In [40]:
import xgboost as xgb
from sklearn.ensemble import AdaBoostRegressor
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
import numpy as np
import joblib
from hyperopt import fmin, tpe, hp, Trials

import warnings

warnings.simplefilter("ignore")


In [41]:
test = pd.read_excel("data/encoding.xlsx")
test.head()

Unnamed: 0,index,match_id,big_age,program_type,big_race_ethnicity,rationale_for_match,little_participant__race_ethnicity,avg_cadence_day,max_cadence_day,std,...,sentiment_change,sentiment_trend,rigidity,fixed_schedule,income_level,stability,big_age_match_start,little_age_match_start,same_gender,race_similarity
0,0,a1v2J0000027CXKQA2,25,Site,Asian;,Both are male and Hmong. They share similar in...,Asian,15.0,30,21.213203,...,0.2407,Stable,1,0,1,1,18,11,True,0.0
1,1,a1v2J0000027JFCQA2,38,Community,White or Caucasian;,L_first_name and B_first_name were matched bec...,Black or African American,76.363636,156,48.380312,...,0.0003,Stable,4,1,5,5,32,12,True,0.166667
2,2,a1v2J0000027KBoQAM,37,Site Based Facilitated,White or Caucasian;,L_first_name was really wanting to be rematche...,Black or African American,37.0,82,24.951381,...,0.0016,Stable,3,1,3,3,31,14,True,0.166667
3,3,a1v2J0000027KCEQA2,24,Site,Asian;,"BS is a leader, positive, experienced, and smi...",Asian,35.4,61,24.449949,...,0.0029,Stable,1,0,1,1,17,10,True,0.0
4,4,a1v2J0000027KCbQAM,31,Site,White or Caucasian;,Conor was open to who he worked with and I tho...,Black or African American,43.857143,90,27.853357,...,-0.0004,Stable,3,1,3,3,23,11,True,0.166667


In [42]:
test.drop([
    # 'match_id',
    'sentiment_trend',
    "program_type",
    "big_race_ethnicity",
    "little_participant__race_ethnicity",
    "rationale_for_match",
    "index"
    ], axis=True, inplace=True)

In [43]:
# 🚀 Define feature sets (Ensure the test file has these columns)
time_features = time_features = ["call_count", "avg_cadence_day", "max_cadence_day", "std"]
general_features = [x for x in  test.columns.to_list() if x not in time_features+["match_id"]]


In [44]:
# 🚀 Load your trained models (Replace with actual paths if saved)
model_time = joblib.load("saved/model_time.pkl")
model_gen = joblib.load("saved/model_gen.pkl")
meta_model = joblib.load("saved/meta_model.pkl")

In [45]:
# 🚀 Predict using time-based model
X_test_time = test[time_features]
test['pred_time'] = model_time.predict(X_test_time)


In [46]:
general_features

['big_age',
 'topic_consistency',
 'shared_interest',
 'career',
 'location',
 'family',
 'volunteering',
 'early_stage_score',
 'late_stage_score',
 'sentiment_change',
 'rigidity',
 'fixed_schedule',
 'income_level',
 'stability',
 'big_age_match_start',
 'little_age_match_start',
 'same_gender',
 'race_similarity']

In [47]:

# 🚀 Predict using general feature model
X_test_gen = test[general_features]
test['pred_gen'] = model_gen.predict(X_test_gen)


In [48]:

# 🚀 Meta-model prediction
meta_features = ['pred_time', 'pred_gen']
X_test_meta = test[meta_features]
test['match_length'] = meta_model.predict(X_test_meta)

In [49]:
test

Unnamed: 0,match_id,big_age,avg_cadence_day,max_cadence_day,std,call_count,topic_consistency,shared_interest,career,location,...,fixed_schedule,income_level,stability,big_age_match_start,little_age_match_start,same_gender,race_similarity,pred_time,pred_gen,match_length
0,a1v2J0000027CXKQA2,25,15.000000,30,21.213203,2,0.589216,True,False,False,...,0,1,1,18,11,True,0.000000,3.213875,5.436646,3.117316
1,a1v2J0000027JFCQA2,38,76.363636,156,48.380312,11,0.660283,True,False,True,...,1,5,5,32,12,True,0.166667,28.791066,32.270450,28.810947
2,a1v2J0000027KBoQAM,37,37.000000,82,24.951381,8,0.631338,False,False,False,...,1,3,3,31,14,True,0.166667,13.230841,21.627053,13.778702
3,a1v2J0000027KCEQA2,24,35.400000,61,24.449949,5,0.521566,False,False,False,...,0,1,1,17,10,True,0.000000,7.748471,15.412440,8.201945
4,a1v2J0000027KCbQAM,31,43.857143,90,27.853357,7,0.599792,False,True,False,...,1,3,3,23,11,True,0.166667,13.373537,16.273606,14.306105
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,a1vHt000005BXMnIAO,47,24.500000,49,34.648232,2,0.171631,True,True,False,...,1,5,5,46,12,True,0.166667,5.117086,7.904552,4.958613
296,a1vUX0000009FvtYAE,23,23.600000,36,14.536162,5,0.532257,True,False,True,...,0,1,2,22,11,False,0.000000,6.026715,3.400427,5.231708
297,a1vUX000000DcnRYAS,47,19.750000,38,17.366155,4,0.732802,True,False,True,...,1,3,3,47,10,True,0.166667,4.884884,1.896095,3.292175
298,a1vUX000000U9QrYAK,47,24.500000,35,16.663333,4,0.687605,True,False,True,...,1,5,5,47,13,True,0.000000,5.530202,10.892701,5.940356


In [50]:
# 🚀 Save predictions to a CSV file
output_file_path = "predictions.csv"
test[['match_id', 'match_length']].to_csv(output_file_path, index=False)

print(f"Predictions saved to {output_file_path}")

Predictions saved to predictions.csv
