In [2]:
import xgboost as xgb
from sklearn.ensemble import AdaBoostRegressor
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
import numpy as np
import joblib
from hyperopt import fmin, tpe, hp, Trials

import warnings

warnings.simplefilter("ignore")


In [3]:
df = pd.read_excel("data/encoding.xlsx")
df.head()

Unnamed: 0,index,match_id,big_age,program_type,big_race_ethnicity,rationale_for_match,little_participant__race_ethnicity,match_length,avg_cadence_day,max_cadence_day,...,sentiment_change,sentiment_trend,rigidity,fixed_schedule,income_level,stability,big_age_match_start,little_age_match_start,same_gender,race_similarity
0,0,a1v2J0000027CWYQA2,25,Site,Asian;,"Big, little and parent were in agreement with ...",Asian,5.5,35.0,56,...,0.2265,Stable,1,0,1,1,18,12,True,0.0
1,1,a1v2J0000027CWfQAM,26,Site,White or Caucasian;,"Both seem to like the arts, books, and present...",Black or African American,8.5,46.2,83,...,-0.0037,Stable,1,0,1,2,19,11,True,0.166667
2,2,a1v2J0000027CWiQAM,27,Site,Asian;,Both BS and LB share similar interests. BS has...,Asian,6.9,35.333333,65,...,0.033,Stable,1,0,1,1,19,12,True,0.0
3,3,a1v2J0000027CWoQAM,25,Site,Asian;,B_first_name was open to the little that he is...,Asian,7.2,43.75,106,...,-0.5276,Declined,1,0,1,1,18,8,True,0.0
4,4,a1v2J0000027CWpQAM,27,Site,White or Caucasian;,Distance is 8 miles (21 mins). Both are talkti...,Black or African American,7.4,45.75,85,...,-0.0059,Stable,1,0,1,2,20,11,True,0.166667


In [4]:
# df = pd.get_dummies(df, columns=['program_type', "sentiment_trend"])
# df.head()

In [5]:
df.drop([
    'match_id',
    'sentiment_trend',
    "program_type",
    "big_race_ethnicity",
    "little_participant__race_ethnicity",
    "rationale_for_match",
    "index"
    ], axis=True, inplace=True)

In [6]:
df.dtypes

big_age                     int64
match_length              float64
avg_cadence_day           float64
max_cadence_day             int64
std                       float64
call_count                  int64
topic_consistency         float64
shared_interest              bool
career                       bool
location                     bool
family                       bool
volunteering                 bool
early_stage_score         float64
late_stage_score          float64
sentiment_change          float64
rigidity                    int64
fixed_schedule              int64
income_level                int64
stability                   int64
big_age_match_start         int64
little_age_match_start      int64
same_gender                  bool
race_similarity           float64
dtype: object

In [7]:
timely_features = ["call_count", "avg_cadence_day", "max_cadence_day", "std"]
X = df.drop(columns=['match_length'])[timely_features]  # Features (all columns except 'target')
y = df['match_length']  # Target variable


In [8]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Feature scaling for X

# Convert y_train and y_test to NumPy arrays
y_train = y_train.values.reshape(-1, 1)  # Convert to a 2D array
y_test = y_test.values.reshape(-1, 1)    # Convert to a 2D array

# Feature scaling for X
scaler_X = StandardScaler()
X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)
joblib.dump(scaler_X, "saved/scaler_x.pkl")

# Scaling the target variable y
scaler_y = StandardScaler()
y_train_scaled = scaler_y.fit_transform(y_train).ravel()  # Flatten after scaling
joblib.dump(scaler_y, "saved/scaler_y.pkl")


['saved/scaler_y.pkl']

In [9]:
def objective(params):
    md = int(params['max_depth'])
    ld = int(params['lambda'])
    est = int(params['n_estimators'])
    ss = params['subsample']
    lr = params['learning_rate']
    csbt = params['colsample_bytree']
    model = lgb.LGBMRegressor(
        n_estimators = est,
        max_depth = md,
        subsample = ss,
        reg_lambda = ld,
        colsample_bytree = csbt,
        learning_rate=lr,
        n_jobs = -1,  
    )

    model.fit(X_train_scaled, y_train_scaled)
    y_pred_scaled = model.predict(X_test_scaled)
    y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).ravel()
    score = mean_squared_error(y_test, y_pred)

    return score

In [10]:
def optimize(trial):
    params = {
        "max_depth": hp.uniform('max_depth', 2, 80),
        "learning_rate": hp.uniform('learning_rate', 0.01, 0.5),
        "subsample": hp.uniform('subsample',0.5, 1),
        "lambda": hp.uniform('lambda', 1, 10),
        "n_estimators": hp.uniform('n_estimators', 20, 500),
        "colsample_bytree": hp.uniform('colsample_bytree', 0.5, 1),
    }

    rstate = np.random.default_rng(42)
    best = fmin(
        fn=objective,
        space=params,
        algo=tpe.suggest,
        trials=trial,
        max_evals=30,
        rstate=rstate
    )
    return best


In [11]:

trials = Trials()
best = optimize(trials)
joblib.dump(best, "saved/best_hp.pkl")
print(best)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001288 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 806                      
[LightGBM] [Info] Number of data points in the train set: 2611, number of used features: 4
[LightGBM] [Info] Start training from score 0.000000  
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001479 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 806                                              
[LightGBM] [Info] Number of data points in the train set: 2611, number of used features: 4
[LightGBM] [Info] Start training from score 0.000000                          
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000524 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 806                                

In [12]:
md = int(best['max_depth'])
ld = int(best['lambda'])
est = int(best['n_estimators'])
ss = best['subsample']
lr = best['learning_rate']
csbt = best['colsample_bytree']

In [13]:

# Train an XGBoost model


model = lgb.LGBMRegressor(
    objective='regression',
    n_estimators = est,
    max_depth = md,
    subsample = ss,
    reg_lambda = ld,
    colsample_bytree = csbt,
    learning_rate=lr,
    seed=42)

# # Fit the model to the training data
# model.fit(X_train, y_train)

# Train your model (replace this with your actual model training)
model.fit(X_train_scaled, y_train_scaled)

# Make predictions
y_pred_scaled = model.predict(X_test_scaled)

# Inverse transform the predictions to original scale
y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).ravel()

# Compute RMSE using the original scale
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
joblib.dump(model, "saved/model.pkl")

print(f'Mean Squared Error: {rmse}')



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000691 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 806
[LightGBM] [Info] Number of data points in the train set: 2611, number of used features: 4
[LightGBM] [Info] Start training from score 0.000000
Mean Squared Error: 2.4656388810203516


In [14]:
#  Get feature importances
feature_importances = model.feature_importances_

# Create a DataFrame to show feature importances with their respective columns
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,  # Column names (features)
    'Importance': feature_importances  # Feature importances
})

# Sort the DataFrame by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
feature_importance_df


Unnamed: 0,Feature,Importance
1,avg_cadence_day,1842
2,max_cadence_day,1319
3,std,1234
0,call_count,1215


In [15]:
timely_features = ["call_count", "avg_cadence_day", "max_cadence_day", "std"]
X_other = df.drop(columns=['match_length']+timely_features)  # Features (all columns except 'target')
y = df['match_length']  # Target variable


In [16]:

# Split the data into training and testing sets
X_other_train, X_other_test, y_train, y_test = train_test_split(X_other, y, test_size=0.2, random_state=42)

## Feature scaling for X

# Convert y_train and y_test to NumPy arrays
y_train = y_train.values.reshape(-1, 1)  # Convert to a 2D array
y_test = y_test.values.reshape(-1, 1)    # Convert to a 2D array

# Feature scaling for X
scaler_X = StandardScaler()
X_other_train_scaled = scaler_X.fit_transform(X_other_train)
X_other_test_scaled = scaler_X.transform(X_other_test)
joblib.dump(scaler_X, "saved/scaler_x_2.pkl")
# Scaling the target variable y
scaler_y = StandardScaler()
y_train_scaled = scaler_y.fit_transform(y_train).ravel()  # Flatten after scaling
joblib.dump(best, "saved/scaler_y_2.pkl")


['saved/scaler_y_2.pkl']

In [17]:
def objective(params):
    md = int(params['max_depth'])
    ld = int(params['lambda'])
    est = int(params['n_estimators'])
    ss = params['subsample']
    lr = params['learning_rate']
    csbt = params['colsample_bytree']
    model = lgb.LGBMRegressor(
        n_estimators = est,
        max_depth = md,
        subsample = ss,
        reg_lambda = ld,
        colsample_bytree = csbt,
        learning_rate=lr,
        n_jobs = -1,  
    )

    model.fit(X_other_train_scaled, y_train_scaled)
    y_pred_scaled = model.predict(X_other_test_scaled)
    y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).ravel()
    score = mean_squared_error(y_test, y_pred)

    return score

In [18]:

trials = Trials()
best = optimize(trials)
joblib.dump(best, "saved/best_hp_2.pkl")
print(best)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004314 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1014                     
[LightGBM] [Info] Number of data points in the train set: 2611, number of used features: 18
[LightGBM] [Info] Start training from score 0.000000  
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002624 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1014                                              
[LightGBM] [Info] Number of data points in the train set: 2611, number of used features: 18
[LightGBM] [Info] Start training from score 0.000000                           
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002718

In [19]:
md = int(best['max_depth'])
ld = int(best['lambda'])
est = int(best['n_estimators'])
ss = best['subsample']
lr = best['learning_rate']
csbt = best['colsample_bytree']

In [20]:

# Train an XGBoost model


model = lgb.LGBMRegressor(
    objective='regression',
    n_estimators = est,
    max_depth = md,
    subsample = ss,
    reg_lambda = ld,
    colsample_bytree = csbt,
    learning_rate=lr,
    seed=42)

# # Fit the model to the training data
# model.fit(X_train, y_train)

# Train your model (replace this with your actual model training)
model.fit(X_other_train_scaled, y_train_scaled)

# Make predictions
y_pred_scaled = model.predict(X_other_test_scaled)

# Inverse transform the predictions to original scale
y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).ravel()

# Compute RMSE using the original scale
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
joblib.dump(model, "saved/model_2.pkl")

print(f'Mean Squared Error: {rmse}')



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003144 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1014
[LightGBM] [Info] Number of data points in the train set: 2611, number of used features: 18
[LightGBM] [Info] Start training from score 0.000000
Mean Squared Error: 11.925791853940897


In [21]:
#  Get feature importances
feature_importances = model.feature_importances_

# Create a DataFrame to show feature importances with their respective columns
feature_importance_df = pd.DataFrame({
    'Feature': X_other.columns,  # Column names (features)
    'Importance': feature_importances  # Feature importances
})

# Sort the DataFrame by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
feature_importance_df

Unnamed: 0,Feature,Importance
1,topic_consistency,1829
0,big_age,1231
14,big_age_match_start,1178
8,late_stage_score,827
7,early_stage_score,767
9,sentiment_change,734
15,little_age_match_start,620
17,race_similarity,479
13,stability,273
10,rigidity,250
