In [13]:
import xgboost as xgb
from sklearn.ensemble import AdaBoostRegressor
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
import numpy as np

from hyperopt import fmin, tpe, hp, Trials

import warnings

warnings.simplefilter("ignore")


In [14]:
df = pd.read_excel("../../data/interim/modeling.xlsx")
df.head()

Unnamed: 0,match_id,big_age,program_type,match_length,avg_cadence_day,early_stage_score,late_stage_score,sentiment_change,sentiment_trend,busyness_level,big_age_match_start,little_age_match_start,same_gender,race_similarity
0,a1v2J0000027CWYQA2,25,Site,5.5,35.0,0.765,0.9905,0.2255,Stable,3,18,12,True,0.0
1,a1v2J0000027CWfQAM,26,Site,8.5,46.2,0.9987,0.9913,-0.0074,Stable,3,19,11,True,0.166667
2,a1v2J0000027CWiQAM,27,Site,6.9,35.333333,0.9612,0.9893,0.0281,Stable,3,19,12,True,0.0
3,a1v2J0000027CWoQAM,25,Site,7.2,43.75,0.9972,0.4696,-0.5276,Declined,3,18,8,True,0.0
4,a1v2J0000027CWpQAM,27,Site,7.4,45.75,0.9995,0.9825,-0.017,Stable,3,20,11,True,0.166667


In [15]:
# df = pd.get_dummies(df, columns=['program_type', "sentiment_trend"])
# df.head()

In [16]:
df.drop([
    'match_id',
    'sentiment_trend',
    "program_type"
    ], axis=True, inplace=True)

In [17]:
X = df.drop(columns=['match_length'])  # Features (all columns except 'target')
y = df['match_length']  # Target variable


In [18]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Feature scaling for X

# Convert y_train and y_test to NumPy arrays
y_train = y_train.values.reshape(-1, 1)  # Convert to a 2D array
y_test = y_test.values.reshape(-1, 1)    # Convert to a 2D array

# Feature scaling for X
scaler_X = StandardScaler()
X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)

# Scaling the target variable y
scaler_y = StandardScaler()
y_train_scaled = scaler_y.fit_transform(y_train).ravel()  # Flatten after scaling


In [19]:
def objective(params):
    md = int(params['max_depth'])
    ld = int(params['lambda'])
    est = int(params['n_estimators'])
    ss = params['subsample']
    lr = params['learning_rate']
    csbt = params['colsample_bytree']
    model = lgb.LGBMRegressor(
        n_estimators = est,
        max_depth = md,
        subsample = ss,
        reg_lambda = ld,
        colsample_bytree = csbt,
        learning_rate=lr,
        n_jobs = -1,  
    )

    model.fit(X_train_scaled, y_train_scaled)
    y_pred_scaled = model.predict(X_test_scaled)
    y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).ravel()
    score = mean_squared_error(y_test, y_pred)

    return score

In [20]:
def optimize(trial):
    params = {
        "max_depth": hp.uniform('max_depth', 2, 80),
        "learning_rate": hp.uniform('learning_rate', 0.01, 0.5),
        "subsample": hp.uniform('subsample',0.5, 1),
        "lambda": hp.uniform('lambda', 1, 10),
        "n_estimators": hp.uniform('n_estimators', 20, 500),
        "colsample_bytree": hp.uniform('colsample_bytree', 0.5, 1),
    }

    rstate = np.random.default_rng(42)
    best = fmin(
        fn=objective,
        space=params,
        algo=tpe.suggest,
        trials=trial,
        max_evals=30,
        rstate=rstate
    )
    return best


In [21]:

trials = Trials()
best = optimize(trials)
print(best)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001437 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 982                      
[LightGBM] [Info] Number of data points in the train set: 2611, number of used features: 10
[LightGBM] [Info] Start training from score 0.000000  
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001048 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 982                                               
[LightGBM] [Info] Number of data points in the train set: 2611, number of used features: 10
[LightGBM] [Info] Start training from score 0.000000                           
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000698 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 982                            

In [22]:
md = int(best['max_depth'])
ld = int(best['lambda'])
est = int(best['n_estimators'])
ss = best['subsample']
lr = best['learning_rate']
csbt = best['colsample_bytree']

In [28]:

# Train an XGBoost model


model = lgb.LGBMRegressor(
    objective='regression',
    n_estimators = est,
    max_depth = md,
    subsample = ss,
    reg_lambda = ld,
    colsample_bytree = csbt,
    learning_rate=lr,
    seed=42)

# # Fit the model to the training data
# model.fit(X_train, y_train)

# Train your model (replace this with your actual model training)
model.fit(X_train_scaled, y_train_scaled)

# Make predictions
y_pred_scaled = model.predict(X_test_scaled)

# Inverse transform the predictions to original scale
y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).ravel()

# Compute RMSE using the original scale
rmse = np.sqrt(mean_squared_error(y_test, y_pred))


print(f'Mean Squared Error: {rmse}')



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001264 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 982
[LightGBM] [Info] Number of data points in the train set: 2611, number of used features: 10
[LightGBM] [Info] Start training from score 0.000000
Mean Squared Error: 7.180282641454613


In [29]:
#  Get feature importances
feature_importances = model.feature_importances_

# Create a DataFrame to show feature importances with their respective columns
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,  # Column names (features)
    'Importance': feature_importances  # Feature importances
})

# Sort the DataFrame by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
feature_importance_df


Unnamed: 0,Feature,Importance
1,avg_cadence_day,1486
0,big_age,906
6,big_age_match_start,874
2,early_stage_score,580
3,late_stage_score,551
7,little_age_match_start,349
9,race_similarity,280
4,sentiment_change,274
5,busyness_level,96
8,same_gender,34
