In [1]:
import warnings 
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score, precision_score, recall_score
import lightgbm as lgb
import keras

import pickle

2023-07-31 21:50:13.135143: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Load in all predicted probabilities and labels

In [2]:
folder = "model_predicted_proba/"

with open(folder + "y_val.pkl", 'rb') as handle:
    y_val = pickle.load(handle)
with open(folder + "y_test.pkl", 'rb') as handle:
    y_test = pickle.load(handle)
with open(folder + "embedding_val_preds.pkl", 'rb') as handle:
    embedding_val = pickle.load(handle)
#    embedding_val = embedding_val.drop("CNN Embedding LGB Preds", axis = 1)
with open(folder + "embedding_test_preds.pkl", 'rb') as handle:
    embedding_test = pickle.load(handle)
#    embedding_test = embedding_test.drop("CNN Embedding LGB Preds", axis = 1)

sentiment_val = pd.read_csv(folder + "sentiment_probabilities_val.csv")["1"]
sentiment_test = pd.read_csv(folder + "sentiment_probabilities_test.csv")["1"]
tfidf_val = pd.read_csv(folder + "tfidf_probabilities_val.csv", header = None)[1]
tfidf_test = pd.read_csv(folder + "tfidf_probabilities_test.csv", header = None)[1]

In [12]:
embedding_val["Sentiment Preds"] = sentiment_val
embedding_val["TFIDF Preds"] = tfidf_val
embedding_test["Sentiment Preds"] = sentiment_test
embedding_test["TFIDF Preds"] = tfidf_test


In [13]:
val_proba = np.array(embedding_val)
val_proba_train, val_proba_validation, y_val_train, y_val_validation = train_test_split(
    val_proba, y_val, test_size=0.1, random_state=42)

test_proba = np.array(embedding_test)

# Model Blending via Boosting

Use boosted trees to blend model probabilities. The models will most likely have very similar outputs and we want to focus on improving upon the errors 

In [14]:
#Set to large number, set early stopping rounds
n_estimators = [2000] 
#Smaller dataset, try reducing learning rate
learning_rate = [0.001, 0.01] 
#tree grows leaf wise instead of depth wise for lightgbm, focus on tuning num leaves
num_leaves = [5, 10, 15, 20, 50] 
#set number of entries needed in a leaf to prevent overfitting
min_child_samples = [10, 25, 50, 100] 
#could be redundancy in predictions since models can be similar, 
#set L1 regularization to drop out unnecessary features
reg_alpha = [0.1, 0.2, 0.3] 
early_stopping_rounds = [50]
random_state = [42]

param_grid = {"n_estimators": n_estimators,
              "learning_rate": learning_rate,
              "num_leaves": num_leaves,
              "min_child_samples": min_child_samples,
              "reg_alpha": reg_alpha,
              "early_stopping_round": early_stopping_rounds,
              "random_state": random_state}

In [17]:
lgb_model = lgb.LGBMClassifier(force_row_wise = True)
grid_search = GridSearchCV(estimator = lgb_model, 
                           param_grid = param_grid, 
                           cv = 5,
                           n_jobs = -1, 
                           verbose = 2, 
                           scoring = "f1") 
grid_search.fit(val_proba_train, y_val_train, eval_set = [(val_proba_validation, y_val_validation)], eval_metric = "logloss")
grid_search.best_params_

Fitting 5 folds for each of 120 candidates, totalling 600 fits
[LightGBM] [Info] Number of positive: 16698, number of negative: 16720
[LightGBM] [Info] Total Bins 2015
[LightGBM] [Info] Number of data points in the train set: 33418, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499671 -> initscore=-0.001317
[LightGBM] [Info] Start training from score -0.001317
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[2000]	valid_0's binary_logloss: 0.23409
[CV] END early_stopping_round=50, learning_rate=0.001, min_child_samples=10, n_estimators=2000, num_leaves=5, random_state=42, reg_alpha=0.1; total time=  13.4s
[LightGBM] [Info] Number of positive: 16698, number of negative: 16720
[LightGBM] [Info] Total Bins 2015
[LightGBM] [Info] Number of data points in the train set: 33418, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499671 -> initscore=-0.001317
[LightGBM] [Info] 

[LightGBM] [Info] Number of positive: 16698, number of negative: 16720
[LightGBM] [Info] Total Bins 2021
[LightGBM] [Info] Number of data points in the train set: 33418, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499671 -> initscore=-0.001317
[LightGBM] [Info] Start training from score -0.001317
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[2000]	valid_0's binary_logloss: 0.23413
[CV] END early_stopping_round=50, learning_rate=0.001, min_child_samples=10, n_estimators=2000, num_leaves=5, random_state=42, reg_alpha=0.1; total time=  13.6s
[LightGBM] [Info] Number of positive: 16698, number of negative: 16720
[LightGBM] [Info] Total Bins 2019
[LightGBM] [Info] Number of data points in the train set: 33418, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499671 -> initscore=-0.001317
[LightGBM] [Info] Start training from score -0.001317
Training until validation s

[LightGBM] [Info] Number of positive: 16699, number of negative: 16720
[LightGBM] [Info] Total Bins 2018
[LightGBM] [Info] Number of data points in the train set: 33419, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499686 -> initscore=-0.001257
[LightGBM] [Info] Start training from score -0.001257
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[2000]	valid_0's binary_logloss: 0.234132
[CV] END early_stopping_round=50, learning_rate=0.001, min_child_samples=10, n_estimators=2000, num_leaves=5, random_state=42, reg_alpha=0.1; total time=  13.7s
[LightGBM] [Info] Number of positive: 16698, number of negative: 16720
[LightGBM] [Info] Total Bins 2021
[LightGBM] [Info] Number of data points in the train set: 33418, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499671 -> initscore=-0.001317
[LightGBM] [Info] Start training from score -0.001317
Training until validation 

[LightGBM] [Info] Number of positive: 16698, number of negative: 16720
[LightGBM] [Info] Total Bins 2019
[LightGBM] [Info] Number of data points in the train set: 33418, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499671 -> initscore=-0.001317
[LightGBM] [Info] Start training from score -0.001317
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[2000]	valid_0's binary_logloss: 0.234052
[CV] END early_stopping_round=50, learning_rate=0.001, min_child_samples=10, n_estimators=2000, num_leaves=5, random_state=42, reg_alpha=0.1; total time=  13.3s
[LightGBM] [Info] Number of positive: 16699, number of negative: 16720
[LightGBM] [Info] Total Bins 2013
[LightGBM] [Info] Number of data points in the train set: 33419, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499686 -> initscore=-0.001257
[LightGBM] [Info] Start training from score -0.001257
Training until validation 

[LightGBM] [Info] Number of positive: 20873, number of negative: 20900
[LightGBM] [Info] Total Bins 2021
[LightGBM] [Info] Number of data points in the train set: 41773, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499677 -> initscore=-0.001293
[LightGBM] [Info] Start training from score -0.001293
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[709]	valid_0's binary_logloss: 0.1941


{'early_stopping_round': 50,
 'learning_rate': 0.01,
 'min_child_samples': 50,
 'n_estimators': 2000,
 'num_leaves': 10,
 'random_state': 42,
 'reg_alpha': 0.3}


n_estimators = [2000] 

learning_rate = [0.001, 0.01] 

num_leaves = [5, 10, 15, 20, 50] 

min_child_samples = [10, 25, 50, 100] 

reg_alpha = [0.1, 0.2, 0.3] 


In [18]:
blended_model = grid_search.best_estimator_

In [19]:
with open('models_blended.pkl', 'wb') as handle:
    pickle.dump(blended_model, handle)

In [21]:
blended_test_preds = blended_model.predict(test_proba)

In [25]:
print(classification_report(y_test, blended_test_preds, digits = 3))

              precision    recall  f1-score   support

           0      0.933     0.933     0.933     23248
           1      0.932     0.933     0.932     23167

    accuracy                          0.933     46415
   macro avg      0.933     0.933     0.933     46415
weighted avg      0.933     0.933     0.933     46415



In [29]:
for col, importance in zip(embedding_test.columns,blended_model.feature_importances_):
    print(col, "-", importance)

Word Embedding Preds - 568
CNN Embedding Preds - 813
Doc2Vec LR Preds - 500
Doc2Vec NN Preds - 701
Word Embedding LGB Preds - 677
CNN Embedding LGB Preds - 1071
Sentiment Preds - 438
TFIDF Preds - 1613


The TF-IDF predictions was the most important for the blended model's predictions, followed by the models that used the word embeddings generated by the 1D CNN. 