In [None]:
import pandas as pd
pd.set_option('display.max_rows', 100)
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

import re
from tqdm import tqdm

from sklearn.model_selection import ParameterGrid
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score
from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings("ignore")

In [None]:
models = ["LR", "NB", "SVM"]  #["RF", "LR", "NB", "SVM"]
meta_model_train = pd.DataFrame()
meta_model_test = pd.DataFrame()
meta_model_fold_1 = pd.DataFrame()
meta_model_fold_2 = pd.DataFrame()
meta_model_fold_3 = pd.DataFrame()
meta_model_fold_4 = pd.DataFrame()
meta_model_fold_5 = pd.DataFrame()

for model in models:
    fold1pred = pd.read_csv(f'fold_predictions/{model}/{model}_fold1.csv')
    meta_model_fold_1 = pd.concat([meta_model_fold_1,fold1pred], axis=1)
    
    fold2pred = pd.read_csv(f'fold_predictions/{model}/{model}_fold2.csv')
    meta_model_fold_2 = pd.concat([meta_model_fold_2,fold2pred], axis=1)

    fold3pred = pd.read_csv(f'fold_predictions/{model}/{model}_fold3.csv')
    meta_model_fold_3 = pd.concat([meta_model_fold_3,fold3pred], axis=1)

    fold4pred = pd.read_csv(f'fold_predictions/{model}/{model}_fold4.csv')
    meta_model_fold_4 = pd.concat([meta_model_fold_4,fold4pred], axis=1)

    fold5pred = pd.read_csv(f'fold_predictions/{model}/{model}_fold5.csv')    
    meta_model_fold_5 = pd.concat([meta_model_fold_5,fold5pred], axis=1)

    all_pred_train = pd.concat([fold1pred,fold2pred,fold3pred,fold4pred,fold5pred],axis = 0)
    meta_model_train = pd.concat([meta_model_train,all_pred_train], axis=1)
    
    testpred = pd.read_csv(f'fold_predictions/{model}/{model}_test.csv')
    meta_model_test = pd.concat([meta_model_test,testpred], axis=1)

    
data = [meta_model_fold_1,meta_model_fold_2,meta_model_fold_3,meta_model_fold_4,meta_model_fold_5,meta_model_train]
meta_model_train.head()

# Stacking

In [None]:
# logistic regression
logreg_params = {
    "C": [0.1, 0.5, 1.0, 1.5, 5],
    "solver": ["lbfgs", "newton-cg"],
    "penalty": ["l2", "none"],
    "class_weight": ["balanced", None] 
}
logreg_paramgrid = list(ParameterGrid(logreg_params))

# svm
svm_params = {
    "C": [0.1, 0.5, 1.0, 1.5, 5],
    "kernel": ["poly", "rbf", "sigmoid"],
    "gamma": ["scale", "auto"],
    "class_weight": ["balanced", None] 
}

svm_paramgrid = list(ParameterGrid(svm_params))

In [None]:
def oversample_smote(X,y):
    smote = SMOTE(random_state = 4103)
    X, y = smote.fit_resample(X, y)
    return X,y

In [None]:
def models_grid_search(model_name, model_fn, model_paramgrid, data, test) : 
    ind = 0 
    gridsearch_results = []      
    
    # train models
    for model_param in model_paramgrid:

        # 5 fold cross val
        val_accuracy = []
        val_f1_weighted = []
        val_f1_zero = []
        val_f1_one = []
    
        test_accuracy = []
        test_f1_weighted = []
        test_f1_zero = []
        test_f1_one = []
        
        for i in range(5):
            print(f"fold {i}")
            train_set = pd.DataFrame()
            for x in range(5):
                if i != x :
                    train_set = pd.concat([train_set,data[x]],axis=0)
            val_set = data[i]
            test_set = test
            trainval_set = data[5]

            train_label = train_set.relevance
            val_label = val_set.relevance
            test_label = test_set.relevance
            trainval_label = trainval_set.relevance

            # train on train model test on val
            model = model_fn(**model_param)
            vec_train_over, label_train_over = oversample_smote(train_set.iloc[:,:-1],train_label)
            model.fit(vec_train_over, label_train_over)
            val_pred = model.predict(val_set.iloc[:,:-1])
            
            # scoring
            val_metrics = classification_report(val_label, val_pred, output_dict=True)
            val_accuracy.append(val_metrics["accuracy"])
            val_f1_weighted.append(val_metrics["weighted avg"]["f1-score"])
            val_f1_zero.append(val_metrics["0"]["f1-score"])
            val_f1_one.append(val_metrics["1"]["f1-score"])

            # train on train_val model test on test
            model = model_fn(**model_param)
            vec_tv_over, label_tv_over = oversample_smote(trainval_set.iloc[:,:-1],trainval_label)
            model.fit(vec_tv_over, label_tv_over)
            test_pred = model.predict(test_set.iloc[:,:-1])
            # scoring
            test_metrics = classification_report(test_label, test_pred, output_dict=True)
            test_accuracy.append(test_metrics["accuracy"])
            test_f1_weighted.append(test_metrics["weighted avg"]["f1-score"])
            test_f1_zero.append(test_metrics["0"]["f1-score"])
            test_f1_one.append(test_metrics["1"]["f1-score"])

        results = { "model": model_name }
        results.update(model_param)
        results.update({"val_f1_weighted": np.mean(val_f1_weighted), 
                        "val_f1_zero": np.mean(val_f1_zero), "val_f1_one": np.mean(val_f1_one),
                        "val_accuracy": np.mean(val_accuracy)})
        results.update({"test_f1_weighted": np.mean(test_f1_weighted),
                        "test_f1_zero": np.mean(test_f1_zero), "test_f1_one": np.mean(test_f1_one),
                        "test_accuracy": np.mean(test_accuracy)})
        gridsearch_results.append(results)
        ind += 1
    return gridsearch_results

In [None]:
model_name = "logreg"
model_fn = LogisticRegression
model_paramgrid = logreg_paramgrid
final_logreg_results = pd.DataFrame.from_records(models_grid_search(model_name,model_fn,model_paramgrid, data, meta_model_test))

In [None]:
final_logreg_results = final_logreg_results.sort_values(by=["val_f1_weighted", "test_f1_weighted"], ascending=False)
final_logreg_results

In [None]:
final_logreg_results = final_logreg_results.sort_values(by=["val_f1_weighted", "test_f1_weighted"], ascending=False)
final_logreg_results.to_csv('model_results/stacking_results_lr_no_rf.csv')

In [None]:
model_name = "svm"
model_fn = SVC
model_paramgrid = svm_paramgrid
final_svm_results = pd.DataFrame.from_records(models_grid_search(model_name,model_fn,model_paramgrid, data, meta_model_test))

In [None]:
final_svm_results = final_svm_results.sort_values(by=["val_f1_weighted", "test_f1_weighted"], ascending=False)
final_svm_results.to_csv('model_results/stacking_results_svm_norf.csv')

# Prediction Correlation

In [None]:
alldata = pd.concat([meta_model_train, meta_model_test])

In [None]:
plt.figure(figsize=(16, 6))
# define the mask to set the values in the upper triangle to True
mask = np.triu(np.ones_like(alldata.iloc[:,:-1].corr(), dtype=np.bool))
heatmap = sns.heatmap(alldata.iloc[:,:-1].corr(), mask=mask, vmin=-1, vmax=1, annot=True)
heatmap.set_title('Correlation of Predictions of Base Models', fontdict={'fontsize':18}, pad=16);