Tuning models with embeddings

In [1]:
%run imports.py

In [2]:
# import all standard libraries
from datetime import datetime, date, timedelta
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
import feather
import nltk

import warnings
warnings.filterwarnings("ignore")

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from scipy.stats import uniform
from statistics import mean

Choose the dataset

In [4]:
# read embeddings data
data = feather.read_dataframe("data/embeddings.feather")

In [5]:
X = data.loc[:, data.columns != 'feature_label']
y = data['feature_label']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=15, test_size=0.2)

Hyperparameter Tuning 

In [14]:
clf_lr = LogisticRegression(penalty = 'l2', class_weight='Balanced', solver='saga', random_state=17)

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }

grid_lr = GridSearchCV(clf_lr, param_grid, refit = True, verbose = 3, n_jobs=6) 
grid_lr.fit(X_train, y_train)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:  7.1min
[Parallel(n_jobs=6)]: Done  35 out of  35 | elapsed: 11.1min finished


GridSearchCV(estimator=LogisticRegression(class_weight='Balanced',
                                          random_state=17, solver='saga'),
             n_jobs=6, param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]},
             verbose=3)

In [15]:
valid_pred_lr = grid_lr.predict(X_test)

In [19]:
print(accuracy_score(y_test, valid_pred_lr))

0.7344017426482029


In [17]:
grid_lr.best_params_

{'C': 0.001}

In [18]:
print(classification_report(y_test, valid_pred_lr))

              precision    recall  f1-score   support

           0       0.72      0.78      0.75     22545
           1       0.76      0.69      0.72     22444

    accuracy                           0.73     44989
   macro avg       0.74      0.73      0.73     44989
weighted avg       0.74      0.73      0.73     44989



In [None]:
param_grid = {'max_depth': [10,20,50,100,200],
              'n_estimators': [100,200,500,1000]}

clf_rf = RandomForestClassifier(n_jobs=6, class_weight='balanced')

grid_rf = GridSearchCV(clf_rf, param_grid, refit = True, verbose = 3, n_jobs=6) 

grid_rf.fit(X_train, y_train)

In [24]:
grid_rf.best_params_

{'max_depth': 200, 'n_estimators': 1000}

In [None]:
valid_pred_rf = grid_rf.predict(X_test)

In [8]:
clf_rf = RandomForestClassifier(n_jobs=6, random_state=15, class_weight='balanced', max_depth=200, n_estimators=1000)
clf_rf.fit(X_train, y_train)

RandomForestClassifier(class_weight='balanced', max_depth=200,
                       n_estimators=1000, n_jobs=6, random_state=15)

In [10]:
valid_pred_rf = clf_rf.predict(X_test)

In [11]:
print(accuracy_score(y_test, valid_pred_rf))

0.7336237747004823


In [12]:
print(classification_report(y_test, valid_pred_rf))

              precision    recall  f1-score   support

           0       0.71      0.78      0.75     22545
           1       0.76      0.69      0.72     22444

    accuracy                           0.73     44989
   macro avg       0.74      0.73      0.73     44989
weighted avg       0.74      0.73      0.73     44989



In [20]:
# log reg
report = classification_report(y_test, valid_pred_lr, output_dict=True)
report_df = pd.DataFrame(report).transpose()
report_df.to_csv("results/clf_report_lr_embeddings.csv")

In [21]:
# rf 
report = classification_report(y_test, valid_pred_rf, output_dict=True)
report_df = pd.DataFrame(report).transpose()
report_df.to_csv("results/clf_report_rf_embeddings.csv")