In [43]:
import pandas as pd
import numpy as np
import re
from sklearn.naive_bayes import MultinomialNB,GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from seaborn import barplot
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score

In [44]:
train = pd.read_csv('data/train_clean.csv')

In [45]:
test = pd.read_csv('data/test_clean.csv')

In [46]:
#Don't include "Unnamed:0" column. Separate features and outcome.

X_train = train.iloc[:, 1:5001]
X_test = test.iloc[:, 1:5001]
y_train = train['rating_class']
y_test = test['rating_class']

## Lower Level Model Testing

In [47]:
models = [MultinomialNB(), LogisticRegression(), DecisionTreeClassifier(max_depth=100), RandomForestClassifier(max_depth=100)]

In [48]:
model_names = ['MultinomialNB','LogisticRegression','DecisionTreeClassifier','RandomForestClassifier']

In [49]:
#Test low level models: Naive Bayes, DT, LogReg, RF

In [50]:
metrics=[]

for m,m_name in zip(models,model_names):
    
    m = m.fit(X=X_train,y=y_train)
    y_pred_train = m.predict(X_train)
    y_pred_test = m.predict(X_test)
    
    #Train metrics
    accuracy_train = m.score(X_train, y_train)
    precision_train = precision_score(y_train,y_pred_train)
    recall_train = recall_score(y_train,y_pred_train)
    f1_train = f1_score(y_train,y_pred_train)
    
    #Test metrics
    accuracy_test = m.score(X_test, y_test)
    precision_test = precision_score(y_test,y_pred_test)
    recall_test = recall_score(y_test,y_pred_test)
    f1_test = f1_score(y_test,y_pred_test)
    
    params = {
        'model': m_name
    }
        
    metrics.append(params | {'acc_train': accuracy_train,'prec_train':precision_train,
                            'recall_train':recall_train,'f1_train':f1_train,
                            'acc_test':accuracy_test,'prec_test':precision_test,
                            'recall_test':recall_test,'f1_test':f1_test})

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [51]:
pd.DataFrame(metrics).set_index('model')

Unnamed: 0_level_0,acc_train,prec_train,recall_train,f1_train,acc_test,prec_test,recall_test,f1_test
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MultinomialNB,0.800421,0.802665,0.949055,0.869743,0.79501,0.796735,0.948544,0.866037
LogisticRegression,0.851925,0.870411,0.927121,0.897871,0.839502,0.860979,0.918559,0.888837
DecisionTreeClassifier,0.962375,0.94968,0.999361,0.973887,0.85465,0.871926,0.928277,0.899219
RandomForestClassifier,0.973217,0.963308,0.999938,0.981281,0.890314,0.873999,0.984981,0.926177


## Hyperparameter tuning of models

In [52]:
from sklearn.model_selection import GridSearchCV

In [53]:
# 1. Multinomial Naive Bayes
# Hyperparameter: Alpha

In [54]:
param_grid = {"alpha": np.arange(0.1,1.1,0.1)}
grid_search = GridSearchCV(MultinomialNB(), param_grid)
grid_search.fit(X=X_train, y=y_train)

GridSearchCV(estimator=MultinomialNB(),
             param_grid={'alpha': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])})

In [55]:
print("Best parameters: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)
print("Test set score: ", grid_search.score(X_test, y_test))

Best parameters:  {'alpha': 0.1}
Best cross-validation score:  0.7975660559471361
Test set score:  0.7958332554094742


In [56]:
m = MultinomialNB(alpha=0.1).fit(X=X_train,y=y_train)
y_pred_train = m.predict(X_train)
y_pred_test = m.predict(X_test)

#Train metrics
accuracy_train = m.score(X_train, y_train)
precision_train = precision_score(y_train,y_pred_train)
recall_train = recall_score(y_train,y_pred_train)
f1_train = f1_score(y_train,y_pred_train)

#Test metrics
accuracy_test = m.score(X_test, y_test)
precision_test = precision_score(y_test,y_pred_test)
recall_test = recall_score(y_test,y_pred_test)
f1_test = f1_score(y_test,y_pred_test)

In [58]:
print("Test accuracy:", accuracy_test)
print("Test precision:", precision_test)
print("Test recall:", recall_test)
print("Test F1:", f1_test)

(0.7958332554094742, 0.7978725801726275, 0.947847504819019, 0.8664178647904559)

In [29]:
# 2. Decision Tree
# Grid search was taking way too long. Decision Tree had used max_depth=100
# to reduce complexity and time originally,
# so we tried increasing it to 150. It performed better on both
# test and train sets so we used this value.

In [59]:
dt = DecisionTreeClassifier(max_depth=150).fit(X=X_train,y=y_train)

In [60]:
y_pred_train = dt.predict(X_train)
y_pred_test = dt.predict(X_test)

#Train metrics
accuracy_train = dt.score(X_train, y_train)
precision_train = precision_score(y_train,y_pred_train)
recall_train = recall_score(y_train,y_pred_train)
f1_train = f1_score(y_train,y_pred_train)

#Test metrics
accuracy_test = dt.score(X_test, y_test)
precision_test = precision_score(y_test,y_pred_test)
recall_test = recall_score(y_test,y_pred_test)
f1_test = f1_score(y_test,y_pred_test)

In [61]:
print("Test accuracy:", accuracy_test)
print("Test precision:", precision_test)
print("Test recall:", recall_test)
print("Test F1:", f1_test)

(0.8647865197957771,
 0.8921726903447558,
 0.9173002784322125,
 0.9045620148899097)

In [32]:
# 3. Logistic regression - No hyperparameter tuning was done.

In [62]:
# 4. Random Forest
rf = RandomForestClassifier(max_depth=150).fit(X=X_train,y=y_train)

In [63]:
y_pred_train = rf.predict(X_train)
y_pred_test = rf.predict(X_test)

#Train metrics
accuracy_train = rf.score(X_train, y_train)
precision_train = precision_score(y_train,y_pred_train)
recall_train = recall_score(y_train,y_pred_train)
f1_train = f1_score(y_train,y_pred_train)

#Test metrics
accuracy_test = rf.score(X_test, y_test)
precision_test = precision_score(y_test,y_pred_test)
recall_test = recall_score(y_test,y_pred_test)
f1_test = f1_score(y_test,y_pred_test)

In [64]:
print("Test accuracy:", accuracy_test)
print("Test precision:", precision_test)
print("Test recall:", recall_test)
print("Test F1:", f1_test)

Test accuracy: 0.9081745245086121
Test precision: 0.8969508613938919
Test recall: 0.9812861426429642
Test F1: 0.9372251201800142
