In [63]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from bs4 import BeautifulSoup
from sklearn.naive_bayes import MultinomialNB,GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from seaborn import barplot
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score

In [64]:
train = pd.read_csv('data/train_clean.csv')

In [65]:
test = pd.read_csv('data/test_clean.csv')

In [66]:
#Don't include "Unnamed:0" column. Separate features and outcome.

X_train = train.iloc[:, 1:5001]
X_test = test.iloc[:, 1:5001]
y_train = train['rating_class']
y_test = test['rating_class']

In [67]:
X_train.isna().sum().sum()

0

In [68]:
y_train.isna().sum().sum()

0

In [69]:
X_test.isna().sum().sum()

0

In [70]:
y_test.isna().sum().sum()

0

## Lower Level Model Testing

In [80]:
models = [MultinomialNB(), LogisticRegression(), DecisionTreeClassifier(max_depth=100), RandomForestClassifier(max_depth=100)]

In [81]:
model_names = ['MultinomialNB','LogisticRegression','DecisionTreeClassifier','RandomForestClassifier']

In [82]:
#Test low level models: Naive Bayes, DT, LogReg, RF

In [83]:
metrics=[]

for m,m_name in zip(models,model_names):
    
    m = m.fit(X=X_train,y=y_train)
    y_pred_train = m.predict(X_train)
    y_pred_test = m.predict(X_test)
    
    #Train metrics
    accuracy_train = m.score(X_train, y_train)
    precision_train = precision_score(y_train,y_pred_train)
    recall_train = recall_score(y_train,y_pred_train)
    f1_train = f1_score(y_train,y_pred_train)
    
    #Test metrics
    accuracy_test = m.score(X_test, y_test)
    precision_test = precision_score(y_test,y_pred_test)
    recall_test = recall_score(y_test,y_pred_test)
    f1_test = f1_score(y_test,y_pred_test)
    
    params = {
        'model': m_name
    }
        
    metrics.append(params | {'acc_train': accuracy_train,'prec_train':precision_train,
                            'recall_train':recall_train,'f1_train':f1_train,
                            'acc_test':accuracy_test,'prec_test':precision_test,
                            'recall_test':recall_test,'f1_test':f1_test})

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [84]:
pd.DataFrame(metrics).set_index('model')

Unnamed: 0_level_0,acc_train,prec_train,recall_train,f1_train,acc_test,prec_test,recall_test,f1_test
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MultinomialNB,0.800428,0.802703,0.948993,0.869739,0.795141,0.796767,0.948731,0.866134
LogisticRegression,0.851563,0.870095,0.92697,0.897632,0.83881,0.860162,0.918585,0.888414
DecisionTreeClassifier,0.960698,0.948178,0.998597,0.972735,0.854201,0.871886,0.927581,0.898871
RandomForestClassifier,0.970879,0.960251,0.999911,0.97968,0.888912,0.873117,0.983963,0.925232


## Hyperparameter tuning of models

## Create NN