In [10]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from bs4 import BeautifulSoup
from sklearn.naive_bayes import MultinomialNB,GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from seaborn import barplot
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score

In [11]:
train = pd.read_csv('data/train_letters_removed.csv')

In [12]:
test = pd.read_csv('data/test_letters_removed.csv')

In [13]:
train

Unnamed: 0.1,Unnamed: 0,review,rating_class
0,0,It has no side effect I take it in combinati...,1
1,1,My son is halfway through his fourth week of ...,1
2,2,I used to take another oral contraceptive wh...,0
3,3,This is my first time using any form of birth...,1
4,4,Suboxone has completely turned my life around...,1
...,...,...,...
160393,161292,I wrote my first report in Mid October of ...,1
160394,161293,I was given this in IV before surgey I immed...,0
160395,161294,Limited improvement after months developed...,0
160396,161295,I ve been on thyroid medication years I s...,1


## Finding the best parameters for TfidfVectorizer

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer 

In [19]:
X_train = train[['review']]
X_test = test[['review']]

In [20]:
y_train = train['rating_class']
y_test = test['rating_class']

In [31]:
models = [MultinomialNB(), LogisticRegression(), DecisionTreeClassifier(max_depth=100)]

In [32]:
for model in models:
    
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS)),
        ('clf', model),
    ])

    parameters = {
        'tfidf__max_df': (0.5, 0.75),
        'tfidf__min_df': (2,5),
        'tfidf__ngram_range': [(1, 2), (1, 3)],
        'tfidf__max_features': (1000,5000)
    }

    grid_search_tune = GridSearchCV(pipeline, parameters, cv=2, n_jobs=2, verbose=3)
    grid_search_tune.fit(X_train['review'].values, y_train)

    print("Model:",model)
    print("Best parameters: ", grid_search_tune.best_params_)
    print("Best cross-validation score: ", grid_search_tune.best_score_)
    #the performance of the best found parameters on the test set
    #this is what you report for the evaluation of your model
    print("Test set score: ", grid_search_tune.score(X_test['review'].values, y_test))

Fitting 2 folds for each of 16 candidates, totalling 32 fits
Model: MultinomialNB()
Best parameters:  {'tfidf__max_df': 0.5, 'tfidf__max_features': 5000, 'tfidf__min_df': 5, 'tfidf__ngram_range': (1, 2)}
Best cross-validation score:  0.7960573074477237
Test set score:  0.7951412915412093
Fitting 2 folds for each of 16 candidates, totalling 32 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Model: LogisticRegression()
Best parameters:  {'tfidf__max_df': 0.5, 'tfidf__max_features': 5000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}
Best cross-validation score:  0.836107682140675
Test set score:  0.8386602083372295
Fitting 2 folds for each of 16 candidates, totalling 32 fits
Model: DecisionTreeClassifier(max_depth=100)
Best parameters:  {'tfidf__max_df': 0.75, 'tfidf__max_features': 5000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}
Best cross-validation score:  0.7888315315652314
Test set score:  0.8537899048082138
[CV 1/2] END tfidf__max_df=0.5, tfidf__max_features=1000, tfidf__min_df=2, tfidf__ngram_range=(1, 2);, score=0.766 total time=  13.8s
[CV 1/2] END tfidf__max_df=0.5, tfidf__max_features=1000, tfidf__min_df=2, tfidf__ngram_range=(1, 3);, score=0.766 total time=  25.5s
[CV 1/2] END tfidf__max_df=0.5, tfidf__max_features=1000, tfidf__min_df=5, tfidf__ngram_range=(1, 2);, score=0.766 total time=  14.6s
[CV 1/2] END tfidf__max_df=0.5, tfidf__max_features=10

[CV 2/2] END tfidf__max_df=0.5, tfidf__max_features=1000, tfidf__min_df=2, tfidf__ngram_range=(1, 2);, score=0.761 total time=  13.9s
[CV 2/2] END tfidf__max_df=0.5, tfidf__max_features=1000, tfidf__min_df=2, tfidf__ngram_range=(1, 3);, score=0.761 total time=  25.6s
[CV 2/2] END tfidf__max_df=0.5, tfidf__max_features=1000, tfidf__min_df=5, tfidf__ngram_range=(1, 2);, score=0.762 total time=  14.6s
[CV 2/2] END tfidf__max_df=0.5, tfidf__max_features=1000, tfidf__min_df=5, tfidf__ngram_range=(1, 3);, score=0.761 total time=  25.3s
[CV 2/2] END tfidf__max_df=0.5, tfidf__max_features=5000, tfidf__min_df=2, tfidf__ngram_range=(1, 2);, score=0.795 total time=  13.9s
[CV 2/2] END tfidf__max_df=0.5, tfidf__max_features=5000, tfidf__min_df=2, tfidf__ngram_range=(1, 3);, score=0.794 total time=  25.8s
[CV 2/2] END tfidf__max_df=0.5, tfidf__max_features=5000, tfidf__min_df=5, tfidf__ngram_range=(1, 2);, score=0.795 total time=  13.4s
[CV 2/2] END tfidf__max_df=0.5, tfidf__max_features=5000, tfid

In [30]:
## Random Forest has to be done separately because of time issues. 
## To reduce time it takes to run RF, specify metrics that were already chosen by other models 
## For example all 3 models tested said max_df=0.5 was the best, as well as ngram_range=(1,2)
## We are trying to decide if min_df should be 2 or 5, and max_df should be 0.5 or 0.75.

In [33]:
pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS,max_df=0.5,ngram_range=(1,2),max_features=5000)),
        ('clf', RandomForestClassifier(max_depth=100)),
    ])

parameters = {
    'tfidf__min_df': (2,5),
    'tfidf__max_df': (0.5,0.75)
}

grid_search_tune = GridSearchCV(pipeline, parameters, cv=2, n_jobs=2, verbose=3)
grid_search_tune.fit(X_train['review'].values, y_train)

print("Model:",model)
print("Best parameters: ", grid_search_tune.best_params_)
print("Best cross-validation score: ", grid_search_tune.best_score_)
#the performance of the best found parameters on the test set
#this is what you report for the evaluation of your model
print("Test set score: ", grid_search_tune.score(X_test['review'].values, y_test))

Fitting 2 folds for each of 4 candidates, totalling 8 fits
[CV 1/2] END tfidf__max_df=0.5, tfidf__min_df=2;, score=0.842 total time= 2.4min
[CV 1/2] END tfidf__max_df=0.5, tfidf__min_df=5;, score=0.842 total time= 2.3min
[CV 1/2] END tfidf__max_df=0.75, tfidf__min_df=2;, score=0.842 total time= 2.4min
[CV 1/2] END tfidf__max_df=0.75, tfidf__min_df=5;, score=0.843 total time= 2.4min
[CV 2/2] END tfidf__max_df=0.5, tfidf__min_df=2;, score=0.839 total time= 2.5min
[CV 2/2] END tfidf__max_df=0.5, tfidf__min_df=5;, score=0.841 total time= 2.4min
[CV 2/2] END tfidf__max_df=0.75, tfidf__min_df=2;, score=0.840 total time= 2.4min
[CV 2/2] END tfidf__max_df=0.75, tfidf__min_df=5;, score=0.839 total time= 2.4min
Model: DecisionTreeClassifier(max_depth=100)
Best parameters:  {'tfidf__max_df': 0.5, 'tfidf__min_df': 5}
Best cross-validation score:  0.841569096871532
Test set score:  0.8894166931607788
