In [1]:
# library
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.externals import joblib
#from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
# confusion matrix
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict
# precision/recall
from sklearn.metrics import precision_score, recall_score, f1_score
# undersampling
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline, Pipeline

In [2]:
# result reproducibility
np.random.seed(42)

In [3]:
# The text data is already cleaned
inputfile = './csvfiles/output_sentiment.csv'
review = pd.read_csv(inputfile, skip_blank_lines=False)
review = review[['text', 'ovsentiment']]
# exclude NaN in 'text' column (count: 11248)
review = review[~pd.isna(review['text'])]
X = review['text'].values
y = review['ovsentiment'].values

# len(review[review['ovsentiment'] == -1])
# total number of -1 (negative review) is 12566

# len(review[review['ovsentiment'] == 0])
# total number of 0 (neutral) is 135292

# len(review[review['ovsentiment'] == 1])
# total number of 1 (positive review) is 455689

# splitting the dataset into the training set and test set
# stratify=y --> stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
        random_state=42, shuffle=True, stratify=y)

In [4]:
# TOKENIZATION
# see psentiment.py, tokenization is done using
# snowball englishstemmer
# Here, we only need to split the text
##############################################
def tokenizer(text):
    return text.split()

In [62]:
X.shape

(603547,)

In [72]:
# -1
len(review[review['ovsentiment'] == -1])/len(review) * 100

2.0820250949801755

In [9]:
# -1
len(review[review['ovsentiment'] == -1])

12566

In [71]:
# 0
len(review[review['ovsentiment'] == 0])/len(review) * 100

22.416149860739925

In [10]:
# 0
len(review[review['ovsentiment'] == 0])

135292

In [8]:
# 1
(len(review[review['ovsentiment'] == 1])/len(review)) * 100

75.5018250442799

In [None]:
# 1
len(review[review['ovsentiment'] == 1])

# test

In [6]:
rus = RandomUnderSampler(random_state=0, replacement=True)

In [8]:
tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None,
                       ngram_range=(1,1), stop_words=None, tokenizer=str.split)

In [9]:
param_grid = [{'clf__penalty': ['l2'],
               'clf__C': [1.0, 10.0, 15.0, 20.0, 30.0, 40.0, 
                          50.0, 60.0, 70.0, 80.0, 90.0, 100.0],
               'clf__solver': ['newton-cg', 'saga', 'lbfgs']
              }, 
              {'clf__penalty': ['l2'],
               'clf__C': [1.0, 10.0, 15.0, 20.0, 30.0, 40.0, 
                          50.0, 60.0, 70.0, 80.0, 90.0, 100.0],
               'clf__solver': ['newton-cg', 'saga', 'lbfgs']
              }
             ]

In [None]:
lr_tfidf = Pipeline([('vect', tfidf),
                     ('rus', rus),
                     ('clf', LogisticRegression(random_state=42,
                                                multi_class='multinomial')
                      )
                    ])

# 1 - Logistic Regression

In [5]:
from sklearn.linear_model import LogisticRegression

In [5]:
rus = RandomUnderSampler(random_state=42, replacement=True)

In [6]:
tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None,
                       ngram_range=(1,1), stop_words=None, tokenizer=str.split)

In [7]:
param_grid = [{'clf__penalty': ['l2'],
               'clf__C': [1.0, 10.0, 15.0, 20.0, 30.0, 40.0, 
                          50.0, 60.0, 70.0, 80.0, 90.0, 100.0],
               'clf__solver': ['newton-cg', 'saga', 'lbfgs']
              }, 
              {'clf__penalty': ['l2'],
               'clf__C': [1.0, 10.0, 15.0, 20.0, 30.0, 40.0, 
                          50.0, 60.0, 70.0, 80.0, 90.0, 100.0],
               'clf__solver': ['newton-cg', 'saga', 'lbfgs']
              }
             ]

In [10]:
lr_tfidf = Pipeline([('vect', tfidf),
                     ('rus', rus),
                     ('clf', LogisticRegression(random_state=42,
                                                multi_class='multinomial')
                      )
                    ])

In [11]:
gs_lr_tfidf = GridSearchCV(estimator=lr_tfidf,
        param_grid=param_grid,
        scoring=['accuracy', 'f1_micro'],
        cv=5,
        verbose=2,
        refit='f1_micro',
        n_jobs=-1)

In [12]:
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV] clf__C=1.0, clf__penalty=l2, clf__solver=newton-cg ..............
[CV] clf__C=1.0, clf__penalty=l2, clf__solver=newton-cg ..............
[CV] clf__C=1.0, clf__penalty=l2, clf__solver=newton-cg ..............
[CV] clf__C=1.0, clf__penalty=l2, clf__solver=newton-cg ..............
[CV] clf__C=1.0, clf__penalty=l2, clf__solver=newton-cg ..............
[CV] clf__C=1.0, clf__penalty=l2, clf__solver=saga ...................
[CV] clf__C=1.0, clf__penalty=l2, clf__solver=saga ...................
[CV] clf__C=1.0, clf__penalty=l2, clf__solver=saga ...................
[CV] clf__C=1.0, clf__penalty=l2, clf__solver=saga ...................
[CV] clf__C=1.0, clf__penalty=l2, clf__solver=saga ...................
[CV] clf__C=1.0, clf__penalty=l2, clf__solver=lbfgs ..................
[CV] clf__C=1.0, clf__penalty=l2, clf__solver=lbfgs ..................
[CV]  clf__C=1.0, clf__penalty=l2, clf__solver=newton-cg, total=  11.5s
[CV] clf__C=1.

[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   52.8s


[CV] clf__C=10.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV] ... clf__C=1.0, clf__penalty=l2, clf__solver=lbfgs, total=  21.4s
[CV]  clf__C=10.0, clf__penalty=l2, clf__solver=newton-cg, total=  20.2s
[CV] clf__C=10.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV] ... clf__C=10.0, clf__penalty=l2, clf__solver=saga, total=  19.6s
[CV] clf__C=10.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV] clf__C=15.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] clf__C=15.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] ... clf__C=10.0, clf__penalty=l2, clf__solver=saga, total=  20.4s
[CV] clf__C=15.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] clf__C=15.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] ... clf__C=10.0, clf__penalty=l2, clf__solver=saga, total=  18.4s
[CV] clf__C=15.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] ... clf__C=10.0, clf__penalty=l2, clf__solver=saga, total=  14.8s
[CV]



[CV]  clf__C=20.0, clf__penalty=l2, clf__solver=newton-cg, total=  13.4s
[CV] ... clf__C=20.0, clf__penalty=l2, clf__solver=saga, total=  15.4s
[CV] clf__C=30.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] clf__C=30.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV]  clf__C=20.0, clf__penalty=l2, clf__solver=newton-cg, total=  17.7s
[CV] ... clf__C=20.0, clf__penalty=l2, clf__solver=saga, total=  17.3s
[CV] clf__C=30.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] clf__C=30.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] ... clf__C=20.0, clf__penalty=l2, clf__solver=saga, total=  20.9s
[CV] clf__C=30.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] ... clf__C=20.0, clf__penalty=l2, clf__solver=saga, total=  21.8s
[CV] clf__C=30.0, clf__penalty=l2, clf__solver=saga ..................
[CV] .. clf__C=20.0, clf__penalty=l2, clf__solver=lbfgs, total=  21.7s
[CV] clf__C=30.0, clf__penalty=l2, clf__solver=saga ..................
[C



[CV]  clf__C=30.0, clf__penalty=l2, clf__solver=newton-cg, total=  17.7s
[CV] clf__C=30.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV]  clf__C=30.0, clf__penalty=l2, clf__solver=newton-cg, total=  17.5s
[CV]  clf__C=30.0, clf__penalty=l2, clf__solver=newton-cg, total=  18.1s
[CV] clf__C=30.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV] clf__C=30.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV]  clf__C=30.0, clf__penalty=l2, clf__solver=newton-cg, total=  17.1s
[CV] clf__C=40.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV]  clf__C=30.0, clf__penalty=l2, clf__solver=newton-cg, total=  19.3s
[CV] clf__C=40.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] ... clf__C=30.0, clf__penalty=l2, clf__solver=saga, total=  21.0s
[CV] clf__C=40.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] ... clf__C=30.0, clf__penalty=l2, clf__solver=saga, total=  20.4s
[CV] clf__C=40.0, clf__penalty=l2, clf__solver=newton-cg ..........



[CV] .. clf__C=30.0, clf__penalty=l2, clf__solver=lbfgs, total=  27.2s
[CV] clf__C=40.0, clf__penalty=l2, clf__solver=saga ..................




[CV] .. clf__C=30.0, clf__penalty=l2, clf__solver=lbfgs, total=  25.1s
[CV]  clf__C=40.0, clf__penalty=l2, clf__solver=newton-cg, total=  23.9s
[CV] .. clf__C=30.0, clf__penalty=l2, clf__solver=lbfgs, total=  25.6s
[CV] clf__C=40.0, clf__penalty=l2, clf__solver=saga ..................
[CV] .. clf__C=30.0, clf__penalty=l2, clf__solver=lbfgs, total=  25.6s
[CV] clf__C=40.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV] clf__C=40.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV] clf__C=40.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV]  clf__C=40.0, clf__penalty=l2, clf__solver=newton-cg, total=  24.9s
[CV] clf__C=40.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV]  clf__C=40.0, clf__penalty=l2, clf__solver=newton-cg, total=  24.3s
[CV] clf__C=40.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV]  clf__C=40.0, clf__penalty=l2, clf__solver=newton-cg, total=  24.1s
[CV] ... clf__C=40.0, clf__penalty=l2, clf__solver=saga, total=  23.9



[CV] clf__C=50.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] ... clf__C=40.0, clf__penalty=l2, clf__solver=saga, total=  15.5s




[CV] clf__C=50.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] ... clf__C=40.0, clf__penalty=l2, clf__solver=saga, total=   9.1s
[CV] clf__C=50.0, clf__penalty=l2, clf__solver=saga ..................
[CV] ... clf__C=40.0, clf__penalty=l2, clf__solver=saga, total=  17.5s
[CV] clf__C=50.0, clf__penalty=l2, clf__solver=saga ..................
[CV] .. clf__C=40.0, clf__penalty=l2, clf__solver=lbfgs, total=  22.7s
[CV] clf__C=50.0, clf__penalty=l2, clf__solver=saga ..................
[CV] .. clf__C=40.0, clf__penalty=l2, clf__solver=lbfgs, total=  23.0s
[CV] .. clf__C=40.0, clf__penalty=l2, clf__solver=lbfgs, total=  23.1s
[CV] clf__C=50.0, clf__penalty=l2, clf__solver=saga ..................
[CV] clf__C=50.0, clf__penalty=l2, clf__solver=saga ..................
[CV] .. clf__C=40.0, clf__penalty=l2, clf__solver=lbfgs, total=  25.8s
[CV] clf__C=50.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV] .. clf__C=40.0, clf__penalty=l2, clf__solver=lbfgs, total=  25.8s
[CV] c



[CV]  clf__C=50.0, clf__penalty=l2, clf__solver=newton-cg, total=  27.3s
[CV] clf__C=50.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV] clf__C=60.0, clf__penalty=l2, clf__solver=newton-cg .............




[CV]  clf__C=50.0, clf__penalty=l2, clf__solver=newton-cg, total=  26.8s
[CV] clf__C=60.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] ... clf__C=50.0, clf__penalty=l2, clf__solver=saga, total=  10.4s
[CV] clf__C=60.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] ... clf__C=50.0, clf__penalty=l2, clf__solver=saga, total=  12.1s
[CV] clf__C=60.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] ... clf__C=50.0, clf__penalty=l2, clf__solver=saga, total=  17.3s
[CV] ... clf__C=50.0, clf__penalty=l2, clf__solver=saga, total=  16.2s
[CV] clf__C=60.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] clf__C=60.0, clf__penalty=l2, clf__solver=saga ..................
[CV] ... clf__C=50.0, clf__penalty=l2, clf__solver=saga, total=  17.7s
[CV] clf__C=60.0, clf__penalty=l2, clf__solver=saga ..................
[CV] .. clf__C=50.0, clf__penalty=l2, clf__solver=lbfgs, total=  21.5s
[CV] clf__C=60.0, clf__penalty=l2, clf__solver=saga ..................
[CV]



[CV]  clf__C=60.0, clf__penalty=l2, clf__solver=newton-cg, total=  22.4s
[CV] clf__C=60.0, clf__penalty=l2, clf__solver=lbfgs .................




[CV]  clf__C=60.0, clf__penalty=l2, clf__solver=newton-cg, total=  18.4s
[CV] clf__C=60.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV]  clf__C=60.0, clf__penalty=l2, clf__solver=newton-cg, total=  17.0s
[CV] clf__C=70.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] ... clf__C=60.0, clf__penalty=l2, clf__solver=saga, total=  18.3s
[CV] clf__C=70.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] ... clf__C=60.0, clf__penalty=l2, clf__solver=saga, total=  17.8s
[CV]  clf__C=60.0, clf__penalty=l2, clf__solver=newton-cg, total=  19.6s
[CV] clf__C=70.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] clf__C=70.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] ... clf__C=60.0, clf__penalty=l2, clf__solver=saga, total=  21.1s
[CV] ... clf__C=60.0, clf__penalty=l2, clf__solver=saga, total=  19.8s
[CV] clf__C=70.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] clf__C=70.0, clf__penalty=l2, clf__solver=saga ..................




[CV] .. clf__C=60.0, clf__penalty=l2, clf__solver=lbfgs, total=  22.7s




[CV] clf__C=70.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV]  clf__C=70.0, clf__penalty=l2, clf__solver=newton-cg, total=  20.6s
[CV] clf__C=70.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV] .. clf__C=60.0, clf__penalty=l2, clf__solver=lbfgs, total=  25.0s
[CV] clf__C=70.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV]  clf__C=70.0, clf__penalty=l2, clf__solver=newton-cg, total=  22.8s
[CV]  clf__C=70.0, clf__penalty=l2, clf__solver=newton-cg, total=  24.1s
[CV] clf__C=70.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV] clf__C=70.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV]  clf__C=70.0, clf__penalty=l2, clf__solver=newton-cg, total=  23.0s
[CV] clf__C=80.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] ... clf__C=70.0, clf__penalty=l2, clf__solver=saga, total=  22.4s
[CV] clf__C=80.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV]  clf__C=70.0, clf__penalty=l2, clf__solver=newton-cg, total=  23



[CV] .. clf__C=70.0, clf__penalty=l2, clf__solver=lbfgs, total=  22.5s
[CV] clf__C=80.0, clf__penalty=l2, clf__solver=saga ..................
[CV] .. clf__C=70.0, clf__penalty=l2, clf__solver=lbfgs, total=  27.2s
[CV] clf__C=80.0, clf__penalty=l2, clf__solver=saga ..................
[CV] .. clf__C=70.0, clf__penalty=l2, clf__solver=lbfgs, total=  28.2s
[CV] clf__C=80.0, clf__penalty=l2, clf__solver=saga ..................
[CV] .. clf__C=70.0, clf__penalty=l2, clf__solver=lbfgs, total=  29.5s
[CV] clf__C=80.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV] .. clf__C=70.0, clf__penalty=l2, clf__solver=lbfgs, total=  30.6s
[CV]  clf__C=80.0, clf__penalty=l2, clf__solver=newton-cg, total=  29.4s
[CV] clf__C=80.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV] clf__C=80.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV]  clf__C=80.0, clf__penalty=l2, clf__solver=newton-cg, total=  30.3s
[CV] clf__C=80.0, clf__penalty=l2, clf__solver=lbfgs .................
[C

[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  6.3min


[CV] clf__C=80.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV]  clf__C=80.0, clf__penalty=l2, clf__solver=newton-cg, total=  28.8s
[CV]  clf__C=80.0, clf__penalty=l2, clf__solver=newton-cg, total=  29.3s
[CV] clf__C=90.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] clf__C=90.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] ... clf__C=80.0, clf__penalty=l2, clf__solver=saga, total=  25.8s
[CV] ... clf__C=80.0, clf__penalty=l2, clf__solver=saga, total=  24.6s
[CV] clf__C=90.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] clf__C=90.0, clf__penalty=l2, clf__solver=newton-cg .............




[CV] ... clf__C=80.0, clf__penalty=l2, clf__solver=saga, total=  15.4s
[CV] clf__C=90.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] ... clf__C=80.0, clf__penalty=l2, clf__solver=saga, total=  19.5s
[CV] clf__C=90.0, clf__penalty=l2, clf__solver=saga ..................
[CV] ... clf__C=80.0, clf__penalty=l2, clf__solver=saga, total=  21.6s
[CV] clf__C=90.0, clf__penalty=l2, clf__solver=saga ..................
[CV] .. clf__C=80.0, clf__penalty=l2, clf__solver=lbfgs, total=  24.8s
[CV] .. clf__C=80.0, clf__penalty=l2, clf__solver=lbfgs, total=  24.9s
[CV] clf__C=90.0, clf__penalty=l2, clf__solver=saga ..................
[CV] clf__C=90.0, clf__penalty=l2, clf__solver=saga ..................
[CV] .. clf__C=80.0, clf__penalty=l2, clf__solver=lbfgs, total=  24.8s
[CV] .. clf__C=80.0, clf__penalty=l2, clf__solver=lbfgs, total=  24.5s
[CV] clf__C=90.0, clf__penalty=l2, clf__solver=saga ..................
[CV] clf__C=90.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV] .



[CV]  clf__C=90.0, clf__penalty=l2, clf__solver=newton-cg, total=  29.4s
[CV] clf__C=100.0, clf__penalty=l2, clf__solver=newton-cg ............




[CV] ... clf__C=90.0, clf__penalty=l2, clf__solver=saga, total=  12.2s
[CV] clf__C=100.0, clf__penalty=l2, clf__solver=newton-cg ............
[CV] ... clf__C=90.0, clf__penalty=l2, clf__solver=saga, total=  15.1s
[CV] clf__C=100.0, clf__penalty=l2, clf__solver=newton-cg ............
[CV]  clf__C=90.0, clf__penalty=l2, clf__solver=newton-cg, total=  20.2s
[CV] clf__C=100.0, clf__penalty=l2, clf__solver=newton-cg ............
[CV] ... clf__C=90.0, clf__penalty=l2, clf__solver=saga, total=  20.8s
[CV] clf__C=100.0, clf__penalty=l2, clf__solver=newton-cg ............
[CV] ... clf__C=90.0, clf__penalty=l2, clf__solver=saga, total=  21.6s
[CV] ... clf__C=90.0, clf__penalty=l2, clf__solver=saga, total=  22.1s
[CV] clf__C=100.0, clf__penalty=l2, clf__solver=saga .................
[CV] clf__C=100.0, clf__penalty=l2, clf__solver=saga .................
[CV] .. clf__C=90.0, clf__penalty=l2, clf__solver=lbfgs, total=  22.2s
[CV] clf__C=100.0, clf__penalty=l2, clf__solver=saga .................
[CV]



[CV]  clf__C=100.0, clf__penalty=l2, clf__solver=newton-cg, total=  25.6s




[CV] clf__C=100.0, clf__penalty=l2, clf__solver=lbfgs ................




[CV]  clf__C=100.0, clf__penalty=l2, clf__solver=newton-cg, total=  17.8s
[CV] clf__C=100.0, clf__penalty=l2, clf__solver=lbfgs ................
[CV]  clf__C=100.0, clf__penalty=l2, clf__solver=newton-cg, total=  19.4s
[CV] clf__C=100.0, clf__penalty=l2, clf__solver=lbfgs ................
[CV]  clf__C=100.0, clf__penalty=l2, clf__solver=newton-cg, total=  19.0s
[CV] clf__C=1.0, clf__penalty=l2, clf__solver=newton-cg ..............
[CV] .. clf__C=100.0, clf__penalty=l2, clf__solver=saga, total=  21.5s
[CV] clf__C=1.0, clf__penalty=l2, clf__solver=newton-cg ..............
[CV]  clf__C=100.0, clf__penalty=l2, clf__solver=newton-cg, total=  20.2s
[CV] .. clf__C=100.0, clf__penalty=l2, clf__solver=saga, total=  21.4s
[CV] clf__C=1.0, clf__penalty=l2, clf__solver=newton-cg ..............
[CV] .. clf__C=100.0, clf__penalty=l2, clf__solver=saga, total=  19.8s
[CV] clf__C=1.0, clf__penalty=l2, clf__solver=newton-cg ..............
[CV] .. clf__C=100.0, clf__penalty=l2, clf__solver=saga, total=  

[CV] clf__C=20.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV] .. clf__C=15.0, clf__penalty=l2, clf__solver=lbfgs, total=  25.3s
[CV] clf__C=20.0, clf__penalty=l2, clf__solver=lbfgs .................




[CV]  clf__C=20.0, clf__penalty=l2, clf__solver=newton-cg, total=  19.1s
[CV] clf__C=20.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV]  clf__C=20.0, clf__penalty=l2, clf__solver=newton-cg, total=  15.4s
[CV] clf__C=20.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV]  clf__C=20.0, clf__penalty=l2, clf__solver=newton-cg, total=  17.9s
[CV] clf__C=20.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV]  clf__C=20.0, clf__penalty=l2, clf__solver=newton-cg, total=  17.1s
[CV] clf__C=30.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV]  clf__C=20.0, clf__penalty=l2, clf__solver=newton-cg, total=  16.7s
[CV] clf__C=30.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] ... clf__C=20.0, clf__penalty=l2, clf__solver=saga, total=  16.8s
[CV] clf__C=30.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] ... clf__C=20.0, clf__penalty=l2, clf__solver=saga, total=  16.9s
[CV] ... clf__C=20.0, clf__penalty=l2, clf__solver=saga, total=  16



[CV] clf__C=30.0, clf__penalty=l2, clf__solver=saga ..................




[CV] .. clf__C=20.0, clf__penalty=l2, clf__solver=lbfgs, total=  24.8s
[CV] clf__C=30.0, clf__penalty=l2, clf__solver=saga ..................




[CV] .. clf__C=20.0, clf__penalty=l2, clf__solver=lbfgs, total=  25.3s
[CV] .. clf__C=20.0, clf__penalty=l2, clf__solver=lbfgs, total=  26.4s
[CV] clf__C=30.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV] clf__C=30.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV]  clf__C=30.0, clf__penalty=l2, clf__solver=newton-cg, total=  24.4s
[CV] clf__C=30.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV]  clf__C=30.0, clf__penalty=l2, clf__solver=newton-cg, total=  24.3s




[CV] clf__C=30.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV]  clf__C=30.0, clf__penalty=l2, clf__solver=newton-cg, total=  23.3s
[CV] clf__C=30.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV]  clf__C=30.0, clf__penalty=l2, clf__solver=newton-cg, total=  22.6s
[CV]  clf__C=30.0, clf__penalty=l2, clf__solver=newton-cg, total=  23.1s
[CV] clf__C=40.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] clf__C=40.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] ... clf__C=30.0, clf__penalty=l2, clf__solver=saga, total=  22.8s
[CV] clf__C=40.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] ... clf__C=30.0, clf__penalty=l2, clf__solver=saga, total=  21.0s
[CV] clf__C=40.0, clf__penalty=l2, clf__solver=newton-cg .............




[CV] ... clf__C=30.0, clf__penalty=l2, clf__solver=saga, total=  10.7s
[CV] clf__C=40.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] ... clf__C=30.0, clf__penalty=l2, clf__solver=saga, total=   8.1s
[CV] clf__C=40.0, clf__penalty=l2, clf__solver=saga ..................
[CV] ... clf__C=30.0, clf__penalty=l2, clf__solver=saga, total=  11.1s
[CV] clf__C=40.0, clf__penalty=l2, clf__solver=saga ..................
[CV] .. clf__C=30.0, clf__penalty=l2, clf__solver=lbfgs, total=  24.3s




[CV] clf__C=40.0, clf__penalty=l2, clf__solver=saga ..................
[CV] .. clf__C=30.0, clf__penalty=l2, clf__solver=lbfgs, total=  24.7s
[CV] .. clf__C=30.0, clf__penalty=l2, clf__solver=lbfgs, total=  23.8s
[CV] clf__C=40.0, clf__penalty=l2, clf__solver=saga ..................
[CV] clf__C=40.0, clf__penalty=l2, clf__solver=saga ..................
[CV] .. clf__C=30.0, clf__penalty=l2, clf__solver=lbfgs, total=  26.0s
[CV] clf__C=40.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV] .. clf__C=30.0, clf__penalty=l2, clf__solver=lbfgs, total=  25.4s
[CV] clf__C=40.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV]  clf__C=40.0, clf__penalty=l2, clf__solver=newton-cg, total=  26.3s
[CV] clf__C=40.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV]  clf__C=40.0, clf__penalty=l2, clf__solver=newton-cg, total=  26.1s
[CV]  clf__C=40.0, clf__penalty=l2, clf__solver=newton-cg, total=  26.2s
[CV] clf__C=40.0, clf__penalty=l2, clf__solver=lbfgs .................




[CV]  clf__C=40.0, clf__penalty=l2, clf__solver=newton-cg, total=  27.3s
[CV] clf__C=50.0, clf__penalty=l2, clf__solver=newton-cg .............




[CV]  clf__C=40.0, clf__penalty=l2, clf__solver=newton-cg, total=  17.2s
[CV] clf__C=50.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] ... clf__C=40.0, clf__penalty=l2, clf__solver=saga, total=  11.2s
[CV] clf__C=50.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] ... clf__C=40.0, clf__penalty=l2, clf__solver=saga, total=   9.4s
[CV] clf__C=50.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] ... clf__C=40.0, clf__penalty=l2, clf__solver=saga, total=  17.6s
[CV] clf__C=50.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] ... clf__C=40.0, clf__penalty=l2, clf__solver=saga, total=  19.2s
[CV] ... clf__C=40.0, clf__penalty=l2, clf__solver=saga, total=  18.7s
[CV] clf__C=50.0, clf__penalty=l2, clf__solver=saga ..................
[CV] clf__C=50.0, clf__penalty=l2, clf__solver=saga ..................
[CV] .. clf__C=40.0, clf__penalty=l2, clf__solver=lbfgs, total=  21.1s
[CV] clf__C=50.0, clf__penalty=l2, clf__solver=saga ..................
[CV]



[CV]  clf__C=50.0, clf__penalty=l2, clf__solver=newton-cg, total=  15.5s
[CV] clf__C=50.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV]  clf__C=50.0, clf__penalty=l2, clf__solver=newton-cg, total=  18.6s
[CV] clf__C=50.0, clf__penalty=l2, clf__solver=lbfgs .................




[CV]  clf__C=50.0, clf__penalty=l2, clf__solver=newton-cg, total=  19.0s
[CV] clf__C=60.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV]  clf__C=50.0, clf__penalty=l2, clf__solver=newton-cg, total=  17.8s
[CV] ... clf__C=50.0, clf__penalty=l2, clf__solver=saga, total=  16.0s
[CV] ... clf__C=50.0, clf__penalty=l2, clf__solver=saga, total=  17.0s
[CV] clf__C=60.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] clf__C=60.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] clf__C=60.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] ... clf__C=50.0, clf__penalty=l2, clf__solver=saga, total=  19.9s
[CV] ... clf__C=50.0, clf__penalty=l2, clf__solver=saga, total=  19.4s
[CV] clf__C=60.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] .. clf__C=50.0, clf__penalty=l2, clf__solver=lbfgs, total=  20.7s
[CV] clf__C=60.0, clf__penalty=l2, clf__solver=saga ..................
[CV] ... clf__C=50.0, clf__penalty=l2, clf__solver=saga, total=  19.4s
[C



[CV] .. clf__C=50.0, clf__penalty=l2, clf__solver=lbfgs, total=  23.6s




[CV] clf__C=60.0, clf__penalty=l2, clf__solver=lbfgs .................




[CV]  clf__C=60.0, clf__penalty=l2, clf__solver=newton-cg, total=  23.2s
[CV] clf__C=60.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV]  clf__C=60.0, clf__penalty=l2, clf__solver=newton-cg, total=  20.0s
[CV] clf__C=60.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV]  clf__C=60.0, clf__penalty=l2, clf__solver=newton-cg, total=  20.0s
[CV]  clf__C=60.0, clf__penalty=l2, clf__solver=newton-cg, total=  19.2s
[CV]  clf__C=60.0, clf__penalty=l2, clf__solver=newton-cg, total=  20.1s
[CV] ... clf__C=60.0, clf__penalty=l2, clf__solver=saga, total=  18.7s
[CV] clf__C=60.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV] clf__C=70.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] ... clf__C=60.0, clf__penalty=l2, clf__solver=saga, total=  18.4s
[CV] clf__C=70.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] ... clf__C=60.0, clf__penalty=l2, clf__solver=saga, total=  19.1s
[CV] clf__C=70.0, clf__penalty=l2, clf__solver=newton-cg ..........



[CV] .. clf__C=60.0, clf__penalty=l2, clf__solver=lbfgs, total=  25.6s
[CV] clf__C=70.0, clf__penalty=l2, clf__solver=saga ..................
[CV] .. clf__C=60.0, clf__penalty=l2, clf__solver=lbfgs, total=  28.8s




[CV] clf__C=70.0, clf__penalty=l2, clf__solver=saga ..................
[CV] .. clf__C=60.0, clf__penalty=l2, clf__solver=lbfgs, total=  26.5s
[CV] .. clf__C=60.0, clf__penalty=l2, clf__solver=lbfgs, total=  25.4s
[CV] clf__C=70.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV] clf__C=70.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV]  clf__C=70.0, clf__penalty=l2, clf__solver=newton-cg, total=  26.2s
[CV] clf__C=70.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV]  clf__C=70.0, clf__penalty=l2, clf__solver=newton-cg, total=  27.5s
[CV] clf__C=70.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV]  clf__C=70.0, clf__penalty=l2, clf__solver=newton-cg, total=  28.0s
[CV] clf__C=70.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV]  clf__C=70.0, clf__penalty=l2, clf__solver=newton-cg, total=  27.4s
[CV]  clf__C=70.0, clf__penalty=l2, clf__solver=newton-cg, total=  26.9s
[CV] clf__C=80.0, clf__penalty=l2, clf__solver=newton-cg ..........



[CV] ... clf__C=70.0, clf__penalty=l2, clf__solver=saga, total=  20.6s
[CV] clf__C=80.0, clf__penalty=l2, clf__solver=newton-cg .............




[CV] ... clf__C=70.0, clf__penalty=l2, clf__solver=saga, total=   8.8s
[CV] clf__C=80.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] ... clf__C=70.0, clf__penalty=l2, clf__solver=saga, total=  11.9s
[CV] clf__C=80.0, clf__penalty=l2, clf__solver=saga ..................
[CV] ... clf__C=70.0, clf__penalty=l2, clf__solver=saga, total=  21.9s
[CV] clf__C=80.0, clf__penalty=l2, clf__solver=saga ..................
[CV] .. clf__C=70.0, clf__penalty=l2, clf__solver=lbfgs, total=  23.3s
[CV] clf__C=80.0, clf__penalty=l2, clf__solver=saga ..................
[CV] .. clf__C=70.0, clf__penalty=l2, clf__solver=lbfgs, total=  25.5s
[CV] .. clf__C=70.0, clf__penalty=l2, clf__solver=lbfgs, total=  25.1s
[CV] clf__C=80.0, clf__penalty=l2, clf__solver=saga ..................
[CV] clf__C=80.0, clf__penalty=l2, clf__solver=saga ..................
[CV] .. clf__C=70.0, clf__penalty=l2, clf__solver=lbfgs, total=  25.2s
[CV] clf__C=80.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV] .



[CV]  clf__C=80.0, clf__penalty=l2, clf__solver=newton-cg, total=  27.2s
[CV] clf__C=80.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV]  clf__C=80.0, clf__penalty=l2, clf__solver=newton-cg, total=  27.5s
[CV] clf__C=90.0, clf__penalty=l2, clf__solver=newton-cg .............




[CV]  clf__C=80.0, clf__penalty=l2, clf__solver=newton-cg, total=  17.0s
[CV] clf__C=90.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] ... clf__C=80.0, clf__penalty=l2, clf__solver=saga, total=  11.0s
[CV] clf__C=90.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] ... clf__C=80.0, clf__penalty=l2, clf__solver=saga, total=  13.6s
[CV] clf__C=90.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] ... clf__C=80.0, clf__penalty=l2, clf__solver=saga, total=  16.8s
[CV] clf__C=90.0, clf__penalty=l2, clf__solver=newton-cg .............
[CV] ... clf__C=80.0, clf__penalty=l2, clf__solver=saga, total=  18.5s
[CV] ... clf__C=80.0, clf__penalty=l2, clf__solver=saga, total=  18.1s
[CV] clf__C=90.0, clf__penalty=l2, clf__solver=saga ..................
[CV] clf__C=90.0, clf__penalty=l2, clf__solver=saga ..................
[CV] .. clf__C=80.0, clf__penalty=l2, clf__solver=lbfgs, total=  20.4s
[CV] clf__C=90.0, clf__penalty=l2, clf__solver=saga ..................
[CV]



[CV]  clf__C=90.0, clf__penalty=l2, clf__solver=newton-cg, total=  19.9s
[CV] clf__C=90.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV]  clf__C=90.0, clf__penalty=l2, clf__solver=newton-cg, total=  18.7s
[CV] clf__C=90.0, clf__penalty=l2, clf__solver=lbfgs .................
[CV]  clf__C=90.0, clf__penalty=l2, clf__solver=newton-cg, total=  19.8s
[CV] clf__C=100.0, clf__penalty=l2, clf__solver=newton-cg ............
[CV]  clf__C=90.0, clf__penalty=l2, clf__solver=newton-cg, total=  20.2s
[CV] clf__C=100.0, clf__penalty=l2, clf__solver=newton-cg ............
[CV] ... clf__C=90.0, clf__penalty=l2, clf__solver=saga, total=  17.5s
[CV] ... clf__C=90.0, clf__penalty=l2, clf__solver=saga, total=  19.3s
[CV] clf__C=100.0, clf__penalty=l2, clf__solver=newton-cg ............
[CV] clf__C=100.0, clf__penalty=l2, clf__solver=newton-cg ............
[CV] ... clf__C=90.0, clf__penalty=l2, clf__solver=saga, total=  20.5s
[CV] ... clf__C=90.0, clf__penalty=l2, clf__solver=saga, total=  18.5



[CV] .. clf__C=90.0, clf__penalty=l2, clf__solver=lbfgs, total=  22.8s
[CV] clf__C=100.0, clf__penalty=l2, clf__solver=lbfgs ................
[CV] .. clf__C=90.0, clf__penalty=l2, clf__solver=lbfgs, total=  22.6s




[CV] clf__C=100.0, clf__penalty=l2, clf__solver=lbfgs ................
[CV]  clf__C=100.0, clf__penalty=l2, clf__solver=newton-cg, total=  23.5s
[CV]  clf__C=100.0, clf__penalty=l2, clf__solver=newton-cg, total=  24.7s
[CV] clf__C=100.0, clf__penalty=l2, clf__solver=lbfgs ................
[CV]  clf__C=100.0, clf__penalty=l2, clf__solver=newton-cg, total=  24.9s
[CV] clf__C=100.0, clf__penalty=l2, clf__solver=lbfgs ................
[CV]  clf__C=100.0, clf__penalty=l2, clf__solver=newton-cg, total=  24.8s
[CV] clf__C=100.0, clf__penalty=l2, clf__solver=lbfgs ................
[CV] .. clf__C=100.0, clf__penalty=l2, clf__solver=saga, total=  23.8s
[CV] .. clf__C=100.0, clf__penalty=l2, clf__solver=saga, total=  21.9s
[CV]  clf__C=100.0, clf__penalty=l2, clf__solver=newton-cg, total=  26.0s
[CV] .. clf__C=100.0, clf__penalty=l2, clf__solver=saga, total=  22.4s
[CV] .. clf__C=100.0, clf__penalty=l2, clf__solver=saga, total=  14.0s
[CV] .. clf__C=100.0, clf__penalty=l2, clf__solver=saga, total

[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed: 16.5min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...alty='l2', random_state=42, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'clf__penalty': ['l2'], 'clf__C': [1.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0], 'clf__solver': ['newton-cg', 'saga', 'lbfgs']}, {'clf__penalty': ['l2'], 'clf__C': [1.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0], 'clf__solver': ['newton-cg', 'saga', 'lbfgs']}],
       pre_dispatch='2*n_jobs', refit='f1_micro',
       return_train_score='warn', scoring=['accuracy', 'f1_micro'],
       verbos

In [13]:
best_parameters = gs_lr_tfidf.best_params_
best_estimator = gs_lr_tfidf.best_estimator_
result = gs_lr_tfidf.cv_results_

In [14]:
best_parameters

{'clf__C': 80.0, 'clf__penalty': 'l2', 'clf__solver': 'lbfgs'}

In [15]:
# PERFORMANCE MEASURE
#####################
# Stratified k-fold CV
skfolds = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

for train_index, test_index in skfolds.split(X_train, y_train):
    clone_estimator = clone(best_estimator)
    X_train_folds = X_train[train_index]
    y_train_folds = (y_train[train_index])
    X_test_fold = X_train[test_index]
    y_test_fold = (y_train[test_index])

    clone_estimator.fit(X_train_folds, y_train_folds)
    y_pred = clone_estimator.predict(X_test_fold)
    n_correct = sum(y_pred == y_test_fold)
    print(n_correct / len(y_pred))

0.9481199971005188
0.9492274873664154
0.9479744843012178
0.9488743450075596
0.9506969326678126


In [30]:
# Dumb Classifier
from sklearn.base import BaseEstimator
class DumbClassifier(BaseEstimator):
    def fit(self, X, y=None):
        pass
    def predict(self, X):
        return np.zeros((len(X), 1), dtype=bool)
dumb = DumbClassifier()
cross_val_score(dumb, X_train, y_train, cv=5, scoring='accuracy')

array([0.22451537, 0.22286886, 0.22508725, 0.22405169, 0.22427952])

In [16]:
y_train_pred = cross_val_predict(best_estimator, X_train, y_train, cv=5,
        verbose=2, n_jobs=-1)

conf_mx = confusion_matrix(y_train, y_train_pred)

[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   13.5s remaining:    9.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   14.4s finished


In [17]:
conf_mx

array([[  9434,    545,     74],
       [  5294, 101051,   1888],
       [  2528,  14881, 347142]])

In [18]:
(precision_score(y_train, y_train_pred, average='micro'),
        recall_score(y_train, y_train_pred, average='micro'))

(0.9477877627439488, 0.9477877627439488)

In [19]:
f1_score(y_train, y_train_pred, average='micro')

0.9477877627439488

In [20]:
# TEST MODEL ON TEST DATA
#########################
y_test_pred = cross_val_predict(best_estimator, X_test, y_test, cv=5,
        verbose=2, n_jobs=-1)

conf_mx = confusion_matrix(y_test, y_test_pred)

[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    4.7s remaining:    3.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    5.0s finished


In [21]:
conf_mx

array([[ 2291,   189,    33],
       [ 1955, 24489,   615],
       [ 1260,  5667, 84211]])

In [22]:
(precision_score(y_test, y_test_pred, average='micro'),
        recall_score(y_test, y_test_pred, average='micro'))

(0.919484715433684, 0.919484715433684)

In [23]:
f1_score(y_test, y_test_pred, average='micro')

0.919484715433684

In [24]:
# save fitted model to file
joblib.dump(best_estimator, './training/sentiment_logreg_undersample.pkl')

['./training/sentiment_logreg_undersample.pkl']

# 2 - Decision Trees

In [6]:
from sklearn.tree import DecisionTreeClassifier

In [26]:
# The text data is already cleaned
inputfile = './csvfiles/output_sentiment.csv'
review = pd.read_csv(inputfile, skip_blank_lines=False)
review = review[['text', 'ovsentiment']]
# exclude NaN in 'text' column (count: 11248)
review = review[~pd.isna(review['text'])]
X = review['text'].values
y = review['ovsentiment'].values

# len(review[review['ovsentiment'] == -1])
# total number of -1 (negative review) is 12566

# len(review[review['ovsentiment'] == 0])
# total number of 0 (neutral) is 135292

# len(review[review['ovsentiment'] == 1])
# total number of 1 (positive review) is 455689

# splitting the dataset into the training set and test set
# stratify=y --> stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, 
        random_state=42, shuffle=True, stratify=y)

In [27]:
X_train.shape

(301773,)

In [28]:
rus = RandomUnderSampler(random_state=42, replacement=True)

In [29]:
tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None,
                       ngram_range=(1,1), stop_words=None, tokenizer=str.split)

In [30]:
param_grid = [{'clf__max_depth': [5, 6, 7],
               'clf__min_samples_leaf': [4, 5, 6],
               'clf__min_samples_split': [5, 6, 7],
               'clf__max_features': ['auto', 'log2', 'sqrt']
              }, 
              {'clf__max_depth': [5, 6, 7],
               'clf__min_samples_leaf': [4, 5, 6],
               'clf__min_samples_split': [5, 6, 7],
               'clf__max_features': ['auto', 'log2','sqrt']
              }
             ]

In [32]:
lr_tfidf = Pipeline([('vect', tfidf),
                     ('rus', rus),
                     ('clf', DecisionTreeClassifier(random_state=42)
                     )
                    ])

In [33]:
gs_lr_tfidf = GridSearchCV(estimator=lr_tfidf,
        param_grid=param_grid,
        scoring=['accuracy', 'f1_micro'],
        cv=5,
        verbose=2,
        refit='f1_micro',
        n_jobs=-1)

In [34]:
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 162 candidates, totalling 810 fits
[CV] clf__max_depth=5, clf__max_features=auto, clf__min_samples_leaf=4, clf__min_samples_split=5 
[CV] clf__max_depth=5, clf__max_features=auto, clf__min_samples_leaf=4, clf__min_samples_split=5 
[CV] clf__max_depth=5, clf__max_features=auto, clf__min_samples_leaf=4, clf__min_samples_split=5 
[CV] clf__max_depth=5, clf__max_features=auto, clf__min_samples_leaf=4, clf__min_samples_split=5 
[CV] clf__max_depth=5, clf__max_features=auto, clf__min_samples_leaf=4, clf__min_samples_split=5 
[CV] clf__max_depth=5, clf__max_features=auto, clf__min_samples_leaf=4, clf__min_samples_split=6 
[CV] clf__max_depth=5, clf__max_features=auto, clf__min_samples_leaf=4, clf__min_samples_split=6 
[CV] clf__max_depth=5, clf__max_features=auto, clf__min_samples_leaf=4, clf__min_samples_split=6 
[CV] clf__max_depth=5, clf__max_features=auto, clf__min_samples_leaf=4, clf__min_samples_split=6 
[CV] clf__max_depth=5, clf__max_features=auto, clf__min

[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   16.5s


[CV] clf__max_depth=5, clf__max_features=auto, clf__min_samples_leaf=5, clf__min_samples_split=7 
[CV]  clf__max_depth=5, clf__max_features=auto, clf__min_samples_leaf=5, clf__min_samples_split=5, total=   3.6s
[CV]  clf__max_depth=5, clf__max_features=auto, clf__min_samples_leaf=5, clf__min_samples_split=5, total=   3.7s
[CV] clf__max_depth=5, clf__max_features=auto, clf__min_samples_leaf=5, clf__min_samples_split=7 
[CV]  clf__max_depth=5, clf__max_features=auto, clf__min_samples_leaf=5, clf__min_samples_split=5, total=   3.6s
[CV] clf__max_depth=5, clf__max_features=auto, clf__min_samples_leaf=6, clf__min_samples_split=5 
[CV]  clf__max_depth=5, clf__max_features=auto, clf__min_samples_leaf=5, clf__min_samples_split=6, total=   3.5s
[CV] clf__max_depth=5, clf__max_features=auto, clf__min_samples_leaf=6, clf__min_samples_split=5 
[CV] clf__max_depth=5, clf__max_features=auto, clf__min_samples_leaf=6, clf__min_samples_split=5 
[CV]  clf__max_depth=5, clf__max_features=auto, clf__min_s

[CV]  clf__max_depth=5, clf__max_features=log2, clf__min_samples_leaf=4, clf__min_samples_split=7, total=   3.7s
[CV] clf__max_depth=5, clf__max_features=log2, clf__min_samples_leaf=5, clf__min_samples_split=6 
[CV]  clf__max_depth=5, clf__max_features=log2, clf__min_samples_leaf=4, clf__min_samples_split=7, total=   3.6s
[CV] clf__max_depth=5, clf__max_features=log2, clf__min_samples_leaf=5, clf__min_samples_split=6 
[CV] clf__max_depth=5, clf__max_features=log2, clf__min_samples_leaf=5, clf__min_samples_split=6 
[CV]  clf__max_depth=5, clf__max_features=log2, clf__min_samples_leaf=4, clf__min_samples_split=7, total=   3.7s
[CV] clf__max_depth=5, clf__max_features=log2, clf__min_samples_leaf=5, clf__min_samples_split=7 
[CV]  clf__max_depth=5, clf__max_features=log2, clf__min_samples_leaf=4, clf__min_samples_split=7, total=   3.5s
[CV] clf__max_depth=5, clf__max_features=log2, clf__min_samples_leaf=5, clf__min_samples_split=7 
[CV]  clf__max_depth=5, clf__max_features=log2, clf__min_s

[CV]  clf__max_depth=5, clf__max_features=sqrt, clf__min_samples_leaf=4, clf__min_samples_split=6, total=   3.5s
[CV] clf__max_depth=5, clf__max_features=sqrt, clf__min_samples_leaf=5, clf__min_samples_split=5 
[CV] clf__max_depth=5, clf__max_features=sqrt, clf__min_samples_leaf=5, clf__min_samples_split=5 
[CV]  clf__max_depth=5, clf__max_features=sqrt, clf__min_samples_leaf=4, clf__min_samples_split=6, total=   3.7s
[CV] clf__max_depth=5, clf__max_features=sqrt, clf__min_samples_leaf=5, clf__min_samples_split=5 
[CV]  clf__max_depth=5, clf__max_features=sqrt, clf__min_samples_leaf=4, clf__min_samples_split=6, total=   3.5s
[CV] clf__max_depth=5, clf__max_features=sqrt, clf__min_samples_leaf=5, clf__min_samples_split=5 
[CV]  clf__max_depth=5, clf__max_features=sqrt, clf__min_samples_leaf=4, clf__min_samples_split=6, total=   3.6s
[CV]  clf__max_depth=5, clf__max_features=sqrt, clf__min_samples_leaf=4, clf__min_samples_split=7, total=   3.5s
[CV]  clf__max_depth=5, clf__max_features=s

[CV] clf__max_depth=6, clf__max_features=auto, clf__min_samples_leaf=4, clf__min_samples_split=7 
[CV]  clf__max_depth=5, clf__max_features=sqrt, clf__min_samples_leaf=6, clf__min_samples_split=7, total=   3.7s
[CV] clf__max_depth=6, clf__max_features=auto, clf__min_samples_leaf=4, clf__min_samples_split=7 
[CV]  clf__max_depth=6, clf__max_features=auto, clf__min_samples_leaf=4, clf__min_samples_split=5, total=   3.8s
[CV] clf__max_depth=6, clf__max_features=auto, clf__min_samples_leaf=4, clf__min_samples_split=7 
[CV]  clf__max_depth=6, clf__max_features=auto, clf__min_samples_leaf=4, clf__min_samples_split=5, total=   3.6s
[CV] clf__max_depth=6, clf__max_features=auto, clf__min_samples_leaf=4, clf__min_samples_split=7 
[CV]  clf__max_depth=6, clf__max_features=auto, clf__min_samples_leaf=4, clf__min_samples_split=5, total=   3.8s
[CV]  clf__max_depth=6, clf__max_features=auto, clf__min_samples_leaf=4, clf__min_samples_split=5, total=   3.5s
[CV] clf__max_depth=6, clf__max_features=au

[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  1.6min


[CV]  clf__max_depth=6, clf__max_features=auto, clf__min_samples_leaf=4, clf__min_samples_split=5, total=   3.9s
[CV] clf__max_depth=6, clf__max_features=auto, clf__min_samples_leaf=5, clf__min_samples_split=5 
[CV]  clf__max_depth=6, clf__max_features=auto, clf__min_samples_leaf=4, clf__min_samples_split=6, total=   3.5s
[CV] clf__max_depth=6, clf__max_features=auto, clf__min_samples_leaf=5, clf__min_samples_split=5 
[CV] clf__max_depth=6, clf__max_features=auto, clf__min_samples_leaf=5, clf__min_samples_split=5 
[CV]  clf__max_depth=6, clf__max_features=auto, clf__min_samples_leaf=4, clf__min_samples_split=6, total=   3.5s
[CV]  clf__max_depth=6, clf__max_features=auto, clf__min_samples_leaf=4, clf__min_samples_split=6, total=   3.5s
[CV]  clf__max_depth=6, clf__max_features=auto, clf__min_samples_leaf=4, clf__min_samples_split=6, total=   3.5s
[CV] clf__max_depth=6, clf__max_features=auto, clf__min_samples_leaf=5, clf__min_samples_split=5 
[CV] clf__max_depth=6, clf__max_features=au

[CV] clf__max_depth=6, clf__max_features=log2, clf__min_samples_leaf=4, clf__min_samples_split=6 
[CV]  clf__max_depth=6, clf__max_features=auto, clf__min_samples_leaf=6, clf__min_samples_split=7, total=   3.5s
[CV] clf__max_depth=6, clf__max_features=log2, clf__min_samples_leaf=4, clf__min_samples_split=7 
[CV]  clf__max_depth=6, clf__max_features=auto, clf__min_samples_leaf=6, clf__min_samples_split=7, total=   3.5s
[CV] clf__max_depth=6, clf__max_features=log2, clf__min_samples_leaf=4, clf__min_samples_split=7 
[CV]  clf__max_depth=6, clf__max_features=log2, clf__min_samples_leaf=4, clf__min_samples_split=5, total=   3.6s
[CV] clf__max_depth=6, clf__max_features=log2, clf__min_samples_leaf=4, clf__min_samples_split=7 
[CV]  clf__max_depth=6, clf__max_features=log2, clf__min_samples_leaf=4, clf__min_samples_split=5, total=   3.6s
[CV] clf__max_depth=6, clf__max_features=log2, clf__min_samples_leaf=4, clf__min_samples_split=7 
[CV]  clf__max_depth=6, clf__max_features=log2, clf__min_s

[CV] clf__max_depth=6, clf__max_features=sqrt, clf__min_samples_leaf=4, clf__min_samples_split=5 
[CV]  clf__max_depth=6, clf__max_features=log2, clf__min_samples_leaf=6, clf__min_samples_split=6, total=   3.6s
[CV] clf__max_depth=6, clf__max_features=sqrt, clf__min_samples_leaf=4, clf__min_samples_split=5 
[CV]  clf__max_depth=6, clf__max_features=log2, clf__min_samples_leaf=6, clf__min_samples_split=6, total=   3.5s
[CV] clf__max_depth=6, clf__max_features=sqrt, clf__min_samples_leaf=4, clf__min_samples_split=6 
[CV]  clf__max_depth=6, clf__max_features=log2, clf__min_samples_leaf=6, clf__min_samples_split=6, total=   3.6s
[CV]  clf__max_depth=6, clf__max_features=log2, clf__min_samples_leaf=6, clf__min_samples_split=7, total=   3.5s
[CV] clf__max_depth=6, clf__max_features=sqrt, clf__min_samples_leaf=4, clf__min_samples_split=6 
[CV]  clf__max_depth=6, clf__max_features=log2, clf__min_samples_leaf=6, clf__min_samples_split=7, total=   3.7s
[CV] clf__max_depth=6, clf__max_features=sq

[CV]  clf__max_depth=6, clf__max_features=sqrt, clf__min_samples_leaf=6, clf__min_samples_split=5, total=   3.7s
[CV] clf__max_depth=6, clf__max_features=sqrt, clf__min_samples_leaf=6, clf__min_samples_split=7 
[CV]  clf__max_depth=6, clf__max_features=sqrt, clf__min_samples_leaf=6, clf__min_samples_split=5, total=   3.7s
[CV]  clf__max_depth=6, clf__max_features=sqrt, clf__min_samples_leaf=6, clf__min_samples_split=5, total=   3.6s
[CV] clf__max_depth=6, clf__max_features=sqrt, clf__min_samples_leaf=6, clf__min_samples_split=7 
[CV]  clf__max_depth=6, clf__max_features=sqrt, clf__min_samples_leaf=6, clf__min_samples_split=5, total=   3.6s
[CV] clf__max_depth=6, clf__max_features=sqrt, clf__min_samples_leaf=6, clf__min_samples_split=7 
[CV] clf__max_depth=7, clf__max_features=auto, clf__min_samples_leaf=4, clf__min_samples_split=5 
[CV] clf__max_depth=7, clf__max_features=auto, clf__min_samples_leaf=4, clf__min_samples_split=5 
[CV]  clf__max_depth=6, clf__max_features=sqrt, clf__min_s

[CV]  clf__max_depth=7, clf__max_features=auto, clf__min_samples_leaf=5, clf__min_samples_split=7, total=   3.5s
[CV] clf__max_depth=7, clf__max_features=auto, clf__min_samples_leaf=6, clf__min_samples_split=6 
[CV] clf__max_depth=7, clf__max_features=auto, clf__min_samples_leaf=6, clf__min_samples_split=6 
[CV]  clf__max_depth=7, clf__max_features=auto, clf__min_samples_leaf=5, clf__min_samples_split=7, total=   3.6s
[CV] clf__max_depth=7, clf__max_features=auto, clf__min_samples_leaf=6, clf__min_samples_split=6 
[CV]  clf__max_depth=7, clf__max_features=auto, clf__min_samples_leaf=5, clf__min_samples_split=7, total=   3.5s
[CV] clf__max_depth=7, clf__max_features=auto, clf__min_samples_leaf=6, clf__min_samples_split=6 
[CV]  clf__max_depth=7, clf__max_features=auto, clf__min_samples_leaf=5, clf__min_samples_split=7, total=   3.9s
[CV] clf__max_depth=7, clf__max_features=auto, clf__min_samples_leaf=6, clf__min_samples_split=7 
[CV]  clf__max_depth=7, clf__max_features=auto, clf__min_s

[CV] clf__max_depth=7, clf__max_features=log2, clf__min_samples_leaf=6, clf__min_samples_split=5 
[CV]  clf__max_depth=7, clf__max_features=log2, clf__min_samples_leaf=5, clf__min_samples_split=5, total=   3.4s
[CV] clf__max_depth=7, clf__max_features=log2, clf__min_samples_leaf=6, clf__min_samples_split=5 
[CV]  clf__max_depth=7, clf__max_features=log2, clf__min_samples_leaf=5, clf__min_samples_split=6, total=   3.6s
[CV] clf__max_depth=7, clf__max_features=log2, clf__min_samples_leaf=6, clf__min_samples_split=5 
[CV]  clf__max_depth=7, clf__max_features=log2, clf__min_samples_leaf=5, clf__min_samples_split=6, total=   3.6s
[CV] clf__max_depth=7, clf__max_features=log2, clf__min_samples_leaf=6, clf__min_samples_split=5 
[CV]  clf__max_depth=7, clf__max_features=log2, clf__min_samples_leaf=5, clf__min_samples_split=6, total=   3.5s
[CV] clf__max_depth=7, clf__max_features=log2, clf__min_samples_leaf=6, clf__min_samples_split=5 
[CV]  clf__max_depth=7, clf__max_features=log2, clf__min_s

[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:  3.9min


[CV]  clf__max_depth=7, clf__max_features=log2, clf__min_samples_leaf=5, clf__min_samples_split=7, total=   3.6s
[CV]  clf__max_depth=7, clf__max_features=log2, clf__min_samples_leaf=5, clf__min_samples_split=7, total=   3.9s
[CV] clf__max_depth=7, clf__max_features=log2, clf__min_samples_leaf=6, clf__min_samples_split=6 
[CV]  clf__max_depth=7, clf__max_features=log2, clf__min_samples_leaf=5, clf__min_samples_split=7, total=   3.6s
[CV] clf__max_depth=7, clf__max_features=log2, clf__min_samples_leaf=6, clf__min_samples_split=6 
[CV] clf__max_depth=7, clf__max_features=log2, clf__min_samples_leaf=6, clf__min_samples_split=7 
[CV]  clf__max_depth=7, clf__max_features=log2, clf__min_samples_leaf=5, clf__min_samples_split=7, total=   3.7s
[CV] clf__max_depth=7, clf__max_features=log2, clf__min_samples_leaf=6, clf__min_samples_split=7 
[CV]  clf__max_depth=7, clf__max_features=log2, clf__min_samples_leaf=6, clf__min_samples_split=5, total=   3.5s
[CV] clf__max_depth=7, clf__max_features=lo

[CV]  clf__max_depth=7, clf__max_features=sqrt, clf__min_samples_leaf=5, clf__min_samples_split=6, total=   3.5s
[CV] clf__max_depth=7, clf__max_features=sqrt, clf__min_samples_leaf=6, clf__min_samples_split=5 
[CV]  clf__max_depth=7, clf__max_features=sqrt, clf__min_samples_leaf=5, clf__min_samples_split=6, total=   3.5s
[CV]  clf__max_depth=7, clf__max_features=sqrt, clf__min_samples_leaf=5, clf__min_samples_split=6, total=   3.4s
[CV] clf__max_depth=7, clf__max_features=sqrt, clf__min_samples_leaf=6, clf__min_samples_split=5 
[CV] clf__max_depth=7, clf__max_features=sqrt, clf__min_samples_leaf=6, clf__min_samples_split=5 
[CV]  clf__max_depth=7, clf__max_features=sqrt, clf__min_samples_leaf=5, clf__min_samples_split=6, total=   3.5s
[CV] clf__max_depth=7, clf__max_features=sqrt, clf__min_samples_leaf=6, clf__min_samples_split=6 
[CV]  clf__max_depth=7, clf__max_features=sqrt, clf__min_samples_leaf=5, clf__min_samples_split=6, total=   3.8s
[CV] clf__max_depth=7, clf__max_features=sq

[CV]  clf__max_depth=5, clf__max_features=auto, clf__min_samples_leaf=4, clf__min_samples_split=7, total=   3.7s
[CV] clf__max_depth=5, clf__max_features=auto, clf__min_samples_leaf=5, clf__min_samples_split=7 
[CV]  clf__max_depth=5, clf__max_features=auto, clf__min_samples_leaf=5, clf__min_samples_split=5, total=   3.7s
[CV] clf__max_depth=5, clf__max_features=auto, clf__min_samples_leaf=5, clf__min_samples_split=7 
[CV]  clf__max_depth=5, clf__max_features=auto, clf__min_samples_leaf=5, clf__min_samples_split=5, total=   3.6s
[CV] clf__max_depth=5, clf__max_features=auto, clf__min_samples_leaf=5, clf__min_samples_split=7 
[CV]  clf__max_depth=5, clf__max_features=auto, clf__min_samples_leaf=5, clf__min_samples_split=5, total=   3.7s
[CV] clf__max_depth=5, clf__max_features=auto, clf__min_samples_leaf=5, clf__min_samples_split=7 
[CV]  clf__max_depth=5, clf__max_features=auto, clf__min_samples_leaf=5, clf__min_samples_split=5, total=   3.8s
[CV] clf__max_depth=5, clf__max_features=au

[CV]  clf__max_depth=5, clf__max_features=log2, clf__min_samples_leaf=4, clf__min_samples_split=6, total=   4.0s
[CV]  clf__max_depth=5, clf__max_features=log2, clf__min_samples_leaf=4, clf__min_samples_split=6, total=   3.8s
[CV] clf__max_depth=5, clf__max_features=log2, clf__min_samples_leaf=5, clf__min_samples_split=6 
[CV] clf__max_depth=5, clf__max_features=log2, clf__min_samples_leaf=5, clf__min_samples_split=6 
[CV]  clf__max_depth=5, clf__max_features=log2, clf__min_samples_leaf=4, clf__min_samples_split=7, total=   3.8s
[CV] clf__max_depth=5, clf__max_features=log2, clf__min_samples_leaf=5, clf__min_samples_split=6 
[CV]  clf__max_depth=5, clf__max_features=log2, clf__min_samples_leaf=4, clf__min_samples_split=7, total=   3.7s
[CV]  clf__max_depth=5, clf__max_features=log2, clf__min_samples_leaf=4, clf__min_samples_split=7, total=   4.0s
[CV] clf__max_depth=5, clf__max_features=log2, clf__min_samples_leaf=5, clf__min_samples_split=6 
[CV] clf__max_depth=5, clf__max_features=lo

[CV]  clf__max_depth=5, clf__max_features=sqrt, clf__min_samples_leaf=4, clf__min_samples_split=5, total=   3.7s
[CV] clf__max_depth=5, clf__max_features=sqrt, clf__min_samples_leaf=4, clf__min_samples_split=7 
[CV]  clf__max_depth=5, clf__max_features=sqrt, clf__min_samples_leaf=4, clf__min_samples_split=5, total=   3.5s
[CV] clf__max_depth=5, clf__max_features=sqrt, clf__min_samples_leaf=5, clf__min_samples_split=5 
[CV]  clf__max_depth=5, clf__max_features=sqrt, clf__min_samples_leaf=4, clf__min_samples_split=5, total=   3.9s
[CV] clf__max_depth=5, clf__max_features=sqrt, clf__min_samples_leaf=5, clf__min_samples_split=5 
[CV]  clf__max_depth=5, clf__max_features=sqrt, clf__min_samples_leaf=4, clf__min_samples_split=6, total=   3.5s
[CV]  clf__max_depth=5, clf__max_features=sqrt, clf__min_samples_leaf=4, clf__min_samples_split=6, total=   3.5s
[CV]  clf__max_depth=5, clf__max_features=sqrt, clf__min_samples_leaf=4, clf__min_samples_split=6, total=   3.7s
[CV] clf__max_depth=5, clf__

[CV] clf__max_depth=6, clf__max_features=auto, clf__min_samples_leaf=4, clf__min_samples_split=6 
[CV]  clf__max_depth=5, clf__max_features=sqrt, clf__min_samples_leaf=6, clf__min_samples_split=7, total=   3.6s
[CV]  clf__max_depth=5, clf__max_features=sqrt, clf__min_samples_leaf=6, clf__min_samples_split=7, total=   3.4s
[CV] clf__max_depth=6, clf__max_features=auto, clf__min_samples_leaf=4, clf__min_samples_split=6 
[CV] clf__max_depth=6, clf__max_features=auto, clf__min_samples_leaf=4, clf__min_samples_split=6 
[CV] clf__max_depth=6, clf__max_features=auto, clf__min_samples_leaf=4, clf__min_samples_split=7 
[CV]  clf__max_depth=6, clf__max_features=auto, clf__min_samples_leaf=4, clf__min_samples_split=5, total=   3.4s
[CV]  clf__max_depth=5, clf__max_features=sqrt, clf__min_samples_leaf=6, clf__min_samples_split=7, total=   3.6s
[CV] clf__max_depth=6, clf__max_features=auto, clf__min_samples_leaf=4, clf__min_samples_split=7 
[CV] clf__max_depth=6, clf__max_features=auto, clf__min_sa

[CV]  clf__max_depth=6, clf__max_features=auto, clf__min_samples_leaf=6, clf__min_samples_split=6, total=   3.6s
[CV] clf__max_depth=6, clf__max_features=log2, clf__min_samples_leaf=4, clf__min_samples_split=5 
[CV]  clf__max_depth=6, clf__max_features=auto, clf__min_samples_leaf=6, clf__min_samples_split=6, total=   3.4s
[CV] clf__max_depth=6, clf__max_features=log2, clf__min_samples_leaf=4, clf__min_samples_split=5 
[CV]  clf__max_depth=6, clf__max_features=auto, clf__min_samples_leaf=6, clf__min_samples_split=6, total=   3.8s
[CV] clf__max_depth=6, clf__max_features=log2, clf__min_samples_leaf=4, clf__min_samples_split=5 
[CV]  clf__max_depth=6, clf__max_features=auto, clf__min_samples_leaf=6, clf__min_samples_split=6, total=   3.6s
[CV] clf__max_depth=6, clf__max_features=log2, clf__min_samples_leaf=4, clf__min_samples_split=6 
[CV]  clf__max_depth=6, clf__max_features=auto, clf__min_samples_leaf=6, clf__min_samples_split=6, total=   3.8s
[CV] clf__max_depth=6, clf__max_features=lo

[CV]  clf__max_depth=6, clf__max_features=log2, clf__min_samples_leaf=5, clf__min_samples_split=7, total=   4.0s
[CV] clf__max_depth=6, clf__max_features=log2, clf__min_samples_leaf=6, clf__min_samples_split=7 
[CV]  clf__max_depth=6, clf__max_features=log2, clf__min_samples_leaf=6, clf__min_samples_split=5, total=   3.6s
[CV] clf__max_depth=6, clf__max_features=log2, clf__min_samples_leaf=6, clf__min_samples_split=7 
[CV]  clf__max_depth=6, clf__max_features=log2, clf__min_samples_leaf=6, clf__min_samples_split=5, total=   3.5s
[CV] clf__max_depth=6, clf__max_features=log2, clf__min_samples_leaf=6, clf__min_samples_split=7 
[CV]  clf__max_depth=6, clf__max_features=log2, clf__min_samples_leaf=6, clf__min_samples_split=5, total=   3.6s
[CV] clf__max_depth=6, clf__max_features=log2, clf__min_samples_leaf=6, clf__min_samples_split=7 
[CV]  clf__max_depth=6, clf__max_features=log2, clf__min_samples_leaf=6, clf__min_samples_split=5, total=   3.6s
[CV] clf__max_depth=6, clf__max_features=sq

[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:  7.1min


[CV]  clf__max_depth=6, clf__max_features=log2, clf__min_samples_leaf=6, clf__min_samples_split=6, total=   3.4s
[CV] clf__max_depth=6, clf__max_features=sqrt, clf__min_samples_leaf=4, clf__min_samples_split=6 
[CV]  clf__max_depth=6, clf__max_features=log2, clf__min_samples_leaf=6, clf__min_samples_split=7, total=   3.5s
[CV] clf__max_depth=6, clf__max_features=sqrt, clf__min_samples_leaf=4, clf__min_samples_split=6 
[CV]  clf__max_depth=6, clf__max_features=log2, clf__min_samples_leaf=6, clf__min_samples_split=7, total=   3.5s
[CV] clf__max_depth=6, clf__max_features=sqrt, clf__min_samples_leaf=4, clf__min_samples_split=6 
[CV]  clf__max_depth=6, clf__max_features=log2, clf__min_samples_leaf=6, clf__min_samples_split=7, total=   3.8s
[CV]  clf__max_depth=6, clf__max_features=log2, clf__min_samples_leaf=6, clf__min_samples_split=7, total=   3.5s
[CV] clf__max_depth=6, clf__max_features=sqrt, clf__min_samples_leaf=4, clf__min_samples_split=6 
[CV] clf__max_depth=6, clf__max_features=sq

[CV]  clf__max_depth=6, clf__max_features=sqrt, clf__min_samples_leaf=6, clf__min_samples_split=5, total=   4.1s
[CV]  clf__max_depth=6, clf__max_features=sqrt, clf__min_samples_leaf=6, clf__min_samples_split=6, total=   3.6s
[CV] clf__max_depth=7, clf__max_features=auto, clf__min_samples_leaf=4, clf__min_samples_split=5 
[CV]  clf__max_depth=6, clf__max_features=sqrt, clf__min_samples_leaf=6, clf__min_samples_split=6, total=   3.5s
[CV] clf__max_depth=7, clf__max_features=auto, clf__min_samples_leaf=4, clf__min_samples_split=5 
[CV]  clf__max_depth=6, clf__max_features=sqrt, clf__min_samples_leaf=6, clf__min_samples_split=5, total=   3.9s
[CV] clf__max_depth=7, clf__max_features=auto, clf__min_samples_leaf=4, clf__min_samples_split=5 
[CV] clf__max_depth=7, clf__max_features=auto, clf__min_samples_leaf=4, clf__min_samples_split=5 
[CV]  clf__max_depth=6, clf__max_features=sqrt, clf__min_samples_leaf=6, clf__min_samples_split=6, total=   3.5s
[CV]  clf__max_depth=6, clf__max_features=s

[CV] clf__max_depth=7, clf__max_features=auto, clf__min_samples_leaf=6, clf__min_samples_split=6 
[CV] clf__max_depth=7, clf__max_features=auto, clf__min_samples_leaf=6, clf__min_samples_split=6 
[CV]  clf__max_depth=7, clf__max_features=auto, clf__min_samples_leaf=5, clf__min_samples_split=7, total=   3.5s
[CV]  clf__max_depth=7, clf__max_features=auto, clf__min_samples_leaf=5, clf__min_samples_split=7, total=   3.6s
[CV] clf__max_depth=7, clf__max_features=auto, clf__min_samples_leaf=6, clf__min_samples_split=7 
[CV] clf__max_depth=7, clf__max_features=auto, clf__min_samples_leaf=6, clf__min_samples_split=7 
[CV]  clf__max_depth=7, clf__max_features=auto, clf__min_samples_leaf=6, clf__min_samples_split=5, total=   3.6s
[CV] clf__max_depth=7, clf__max_features=auto, clf__min_samples_leaf=6, clf__min_samples_split=7 
[CV]  clf__max_depth=7, clf__max_features=auto, clf__min_samples_leaf=6, clf__min_samples_split=5, total=   3.5s
[CV]  clf__max_depth=7, clf__max_features=auto, clf__min_s

[CV] clf__max_depth=7, clf__max_features=log2, clf__min_samples_leaf=6, clf__min_samples_split=5 
[CV]  clf__max_depth=7, clf__max_features=log2, clf__min_samples_leaf=5, clf__min_samples_split=6, total=   3.5s
[CV] clf__max_depth=7, clf__max_features=log2, clf__min_samples_leaf=6, clf__min_samples_split=5 
[CV] clf__max_depth=7, clf__max_features=log2, clf__min_samples_leaf=6, clf__min_samples_split=5 
[CV]  clf__max_depth=7, clf__max_features=log2, clf__min_samples_leaf=5, clf__min_samples_split=6, total=   3.4s
[CV] clf__max_depth=7, clf__max_features=log2, clf__min_samples_leaf=6, clf__min_samples_split=6 
[CV]  clf__max_depth=7, clf__max_features=log2, clf__min_samples_leaf=5, clf__min_samples_split=6, total=   3.5s
[CV] clf__max_depth=7, clf__max_features=log2, clf__min_samples_leaf=6, clf__min_samples_split=6 
[CV]  clf__max_depth=7, clf__max_features=log2, clf__min_samples_leaf=5, clf__min_samples_split=7, total=   3.5s
[CV] clf__max_depth=7, clf__max_features=log2, clf__min_sa

[CV]  clf__max_depth=7, clf__max_features=sqrt, clf__min_samples_leaf=5, clf__min_samples_split=5, total=   3.7s
[CV] clf__max_depth=7, clf__max_features=sqrt, clf__min_samples_leaf=5, clf__min_samples_split=7 
[CV]  clf__max_depth=7, clf__max_features=sqrt, clf__min_samples_leaf=5, clf__min_samples_split=5, total=   3.4s
[CV] clf__max_depth=7, clf__max_features=sqrt, clf__min_samples_leaf=5, clf__min_samples_split=7 
[CV]  clf__max_depth=7, clf__max_features=sqrt, clf__min_samples_leaf=5, clf__min_samples_split=5, total=   3.5s
[CV] clf__max_depth=7, clf__max_features=sqrt, clf__min_samples_leaf=5, clf__min_samples_split=7 
[CV]  clf__max_depth=7, clf__max_features=sqrt, clf__min_samples_leaf=5, clf__min_samples_split=5, total=   3.5s
[CV] clf__max_depth=7, clf__max_features=sqrt, clf__min_samples_leaf=6, clf__min_samples_split=5 
[CV]  clf__max_depth=7, clf__max_features=sqrt, clf__min_samples_leaf=5, clf__min_samples_split=5, total=   3.5s
[CV]  clf__max_depth=7, clf__max_features=s

[Parallel(n_jobs=-1)]: Done 810 out of 810 | elapsed:  9.1min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...        min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'clf__max_depth': [5, 6, 7], 'clf__min_samples_leaf': [4, 5, 6], 'clf__min_samples_split': [5, 6, 7], 'clf__max_features': ['auto', 'log2', 'sqrt']}, {'clf__max_depth': [5, 6, 7], 'clf__min_samples_leaf': [4, 5, 6], 'clf__min_samples_split': [5, 6, 7], 'clf__max_features': ['auto', 'log2', 'sqrt']}],
       pre_dispatch='2*n_jobs', refit='f1_micro',
       return_train_score='warn', scoring=['accuracy', 'f1_micro'],
       verbose=2)

In [35]:
best_parameters = gs_lr_tfidf.best_params_
best_estimator = gs_lr_tfidf.best_estimator_
result = gs_lr_tfidf.cv_results_

In [36]:
best_parameters

{'clf__max_depth': 5,
 'clf__max_features': 'auto',
 'clf__min_samples_leaf': 4,
 'clf__min_samples_split': 5}

In [37]:
# PERFORMANCE MEASURE
#####################
# Stratified k-fold CV
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

skfolds = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

for train_index, test_index in skfolds.split(X_train, y_train):
    clone_estimator = clone(best_estimator)
    X_train_folds = X_train[train_index]
    y_train_folds = (y_train[train_index])
    X_test_fold = X_train[test_index]
    y_test_fold = (y_train[test_index])

    clone_estimator.fit(X_train_folds, y_train_folds)
    y_pred = clone_estimator.predict(X_test_fold)
    n_correct = sum(y_pred == y_test_fold)
    print(n_correct / len(y_pred))

0.2869640135197826
0.24717090547593407
0.2212244221688344
0.7431156178546575
0.7553062813778934


In [38]:
y_train_pred = cross_val_predict(best_estimator, X_train, y_train, cv=5,
        verbose=2, n_jobs=-1)

conf_mx = confusion_matrix(y_train, y_train_pred)

[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    2.4s remaining:    1.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.5s finished


In [39]:
conf_mx

array([[  1553,   2375,   2355],
       [ 13765,  27615,  26266],
       [ 47276,  69716, 110852]])

In [40]:
(precision_score(y_train, y_train_pred, average='micro'),
        recall_score(y_train, y_train_pred, average='micro'))

(0.46399114566246813, 0.46399114566246813)

In [41]:
f1_score(y_train, y_train_pred, average='micro')

0.46399114566246813

In [42]:
# TEST MODEL ON TEST DATA
#########################
y_test_pred = cross_val_predict(best_estimator, X_test, y_test, cv=5,
        verbose=2, n_jobs=-1)

conf_mx = confusion_matrix(y_test, y_test_pred)

[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    2.3s remaining:    1.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.5s finished


In [43]:
conf_mx

array([[   659,   2210,   3414],
       [   780,  26621,  40245],
       [  1795,  88986, 137064]])

In [44]:
(precision_score(y_test, y_test_pred, average='micro'),
        recall_score(y_test, y_test_pred, average='micro'))

(0.5445929735497426, 0.5445929735497426)

In [45]:
f1_score(y_test, y_test_pred, average='micro')

0.5445929735497426

In [46]:
joblib.dump(best_estimator, './training/sentiment_destree_undersample.pkl')

['./training/sentiment_destree_undersample.pkl']

# 3 - SGDClassifier behaves like Linear SVC

In [47]:
# hence loss='hinge' and penalty='l2'

In [7]:
from sklearn.linear_model import SGDClassifier

In [52]:
# The text data is already cleaned
inputfile = './csvfiles/output_sentiment.csv'
review = pd.read_csv(inputfile, skip_blank_lines=False)
review = review[['text', 'ovsentiment']]
# exclude NaN in 'text' column (count: 11248)
review = review[~pd.isna(review['text'])]
X = review['text'].values
y = review['ovsentiment'].values

# len(review[review['ovsentiment'] == -1])
# total number of -1 (negative review) is 12566

# len(review[review['ovsentiment'] == 0])
# total number of 0 (neutral) is 135292

# len(review[review['ovsentiment'] == 1])
# total number of 1 (positive review) is 455689

# splitting the dataset into the training set and test set
# stratify=y --> stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
        random_state=42, shuffle=True, stratify=y)

In [53]:
X_train.shape

(482837,)

In [54]:
rus = RandomUnderSampler(random_state=42, replacement=True)

tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None,
                       ngram_range=(1,1), stop_words=None, tokenizer=str.split)

param_grid = [{'clf__loss': ['hinge'],
               'clf__penalty': ['l2'],
               'clf__max_iter': [200, 300, 400, 500, 600]
              }
             ]

lr_tfidf = Pipeline([('vect', tfidf),
                     ('run', rus),
                     ('clf', SGDClassifier(random_state=42))
                     ])

gs_lr_tfidf = GridSearchCV(estimator=lr_tfidf,
        param_grid=param_grid,
        scoring=['accuracy', 'f1_macro', 'f1_micro'],
        cv=5,
        verbose=2,
        refit='f1_micro',
        n_jobs=-1)

In [55]:
gs_lr_tfidf.fit(X_train, y_train)

best_parameters = gs_lr_tfidf.best_params_
best_estimator = gs_lr_tfidf.best_estimator_
result = gs_lr_tfidf.cv_results_

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] clf__loss=hinge, clf__max_iter=200, clf__penalty=l2 .............
[CV] clf__loss=hinge, clf__max_iter=200, clf__penalty=l2 .............
[CV] clf__loss=hinge, clf__max_iter=200, clf__penalty=l2 .............
[CV] clf__loss=hinge, clf__max_iter=200, clf__penalty=l2 .............
[CV] clf__loss=hinge, clf__max_iter=200, clf__penalty=l2 .............
[CV] clf__loss=hinge, clf__max_iter=300, clf__penalty=l2 .............
[CV] clf__loss=hinge, clf__max_iter=300, clf__penalty=l2 .............
[CV] clf__loss=hinge, clf__max_iter=300, clf__penalty=l2 .............
[CV] clf__loss=hinge, clf__max_iter=300, clf__penalty=l2 .............
[CV] clf__loss=hinge, clf__max_iter=300, clf__penalty=l2 .............
[CV] clf__loss=hinge, clf__max_iter=400, clf__penalty=l2 .............
[CV] clf__loss=hinge, clf__max_iter=400, clf__penalty=l2 .............
[CV]  clf__loss=hinge, clf__max_iter=200, clf__penalty=l2, total=   8.8s
[CV] clf__loss=

[Parallel(n_jobs=-1)]: Done  15 out of  25 | elapsed:   45.2s remaining:   30.2s


[CV]  clf__loss=hinge, clf__max_iter=500, clf__penalty=l2, total=  13.0s
[CV]  clf__loss=hinge, clf__max_iter=500, clf__penalty=l2, total=  13.8s
[CV]  clf__loss=hinge, clf__max_iter=500, clf__penalty=l2, total=  14.1s
[CV]  clf__loss=hinge, clf__max_iter=500, clf__penalty=l2, total=  14.0s
[CV]  clf__loss=hinge, clf__max_iter=600, clf__penalty=l2, total=  15.7s
[CV]  clf__loss=hinge, clf__max_iter=500, clf__penalty=l2, total=  15.2s
[CV]  clf__loss=hinge, clf__max_iter=600, clf__penalty=l2, total=  15.3s
[CV]  clf__loss=hinge, clf__max_iter=600, clf__penalty=l2, total=  15.2s
[CV]  clf__loss=hinge, clf__max_iter=600, clf__penalty=l2, total=  15.1s
[CV]  clf__loss=hinge, clf__max_iter=600, clf__penalty=l2, total=   8.8s


[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:   57.5s finished


In [56]:
best_parameters

{'clf__loss': 'hinge', 'clf__max_iter': 400, 'clf__penalty': 'l2'}

In [57]:
# PERFORMANCE MEASURE
#####################
# Stratified k-fold CV
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

skfolds = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

for train_index, test_index in skfolds.split(X_train, y_train):
    clone_estimator = clone(best_estimator)
    X_train_folds = X_train[train_index]
    y_train_folds = (y_train[train_index])
    X_test_fold = X_train[test_index]
    y_test_fold = (y_train[test_index])

    clone_estimator.fit(X_train_folds, y_train_folds)
    y_pred = clone_estimator.predict(X_test_fold)
    n_correct = sum(y_pred == y_test_fold)
    print(n_correct / len(y_pred))

0.9467945199805321
0.9480469720818491
0.9486993621075305
0.947362425698486
0.9485222542095562


In [58]:
# confusion matrix
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(best_estimator, X_train, y_train, cv=5,
        verbose=2, n_jobs=-1)

conf_mx = confusion_matrix(y_train, y_train_pred)

[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    6.2s remaining:    4.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    6.7s finished


In [59]:
conf_mx

array([[  9686,    265,    102],
       [  6309,  99557,   2367],
       [  6338,   9655, 348558]])

In [60]:
(precision_score(y_train, y_train_pred, average='micro'),
        recall_score(y_train, y_train_pred, average='micro'))

(0.9481481328067236, 0.9481481328067236)

In [61]:
f1_score(y_train, y_train_pred, average='micro')

0.9481481328067236

In [62]:
# TEST MODEL ON TEST DATA
#########################
y_test_pred = cross_val_predict(best_estimator, X_test, y_test, cv=5,
        verbose=2, n_jobs=-1)

conf_mx = confusion_matrix(y_test, y_test_pred)

[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    1.8s remaining:    1.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.0s finished


In [63]:
conf_mx

array([[ 2388,   103,    22],
       [ 1799, 24594,   666],
       [ 1614,  2828, 86696]])

In [64]:
(precision_score(y_test, y_test_pred, average='micro'),
        recall_score(y_test, y_test_pred, average='micro'))

(0.9417446773258222, 0.9417446773258222)

In [65]:
f1_score(y_test, y_test_pred, average='micro')

0.9417446773258222

In [66]:
joblib.dump(best_estimator, './training/sentiment_svm_undersample.pkl')

['./training/sentiment_svm_undersample.pkl']

# 4 - SGD

In [8]:
from sklearn.linear_model import SGDClassifier

In [68]:
rus = RandomUnderSampler(random_state=42, replacement=True)

tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None,
                       ngram_range=(1,1), stop_words=None, tokenizer=str.split)
param_grid = [{'clf__loss': ['hinge', 'modified_huber', 'squared_hinge', 'perceptron', 'log'],
               'clf__penalty': ['l2', 'l1', 'elasticnet'],
               'clf__max_iter': [200, 300, 400, 500, 600]
            }]

lr_tfidf = Pipeline([('vect', tfidf),
                     ('rus', rus),
                     ('clf', SGDClassifier(random_state=42))
                    ])

gs_lr_tfidf = GridSearchCV(estimator=lr_tfidf,
        param_grid=param_grid,
        scoring=['accuracy', 'f1_macro', 'f1_micro'],
        cv=5,
        verbose=2,
        refit='f1_micro',
        n_jobs=-1)

In [69]:
gs_lr_tfidf.fit(X_train, y_train)

best_parameters = gs_lr_tfidf.best_params_
best_estimator = gs_lr_tfidf.best_estimator_
result = gs_lr_tfidf.cv_results_

Fitting 5 folds for each of 75 candidates, totalling 375 fits
[CV] clf__loss=hinge, clf__max_iter=200, clf__penalty=l2 .............
[CV] clf__loss=hinge, clf__max_iter=200, clf__penalty=l2 .............
[CV] clf__loss=hinge, clf__max_iter=200, clf__penalty=l2 .............
[CV] clf__loss=hinge, clf__max_iter=200, clf__penalty=l2 .............
[CV] clf__loss=hinge, clf__max_iter=200, clf__penalty=l2 .............
[CV] clf__loss=hinge, clf__max_iter=200, clf__penalty=l1 .............
[CV] clf__loss=hinge, clf__max_iter=200, clf__penalty=l1 .............
[CV] clf__loss=hinge, clf__max_iter=200, clf__penalty=l1 .............
[CV] clf__loss=hinge, clf__max_iter=200, clf__penalty=l1 .............
[CV] clf__loss=hinge, clf__max_iter=200, clf__penalty=l1 .............
[CV] clf__loss=hinge, clf__max_iter=200, clf__penalty=elasticnet .....
[CV] clf__loss=hinge, clf__max_iter=200, clf__penalty=elasticnet .....
[CV]  clf__loss=hinge, clf__max_iter=200, clf__penalty=l2, total=   8.5s
[CV] clf__los

[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   45.2s


[CV] clf__loss=hinge, clf__max_iter=300, clf__penalty=elasticnet .....
[CV]  clf__loss=hinge, clf__max_iter=300, clf__penalty=l2, total=  11.2s
[CV]  clf__loss=hinge, clf__max_iter=300, clf__penalty=l2, total=  11.9s
[CV] clf__loss=hinge, clf__max_iter=300, clf__penalty=elasticnet .....
[CV] clf__loss=hinge, clf__max_iter=400, clf__penalty=l2 .............
[CV]  clf__loss=hinge, clf__max_iter=300, clf__penalty=l2, total=  11.4s
[CV] clf__loss=hinge, clf__max_iter=400, clf__penalty=l2 .............
[CV]  clf__loss=hinge, clf__max_iter=300, clf__penalty=l1, total=  12.5s
[CV] clf__loss=hinge, clf__max_iter=400, clf__penalty=l2 .............
[CV]  clf__loss=hinge, clf__max_iter=300, clf__penalty=l1, total=  13.3s
[CV]  clf__loss=hinge, clf__max_iter=300, clf__penalty=l1, total=  13.3s
[CV] clf__loss=hinge, clf__max_iter=400, clf__penalty=l2 .............
[CV]  clf__loss=hinge, clf__max_iter=300, clf__penalty=l1, total=  13.2s
[CV] clf__loss=hinge, clf__max_iter=400, clf__penalty=l2 ......

[CV] clf__loss=modified_huber, clf__max_iter=200, clf__penalty=l1 ....
[CV]  clf__loss=modified_huber, clf__max_iter=200, clf__penalty=l2, total=   9.3s
[CV] clf__loss=modified_huber, clf__max_iter=200, clf__penalty=elasticnet 
[CV]  clf__loss=hinge, clf__max_iter=600, clf__penalty=elasticnet, total=  16.6s
[CV] clf__loss=modified_huber, clf__max_iter=200, clf__penalty=elasticnet 
[CV]  clf__loss=modified_huber, clf__max_iter=200, clf__penalty=l2, total=   8.7s
[CV] clf__loss=modified_huber, clf__max_iter=200, clf__penalty=elasticnet 
[CV]  clf__loss=hinge, clf__max_iter=600, clf__penalty=elasticnet, total=  16.5s
[CV] clf__loss=modified_huber, clf__max_iter=200, clf__penalty=elasticnet 
[CV]  clf__loss=modified_huber, clf__max_iter=200, clf__penalty=l2, total=   8.9s
[CV] clf__loss=modified_huber, clf__max_iter=200, clf__penalty=elasticnet 
[CV]  clf__loss=modified_huber, clf__max_iter=200, clf__penalty=l2, total=  10.2s
[CV] clf__loss=modified_huber, clf__max_iter=300, clf__penalty=l

[CV]  clf__loss=modified_huber, clf__max_iter=500, clf__penalty=l1, total=  15.4s
[CV] clf__loss=modified_huber, clf__max_iter=600, clf__penalty=l2 ....
[CV]  clf__loss=modified_huber, clf__max_iter=500, clf__penalty=l1, total=  17.1s
[CV]  clf__loss=modified_huber, clf__max_iter=500, clf__penalty=l1, total=  17.3s
[CV] clf__loss=modified_huber, clf__max_iter=600, clf__penalty=l2 ....
[CV] clf__loss=modified_huber, clf__max_iter=600, clf__penalty=l2 ....
[CV]  clf__loss=modified_huber, clf__max_iter=500, clf__penalty=l1, total=  18.1s
[CV]  clf__loss=modified_huber, clf__max_iter=500, clf__penalty=l1, total=  18.2s
[CV] clf__loss=modified_huber, clf__max_iter=600, clf__penalty=l1 ....
[CV]  clf__loss=modified_huber, clf__max_iter=500, clf__penalty=elasticnet, total=  18.2s
[CV] clf__loss=modified_huber, clf__max_iter=600, clf__penalty=l1 ....
[CV] clf__loss=modified_huber, clf__max_iter=600, clf__penalty=l1 ....
[CV]  clf__loss=modified_huber, clf__max_iter=500, clf__penalty=elasticnet

[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  4.9min


[CV] clf__loss=modified_huber, clf__max_iter=600, clf__penalty=elasticnet 
[CV]  clf__loss=modified_huber, clf__max_iter=600, clf__penalty=l2, total=  13.9s
[CV] clf__loss=squared_hinge, clf__max_iter=200, clf__penalty=l2 .....
[CV]  clf__loss=modified_huber, clf__max_iter=600, clf__penalty=l2, total=  14.1s
[CV] clf__loss=squared_hinge, clf__max_iter=200, clf__penalty=l2 .....
[CV]  clf__loss=modified_huber, clf__max_iter=600, clf__penalty=l1, total=  18.2s
[CV] clf__loss=squared_hinge, clf__max_iter=200, clf__penalty=l2 .....
[CV]  clf__loss=modified_huber, clf__max_iter=600, clf__penalty=l1, total=  18.4s
[CV]  clf__loss=modified_huber, clf__max_iter=600, clf__penalty=l1, total=  19.0s
[CV] clf__loss=squared_hinge, clf__max_iter=200, clf__penalty=l2 .....
[CV] clf__loss=squared_hinge, clf__max_iter=200, clf__penalty=l2 .....
[CV]  clf__loss=modified_huber, clf__max_iter=600, clf__penalty=l1, total=  17.8s
[CV] clf__loss=squared_hinge, clf__max_iter=200, clf__penalty=l1 .....
[CV]  c

[CV] clf__loss=squared_hinge, clf__max_iter=500, clf__penalty=l1 .....
[CV]  clf__loss=squared_hinge, clf__max_iter=400, clf__penalty=elasticnet, total=  12.3s
[CV]  clf__loss=squared_hinge, clf__max_iter=400, clf__penalty=elasticnet, total=  14.4s
[CV]  clf__loss=squared_hinge, clf__max_iter=400, clf__penalty=elasticnet, total=  14.2s
[CV] clf__loss=squared_hinge, clf__max_iter=500, clf__penalty=l1 .....
[CV] clf__loss=squared_hinge, clf__max_iter=500, clf__penalty=l1 .....
[CV] clf__loss=squared_hinge, clf__max_iter=500, clf__penalty=elasticnet 
[CV]  clf__loss=squared_hinge, clf__max_iter=400, clf__penalty=elasticnet, total=  12.1s
[CV] clf__loss=squared_hinge, clf__max_iter=500, clf__penalty=elasticnet 
[CV]  clf__loss=squared_hinge, clf__max_iter=500, clf__penalty=l2, total=  10.8s
[CV]  clf__loss=squared_hinge, clf__max_iter=500, clf__penalty=l2, total=  11.1s
[CV] clf__loss=squared_hinge, clf__max_iter=500, clf__penalty=elasticnet 
[CV] clf__loss=squared_hinge, clf__max_iter=500

[CV]  clf__loss=perceptron, clf__max_iter=300, clf__penalty=l2, total=   9.0s
[CV] clf__loss=perceptron, clf__max_iter=400, clf__penalty=l2 ........
[CV]  clf__loss=perceptron, clf__max_iter=300, clf__penalty=l1, total=  10.9s
[CV] clf__loss=perceptron, clf__max_iter=400, clf__penalty=l2 ........
[CV]  clf__loss=perceptron, clf__max_iter=300, clf__penalty=l1, total=  10.5s
[CV] clf__loss=perceptron, clf__max_iter=400, clf__penalty=l2 ........
[CV]  clf__loss=perceptron, clf__max_iter=300, clf__penalty=l1, total=  11.2s
[CV] clf__loss=perceptron, clf__max_iter=400, clf__penalty=l2 ........
[CV]  clf__loss=perceptron, clf__max_iter=300, clf__penalty=l1, total=  11.0s
[CV] clf__loss=perceptron, clf__max_iter=400, clf__penalty=l1 ........
[CV]  clf__loss=perceptron, clf__max_iter=300, clf__penalty=l1, total=  10.7s
[CV] clf__loss=perceptron, clf__max_iter=400, clf__penalty=l1 ........
[CV]  clf__loss=perceptron, clf__max_iter=300, clf__penalty=elasticnet, total=  11.3s
[CV] clf__loss=perce

[CV] clf__loss=log, clf__max_iter=200, clf__penalty=elasticnet .......
[CV]  clf__loss=log, clf__max_iter=200, clf__penalty=l2, total=  12.2s
[CV] clf__loss=log, clf__max_iter=200, clf__penalty=elasticnet .......
[CV]  clf__loss=log, clf__max_iter=200, clf__penalty=l2, total=  10.9s
[CV] clf__loss=log, clf__max_iter=200, clf__penalty=elasticnet .......
[CV]  clf__loss=perceptron, clf__max_iter=600, clf__penalty=elasticnet, total=  16.1s
[CV] clf__loss=log, clf__max_iter=200, clf__penalty=elasticnet .......
[CV]  clf__loss=log, clf__max_iter=200, clf__penalty=l2, total=  10.9s
[CV] clf__loss=log, clf__max_iter=200, clf__penalty=elasticnet .......
[CV]  clf__loss=log, clf__max_iter=200, clf__penalty=l2, total=  10.8s
[CV]  clf__loss=log, clf__max_iter=200, clf__penalty=l2, total=  10.5s
[CV] clf__loss=log, clf__max_iter=300, clf__penalty=l2 ...............
[CV] clf__loss=log, clf__max_iter=300, clf__penalty=l2 ...............
[CV]  clf__loss=log, clf__max_iter=200, clf__penalty=l1, total

[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed: 11.7min


[CV] clf__loss=log, clf__max_iter=500, clf__penalty=l1 ...............
[CV]  clf__loss=log, clf__max_iter=400, clf__penalty=elasticnet, total=  19.1s
[CV] clf__loss=log, clf__max_iter=500, clf__penalty=l1 ...............
[CV]  clf__loss=log, clf__max_iter=400, clf__penalty=elasticnet, total=  18.8s
[CV] clf__loss=log, clf__max_iter=500, clf__penalty=l1 ...............
[CV]  clf__loss=log, clf__max_iter=400, clf__penalty=elasticnet, total=  18.8s
[CV] clf__loss=log, clf__max_iter=500, clf__penalty=elasticnet .......
[CV]  clf__loss=log, clf__max_iter=500, clf__penalty=l2, total=  17.3s
[CV]  clf__loss=log, clf__max_iter=400, clf__penalty=elasticnet, total=  18.0s
[CV] clf__loss=log, clf__max_iter=500, clf__penalty=elasticnet .......
[CV] clf__loss=log, clf__max_iter=500, clf__penalty=elasticnet .......
[CV]  clf__loss=log, clf__max_iter=500, clf__penalty=l2, total=  17.8s
[CV] clf__loss=log, clf__max_iter=500, clf__penalty=elasticnet .......
[CV]  clf__loss=log, clf__max_iter=500, clf__

[Parallel(n_jobs=-1)]: Done 375 out of 375 | elapsed: 13.1min finished


In [73]:
best_parameters

{'clf__loss': 'modified_huber', 'clf__max_iter': 400, 'clf__penalty': 'l1'}

In [74]:
skfolds = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

for train_index, test_index in skfolds.split(X_train, y_train):
    clone_estimator = clone(best_estimator)
    X_train_folds = X_train[train_index]
    y_train_folds = (y_train[train_index])
    X_test_fold = X_train[test_index]
    y_test_fold = (y_train[test_index])

    clone_estimator.fit(X_train_folds, y_train_folds)
    y_pred = clone_estimator.predict(X_test_fold)
    n_correct = sum(y_pred == y_test_fold)
    print(n_correct / len(y_pred))

0.9655790160403442
0.9668420180598127
0.9683849722475354
0.9678665368763333
0.9673384006793281


In [75]:
y_train_pred = cross_val_predict(best_estimator, X_train, y_train, cv=5,
        verbose=2, n_jobs=-1)

conf_mx = confusion_matrix(y_train, y_train_pred)

[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    8.1s remaining:    5.4s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    8.5s finished


In [76]:
conf_mx

array([[  9732,    245,     76],
       [  5540, 100840,   1853],
       [  2282,   5434, 356835]])

In [77]:
(precision_score(y_train, y_train_pred, average='micro'),
        recall_score(y_train, y_train_pred, average='micro'))

(0.9680430455826707, 0.9680430455826707)

In [78]:
f1_score(y_train, y_train_pred, average='micro')

0.9680430455826707

In [79]:
y_test_pred = cross_val_predict(best_estimator, X_test, y_test, cv=5,
        verbose=2, n_jobs=-1)

conf_mx = confusion_matrix(y_test, y_test_pred)

[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    2.2s remaining:    1.5s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.3s finished


In [80]:
conf_mx

array([[ 2354,   129,    30],
       [ 1968, 24449,   642],
       [ 1039,  2832, 87267]])

In [81]:
(precision_score(y_test, y_test_pred, average='micro'),
        recall_score(y_test, y_test_pred, average='micro'))

(0.9449921298981029, 0.9449921298981029)

In [82]:
f1_score(y_test, y_test_pred, average='micro')

0.9449921298981029

In [83]:
joblib.dump(best_estimator, './training/sentiment_sgd_undersample.pkl')

['./training/sentiment_sgd_undersample.pkl']

# 5 - Passive Aggressive Classifier

In [9]:
from sklearn.linear_model import PassiveAggressiveClassifier

In [85]:
rus = RandomUnderSampler(random_state=42, replacement=True)

tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None,
                       ngram_range=(1,1), stop_words=None, tokenizer=str.split)

param_grid = [{'clf__loss': ['hinge', 'squared_hinge'],
               'clf__C': [0.1, 0.5, 1.0, 1.5],
               'clf__shuffle': [True, False]
              }]

lr_tfidf = Pipeline([('vect', tfidf),
                     ('rus', rus),
                     ('clf', PassiveAggressiveClassifier(random_state=42))
                    ])

gs_lr_tfidf = GridSearchCV(estimator=lr_tfidf,
        param_grid=param_grid,
        scoring=['accuracy', 'f1_micro'],
        cv=5,
        verbose=2,
        refit='f1_micro',
        n_jobs=-1)

In [86]:
gs_lr_tfidf.fit(X_train, y_train)

best_parameters = gs_lr_tfidf.best_params_
best_estimator = gs_lr_tfidf.best_estimator_
result = gs_lr_tfidf.cv_results_

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] clf__C=0.1, clf__loss=hinge, clf__shuffle=True ..................
[CV] clf__C=0.1, clf__loss=hinge, clf__shuffle=True ..................
[CV] clf__C=0.1, clf__loss=hinge, clf__shuffle=True ..................
[CV] clf__C=0.1, clf__loss=hinge, clf__shuffle=True ..................
[CV] clf__C=0.1, clf__loss=hinge, clf__shuffle=True ..................
[CV] clf__C=0.1, clf__loss=hinge, clf__shuffle=False .................
[CV] clf__C=0.1, clf__loss=hinge, clf__shuffle=False .................
[CV] clf__C=0.1, clf__loss=hinge, clf__shuffle=False .................
[CV] clf__C=0.1, clf__loss=hinge, clf__shuffle=False .................
[CV] clf__C=0.1, clf__loss=hinge, clf__shuffle=False .................
[CV] clf__C=0.1, clf__loss=squared_hinge, clf__shuffle=True ..........
[CV] clf__C=0.1, clf__loss=squared_hinge, clf__shuffle=True ..........




[CV] ... clf__C=0.1, clf__loss=hinge, clf__shuffle=True, total=   5.6s
[CV] ... clf__C=0.1, clf__loss=hinge, clf__shuffle=True, total=   5.6s
[CV] clf__C=0.1, clf__loss=squared_hinge, clf__shuffle=True ..........
[CV] ... clf__C=0.1, clf__loss=hinge, clf__shuffle=True, total=   5.5s
[CV] clf__C=0.1, clf__loss=squared_hinge, clf__shuffle=True ..........
[CV] clf__C=0.1, clf__loss=squared_hinge, clf__shuffle=True ..........
[CV] ... clf__C=0.1, clf__loss=hinge, clf__shuffle=True, total=   6.1s
[CV] clf__C=0.1, clf__loss=squared_hinge, clf__shuffle=False .........
[CV] ... clf__C=0.1, clf__loss=hinge, clf__shuffle=True, total=   6.3s
[CV] clf__C=0.1, clf__loss=squared_hinge, clf__shuffle=False .........
[CV] .. clf__C=0.1, clf__loss=hinge, clf__shuffle=False, total=   6.5s
[CV] .. clf__C=0.1, clf__loss=hinge, clf__shuffle=False, total=   6.3s
[CV] .. clf__C=0.1, clf__loss=hinge, clf__shuffle=False, total=   6.2s
[CV] clf__C=0.1, clf__loss=squared_hinge, clf__shuffle=False .........
[CV] c



[CV]  clf__C=0.1, clf__loss=squared_hinge, clf__shuffle=True, total=   6.4s
[CV]  clf__C=0.1, clf__loss=squared_hinge, clf__shuffle=True, total=   6.5s
[CV] clf__C=0.5, clf__loss=hinge, clf__shuffle=True ..................
[CV] clf__C=0.5, clf__loss=hinge, clf__shuffle=False .................
[CV]  clf__C=0.1, clf__loss=squared_hinge, clf__shuffle=True, total=   6.6s
[CV] clf__C=0.5, clf__loss=hinge, clf__shuffle=False .................
[CV]  clf__C=0.1, clf__loss=squared_hinge, clf__shuffle=False, total=   6.5s
[CV] clf__C=0.5, clf__loss=hinge, clf__shuffle=False .................
[CV]  clf__C=0.1, clf__loss=squared_hinge, clf__shuffle=False, total=   6.7s


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   28.0s


[CV] clf__C=0.5, clf__loss=hinge, clf__shuffle=False .................
[CV]  clf__C=0.1, clf__loss=squared_hinge, clf__shuffle=False, total=   6.4s
[CV]  clf__C=0.1, clf__loss=squared_hinge, clf__shuffle=False, total=   7.1s
[CV] clf__C=0.5, clf__loss=hinge, clf__shuffle=False .................
[CV] ... clf__C=0.5, clf__loss=hinge, clf__shuffle=True, total=   6.0s
[CV] clf__C=0.5, clf__loss=squared_hinge, clf__shuffle=True ..........
[CV]  clf__C=0.1, clf__loss=squared_hinge, clf__shuffle=False, total=   7.0s
[CV] ... clf__C=0.5, clf__loss=hinge, clf__shuffle=True, total=   6.6s
[CV] ... clf__C=0.5, clf__loss=hinge, clf__shuffle=True, total=   6.3s
[CV] clf__C=0.5, clf__loss=squared_hinge, clf__shuffle=True ..........
[CV] clf__C=0.5, clf__loss=squared_hinge, clf__shuffle=True ..........
[CV] ... clf__C=0.5, clf__loss=hinge, clf__shuffle=True, total=   6.1s
[CV] clf__C=0.5, clf__loss=squared_hinge, clf__shuffle=True ..........
[CV] clf__C=0.5, clf__loss=squared_hinge, clf__shuffle=True



[CV] .. clf__C=0.5, clf__loss=hinge, clf__shuffle=False, total=   6.1s
[CV] clf__C=0.5, clf__loss=squared_hinge, clf__shuffle=False .........
[CV] ... clf__C=0.5, clf__loss=hinge, clf__shuffle=True, total=   6.4s
[CV] .. clf__C=0.5, clf__loss=hinge, clf__shuffle=False, total=   6.4s
[CV] clf__C=0.5, clf__loss=squared_hinge, clf__shuffle=False .........
[CV] .. clf__C=0.5, clf__loss=hinge, clf__shuffle=False, total=   6.7s
[CV] clf__C=0.5, clf__loss=squared_hinge, clf__shuffle=False .........
[CV] clf__C=0.5, clf__loss=squared_hinge, clf__shuffle=False .........
[CV] .. clf__C=0.5, clf__loss=hinge, clf__shuffle=False, total=   6.2s
[CV] .. clf__C=0.5, clf__loss=hinge, clf__shuffle=False, total=   7.0s
[CV] clf__C=1.0, clf__loss=hinge, clf__shuffle=True ..................
[CV]  clf__C=0.5, clf__loss=squared_hinge, clf__shuffle=True, total=   6.5s
[CV] clf__C=1.0, clf__loss=hinge, clf__shuffle=True ..................
[CV]  clf__C=0.5, clf__loss=squared_hinge, clf__shuffle=True, total=   6



[CV]  clf__C=0.5, clf__loss=squared_hinge, clf__shuffle=False, total=   6.3s
[CV]  clf__C=0.5, clf__loss=squared_hinge, clf__shuffle=False, total=   5.9s
[CV]  clf__C=0.5, clf__loss=squared_hinge, clf__shuffle=False, total=   6.2s
[CV] clf__C=1.0, clf__loss=hinge, clf__shuffle=False .................
[CV] clf__C=1.0, clf__loss=hinge, clf__shuffle=False .................
[CV] clf__C=1.0, clf__loss=squared_hinge, clf__shuffle=True ..........
[CV]  clf__C=0.5, clf__loss=squared_hinge, clf__shuffle=False, total=   6.6s
[CV] ... clf__C=1.0, clf__loss=hinge, clf__shuffle=True, total=   6.2s
[CV] clf__C=1.0, clf__loss=squared_hinge, clf__shuffle=True ..........
[CV] ... clf__C=1.0, clf__loss=hinge, clf__shuffle=True, total=   6.3s
[CV] ... clf__C=1.0, clf__loss=hinge, clf__shuffle=True, total=   6.0s
[CV] clf__C=1.0, clf__loss=squared_hinge, clf__shuffle=True ..........
[CV] clf__C=1.0, clf__loss=squared_hinge, clf__shuffle=True ..........
[CV] .. clf__C=1.0, clf__loss=hinge, clf__shuffle=Fal



[CV] .. clf__C=1.0, clf__loss=hinge, clf__shuffle=False, total=   6.0s
[CV] clf__C=1.5, clf__loss=hinge, clf__shuffle=True ..................
[CV]  clf__C=1.0, clf__loss=squared_hinge, clf__shuffle=True, total=   6.2s
[CV] .. clf__C=1.0, clf__loss=hinge, clf__shuffle=False, total=   6.6s
[CV] clf__C=1.5, clf__loss=hinge, clf__shuffle=True ..................
[CV] clf__C=1.5, clf__loss=hinge, clf__shuffle=True ..................
[CV]  clf__C=1.0, clf__loss=squared_hinge, clf__shuffle=True, total=   6.3s
[CV]  clf__C=1.0, clf__loss=squared_hinge, clf__shuffle=True, total=   6.3s
[CV] clf__C=1.5, clf__loss=hinge, clf__shuffle=True ..................
[CV] clf__C=1.5, clf__loss=hinge, clf__shuffle=True ..................
[CV]  clf__C=1.0, clf__loss=squared_hinge, clf__shuffle=True, total=   6.2s
[CV]  clf__C=1.0, clf__loss=squared_hinge, clf__shuffle=True, total=   6.7s
[CV] clf__C=1.5, clf__loss=hinge, clf__shuffle=False .................
[CV]  clf__C=1.0, clf__loss=squared_hinge, clf__shuf



[CV] ... clf__C=1.5, clf__loss=hinge, clf__shuffle=True, total=   6.1s
[CV] clf__C=1.5, clf__loss=squared_hinge, clf__shuffle=True ..........
[CV] ... clf__C=1.5, clf__loss=hinge, clf__shuffle=True, total=   6.4s
[CV] ... clf__C=1.5, clf__loss=hinge, clf__shuffle=True, total=   6.3s
[CV] clf__C=1.5, clf__loss=squared_hinge, clf__shuffle=True ..........
[CV] clf__C=1.5, clf__loss=squared_hinge, clf__shuffle=True ..........
[CV] ... clf__C=1.5, clf__loss=hinge, clf__shuffle=True, total=   6.3s
[CV] ... clf__C=1.5, clf__loss=hinge, clf__shuffle=True, total=   6.5s
[CV] clf__C=1.5, clf__loss=squared_hinge, clf__shuffle=False .........
[CV] clf__C=1.5, clf__loss=squared_hinge, clf__shuffle=False .........
[CV] .. clf__C=1.5, clf__loss=hinge, clf__shuffle=False, total=   6.7s
[CV] .. clf__C=1.5, clf__loss=hinge, clf__shuffle=False, total=   6.7s
[CV] .. clf__C=1.5, clf__loss=hinge, clf__shuffle=False, total=   6.1s
[CV] clf__C=1.5, clf__loss=squared_hinge, clf__shuffle=False .........
[CV] .



[CV]  clf__C=1.5, clf__loss=squared_hinge, clf__shuffle=True, total=   5.1s
[CV]  clf__C=1.5, clf__loss=squared_hinge, clf__shuffle=False, total=   4.4s
[CV]  clf__C=1.5, clf__loss=squared_hinge, clf__shuffle=False, total=   5.0s
[CV]  clf__C=1.5, clf__loss=squared_hinge, clf__shuffle=False, total=   4.2s
[CV]  clf__C=1.5, clf__loss=squared_hinge, clf__shuffle=False, total=   4.8s
[CV]  clf__C=1.5, clf__loss=squared_hinge, clf__shuffle=True, total=   5.2s
[CV]  clf__C=1.5, clf__loss=squared_hinge, clf__shuffle=True, total=   5.7s
[CV]  clf__C=1.5, clf__loss=squared_hinge, clf__shuffle=False, total=   5.8s


[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  1.6min finished


In [87]:
best_parameters

{'clf__C': 0.1, 'clf__loss': 'hinge', 'clf__shuffle': True}

In [88]:
# PERFORMANCE MEASURE
#####################
# Stratified k-fold CV
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

skfolds = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

for train_index, test_index in skfolds.split(X_train, y_train):
    clone_estimator = clone(best_estimator)
    X_train_folds = X_train[train_index]
    y_train_folds = (y_train[train_index])
    X_test_fold = X_train[test_index]
    y_test_fold = (y_train[test_index])

    clone_estimator.fit(X_train_folds, y_train_folds)
    y_pred = clone_estimator.predict(X_test_fold)
    n_correct = sum(y_pred == y_test_fold)
    print(n_correct / len(y_pred))



0.95499590966045




0.9559170739789579




0.9575325159473117




0.9571070563138164




0.9579562164737071


In [89]:
y_train_pred = cross_val_predict(best_estimator, X_train, y_train, cv=5,
        verbose=2, n_jobs=-1)

conf_mx = confusion_matrix(y_train, y_train_pred)

[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    4.0s remaining:    2.7s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    4.3s finished


In [90]:
conf_mx

array([[  9706,    237,    110],
       [  6202,  99493,   2538],
       [  5378,   6565, 352608]])

In [91]:
(precision_score(y_train, y_train_pred, average='micro'),
        recall_score(y_train, y_train_pred, average='micro'))

(0.9564449286198033, 0.9564449286198033)

In [92]:
f1_score(y_train, y_train_pred, average='micro')

0.9564449286198033

In [93]:
# TEST MODEL ON TEST DATA
#########################
y_test_pred = cross_val_predict(best_estimator, X_test, y_test, cv=5,
        verbose=2, n_jobs=-1)

conf_mx = confusion_matrix(y_test, y_test_pred)

[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    1.2s remaining:    0.8s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.3s finished


In [94]:
conf_mx

array([[ 2351,   131,    31],
       [ 1937, 24427,   695],
       [ 1746,  2738, 86654]])

In [95]:
(precision_score(y_test, y_test_pred, average='micro'),
        recall_score(y_test, y_test_pred, average='micro'))

(0.9397067351503604, 0.9397067351503604)

In [96]:
f1_score(y_test, y_test_pred, average='micro')

0.9397067351503604

In [97]:
joblib.dump(best_estimator, './training/sentiment_passiveagressive_undersample.pkl')

['./training/sentiment_passiveagressive_undersample.pkl']

# 6 - Perceptron

In [10]:
from sklearn.linear_model import Perceptron

In [99]:
rus = RandomUnderSampler(random_state=42, replacement=True)

tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None,
                       ngram_range=(1,1), stop_words=None, tokenizer=str.split)

param_grid = [{'clf__penalty': ['l2', 'l1', 'elasticnet'],
               'clf__alpha': [0.0001, 0.001, 0.01, 1],
               'clf__max_iter': [1000],
               'clf__warm_start': [0.0001]
              }]

lr_tfidf = Pipeline([('vect', tfidf),
                     ('rus', rus),
                     ('clf', Perceptron(random_state=42))
                    ])

gs_lr_tfidf = GridSearchCV(estimator=lr_tfidf,
        param_grid=param_grid,
        scoring=['accuracy', 'f1_micro'],
        cv=5,
        verbose=2,
        refit='f1_micro',
        n_jobs=-1)

In [100]:
gs_lr_tfidf.fit(X_train, y_train)

best_parameters = gs_lr_tfidf.best_params_
best_estimator = gs_lr_tfidf.best_estimator_
result = gs_lr_tfidf.cv_results_

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] clf__alpha=0.0001, clf__max_iter=1000, clf__penalty=l2, clf__warm_start=0.0001 
[CV] clf__alpha=0.0001, clf__max_iter=1000, clf__penalty=l2, clf__warm_start=0.0001 
[CV] clf__alpha=0.0001, clf__max_iter=1000, clf__penalty=l2, clf__warm_start=0.0001 
[CV] clf__alpha=0.0001, clf__max_iter=1000, clf__penalty=l2, clf__warm_start=0.0001 
[CV] clf__alpha=0.0001, clf__max_iter=1000, clf__penalty=l2, clf__warm_start=0.0001 
[CV] clf__alpha=0.0001, clf__max_iter=1000, clf__penalty=l1, clf__warm_start=0.0001 
[CV] clf__alpha=0.0001, clf__max_iter=1000, clf__penalty=l1, clf__warm_start=0.0001 
[CV] clf__alpha=0.0001, clf__max_iter=1000, clf__penalty=l1, clf__warm_start=0.0001 
[CV] clf__alpha=0.0001, clf__max_iter=1000, clf__penalty=l1, clf__warm_start=0.0001 
[CV] clf__alpha=0.0001, clf__max_iter=1000, clf__penalty=l1, clf__warm_start=0.0001 
[CV] clf__alpha=0.0001, clf__max_iter=1000, clf__penalty=elasticnet, clf__warm_start=0.00

[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  1.1min


[CV] clf__alpha=0.001, clf__max_iter=1000, clf__penalty=elasticnet, clf__warm_start=0.0001 
[CV]  clf__alpha=0.001, clf__max_iter=1000, clf__penalty=l2, clf__warm_start=0.0001, total=  20.9s
[CV]  clf__alpha=0.001, clf__max_iter=1000, clf__penalty=l2, clf__warm_start=0.0001, total=  21.0s
[CV] clf__alpha=0.001, clf__max_iter=1000, clf__penalty=elasticnet, clf__warm_start=0.0001 
[CV]  clf__alpha=0.0001, clf__max_iter=1000, clf__penalty=elasticnet, clf__warm_start=0.0001, total=  28.2s
[CV] clf__alpha=0.01, clf__max_iter=1000, clf__penalty=l2, clf__warm_start=0.0001 
[CV] clf__alpha=0.01, clf__max_iter=1000, clf__penalty=l2, clf__warm_start=0.0001 
[CV]  clf__alpha=0.001, clf__max_iter=1000, clf__penalty=l1, clf__warm_start=0.0001, total=  24.1s
[CV] clf__alpha=0.01, clf__max_iter=1000, clf__penalty=l2, clf__warm_start=0.0001 
[CV]  clf__alpha=0.001, clf__max_iter=1000, clf__penalty=l1, clf__warm_start=0.0001, total=  26.5s
[CV]  clf__alpha=0.001, clf__max_iter=1000, clf__penalty=l1, cl

[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 17.7min finished


In [101]:
best_parameters

{'clf__alpha': 0.0001,
 'clf__max_iter': 1000,
 'clf__penalty': 'l1',
 'clf__warm_start': 0.0001}

In [102]:
# PERFORMANCE MEASURE
#####################
# Stratified k-fold CV
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

skfolds = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

for train_index, test_index in skfolds.split(X_train, y_train):
    clone_estimator = clone(best_estimator)
    X_train_folds = X_train[train_index]
    y_train_folds = (y_train[train_index])
    X_test_fold = X_train[test_index]
    y_test_fold = (y_train[test_index])

    clone_estimator.fit(X_train_folds, y_train_folds)
    y_pred = clone_estimator.predict(X_test_fold)
    n_correct = sum(y_pred == y_test_fold)
    print(n_correct / len(y_pred))

0.9481614182605184
0.9194557203214315
0.9115649076298566
0.9223743346519478
0.9337862187519417


In [103]:
y_train_pred = cross_val_predict(best_estimator, X_train, y_train, cv=5,
        verbose=2, n_jobs=-1)

conf_mx = confusion_matrix(y_train, y_train_pred)

[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   12.9s remaining:    8.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   13.2s finished


In [104]:
conf_mx

array([[  8688,   1014,    351],
       [ 12724,  90147,   5362],
       [  5023,  23968, 335560]])

In [105]:
(precision_score(y_train, y_train_pred, average='micro'),
        recall_score(y_train, y_train_pred, average='micro'))

(0.8996721460865675, 0.8996721460865675)

In [106]:
f1_score(y_train, y_train_pred, average='micro')

0.8996721460865675

In [107]:
y_test_pred = cross_val_predict(best_estimator, X_test, y_test, cv=5,
        verbose=2, n_jobs=-1)

conf_mx = confusion_matrix(y_test, y_test_pred)

[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    3.0s remaining:    2.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    3.3s finished


In [108]:
conf_mx

array([[ 2264,   139,   110],
       [ 4345, 21713,  1001],
       [ 1867,  2958, 86313]])

In [109]:
(precision_score(y_test, y_test_pred, average='micro'),
        recall_score(y_test, y_test_pred, average='micro'))

(0.9136774086653964, 0.9136774086653964)

In [110]:
f1_score(y_test, y_test_pred, average='micro')

0.9136774086653964

In [112]:
joblib.dump(best_estimator, './training/sentiment_perceptron_undersample.pkl')

['./training/sentiment_perceptron_undersample.pkl']

In [55]:
estimator = joblib.load('./training/sentiment_perceptron_undersample.pkl')

In [56]:
y_train_pred = cross_val_predict(estimator, X_train, y_train, cv=5,
        verbose=2, n_jobs=-1)

[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   47.4s finished


In [57]:
(precision_score(y_train, y_train_pred, average='macro'),
        recall_score(y_train, y_train_pred, average='macro'))

(0.698307851142644, 0.8725306522869007)

In [58]:
f1_score(y_train, y_train_pred, average='macro')

0.7447420837346092

In [59]:
y_test_pred = cross_val_predict(estimator, X_test, y_test, cv=5,
        verbose=2, n_jobs=-1)

[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    8.4s finished


In [60]:
(precision_score(y_test, y_test_pred, average='macro'),
        recall_score(y_test, y_test_pred, average='macro'))

(0.7098567490257722, 0.8834684237339981)

In [61]:
f1_score(y_test, y_test_pred, average='macro')

0.738676534417607