In [1]:
import numpy as np
import pandas as pd
import time
import os

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
def loadNLPVectors(folder, filename):
    file = folder + '/' + filename + '.npy'
    return np.load(file)

In [4]:
def loadData(name):
    X_train = loadNLPVectors('train', name)
    y_train = loadNLPVectors('train', 'labels')
    
    X_test = loadNLPVectors('test', name)
    y_test = loadNLPVectors('test', 'labels')
    
    train = [X_train, y_train]
    test = [X_test, y_test]
    
    return train, test

# Load NLP Training/Test Data

In [5]:
train_uni, test_uni = loadData('unigram')

In [6]:
train_big, test_big = loadData('bigram')

In [7]:
train_tfidf, test_tfidf = loadData('tfidf')

In [8]:
train_vec, test_vec = loadData('word2vec')

In [9]:
train_runi, test_runi = loadData('runigram')

In [10]:
train_rbig, test_rbig = loadData('rbigram')

In [11]:
train_rtfidf, test_rtfidf = loadData('rtfidf')

# Import Models

In [12]:
from sklearn.externals import joblib

In [13]:
def loadModel(folder, clf):
    model_path = 'models/' + folder + '/' + clf + '.pkl'
    model = joblib.load(model_path)
    return model

# Hyperparameter Optimization

### Selecting Parameters to Tune for Classifiers

In [14]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

  from numpy.core.umath_tests import inner1d


In [15]:
def RandomForestHyperparameters():
    
    n_estimators = [68, 70, 72, 74, 76]
    max_depth = [30, 40, 50, 60, 70, 80]
    max_depth.append(None)
    max_features = ['auto', 'sqrt', 'log2']
    min_samples_split = [int(x) for x in np.linspace(start = 2, stop = 100, num = 10)]
    min_samples_leaf = [int(x) for x in np.linspace(start = 1, stop = 100, num = 10)]
    
    params = {'n_estimators': n_estimators, 
              'max_depth': max_depth,
              'max_features': max_features,
              'min_samples_split': min_samples_split, 
              'min_samples_leaf': min_samples_leaf}
    
    return params

In [16]:
def LogisticRegressionHyperparameters():
    
    penalty = ['l1', 'l2']
    tol = [0.01, 0.001, 0.0001, .00001]
    C = np.logspace(0, 4, 10)
    max_iter = [int(x) for x in np.linspace(start = 50, stop = 200, num = 15)]
    
    params = {'penalty': penalty,
              'tol': tol,
              'C': C,
              'max_iter': max_iter}

    return params

In [17]:
def SupportVectorHyperparameters():
    
    C = np.logspace(0, 4, 10)
    kernel = ['linear', 'rbf', 'poly', 'sigmoid']
    tol = [0.01, 0.001, 0.0001, 0.00001]
    
    params = {'C': C,
              'kernel': kernel,
              'tol': tol}
    
    return params

In [18]:
def NaiveBayesHyperparameters():
    
    alpha = [x for x in range(0, 11)]
    class_prior = [[.1, .9], [.2, .8], [.3, .7], [.4, .6]]
    
    params = {'alpha': alpha,
              'class_prior': class_prior}
    
    return params

### Importing Untuned Models

In [19]:
rf_uni = loadModel('uni', 'rand_forest')
rf_uni

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [20]:
rf_tfidf = loadModel('tfidf', 'rand_forest')
rf_tfidf

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [21]:
lr_uni = loadModel('uni', 'log_reg')
lr_uni

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [22]:
lr_big = loadModel('big', 'log_reg')
lr_big

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [23]:
lr_tfidf = loadModel('tfidf', 'log_reg')
lr_tfidf

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [24]:
nb_big = loadModel('big', 'naive_bayes')
nb_big

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

### Create Model and Parameter Instances for Tuning

In [25]:
rf_clf = RandomForestClassifier()

In [26]:
lr_clf = LogisticRegression()

In [27]:
nb_clf = MultinomialNB()

In [28]:
rf_params = RandomForestHyperparameters()
lr_params = LogisticRegressionHyperparameters()
nb_params = NaiveBayesHyperparameters()

### Tuning Function

In [29]:
def randGridOptimizer(clf, params, iterations, folds, train):
    
    tuningGrid = RandomizedSearchCV(estimator = clf,
                                    param_distributions = params,
                                    n_iter = iterations,
                                    scoring = 'accuracy',
                                    n_jobs = -1,
                                    cv = folds,
                                    verbose = 2,
                                    random_state = 42)
    
    best_clf = tuningGrid.fit(train[0], train[1])
    
    return tuningGrid

# Create Tuned Models

#### Random Forest - Unigram

In [30]:
rf_uni_grid = randGridOptimizer(rf_clf, rf_params, 30, 10, train_uni)

Fitting 10 folds for each of 30 candidates, totalling 300 fits
[CV] n_estimators=68, min_samples_split=45, min_samples_leaf=56, max_features=log2, max_depth=70 
[CV] n_estimators=68, min_samples_split=45, min_samples_leaf=56, max_features=log2, max_depth=70 
[CV] n_estimators=68, min_samples_split=45, min_samples_leaf=56, max_features=log2, max_depth=70 
[CV] n_estimators=68, min_samples_split=45, min_samples_leaf=56, max_features=log2, max_depth=70 
[CV]  n_estimators=68, min_samples_split=45, min_samples_leaf=56, max_features=log2, max_depth=70, total=  10.0s
[CV] n_estimators=68, min_samples_split=45, min_samples_leaf=56, max_features=log2, max_depth=70 
[CV]  n_estimators=68, min_samples_split=45, min_samples_leaf=56, max_features=log2, max_depth=70, total=   9.3s
[CV]  n_estimators=68, min_samples_split=45, min_samples_leaf=56, max_features=log2, max_depth=70, total=   8.8s
[CV] n_estimators=68, min_samples_split=45, min_samples_leaf=56, max_features=log2, max_depth=70 
[CV]  n_es

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.7min


[CV] n_estimators=70, min_samples_split=89, min_samples_leaf=34, max_features=sqrt, max_depth=60 
[CV]  n_estimators=70, min_samples_split=89, min_samples_leaf=34, max_features=sqrt, max_depth=60, total=  17.8s
[CV] n_estimators=70, min_samples_split=89, min_samples_leaf=34, max_features=sqrt, max_depth=60 
[CV]  n_estimators=70, min_samples_split=89, min_samples_leaf=34, max_features=sqrt, max_depth=60, total=  15.6s
[CV] n_estimators=70, min_samples_split=89, min_samples_leaf=34, max_features=sqrt, max_depth=60 
[CV]  n_estimators=70, min_samples_split=89, min_samples_leaf=34, max_features=sqrt, max_depth=60, total=  16.2s
[CV] n_estimators=70, min_samples_split=89, min_samples_leaf=34, max_features=sqrt, max_depth=60 
[CV]  n_estimators=70, min_samples_split=89, min_samples_leaf=34, max_features=sqrt, max_depth=60, total=  16.7s
[CV] n_estimators=76, min_samples_split=67, min_samples_leaf=45, max_features=log2, max_depth=60 
[CV]  n_estimators=70, min_samples_split=89, min_samples_l

[CV] n_estimators=70, min_samples_split=56, min_samples_leaf=89, max_features=log2, max_depth=50 
[CV]  n_estimators=70, min_samples_split=56, min_samples_leaf=89, max_features=log2, max_depth=50, total=   6.2s
[CV] n_estimators=70, min_samples_split=56, min_samples_leaf=89, max_features=log2, max_depth=50 
[CV]  n_estimators=70, min_samples_split=56, min_samples_leaf=89, max_features=log2, max_depth=50, total=   5.7s
[CV] n_estimators=70, min_samples_split=56, min_samples_leaf=89, max_features=log2, max_depth=50 
[CV]  n_estimators=70, min_samples_split=56, min_samples_leaf=89, max_features=log2, max_depth=50, total=   5.7s
[CV] n_estimators=70, min_samples_split=56, min_samples_leaf=89, max_features=log2, max_depth=50 
[CV]  n_estimators=70, min_samples_split=56, min_samples_leaf=89, max_features=log2, max_depth=50, total=   5.1s
[CV] n_estimators=70, min_samples_split=56, min_samples_leaf=89, max_features=log2, max_depth=50 
[CV]  n_estimators=70, min_samples_split=56, min_samples_l

[CV]  n_estimators=76, min_samples_split=34, min_samples_leaf=56, max_features=sqrt, max_depth=30, total=  13.1s
[CV] n_estimators=76, min_samples_split=34, min_samples_leaf=56, max_features=sqrt, max_depth=30 
[CV] n_estimators=76, min_samples_split=34, min_samples_leaf=56, max_features=sqrt, max_depth=30 
[CV]  n_estimators=76, min_samples_split=34, min_samples_leaf=56, max_features=sqrt, max_depth=30, total=  11.6s
[CV] n_estimators=76, min_samples_split=34, min_samples_leaf=56, max_features=sqrt, max_depth=30 
[CV]  n_estimators=76, min_samples_split=34, min_samples_leaf=56, max_features=sqrt, max_depth=30, total=   9.2s
[CV] n_estimators=76, min_samples_split=34, min_samples_leaf=56, max_features=sqrt, max_depth=30 
[CV]  n_estimators=76, min_samples_split=34, min_samples_leaf=56, max_features=sqrt, max_depth=30, total=  10.1s
[CV] n_estimators=76, min_samples_split=34, min_samples_leaf=56, max_features=sqrt, max_depth=30 
[CV]  n_estimators=76, min_samples_split=34, min_samples_l

[CV] n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60 
[CV]  n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60, total=  50.8s
[CV] n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60 
[CV]  n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60, total=  53.9s
[CV]  n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60, total=  50.7s
[CV] n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60 
[CV]  n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60, total=  49.8s


[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  8.0min


[CV] n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60 
[CV] n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60 
[CV]  n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60, total= 1.0min
[CV] n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60 
[CV]  n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60, total= 1.1min
[CV] n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60 
[CV]  n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60, total= 1.1min
[CV] n_estimators=68, min_samples_split=45, min_samples_leaf=89, max_features=auto, max_depth=70 
[CV]  n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60, total= 1.1min
[CV] n_estimators=68, min_samples_split=45, min_samples_leaf=89, m

[CV]  n_estimators=68, min_samples_split=78, min_samples_leaf=78, max_features=auto, max_depth=50, total=   8.7s
[CV] n_estimators=68, min_samples_split=78, min_samples_leaf=78, max_features=auto, max_depth=50 
[CV]  n_estimators=68, min_samples_split=78, min_samples_leaf=78, max_features=auto, max_depth=50, total=   9.4s
[CV] n_estimators=68, min_samples_split=78, min_samples_leaf=78, max_features=auto, max_depth=50 
[CV]  n_estimators=68, min_samples_split=78, min_samples_leaf=78, max_features=auto, max_depth=50, total=   9.1s
[CV] n_estimators=68, min_samples_split=78, min_samples_leaf=78, max_features=auto, max_depth=50 
[CV]  n_estimators=68, min_samples_split=12, min_samples_leaf=12, max_features=auto, max_depth=60, total=  48.2s
[CV] n_estimators=68, min_samples_split=78, min_samples_leaf=78, max_features=auto, max_depth=50 
[CV]  n_estimators=68, min_samples_split=78, min_samples_leaf=78, max_features=auto, max_depth=50, total=   8.9s
[CV] n_estimators=70, min_samples_split=100

[CV] n_estimators=74, min_samples_split=12, min_samples_leaf=12, max_features=log2, max_depth=40 
[CV] n_estimators=74, min_samples_split=12, min_samples_leaf=12, max_features=log2, max_depth=40 
[CV]  n_estimators=74, min_samples_split=12, min_samples_leaf=12, max_features=log2, max_depth=40, total=   5.3s
[CV] n_estimators=74, min_samples_split=12, min_samples_leaf=12, max_features=log2, max_depth=40 
[CV]  n_estimators=74, min_samples_split=12, min_samples_leaf=12, max_features=log2, max_depth=40, total=   4.6s
[CV] n_estimators=74, min_samples_split=12, min_samples_leaf=12, max_features=log2, max_depth=40 
[CV]  n_estimators=74, min_samples_split=12, min_samples_leaf=12, max_features=log2, max_depth=40, total=   5.1s
[CV] n_estimators=74, min_samples_split=12, min_samples_leaf=12, max_features=log2, max_depth=40 
[CV]  n_estimators=74, min_samples_split=12, min_samples_leaf=12, max_features=log2, max_depth=40, total=   5.8s
[CV] n_estimators=74, min_samples_split=12, min_samples_le

[CV]  n_estimators=72, min_samples_split=34, min_samples_leaf=34, max_features=auto, max_depth=None, total=  15.1s
[CV] n_estimators=72, min_samples_split=34, min_samples_leaf=34, max_features=auto, max_depth=None 
[CV]  n_estimators=72, min_samples_split=34, min_samples_leaf=34, max_features=auto, max_depth=None, total=  16.1s
[CV] n_estimators=72, min_samples_split=34, min_samples_leaf=34, max_features=auto, max_depth=None 
[CV]  n_estimators=72, min_samples_split=34, min_samples_leaf=34, max_features=auto, max_depth=None, total=  17.7s
[CV] n_estimators=72, min_samples_split=34, min_samples_leaf=34, max_features=auto, max_depth=None 
[CV]  n_estimators=72, min_samples_split=34, min_samples_leaf=34, max_features=auto, max_depth=None, total=  20.2s
[CV] n_estimators=72, min_samples_split=34, min_samples_leaf=34, max_features=auto, max_depth=None 
[CV]  n_estimators=72, min_samples_split=34, min_samples_leaf=34, max_features=auto, max_depth=None, total=  17.9s
[CV] n_estimators=72, min

[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 20.4min finished


In [31]:
rf_uni_tuned = rf_uni_grid.best_estimator_
rf_uni_tuned

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=40, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=100,
            min_weight_fraction_leaf=0.0, n_estimators=72, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

#### Random Forest -TFIDF

In [32]:
rf_tfidf_grid = randGridOptimizer(rf_clf, rf_params, 30, 10, train_tfidf)

Fitting 10 folds for each of 30 candidates, totalling 300 fits
[CV] n_estimators=68, min_samples_split=45, min_samples_leaf=56, max_features=log2, max_depth=70 
[CV] n_estimators=68, min_samples_split=45, min_samples_leaf=56, max_features=log2, max_depth=70 
[CV] n_estimators=68, min_samples_split=45, min_samples_leaf=56, max_features=log2, max_depth=70 
[CV]  n_estimators=68, min_samples_split=45, min_samples_leaf=56, max_features=log2, max_depth=70, total=   3.0s
[CV] n_estimators=68, min_samples_split=45, min_samples_leaf=56, max_features=log2, max_depth=70 
[CV]  n_estimators=68, min_samples_split=45, min_samples_leaf=56, max_features=log2, max_depth=70, total=   3.8s
[CV] n_estimators=68, min_samples_split=45, min_samples_leaf=56, max_features=log2, max_depth=70 
[CV] n_estimators=68, min_samples_split=45, min_samples_leaf=56, max_features=log2, max_depth=70 
[CV]  n_estimators=68, min_samples_split=45, min_samples_leaf=56, max_features=log2, max_depth=70, total=   5.8s
[CV] n_est

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.7min


[CV] n_estimators=70, min_samples_split=89, min_samples_leaf=34, max_features=sqrt, max_depth=60 
[CV]  n_estimators=70, min_samples_split=89, min_samples_leaf=34, max_features=sqrt, max_depth=60, total=  18.3s
[CV] n_estimators=70, min_samples_split=89, min_samples_leaf=34, max_features=sqrt, max_depth=60 
[CV]  n_estimators=70, min_samples_split=89, min_samples_leaf=34, max_features=sqrt, max_depth=60, total=  16.0s
[CV] n_estimators=70, min_samples_split=89, min_samples_leaf=34, max_features=sqrt, max_depth=60 
[CV]  n_estimators=70, min_samples_split=89, min_samples_leaf=34, max_features=sqrt, max_depth=60, total=  17.7s
[CV] n_estimators=70, min_samples_split=89, min_samples_leaf=34, max_features=sqrt, max_depth=60 
[CV]  n_estimators=70, min_samples_split=89, min_samples_leaf=34, max_features=sqrt, max_depth=60, total=  16.1s
[CV] n_estimators=76, min_samples_split=67, min_samples_leaf=45, max_features=log2, max_depth=60 
[CV]  n_estimators=70, min_samples_split=89, min_samples_l

[CV] n_estimators=70, min_samples_split=56, min_samples_leaf=89, max_features=log2, max_depth=50 
[CV]  n_estimators=70, min_samples_split=56, min_samples_leaf=89, max_features=log2, max_depth=50, total=   5.4s
[CV] n_estimators=70, min_samples_split=56, min_samples_leaf=89, max_features=log2, max_depth=50 
[CV]  n_estimators=70, min_samples_split=56, min_samples_leaf=89, max_features=log2, max_depth=50, total=   4.6s
[CV] n_estimators=70, min_samples_split=56, min_samples_leaf=89, max_features=log2, max_depth=50 
[CV]  n_estimators=70, min_samples_split=56, min_samples_leaf=89, max_features=log2, max_depth=50, total=   4.9s
[CV] n_estimators=70, min_samples_split=56, min_samples_leaf=89, max_features=log2, max_depth=50 
[CV]  n_estimators=70, min_samples_split=56, min_samples_leaf=89, max_features=log2, max_depth=50, total=   4.7s
[CV] n_estimators=70, min_samples_split=56, min_samples_leaf=89, max_features=log2, max_depth=50 
[CV]  n_estimators=70, min_samples_split=56, min_samples_l

[CV]  n_estimators=76, min_samples_split=34, min_samples_leaf=56, max_features=sqrt, max_depth=30, total=  11.2s
[CV] n_estimators=76, min_samples_split=34, min_samples_leaf=56, max_features=sqrt, max_depth=30 
[CV] n_estimators=76, min_samples_split=34, min_samples_leaf=56, max_features=sqrt, max_depth=30 
[CV]  n_estimators=76, min_samples_split=34, min_samples_leaf=56, max_features=sqrt, max_depth=30, total=   9.6s
[CV] n_estimators=76, min_samples_split=34, min_samples_leaf=56, max_features=sqrt, max_depth=30 
[CV]  n_estimators=76, min_samples_split=34, min_samples_leaf=56, max_features=sqrt, max_depth=30, total=   8.0s
[CV] n_estimators=76, min_samples_split=34, min_samples_leaf=56, max_features=sqrt, max_depth=30 
[CV]  n_estimators=76, min_samples_split=34, min_samples_leaf=56, max_features=sqrt, max_depth=30, total=   9.6s
[CV] n_estimators=76, min_samples_split=34, min_samples_leaf=56, max_features=sqrt, max_depth=30 
[CV]  n_estimators=76, min_samples_split=34, min_samples_l

[CV] n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60 
[CV]  n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60, total= 1.1min
[CV] n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60 
[CV]  n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60, total= 1.1min
[CV]  n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60, total= 1.1min
[CV] n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60 
[CV]  n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60, total= 1.1min


[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  8.3min


[CV] n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60 
[CV] n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60 
[CV]  n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60, total=  49.2s
[CV] n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60 
[CV]  n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60, total=  51.5s
[CV]  n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60, total=  49.9s
[CV] n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60 
[CV] n_estimators=68, min_samples_split=45, min_samples_leaf=89, max_features=auto, max_depth=70 
[CV]  n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60, total=  53.9s
[CV] n_estimators=68, min_samples_split=45, min_samples_leaf=89, m

[CV]  n_estimators=68, min_samples_split=78, min_samples_leaf=78, max_features=auto, max_depth=50, total=   8.4s
[CV] n_estimators=68, min_samples_split=78, min_samples_leaf=78, max_features=auto, max_depth=50 
[CV]  n_estimators=68, min_samples_split=78, min_samples_leaf=78, max_features=auto, max_depth=50, total=   8.0s
[CV] n_estimators=68, min_samples_split=78, min_samples_leaf=78, max_features=auto, max_depth=50 
[CV]  n_estimators=68, min_samples_split=78, min_samples_leaf=78, max_features=auto, max_depth=50, total=   8.4s
[CV] n_estimators=68, min_samples_split=78, min_samples_leaf=78, max_features=auto, max_depth=50 
[CV]  n_estimators=68, min_samples_split=78, min_samples_leaf=78, max_features=auto, max_depth=50, total=   7.8s
[CV] n_estimators=68, min_samples_split=78, min_samples_leaf=78, max_features=auto, max_depth=50 
[CV]  n_estimators=68, min_samples_split=78, min_samples_leaf=78, max_features=auto, max_depth=50, total=   7.6s
[CV] n_estimators=70, min_samples_split=100

[CV]  n_estimators=74, min_samples_split=12, min_samples_leaf=12, max_features=log2, max_depth=40, total=   6.9s
[CV] n_estimators=74, min_samples_split=12, min_samples_leaf=12, max_features=log2, max_depth=40 
[CV]  n_estimators=74, min_samples_split=12, min_samples_leaf=12, max_features=log2, max_depth=40, total=   5.5s
[CV] n_estimators=74, min_samples_split=12, min_samples_leaf=12, max_features=log2, max_depth=40 
[CV]  n_estimators=74, min_samples_split=12, min_samples_leaf=12, max_features=log2, max_depth=40, total=   4.8s
[CV] n_estimators=74, min_samples_split=12, min_samples_leaf=12, max_features=log2, max_depth=40 
[CV]  n_estimators=74, min_samples_split=12, min_samples_leaf=12, max_features=log2, max_depth=40, total=   4.9s
[CV] n_estimators=74, min_samples_split=12, min_samples_leaf=12, max_features=log2, max_depth=40 
[CV]  n_estimators=74, min_samples_split=12, min_samples_leaf=12, max_features=log2, max_depth=40, total=   4.9s
[CV] n_estimators=74, min_samples_split=12,

[CV]  n_estimators=72, min_samples_split=34, min_samples_leaf=34, max_features=auto, max_depth=None, total=  13.7s
[CV] n_estimators=72, min_samples_split=34, min_samples_leaf=34, max_features=auto, max_depth=None 
[CV]  n_estimators=72, min_samples_split=34, min_samples_leaf=34, max_features=auto, max_depth=None, total=  16.1s
[CV] n_estimators=72, min_samples_split=34, min_samples_leaf=34, max_features=auto, max_depth=None 
[CV]  n_estimators=72, min_samples_split=34, min_samples_leaf=34, max_features=auto, max_depth=None, total=  14.3s
[CV] n_estimators=72, min_samples_split=34, min_samples_leaf=34, max_features=auto, max_depth=None 
[CV]  n_estimators=72, min_samples_split=34, min_samples_leaf=34, max_features=auto, max_depth=None, total=  16.0s
[CV] n_estimators=72, min_samples_split=34, min_samples_leaf=34, max_features=auto, max_depth=None 
[CV]  n_estimators=72, min_samples_split=34, min_samples_leaf=34, max_features=auto, max_depth=None, total=  17.5s
[CV] n_estimators=72, min

[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 20.1min finished


In [33]:
rf_tfidf_tuned = rf_tfidf_grid.best_estimator_
rf_tfidf_tuned

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=40, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=100,
            min_weight_fraction_leaf=0.0, n_estimators=72, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

#### Logistic Regression - Unigram

In [34]:
lr_uni_grid = randGridOptimizer(lr_clf, lr_params, 30, 10, train_uni)

Fitting 10 folds for each of 30 candidates, totalling 300 fits
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] .. tol=0.0001, penalty=l1, max_iter=178, C=10000.0, total=   6.4s
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] .. tol=0.0001, penalty=l1, max_iter=178, C=10000.0, total=  10.4s
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] .. tol=0.0001, penalty=l1, max_iter=178, C=10000.0, total=  11.5s
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] .. tol=0.0001, penalty=l1, max_iter=178, C=10000.0, total=  12.5s
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] .. tol=0.0001, penalty=l1, max_iter=178, C=10000.0, total=  10.1s
[CV] tol=0.000

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.7min


[CV] tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832 .......
[CV]  tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832, total=   8.6s
[CV] tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832 .......
[CV]  tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832, total=   8.9s
[CV] tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832 .......
[CV]  tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832, total=   8.2s
[CV] tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832 .......
[CV]  tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832, total=   8.2s
[CV] tol=0.0001, penalty=l1, max_iter=125, C=1.0 .....................
[CV]  tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832, total=   8.7s
[CV] tol=0.0001, penalty=l1, max_iter=125, C=1.0 .....................
[CV]  tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832, total=   8.8s
[CV] tol=0.0001, penalty=l1, max_iter=125, C=1.0 .....................
[CV]  tol=1e-05, penalty=l2, 

[CV] tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827 ........
[CV]  tol=1e-05, penalty=l1, max_iter=114, C=21.544346900318832, total=   8.6s
[CV] tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827 ........
[CV]  tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827, total=   9.4s
[CV] tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827 ........
[CV]  tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827, total=  10.1s
[CV] tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827 ........
[CV]  tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827, total=   9.1s
[CV] tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827 ........
[CV]  tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827, total=   9.6s
[CV] tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827 ........
[CV]  tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827, total=   9.5s
[CV] tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827 ........
[CV]  tol=1e-05, penalty=l2, max_i

[CV] tol=0.01, penalty=l1, max_iter=50, C=7.742636826811269 ..........
[CV]  tol=0.01, penalty=l1, max_iter=50, C=7.742636826811269, total=   7.8s
[CV] tol=0.01, penalty=l1, max_iter=50, C=7.742636826811269 ..........
[CV]  tol=0.01, penalty=l1, max_iter=50, C=7.742636826811269, total=   8.3s
[CV] tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245 ......
[CV]  tol=0.01, penalty=l1, max_iter=50, C=7.742636826811269, total=   8.8s
[CV] tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245 ......
[CV]  tol=0.01, penalty=l1, max_iter=50, C=7.742636826811269, total=   9.5s
[CV] tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245 ......
[CV]  tol=0.01, penalty=l1, max_iter=50, C=7.742636826811269, total=   9.0s
[CV] tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245 ......
[CV]  tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245, total=   8.8s
[CV] tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245 ......
[CV]  tol=0.0001, penalty=l1, max_iter=114,

[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  7.2min


[CV]  tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245, total=   8.4s
[CV] tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245 ......
[CV]  tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245, total=   8.8s
[CV] tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245 ......
[CV]  tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245, total=   8.1s
[CV] tol=1e-05, penalty=l1, max_iter=103, C=59.94842503189409 ........
[CV]  tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245, total=   8.2s
[CV] tol=1e-05, penalty=l1, max_iter=103, C=59.94842503189409 ........
[CV]  tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245, total=   9.3s
[CV] tol=1e-05, penalty=l1, max_iter=103, C=59.94842503189409 ........
[CV]  tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245, total=   9.6s
[CV] tol=1e-05, penalty=l1, max_iter=103, C=59.94842503189409 ........
[CV]  tol=1e-05, penalty=l1, max_iter=103, C=59.94842503189409, total=   9.9s
[CV] tol=1e-05, 

[CV]  tol=0.01, penalty=l2, max_iter=71, C=21.544346900318832, total=   8.9s
[CV] tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593 ........
[CV]  tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593, total=   8.9s
[CV] tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593 ........
[CV]  tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593, total=   9.1s
[CV] tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593 ........
[CV]  tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593, total=   8.5s
[CV] tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593 ........
[CV]  tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593, total=   7.8s
[CV] tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593 ........
[CV]  tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593, total=   7.9s
[CV] tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593 ........
[CV]  tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593, total=   7.5s
[CV] tol=0.001, penalty=l1, m

[CV]  tol=0.001, penalty=l2, max_iter=146, C=166.81005372000593, total=   8.6s
[CV] tol=0.001, penalty=l2, max_iter=146, C=166.81005372000593 .......
[CV]  tol=0.001, penalty=l2, max_iter=146, C=166.81005372000593, total=   9.1s
[CV] tol=0.0001, penalty=l1, max_iter=167, C=59.94842503189409 .......
[CV]  tol=0.001, penalty=l2, max_iter=146, C=166.81005372000593, total=   9.2s
[CV] tol=0.0001, penalty=l1, max_iter=167, C=59.94842503189409 .......
[CV]  tol=0.001, penalty=l2, max_iter=146, C=166.81005372000593, total=   9.1s
[CV] tol=0.0001, penalty=l1, max_iter=167, C=59.94842503189409 .......
[CV]  tol=0.001, penalty=l2, max_iter=146, C=166.81005372000593, total=   9.5s
[CV] tol=0.0001, penalty=l1, max_iter=167, C=59.94842503189409 .......
[CV]  tol=0.0001, penalty=l1, max_iter=167, C=59.94842503189409, total=  11.3s
[CV] tol=0.0001, penalty=l1, max_iter=167, C=59.94842503189409 .......
[CV]  tol=0.0001, penalty=l1, max_iter=167, C=59.94842503189409, total=  12.8s
[CV] tol=0.0001, pena

[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 13.8min finished


In [35]:
lr_uni_tuned = lr_uni_grid.best_estimator_
lr_uni_tuned

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=125, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

#### Logistic Regression - Bigram

In [38]:
lr_big_grid = randGridOptimizer(lr_clf, lr_params, 20, 3, train_big)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] tol=0.001, penalty=l1, max_iter=82, C=1291.5496650148827 ........
[CV] .. tol=0.0001, penalty=l1, max_iter=178, C=10000.0, total= 2.5min
[CV] tol=0.001, penalty=l1, max_iter=82, C=1291.5496650148827 ........
[CV] .. tol=0.0001, penalty=l1, max_iter=178, C=10000.0, total= 3.1min
[CV] tol=0.001, penalty=l1, max_iter=82, C=1291.5496650148827 ........
[CV] .. tol=0.0001, penalty=l1, max_iter=178, C=10000.0, total= 3.1min
[CV]  tol=0.001, penalty=l1, max_iter=82, C=1291.5496650148827, total= 2.7min
[CV] tol=0.001, penalty=l2, max_iter=178, C=1.0 ......................
[CV] tol=0.001, penalty=l2, max_iter=178, C=1.0 ......................
[CV]  tol=0.001, penalty=l1, max_iter=82, C=1291.5496650148827, total= 3.0min
[C

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 31.0min


[CV] ........ tol=0.01, penalty=l2, max_iter=103, C=1.0, total= 3.1min
[CV] tol=0.001, penalty=l1, max_iter=114, C=1.0 ......................
[CV] ........ tol=0.01, penalty=l2, max_iter=103, C=1.0, total= 2.9min
[CV] tol=0.001, penalty=l1, max_iter=114, C=1.0 ......................
[CV] ........ tol=0.01, penalty=l2, max_iter=103, C=1.0, total= 2.6min
[CV] tol=0.001, penalty=l1, max_iter=60, C=1291.5496650148827 ........
[CV] ....... tol=0.001, penalty=l1, max_iter=114, C=1.0, total= 2.8min
[CV] tol=0.001, penalty=l1, max_iter=60, C=1291.5496650148827 ........
[CV] ....... tol=0.001, penalty=l1, max_iter=114, C=1.0, total= 3.0min
[CV] tol=0.001, penalty=l1, max_iter=60, C=1291.5496650148827 ........
[CV] ....... tol=0.001, penalty=l1, max_iter=114, C=1.0, total= 3.0min
[CV] tol=0.01, penalty=l1, max_iter=50, C=7.742636826811269 ..........
[CV]  tol=0.001, penalty=l1, max_iter=60, C=1291.5496650148827, total= 2.8min
[CV] tol=0.01, penalty=l1, max_iter=50, C=7.742636826811269 ..........

[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 51.3min finished


In [39]:
lr_big_tuned = lr_big_grid.best_estimator_
lr_big_tuned

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=114, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.001,
          verbose=0, warm_start=False)

#### Logistic Regression - TFIDF

In [40]:
lr_tfidf_grid = randGridOptimizer(lr_clf, lr_params, 30, 10, train_tfidf)

Fitting 10 folds for each of 30 candidates, totalling 300 fits
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] .. tol=0.0001, penalty=l1, max_iter=178, C=10000.0, total=   4.1s
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] .. tol=0.0001, penalty=l1, max_iter=178, C=10000.0, total=   4.1s
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] .. tol=0.0001, penalty=l1, max_iter=178, C=10000.0, total=   5.2s
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] .. tol=0.0001, penalty=l1, max_iter=178, C=10000.0, total=   8.2s
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] .. tol=0.0001, penalty=l1, max_iter=178, C=10000.0, total=   6.9s
[CV] tol=0.000

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.0min


[CV] tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832 .......
[CV]  tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832, total=   3.4s
[CV] tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832 .......
[CV]  tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832, total=   3.5s
[CV] tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832 .......
[CV]  tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832, total=   3.4s
[CV] tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832 .......
[CV]  tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832, total=   3.5s
[CV] tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832 .......
[CV]  tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832, total=   3.4s
[CV] tol=0.0001, penalty=l1, max_iter=125, C=1.0 .....................
[CV]  tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832, total=   3.4s
[CV] tol=0.0001, penalty=l1, max_iter=125, C=1.0 .....................
[CV]  tol=1e-05, penalty=l2, 

[CV]  tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827, total=   4.7s
[CV] tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827 ........
[CV]  tol=1e-05, penalty=l1, max_iter=114, C=21.544346900318832, total= 1.3min
[CV] tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827 ........
[CV]  tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827, total=   5.0s
[CV] tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827 ........
[CV]  tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827, total=   5.7s
[CV] tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827 ........
[CV]  tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827, total=   5.2s
[CV] tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827 ........
[CV]  tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827, total=   5.2s
[CV] tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827 ........
[CV]  tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827, total=   5.1s
[CV] tol=1e-05, penalty=l2,

[CV] tol=0.01, penalty=l1, max_iter=50, C=7.742636826811269 ..........
[CV]  tol=0.01, penalty=l1, max_iter=50, C=7.742636826811269, total=   3.3s
[CV] tol=0.01, penalty=l1, max_iter=50, C=7.742636826811269 ..........
[CV]  tol=0.01, penalty=l1, max_iter=50, C=7.742636826811269, total=   3.3s
[CV] tol=0.01, penalty=l1, max_iter=50, C=7.742636826811269 ..........
[CV]  tol=0.01, penalty=l1, max_iter=50, C=7.742636826811269, total=   3.2s
[CV] tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245 ......
[CV]  tol=0.01, penalty=l1, max_iter=50, C=7.742636826811269, total=   3.3s
[CV] tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245 ......
[CV]  tol=0.01, penalty=l1, max_iter=50, C=7.742636826811269, total=   3.5s
[CV] tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245 ......
[CV]  tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245, total=   3.7s
[CV] tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245 ......
[CV]  tol=0.0001, penalty=l1, max_iter=114,

[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  8.2min


[CV] tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245 ......
[CV]  tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245, total=   3.4s
[CV] tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245 ......
[CV]  tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245, total=   3.3s
[CV] tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245 ......
[CV]  tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245, total=   3.3s
[CV] tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245 ......
[CV]  tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245, total=   3.3s
[CV] tol=1e-05, penalty=l1, max_iter=103, C=59.94842503189409 ........
[CV]  tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245, total=   3.3s
[CV] tol=1e-05, penalty=l1, max_iter=103, C=59.94842503189409 ........
[CV]  tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245, total=   3.3s
[CV] tol=1e-05, penalty=l1, max_iter=103, C=59.94842503189409 ........
[CV]  tol=1e-05, penalt

[CV] tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593 ........
[CV]  tol=0.01, penalty=l2, max_iter=71, C=21.544346900318832, total=   3.2s
[CV] tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593 ........
[CV]  tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593, total=   3.4s
[CV] tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593 ........
[CV]  tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593, total=   3.4s
[CV] tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593 ........
[CV]  tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593, total=   3.4s
[CV] tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593 ........
[CV]  tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593, total=   3.4s
[CV] tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593 ........
[CV]  tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593, total=   3.5s
[CV] tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593 ........
[CV]  tol=0.001, penalty=l1, max_ite

[CV] tol=0.001, penalty=l2, max_iter=146, C=166.81005372000593 .......
[CV]  tol=0.001, penalty=l2, max_iter=146, C=166.81005372000593, total=   3.4s
[CV] tol=0.001, penalty=l2, max_iter=146, C=166.81005372000593 .......
[CV]  tol=0.001, penalty=l2, max_iter=146, C=166.81005372000593, total=   3.3s
[CV] tol=0.001, penalty=l2, max_iter=146, C=166.81005372000593 .......
[CV]  tol=0.001, penalty=l2, max_iter=146, C=166.81005372000593, total=   3.3s
[CV] tol=0.0001, penalty=l1, max_iter=167, C=59.94842503189409 .......
[CV]  tol=0.001, penalty=l2, max_iter=146, C=166.81005372000593, total=   3.3s
[CV] tol=0.0001, penalty=l1, max_iter=167, C=59.94842503189409 .......
[CV]  tol=0.001, penalty=l2, max_iter=146, C=166.81005372000593, total=   3.4s
[CV] tol=0.0001, penalty=l1, max_iter=167, C=59.94842503189409 .......
[CV]  tol=0.0001, penalty=l1, max_iter=167, C=59.94842503189409, total=   3.5s
[CV] tol=0.0001, penalty=l1, max_iter=167, C=59.94842503189409 .......
[CV]  tol=0.0001, penalty=l1,

[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 12.7min finished


In [41]:
lr_tfidf_tuned = lr_tfidf_grid.best_estimator_
lr_tfidf_tuned

LogisticRegression(C=2.7825594022071245, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=114,
          multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

#### Naive Bayes - Bigram

In [42]:
nb_big_grid = randGridOptimizer(nb_clf, nb_params, 20, 3, train_big)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] class_prior=[0.2, 0.8], alpha=9 .................................
[CV] class_prior=[0.2, 0.8], alpha=9 .................................
[CV] class_prior=[0.2, 0.8], alpha=9 .................................
[CV] class_prior=[0.1, 0.9], alpha=6 .................................
[CV] .................. class_prior=[0.2, 0.8], alpha=9, total= 2.3min
[CV] class_prior=[0.1, 0.9], alpha=6 .................................
[CV] .................. class_prior=[0.2, 0.8], alpha=9, total= 2.9min
[CV] class_prior=[0.1, 0.9], alpha=6 .................................
[CV] .................. class_prior=[0.2, 0.8], alpha=9, total= 3.2min
[CV] class_prior=[0.2, 0.8], alpha=6 .................................
[CV] .................. class_prior=[0.1, 0.9], alpha=6, total= 3.5min
[CV] class_prior=[0.2, 0.8], alpha=6 .................................
[CV] .................. class_prior=[0.1, 0.9], alpha=6, total= 3.1min
[CV] class_prior

  'setting alpha = %.1e' % _ALPHA_MIN)


[CV] .................. class_prior=[0.4, 0.6], alpha=0, total= 3.2min
[CV] class_prior=[0.3, 0.7], alpha=1 .................................


  'setting alpha = %.1e' % _ALPHA_MIN)


[CV] .................. class_prior=[0.4, 0.6], alpha=0, total= 3.2min
[CV] class_prior=[0.3, 0.7], alpha=1 .................................


  'setting alpha = %.1e' % _ALPHA_MIN)


[CV] .................. class_prior=[0.4, 0.6], alpha=0, total= 3.1min
[CV] class_prior=[0.4, 0.6], alpha=6 .................................
[CV] .................. class_prior=[0.3, 0.7], alpha=1, total= 3.1min
[CV] class_prior=[0.4, 0.6], alpha=6 .................................
[CV] .................. class_prior=[0.3, 0.7], alpha=1, total= 3.1min
[CV] class_prior=[0.4, 0.6], alpha=6 .................................
[CV] .................. class_prior=[0.3, 0.7], alpha=1, total= 3.1min
[CV] class_prior=[0.4, 0.6], alpha=7 .................................


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 30.5min


[CV] .................. class_prior=[0.4, 0.6], alpha=6, total= 2.9min
[CV] class_prior=[0.4, 0.6], alpha=7 .................................
[CV] .................. class_prior=[0.4, 0.6], alpha=6, total= 3.0min
[CV] class_prior=[0.4, 0.6], alpha=7 .................................
[CV] .................. class_prior=[0.4, 0.6], alpha=6, total= 3.0min
[CV] class_prior=[0.4, 0.6], alpha=4 .................................
[CV] .................. class_prior=[0.4, 0.6], alpha=7, total= 3.0min
[CV] class_prior=[0.4, 0.6], alpha=4 .................................
[CV] .................. class_prior=[0.4, 0.6], alpha=7, total= 2.9min
[CV] class_prior=[0.4, 0.6], alpha=4 .................................
[CV] .................. class_prior=[0.4, 0.6], alpha=7, total= 3.0min
[CV] class_prior=[0.2, 0.8], alpha=4 .................................
[CV] .................. class_prior=[0.4, 0.6], alpha=4, total= 3.0min
[CV] class_prior=[0.2, 0.8], alpha=4 .................................
[CV] .

[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 52.3min finished


In [43]:
nb_big_tuned = nb_big_grid.best_estimator_
nb_big_tuned

MultinomialNB(alpha=7, class_prior=[0.4, 0.6], fit_prior=True)

# Tuned vs Untuned Models

In [44]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [45]:
def compare(original, tuned, test, clf_name, nlp_name):
    
    X_test = test[0]
    y_test = test[1]
    
    y_pred = original.predict(X_test)
    y_pred_t = tuned.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    acc_t = accuracy_score(y_test, y_pred_t)
    roc_auc_t = roc_auc_score(y_test, y_pred_t)
    f1_t = f1_score(y_test, y_pred_t)
    precision_t = precision_score(y_test, y_pred_t)
    recall_t = recall_score(y_test, y_pred_t)
    
    print('Untuned ' + clf_name + ' - ' + nlp_name + ' Metrics: ')
    print(clf_name + ' Accuracy: ' + str(acc))
    print(clf_name + ' ROC AUC Score: '+ str(roc_auc))
    print(clf_name + ' F Score: ' + str(f1))
    print(clf_name + ' Precision Score: ' + str(precision))
    print(clf_name + ' Recall Score ' + str(recall))
    print(" ")
    
    print('Tuned ' + clf_name + ' - ' + nlp_name + ' Metrics: ')
    print(clf_name + ' Accuracy: ' + str(acc_t))
    print(clf_name + ' ROC AUC Score: '+ str(roc_auc_t))
    print(clf_name + ' F Score: ' + str(f1_t))
    print(clf_name + ' Precision Score: ' + str(precision_t))
    print(clf_name + ' Recall Score ' + str(recall_t))
    print(" ")

### Random Forest Unigram Comparison

In [46]:
compare(rf_uni, rf_uni_tuned, test_uni, 'Random Forest', 'Unigram')

Untuned Random Forest - Unigram Metrics: 
Random Forest Accuracy: 0.9023452518262207
Random Forest ROC AUC Score: 0.9017452599667019
Random Forest F Score: 0.8975806451612904
Random Forest Precision Score: 0.9175597691673537
Random Forest Recall Score 0.8784530386740331
 
Tuned Random Forest - Unigram Metrics: 
Random Forest Accuracy: 0.8992695117262591
Random Forest ROC AUC Score: 0.8974187925768765
Random Forest F Score: 0.8887000849617672
Random Forest Precision Score: 0.9622815087396505
Random Forest Recall Score 0.8255722178374112
 


### Random Forest TFIDF Comparison

In [47]:
compare(rf_tfidf, rf_tfidf_tuned, test_tfidf, 'Random Forest', 'TFIDF')

Untuned Random Forest - TFIDF Metrics: 
Random Forest Accuracy: 0.9042675893886967
Random Forest ROC AUC Score: 0.9031634537900742
Random Forest F Score: 0.8974886784685056
Random Forest Precision Score: 0.9380378657487092
Random Forest Recall Score 0.8602999210734017
 
Tuned Random Forest - TFIDF Metrics: 
Random Forest Accuracy: 0.8977316416762784
Random Forest ROC AUC Score: 0.8960781053829834
Random Forest F Score: 0.8879528222409435
Random Forest Precision Score: 0.952122854561879
Random Forest Recall Score 0.8318863456985004
 


### Logistic Regression Unigram Comparison

In [48]:
compare(lr_uni, lr_uni_tuned, test_uni, 'Logistic Regression', 'Unigram')

Untuned Logistic Regression - Unigram Metrics: 
Logistic Regression Accuracy: 0.9158016147635525
Logistic Regression ROC AUC Score: 0.9147844191558522
Logistic Regression F Score: 0.9101354123922856
Logistic Regression Precision Score: 0.9478632478632478
Logistic Regression Recall Score 0.8752959747434885
 
Tuned Logistic Regression - Unigram Metrics: 
Logistic Regression Accuracy: 0.9215686274509803
Logistic Regression ROC AUC Score: 0.9207237344232383
Logistic Regression F Score: 0.9168704156479218
Logistic Regression Precision Score: 0.9477674810446504
Logistic Regression Recall Score 0.8879242304656669
 


### Logistic Regression Bigram Comparison

In [49]:
compare(lr_big, lr_big_tuned, test_big, 'Logistic Regression', 'Bigram')

Untuned Logistic Regression - Bigram Metrics: 
Logistic Regression Accuracy: 0.9142637447135717
Logistic Regression ROC AUC Score: 0.913186066792965
Logistic Regression F Score: 0.9082682023858494
Logistic Regression Precision Score: 0.9484536082474226
Logistic Regression Recall Score 0.8713496448303079
 
Tuned Logistic Regression - Bigram Metrics: 
Logistic Regression Accuracy: 0.922722029988466
Logistic Regression ROC AUC Score: 0.9219076333971925
Logistic Regression F Score: 0.9181929181929182
Logistic Regression Precision Score: 0.9478991596638655
Logistic Regression Recall Score 0.8902920284135754
 


### Logistic Regression TFIDF Comparison

In [50]:
compare(lr_tfidf, lr_tfidf_tuned, test_tfidf, 'Logistic Regression', 'TFIDF')

Untuned Logistic Regression - TFIDF Metrics: 
Logistic Regression Accuracy: 0.9081122645136486
Logistic Regression ROC AUC Score: 0.9066539145581115
Logistic Regression F Score: 0.9001253656498119
Logistic Regression Precision Score: 0.9564831261101243
Logistic Regression Recall Score 0.8500394632991318
 
Tuned Logistic Regression - TFIDF Metrics: 
Logistic Regression Accuracy: 0.9219530949634756
Logistic Regression ROC AUC Score: 0.9211778286074012
Logistic Regression F Score: 0.9175132060138156
Logistic Regression Precision Score: 0.9455611390284757
Logistic Regression Recall Score 0.8910812943962115
 


### Naive Bayes Bigram Comparison

In [51]:
compare(nb_big, nb_big_tuned, test_big, 'Naive Bayes', 'Bigram')

Untuned Naive Bayes - Bigram Metrics: 
Naive Bayes Accuracy: 0.8777393310265282
Naive Bayes ROC AUC Score: 0.8779158171506195
Naive Bayes F Score: 0.87578125
Naive Bayes Precision Score: 0.8669760247486465
Naive Bayes Recall Score 0.8847671665351223
 
Tuned Naive Bayes - Bigram Metrics: 
Naive Bayes Accuracy: 0.8677431757016532
Naive Bayes ROC AUC Score: 0.8683490732928723
Naive Bayes F Score: 0.8678955453149001
Naive Bayes Precision Score: 0.8451757666417352
Naive Bayes Recall Score 0.8918705603788477
 


# Save Tuned Models

In [52]:
path = os.getcwd()
folderpath = path + '/models' + '/tuned'
os.mkdir(folderpath)

In [53]:
def saveModel(model, name, test):
    filepath = folderpath + '/' + name + '.pkl'
    joblib.dump(model, filepath)
    
    # test to see if model correctly saved
    model_load = joblib.load(filepath)
    X = test[0]
    y = test[1]
    
    assert model.score(X, y) == model_load.score(X, y)

# Random Forest - Unigram

In [54]:
saveModel(rf_uni_tuned, 'rf_uni', test_uni)

# Random Forest - TFIDF

In [55]:
saveModel(rf_tfidf_tuned, 'rf_tfidf', test_tfidf)

# Logistic Regression - Unigram

In [56]:
saveModel(lr_uni_tuned, 'lr_uni', test_uni)

# Logistic Regression - Bigram

In [57]:
saveModel(lr_big_tuned, 'lr_big', test_big)

# Logistic Regression - TFIDF

In [58]:
saveModel(lr_tfidf_tuned, 'lr_tfidf', test_tfidf)

# Naive Bayes - Bigram

In [59]:
saveModel(nb_big_tuned, 'nb_big', test_big)