In [1]:
import numpy as np
import pandas as pd
import time
import os

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
def loadNLPVectors(filename):
    file = 'nlp_data/' + filename + '.npy'
    return np.load(file)

In [4]:
def loadLabels():
    return loadNLPVectors("labels")

In [5]:
BOTH = 0
TRAIN = 1
TEST = 2
def getData(nlp, tag = 0):
    # 0 = train and test
    # 1 = train only
    # 2 = test only
    X_train, X_test, y_train, y_test = train_test_split(nlp, 
                                                        labels, 
                                                        test_size = 0.2, 
                                                        random_state = 42, 
                                                        shuffle = True, 
                                                        stratify = labels)
    train = [X_train, y_train]
    test = [X_test, y_test]
    if tag == BOTH:
        return train, test
    elif tag == TRAIN:
        return train
    else:
        return test

# Load NLP Data

In [6]:
unigram_array = "feature_array_unigram"
bigram_array = "feature_array_bigram"
tfidf_array = "feature_array_tfidf"
wordvec_array = "feature_array_word2vec"
unigram_reduced = "reduced_unigram"
bigram_reduced = "reduced_bigram"
tfidf_reduced = "reduced_tfidf"

In [7]:
unigram = loadNLPVectors(unigram_array)
bigram = loadNLPVectors(bigram_array)
tfidf = loadNLPVectors(tfidf_array)
word2vec = loadNLPVectors(wordvec_array)
reduced_unigram = loadNLPVectors(unigram_reduced)
reduced_bigram = loadNLPVectors(bigram_reduced)
reduced_tfidf = loadNLPVectors(tfidf_reduced)
labels = loadLabels()

# Create Test Datasets

In [8]:
train_uni, test_uni = getData(unigram)

In [9]:
train_big, test_big = getData(bigram)

In [10]:
train_tfidf, test_tfidf = getData(tfidf)

In [11]:
train_vec, test_vec = getData(word2vec)

In [12]:
train_runi, test_runi = getData(reduced_unigram)

In [13]:
train_rbig, test_rbig = getData(reduced_bigram)

In [14]:
train_rtfidf, test_rtfidf = getData(reduced_tfidf)

# Import Models

In [15]:
from sklearn.externals import joblib

In [16]:
feature_folders = ['unigram', 'bigram', 'tfidf', 'word2vec', 'reduced_unigram', 'reduced_bigram', 'reduced_tfidf']
classifiers = ['rand_forest', 'log_reg', 'lin_reg', 'naive_bayes', 'svm']

In [17]:
def loadModel(nlp_index, clf_index):
    model_path = 'models/' + feature_folders[nlp_index] + '/' + classifiers[clf_index] + '.pkl'
    model = joblib.load(model_path)
    return model

# Hyperparameter Optimization

### Selecting Parameters to Tune for Classifiers

In [38]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

In [68]:
def RandomForestHyperparameters():
    
    n_estimators = [68, 70, 72, 74, 76]
    max_depth = [30, 40, 50, 60, 70, 80]
    max_depth.append(None)
    max_features = ['auto', 'sqrt', 'log2']
    min_samples_split = [int(x) for x in np.linspace(start = 2, stop = 100, num = 10)]
    min_samples_leaf = [int(x) for x in np.linspace(start = 1, stop = 100, num = 10)]
    
    params = {'n_estimators': n_estimators, 
              'max_depth': max_depth,
              'max_features': max_features,
              'min_samples_split': min_samples_split, 
              'min_samples_leaf': min_samples_leaf}
    
    return params

In [58]:
def LogisticRegressionHyperparameters():
    
    penalty = ['l1', 'l2']
    tol = [0.01, 0.001, 0.0001, .00001]
    C = np.logspace(0, 4, 10)
    max_iter = [int(x) for x in np.linspace(start = 50, stop = 200, num = 15)]
    
    params = {'penalty': penalty,
              'tol': tol,
              'C': C,
              'max_iter': max_iter}

    return params

In [59]:
def SupportVectorHyperparameters():
    
    C = np.logspace(0, 4, 10)
    kernel = ['linear', 'rbf', 'poly', 'sigmoid']
    tol = [0.01, 0.001, 0.0001, 0.00001]
    
    params = {'C': C,
              'kernel': kernel,
              'tol': tol}
    
    return params

In [60]:
def NaiveBayesHyperparameters():
    
    alpha = [x for x in range(0, 11)]
    class_prior = [[.1, .9], [.2, .8], [.3, .7], [.4, .6]]
    
    params = {'alpha': alpha,
              'class_prior': class_prior}
    
    return params

### Importing Untuned Models

In [23]:
rf_tfidf = loadModel(2, 0)
rf_tfidf

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [24]:
rf_rtfidf = loadModel(6, 0)
rf_rtfidf

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [25]:
lr_uni = loadModel(0, 1)
lr_uni

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [26]:
lr_big = loadModel(1, 1)
lr_big

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [27]:
lr_rbig = loadModel(5, 1)
lr_rbig

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [28]:
nb_big = loadModel(1, 3)
nb_big

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

### Create Model and Parameter Instances for Tuning

In [69]:
rf_clf = RandomForestClassifier()

In [70]:
lr_clf = LogisticRegression()

In [71]:
nb_clf = MultinomialNB()

In [73]:
rf_params = RandomForestHyperparameters()
lr_params = LogisticRegressionHyperparameters()
nb_params = NaiveBayesHyperparameters()

### Tuning Function

In [74]:
def randGridOptimizer(clf, params, iterations, folds, train):
    
    tuningGrid = RandomizedSearchCV(estimator = clf,
                                    param_distributions = params,
                                    n_iter = iterations,
                                    scoring = 'accuracy',
                                    n_jobs = -1,
                                    cv = folds,
                                    verbose = 2,
                                    random_state = 42)
    
    best_clf = tuningGrid.fit(train[0], train[1])
    
    return tuningGrid

# Create Tuned Models

#### Random Forest -TFIDF

In [75]:
rf_tfidf_grid = randGridOptimizer(rf_clf, rf_params, 30, 10, train_tfidf)

Fitting 10 folds for each of 30 candidates, totalling 300 fits
[CV] n_estimators=68, min_samples_split=45, min_samples_leaf=56, max_features=log2, max_depth=70 
[CV] n_estimators=68, min_samples_split=45, min_samples_leaf=56, max_features=log2, max_depth=70 
[CV] n_estimators=68, min_samples_split=45, min_samples_leaf=56, max_features=log2, max_depth=70 
[CV] n_estimators=68, min_samples_split=45, min_samples_leaf=56, max_features=log2, max_depth=70 
[CV]  n_estimators=68, min_samples_split=45, min_samples_leaf=56, max_features=log2, max_depth=70, total=  13.2s
[CV] n_estimators=68, min_samples_split=45, min_samples_leaf=56, max_features=log2, max_depth=70 
[CV]  n_estimators=68, min_samples_split=45, min_samples_leaf=56, max_features=log2, max_depth=70, total=  16.4s
[CV] n_estimators=68, min_samples_split=45, min_samples_leaf=56, max_features=log2, max_depth=70 
[CV]  n_estimators=68, min_samples_split=45, min_samples_leaf=56, max_features=log2, max_depth=70, total=  20.4s
[CV] n_est

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  4.3min


[CV] n_estimators=70, min_samples_split=89, min_samples_leaf=34, max_features=sqrt, max_depth=60 
[CV]  n_estimators=70, min_samples_split=89, min_samples_leaf=34, max_features=sqrt, max_depth=60, total=  27.8s
[CV] n_estimators=70, min_samples_split=89, min_samples_leaf=34, max_features=sqrt, max_depth=60 
[CV]  n_estimators=70, min_samples_split=89, min_samples_leaf=34, max_features=sqrt, max_depth=60, total=  29.3s
[CV] n_estimators=70, min_samples_split=89, min_samples_leaf=34, max_features=sqrt, max_depth=60 
[CV]  n_estimators=70, min_samples_split=89, min_samples_leaf=34, max_features=sqrt, max_depth=60, total=  32.1s
[CV] n_estimators=70, min_samples_split=89, min_samples_leaf=34, max_features=sqrt, max_depth=60 
[CV]  n_estimators=70, min_samples_split=89, min_samples_leaf=34, max_features=sqrt, max_depth=60, total=  31.4s
[CV] n_estimators=76, min_samples_split=67, min_samples_leaf=45, max_features=log2, max_depth=60 
[CV]  n_estimators=70, min_samples_split=89, min_samples_l

[CV] n_estimators=70, min_samples_split=56, min_samples_leaf=89, max_features=log2, max_depth=50 
[CV]  n_estimators=70, min_samples_split=56, min_samples_leaf=89, max_features=log2, max_depth=50, total=  16.1s
[CV] n_estimators=70, min_samples_split=56, min_samples_leaf=89, max_features=log2, max_depth=50 
[CV]  n_estimators=70, min_samples_split=56, min_samples_leaf=89, max_features=log2, max_depth=50, total=  16.3s
[CV] n_estimators=70, min_samples_split=56, min_samples_leaf=89, max_features=log2, max_depth=50 
[CV]  n_estimators=70, min_samples_split=56, min_samples_leaf=89, max_features=log2, max_depth=50, total=  17.0s
[CV] n_estimators=70, min_samples_split=56, min_samples_leaf=89, max_features=log2, max_depth=50 
[CV]  n_estimators=70, min_samples_split=56, min_samples_leaf=89, max_features=log2, max_depth=50, total=  18.1s
[CV] n_estimators=70, min_samples_split=56, min_samples_leaf=89, max_features=log2, max_depth=50 
[CV]  n_estimators=70, min_samples_split=56, min_samples_l

[CV] n_estimators=76, min_samples_split=34, min_samples_leaf=56, max_features=sqrt, max_depth=30 
[CV]  n_estimators=76, min_samples_split=34, min_samples_leaf=56, max_features=sqrt, max_depth=30, total=  23.9s
[CV] n_estimators=76, min_samples_split=34, min_samples_leaf=56, max_features=sqrt, max_depth=30 
[CV]  n_estimators=76, min_samples_split=34, min_samples_leaf=56, max_features=sqrt, max_depth=30, total=  23.4s
[CV] n_estimators=76, min_samples_split=34, min_samples_leaf=56, max_features=sqrt, max_depth=30 
[CV]  n_estimators=76, min_samples_split=34, min_samples_leaf=56, max_features=sqrt, max_depth=30, total=  26.7s
[CV] n_estimators=76, min_samples_split=34, min_samples_leaf=56, max_features=sqrt, max_depth=30 
[CV]  n_estimators=76, min_samples_split=34, min_samples_leaf=56, max_features=sqrt, max_depth=30, total=  27.6s
[CV] n_estimators=76, min_samples_split=34, min_samples_leaf=56, max_features=sqrt, max_depth=30 
[CV]  n_estimators=76, min_samples_split=34, min_samples_l

[CV] n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60 
[CV]  n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60, total= 1.7min
[CV] n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60 
[CV]  n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60, total= 1.7min
[CV] n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60 
[CV]  n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60, total= 1.7min
[CV] n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60 
[CV]  n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60, total= 1.7min


[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 20.8min


[CV] n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60 
[CV]  n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60, total= 1.7min
[CV] n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60 
[CV]  n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60, total= 1.6min
[CV] n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60 
[CV]  n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60, total= 1.6min
[CV] n_estimators=68, min_samples_split=45, min_samples_leaf=89, max_features=auto, max_depth=70 
[CV]  n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60, total= 1.6min
[CV] n_estimators=68, min_samples_split=45, min_samples_leaf=89, max_features=auto, max_depth=70 
[CV]  n_estimators=68, min_samples_split=45, min_samples_leaf=89,

[CV] n_estimators=68, min_samples_split=78, min_samples_leaf=78, max_features=auto, max_depth=50 
[CV]  n_estimators=68, min_samples_split=12, min_samples_leaf=12, max_features=auto, max_depth=60, total= 1.7min
[CV] n_estimators=68, min_samples_split=78, min_samples_leaf=78, max_features=auto, max_depth=50 
[CV]  n_estimators=68, min_samples_split=78, min_samples_leaf=78, max_features=auto, max_depth=50, total=  21.5s
[CV] n_estimators=68, min_samples_split=78, min_samples_leaf=78, max_features=auto, max_depth=50 
[CV]  n_estimators=68, min_samples_split=12, min_samples_leaf=12, max_features=auto, max_depth=60, total= 1.7min
[CV] n_estimators=68, min_samples_split=78, min_samples_leaf=78, max_features=auto, max_depth=50 
[CV]  n_estimators=68, min_samples_split=78, min_samples_leaf=78, max_features=auto, max_depth=50, total=  23.0s
[CV] n_estimators=70, min_samples_split=100, min_samples_leaf=78, max_features=auto, max_depth=70 
[CV]  n_estimators=68, min_samples_split=78, min_samples_

[CV] n_estimators=74, min_samples_split=12, min_samples_leaf=12, max_features=log2, max_depth=40 
[CV]  n_estimators=74, min_samples_split=12, min_samples_leaf=12, max_features=log2, max_depth=40, total=  14.5s
[CV] n_estimators=74, min_samples_split=12, min_samples_leaf=12, max_features=log2, max_depth=40 
[CV]  n_estimators=74, min_samples_split=12, min_samples_leaf=12, max_features=log2, max_depth=40, total=  14.8s
[CV] n_estimators=74, min_samples_split=12, min_samples_leaf=12, max_features=log2, max_depth=40 
[CV]  n_estimators=74, min_samples_split=12, min_samples_leaf=12, max_features=log2, max_depth=40, total=  18.3s
[CV] n_estimators=74, min_samples_split=12, min_samples_leaf=12, max_features=log2, max_depth=40 
[CV]  n_estimators=74, min_samples_split=12, min_samples_leaf=12, max_features=log2, max_depth=40, total=  19.5s
[CV] n_estimators=74, min_samples_split=12, min_samples_leaf=12, max_features=log2, max_depth=40 
[CV]  n_estimators=74, min_samples_split=12, min_samples_l

[CV] n_estimators=72, min_samples_split=34, min_samples_leaf=34, max_features=auto, max_depth=None 
[CV]  n_estimators=72, min_samples_split=34, min_samples_leaf=34, max_features=auto, max_depth=None, total=  29.7s
[CV] n_estimators=72, min_samples_split=34, min_samples_leaf=34, max_features=auto, max_depth=None 
[CV]  n_estimators=72, min_samples_split=34, min_samples_leaf=34, max_features=auto, max_depth=None, total=  33.8s
[CV] n_estimators=72, min_samples_split=34, min_samples_leaf=34, max_features=auto, max_depth=None 
[CV]  n_estimators=72, min_samples_split=34, min_samples_leaf=34, max_features=auto, max_depth=None, total=  31.7s
[CV] n_estimators=72, min_samples_split=34, min_samples_leaf=34, max_features=auto, max_depth=None 
[CV]  n_estimators=72, min_samples_split=34, min_samples_leaf=34, max_features=auto, max_depth=None, total=  30.9s
[CV] n_estimators=72, min_samples_split=34, min_samples_leaf=34, max_features=auto, max_depth=None 
[CV]  n_estimators=72, min_samples_split

[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 46.4min finished


In [76]:
rf_tfidf_tuned = rf_tfidf_grid.best_estimator_
rf_tfidf_tuned

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=40, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=100,
            min_weight_fraction_leaf=0.0, n_estimators=72, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

#### Random Forest - Reduced TFIDF

In [77]:
rf_rtfidf_grid = randGridOptimizer(rf_clf, rf_params, 30, 10, train_rtfidf)

Fitting 10 folds for each of 30 candidates, totalling 300 fits
[CV] n_estimators=68, min_samples_split=45, min_samples_leaf=56, max_features=log2, max_depth=70 
[CV] n_estimators=68, min_samples_split=45, min_samples_leaf=56, max_features=log2, max_depth=70 
[CV] n_estimators=68, min_samples_split=45, min_samples_leaf=56, max_features=log2, max_depth=70 
[CV] n_estimators=68, min_samples_split=45, min_samples_leaf=56, max_features=log2, max_depth=70 
[CV]  n_estimators=68, min_samples_split=45, min_samples_leaf=56, max_features=log2, max_depth=70, total=   4.4s
[CV] n_estimators=68, min_samples_split=45, min_samples_leaf=56, max_features=log2, max_depth=70 
[CV]  n_estimators=68, min_samples_split=45, min_samples_leaf=56, max_features=log2, max_depth=70, total=   4.6s
[CV]  n_estimators=68, min_samples_split=45, min_samples_leaf=56, max_features=log2, max_depth=70, total=   4.6s
[CV]  n_estimators=68, min_samples_split=45, min_samples_leaf=56, max_features=log2, max_depth=70, total=   

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.2min


[CV]  n_estimators=70, min_samples_split=89, min_samples_leaf=34, max_features=sqrt, max_depth=60, total=  10.4s
[CV] n_estimators=70, min_samples_split=89, min_samples_leaf=34, max_features=sqrt, max_depth=60 
[CV]  n_estimators=70, min_samples_split=89, min_samples_leaf=34, max_features=sqrt, max_depth=60, total=  10.8s
[CV] n_estimators=70, min_samples_split=89, min_samples_leaf=34, max_features=sqrt, max_depth=60 
[CV]  n_estimators=70, min_samples_split=89, min_samples_leaf=34, max_features=sqrt, max_depth=60, total=  10.3s
[CV] n_estimators=76, min_samples_split=67, min_samples_leaf=45, max_features=log2, max_depth=60 
[CV]  n_estimators=70, min_samples_split=89, min_samples_leaf=34, max_features=sqrt, max_depth=60, total=  10.4s
[CV] n_estimators=76, min_samples_split=67, min_samples_leaf=45, max_features=log2, max_depth=60 
[CV]  n_estimators=76, min_samples_split=67, min_samples_leaf=45, max_features=log2, max_depth=60, total=   5.5s
[CV] n_estimators=76, min_samples_split=67,

[CV]  n_estimators=70, min_samples_split=56, min_samples_leaf=89, max_features=log2, max_depth=50, total=   4.4s
[CV] n_estimators=70, min_samples_split=56, min_samples_leaf=89, max_features=log2, max_depth=50 
[CV]  n_estimators=70, min_samples_split=56, min_samples_leaf=89, max_features=log2, max_depth=50, total=   4.3s
[CV] n_estimators=70, min_samples_split=56, min_samples_leaf=89, max_features=log2, max_depth=50 
[CV]  n_estimators=70, min_samples_split=56, min_samples_leaf=89, max_features=log2, max_depth=50, total=   4.4s
[CV] n_estimators=70, min_samples_split=56, min_samples_leaf=89, max_features=log2, max_depth=50 
[CV]  n_estimators=70, min_samples_split=56, min_samples_leaf=89, max_features=log2, max_depth=50, total=   4.4s
[CV] n_estimators=74, min_samples_split=56, min_samples_leaf=12, max_features=log2, max_depth=60 
[CV]  n_estimators=70, min_samples_split=56, min_samples_leaf=89, max_features=log2, max_depth=50, total=   4.4s
[CV] n_estimators=74, min_samples_split=56,

[CV]  n_estimators=76, min_samples_split=34, min_samples_leaf=56, max_features=sqrt, max_depth=30, total=  10.0s
[CV] n_estimators=76, min_samples_split=34, min_samples_leaf=56, max_features=sqrt, max_depth=30 
[CV]  n_estimators=76, min_samples_split=34, min_samples_leaf=56, max_features=sqrt, max_depth=30, total=  10.2s
[CV] n_estimators=76, min_samples_split=34, min_samples_leaf=56, max_features=sqrt, max_depth=30 
[CV]  n_estimators=76, min_samples_split=34, min_samples_leaf=56, max_features=sqrt, max_depth=30, total=  10.0s
[CV] n_estimators=76, min_samples_split=34, min_samples_leaf=56, max_features=sqrt, max_depth=30 
[CV]  n_estimators=76, min_samples_split=34, min_samples_leaf=56, max_features=sqrt, max_depth=30, total=  10.0s
[CV] n_estimators=76, min_samples_split=34, min_samples_leaf=56, max_features=sqrt, max_depth=30 
[CV]  n_estimators=76, min_samples_split=34, min_samples_leaf=56, max_features=sqrt, max_depth=30, total=   9.7s
[CV] n_estimators=76, min_samples_split=100

[CV]  n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60, total=  12.6s
[CV] n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60 
[CV]  n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60, total=  12.3s
[CV] n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60 
[CV]  n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60, total=  13.0s
[CV] n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60 


[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  5.5min


[CV]  n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60, total=  12.3s
[CV] n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60 
[CV]  n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60, total=  12.6s
[CV] n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60 
[CV]  n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60, total=  12.5s
[CV] n_estimators=68, min_samples_split=45, min_samples_leaf=89, max_features=auto, max_depth=70 
[CV]  n_estimators=70, min_samples_split=2, min_samples_leaf=12, max_features=sqrt, max_depth=60, total=  12.5s
[CV] n_estimators=68, min_samples_split=45, min_samples_leaf=89, max_features=auto, max_depth=70 
[CV]  n_estimators=68, min_samples_split=45, min_samples_leaf=89, max_features=auto, max_depth=70, total=   8.1s
[CV] n_estimators=68, min_samples_split=45, min_s

[CV]  n_estimators=68, min_samples_split=78, min_samples_leaf=78, max_features=auto, max_depth=50, total=   8.1s
[CV] n_estimators=68, min_samples_split=78, min_samples_leaf=78, max_features=auto, max_depth=50 
[CV]  n_estimators=68, min_samples_split=78, min_samples_leaf=78, max_features=auto, max_depth=50, total=   8.5s
[CV] n_estimators=68, min_samples_split=78, min_samples_leaf=78, max_features=auto, max_depth=50 
[CV]  n_estimators=68, min_samples_split=78, min_samples_leaf=78, max_features=auto, max_depth=50, total=   8.6s
[CV] n_estimators=68, min_samples_split=78, min_samples_leaf=78, max_features=auto, max_depth=50 
[CV]  n_estimators=68, min_samples_split=78, min_samples_leaf=78, max_features=auto, max_depth=50, total=   8.4s
[CV] n_estimators=70, min_samples_split=100, min_samples_leaf=78, max_features=auto, max_depth=70 
[CV]  n_estimators=68, min_samples_split=78, min_samples_leaf=78, max_features=auto, max_depth=50, total=   8.4s
[CV] n_estimators=70, min_samples_split=10

[CV]  n_estimators=74, min_samples_split=12, min_samples_leaf=12, max_features=log2, max_depth=40, total=   6.5s
[CV] n_estimators=74, min_samples_split=12, min_samples_leaf=12, max_features=log2, max_depth=40 
[CV]  n_estimators=74, min_samples_split=12, min_samples_leaf=12, max_features=log2, max_depth=40, total=   6.7s
[CV] n_estimators=74, min_samples_split=12, min_samples_leaf=12, max_features=log2, max_depth=40 
[CV]  n_estimators=74, min_samples_split=12, min_samples_leaf=12, max_features=log2, max_depth=40, total=   6.6s
[CV] n_estimators=74, min_samples_split=12, min_samples_leaf=12, max_features=log2, max_depth=40 
[CV]  n_estimators=74, min_samples_split=12, min_samples_leaf=12, max_features=log2, max_depth=40, total=   6.5s
[CV] n_estimators=74, min_samples_split=12, min_samples_leaf=12, max_features=log2, max_depth=40 
[CV]  n_estimators=74, min_samples_split=12, min_samples_leaf=12, max_features=log2, max_depth=40, total=   6.6s
[CV] n_estimators=76, min_samples_split=100

[CV]  n_estimators=72, min_samples_split=34, min_samples_leaf=34, max_features=auto, max_depth=None, total=  11.0s
[CV] n_estimators=72, min_samples_split=34, min_samples_leaf=34, max_features=auto, max_depth=None 
[CV]  n_estimators=72, min_samples_split=34, min_samples_leaf=34, max_features=auto, max_depth=None, total=  11.1s
[CV] n_estimators=72, min_samples_split=34, min_samples_leaf=34, max_features=auto, max_depth=None 
[CV]  n_estimators=72, min_samples_split=34, min_samples_leaf=34, max_features=auto, max_depth=None, total=  11.2s
[CV] n_estimators=72, min_samples_split=34, min_samples_leaf=34, max_features=auto, max_depth=None 
[CV]  n_estimators=72, min_samples_split=34, min_samples_leaf=34, max_features=auto, max_depth=None, total=  10.6s
[CV] n_estimators=72, min_samples_split=34, min_samples_leaf=34, max_features=auto, max_depth=None 
[CV]  n_estimators=72, min_samples_split=34, min_samples_leaf=34, max_features=auto, max_depth=None, total=  10.6s
[CV] n_estimators=72, min

[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 11.3min finished


In [78]:
rf_rtfidf_tuned = rf_rtfidf_grid.best_estimator_
rf_tfidf_tuned

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=40, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=100,
            min_weight_fraction_leaf=0.0, n_estimators=72, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

#### Logistic Regression - Unigram

In [79]:
lr_uni_grid = randGridOptimizer(lr_clf, lr_params, 30, 10, train_uni)

Fitting 10 folds for each of 30 candidates, totalling 300 fits
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] .. tol=0.0001, penalty=l1, max_iter=178, C=10000.0, total=  22.9s
[CV] .. tol=0.0001, penalty=l1, max_iter=178, C=10000.0, total=  20.8s
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] .. tol=0.0001, penalty=l1, max_iter=178, C=10000.0, total=  21.0s
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] .. tol=0.0001, penalty=l1, max_iter=178, C=10000.0, total=  22.9s
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] .. tol=0.0001, penalty=l1, max_iter=178, C=10000.0, total=  20.6s
[CV] tol=0.000

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  4.9min


[CV] tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832 .......
[CV]  tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832, total=  25.3s
[CV] tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832 .......
[CV]  tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832, total=  26.4s
[CV] tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832 .......
[CV]  tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832, total=  24.9s
[CV] tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832 .......
[CV]  tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832, total=  23.9s
[CV] tol=0.0001, penalty=l1, max_iter=125, C=1.0 .....................
[CV]  tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832, total=  23.9s
[CV] tol=0.0001, penalty=l1, max_iter=125, C=1.0 .....................
[CV]  tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832, total=  23.4s
[CV] tol=0.0001, penalty=l1, max_iter=125, C=1.0 .....................
[CV]  tol=1e-05, penalty=l2, 

[CV] tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827 ........
[CV]  tol=1e-05, penalty=l1, max_iter=114, C=21.544346900318832, total=  23.1s
[CV] tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827 ........
[CV]  tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827, total=  24.0s
[CV] tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827 ........
[CV]  tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827, total=  24.6s
[CV] tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827 ........
[CV]  tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827, total=  25.1s
[CV] tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827 ........
[CV]  tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827, total=  25.3s
[CV] tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827 ........
[CV]  tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827, total=  25.0s
[CV] tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827 ........
[CV]  tol=1e-05, penalty=l2, max_i

[CV] tol=0.01, penalty=l1, max_iter=50, C=7.742636826811269 ..........
[CV]  tol=0.01, penalty=l1, max_iter=50, C=7.742636826811269, total=  23.2s
[CV] tol=0.01, penalty=l1, max_iter=50, C=7.742636826811269 ..........
[CV]  tol=0.01, penalty=l1, max_iter=50, C=7.742636826811269, total=  23.4s
[CV] tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245 ......
[CV]  tol=0.01, penalty=l1, max_iter=50, C=7.742636826811269, total=  22.9s
[CV] tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245 ......
[CV]  tol=0.01, penalty=l1, max_iter=50, C=7.742636826811269, total=  23.2s
[CV] tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245 ......
[CV]  tol=0.01, penalty=l1, max_iter=50, C=7.742636826811269, total=  23.2s
[CV] tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245 ......
[CV]  tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245, total=  23.1s
[CV] tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245 ......
[CV]  tol=0.0001, penalty=l1, max_iter=114,

[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 21.8min


[CV] tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245 ......
[CV]  tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245, total=  22.7s
[CV] tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245 ......
[CV]  tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245, total=  22.8s
[CV] tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245 ......
[CV]  tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245, total=  22.4s
[CV] tol=1e-05, penalty=l1, max_iter=103, C=59.94842503189409 ........
[CV]  tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245, total=  23.3s
[CV] tol=1e-05, penalty=l1, max_iter=103, C=59.94842503189409 ........
[CV]  tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245, total=  23.0s
[CV] tol=1e-05, penalty=l1, max_iter=103, C=59.94842503189409 ........
[CV]  tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245, total=  25.0s
[CV] tol=1e-05, penalty=l1, max_iter=103, C=59.94842503189409 ........
[CV]  tol=1e-05, penalt

[CV] tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593 ........
[CV]  tol=0.01, penalty=l2, max_iter=71, C=21.544346900318832, total=  22.5s
[CV] tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593 ........
[CV]  tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593, total=  22.0s
[CV] tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593 ........
[CV]  tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593, total=  22.1s
[CV] tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593 ........
[CV]  tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593, total=  22.9s
[CV] tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593 ........
[CV]  tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593, total=  22.8s
[CV] tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593 ........
[CV]  tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593, total=  22.9s
[CV] tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593 ........
[CV]  tol=0.001, penalty=l1, max_ite

[CV] tol=0.001, penalty=l2, max_iter=146, C=166.81005372000593 .......
[CV]  tol=0.001, penalty=l2, max_iter=146, C=166.81005372000593, total=  24.5s
[CV] tol=0.001, penalty=l2, max_iter=146, C=166.81005372000593 .......
[CV]  tol=0.001, penalty=l2, max_iter=146, C=166.81005372000593, total=  23.7s
[CV] tol=0.0001, penalty=l1, max_iter=167, C=59.94842503189409 .......
[CV]  tol=0.001, penalty=l2, max_iter=146, C=166.81005372000593, total=  24.8s
[CV] tol=0.0001, penalty=l1, max_iter=167, C=59.94842503189409 .......
[CV]  tol=0.001, penalty=l2, max_iter=146, C=166.81005372000593, total=  25.0s
[CV] tol=0.0001, penalty=l1, max_iter=167, C=59.94842503189409 .......
[CV]  tol=0.001, penalty=l2, max_iter=146, C=166.81005372000593, total=  25.1s
[CV] tol=0.0001, penalty=l1, max_iter=167, C=59.94842503189409 .......
[CV]  tol=0.0001, penalty=l1, max_iter=167, C=59.94842503189409, total=  24.0s
[CV] tol=0.0001, penalty=l1, max_iter=167, C=59.94842503189409 .......
[CV]  tol=0.0001, penalty=l1,

[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 42.1min finished


In [80]:
lr_uni_tuned = lr_uni_grid.best_estimator_
lr_uni_tuned

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=125, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

#### Logistic Regression - Bigram

In [81]:
lr_big_grid = randGridOptimizer(lr_clf, lr_params, 30, 10, train_big)

Fitting 10 folds for each of 30 candidates, totalling 300 fits
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] .. tol=0.0001, penalty=l1, max_iter=178, C=10000.0, total= 1.9min
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] .. tol=0.0001, penalty=l1, max_iter=178, C=10000.0, total= 2.8min
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] .. tol=0.0001, penalty=l1, max_iter=178, C=10000.0, total= 3.1min
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] .. tol=0.0001, penalty=l1, max_iter=178, C=10000.0, total= 3.1min
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] .. tol=0.0001, penalty=l1, max_iter=178, C=10000.0, total= 2.9min
[CV] tol=0.000

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 29.4min


[CV]  tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832, total= 2.5min
[CV] tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832 .......
[CV]  tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832, total= 2.5min
[CV] tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832 .......
[CV]  tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832, total= 2.5min
[CV] tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832 .......
[CV]  tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832, total= 2.6min
[CV] tol=0.0001, penalty=l1, max_iter=125, C=1.0 .....................
[CV]  tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832, total= 2.6min
[CV] tol=0.0001, penalty=l1, max_iter=125, C=1.0 .....................
[CV]  tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832, total= 2.6min
[CV] tol=0.0001, penalty=l1, max_iter=125, C=1.0 .....................
[CV]  tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832, total= 2.6min
[CV] tol=0.0001, pena

[CV]  tol=1e-05, penalty=l1, max_iter=114, C=21.544346900318832, total= 2.6min
[CV] tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827 ........
[CV]  tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827, total= 2.6min
[CV] tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827 ........
[CV]  tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827, total= 2.6min
[CV] tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827 ........
[CV]  tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827, total= 2.6min
[CV] tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827 ........
[CV]  tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827, total= 2.5min
[CV] tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827 ........
[CV]  tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827, total= 2.5min
[CV] tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827 ........
[CV]  tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827, total= 2.5min
[CV] tol=1e-05, penalty=l2,

[CV]  tol=0.01, penalty=l1, max_iter=50, C=7.742636826811269, total= 2.5min
[CV] tol=0.01, penalty=l1, max_iter=50, C=7.742636826811269 ..........
[CV]  tol=0.01, penalty=l1, max_iter=50, C=7.742636826811269, total= 2.5min
[CV] tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245 ......
[CV]  tol=0.01, penalty=l1, max_iter=50, C=7.742636826811269, total= 2.5min
[CV] tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245 ......
[CV]  tol=0.01, penalty=l1, max_iter=50, C=7.742636826811269, total= 2.6min
[CV] tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245 ......
[CV]  tol=0.01, penalty=l1, max_iter=50, C=7.742636826811269, total= 2.5min
[CV] tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245 ......
[CV]  tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245, total= 2.5min
[CV] tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245 ......
[CV]  tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245, total= 2.5min
[CV] tol=0.0001, penalty=l1, max_i

[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 126.0min


[CV] tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245 ......
[CV]  tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245, total= 2.5min
[CV] tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245 ......
[CV]  tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245, total= 2.5min
[CV] tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245 ......
[CV]  tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245, total= 2.5min
[CV] tol=1e-05, penalty=l1, max_iter=103, C=59.94842503189409 ........
[CV]  tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245, total= 2.5min
[CV] tol=1e-05, penalty=l1, max_iter=103, C=59.94842503189409 ........
[CV]  tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245, total= 2.6min
[CV] tol=1e-05, penalty=l1, max_iter=103, C=59.94842503189409 ........
[CV]  tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245, total= 2.6min
[CV] tol=1e-05, penalty=l1, max_iter=103, C=59.94842503189409 ........
[CV]  tol=1e-05, penalt

[CV] tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593 ........
[CV]  tol=0.01, penalty=l2, max_iter=71, C=21.544346900318832, total= 2.6min
[CV] tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593 ........
[CV]  tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593, total= 2.5min
[CV] tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593 ........
[CV]  tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593, total= 2.5min
[CV] tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593 ........
[CV]  tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593, total= 2.6min
[CV] tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593 ........
[CV]  tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593, total= 2.5min
[CV] tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593 ........
[CV]  tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593, total= 2.5min
[CV] tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593 ........
[CV]  tol=0.001, penalty=l1, max_ite

[CV] tol=0.001, penalty=l2, max_iter=146, C=166.81005372000593 .......
[CV]  tol=0.001, penalty=l2, max_iter=146, C=166.81005372000593, total= 2.5min
[CV] tol=0.001, penalty=l2, max_iter=146, C=166.81005372000593 .......
[CV]  tol=0.001, penalty=l2, max_iter=146, C=166.81005372000593, total= 2.6min
[CV] tol=0.0001, penalty=l1, max_iter=167, C=59.94842503189409 .......
[CV]  tol=0.001, penalty=l2, max_iter=146, C=166.81005372000593, total= 2.6min
[CV] tol=0.0001, penalty=l1, max_iter=167, C=59.94842503189409 .......
[CV]  tol=0.001, penalty=l2, max_iter=146, C=166.81005372000593, total= 2.6min
[CV] tol=0.0001, penalty=l1, max_iter=167, C=59.94842503189409 .......
[CV]  tol=0.001, penalty=l2, max_iter=146, C=166.81005372000593, total= 2.6min
[CV] tol=0.0001, penalty=l1, max_iter=167, C=59.94842503189409 .......
[CV]  tol=0.0001, penalty=l1, max_iter=167, C=59.94842503189409, total= 2.6min
[CV] tol=0.0001, penalty=l1, max_iter=167, C=59.94842503189409 .......
[CV]  tol=0.0001, penalty=l1,

[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 241.0min finished


In [82]:
lr_big_tuned = lr_big_grid.best_estimator_
lr_big_tuned

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=125, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

#### Logistic Regression - Reduced Bigram

In [83]:
lr_rbig_grid = randGridOptimizer(lr_clf, lr_params, 30, 10, train_rbig)

Fitting 10 folds for each of 30 candidates, totalling 300 fits
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] .. tol=0.0001, penalty=l1, max_iter=178, C=10000.0, total= 1.1min
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] .. tol=0.0001, penalty=l1, max_iter=178, C=10000.0, total= 1.4min
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] .. tol=0.0001, penalty=l1, max_iter=178, C=10000.0, total= 1.7min
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] .. tol=0.0001, penalty=l1, max_iter=178, C=10000.0, total= 1.8min
[CV] tol=0.0001, penalty=l1, max_iter=178, C=10000.0 .................
[CV] .. tol=0.0001, penalty=l1, max_iter=178, C=10000.0, total=  29.7s
[CV] tol=0.000

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  3.5min


[CV] tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832 .......
[CV]  tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832, total=   6.0s
[CV] tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832 .......
[CV] .. tol=0.0001, penalty=l1, max_iter=178, C=10000.0, total= 1.7min
[CV] tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832 .......
[CV]  tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832, total=   6.3s
[CV]  tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832, total=   6.1s
[CV] tol=0.0001, penalty=l1, max_iter=125, C=1.0 .....................
[CV] tol=0.0001, penalty=l1, max_iter=125, C=1.0 .....................
[CV]  tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832, total=   6.5s
[CV] tol=0.0001, penalty=l1, max_iter=125, C=1.0 .....................
[CV]  tol=1e-05, penalty=l2, max_iter=146, C=21.544346900318832, total=   6.5s
[CV] tol=0.0001, penalty=l1, max_iter=125, C=1.0 .....................
[CV] ...... tol=0.0001, penalty=l1, m

[CV] tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827 ........
[CV]  tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827, total=  12.8s
[CV] tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827 ........
[CV]  tol=1e-05, penalty=l1, max_iter=114, C=21.544346900318832, total=  31.6s
[CV] tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827 ........
[CV]  tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827, total=  16.3s
[CV] tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827 ........
[CV]  tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827, total=  18.4s
[CV] tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827 ........
[CV]  tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827, total=  17.1s
[CV] tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827 ........
[CV]  tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827, total=  16.0s
[CV] tol=1e-05, penalty=l2, max_iter=60, C=1291.5496650148827 ........
[CV]  tol=1e-05, penalty=l2, max_i

[CV] tol=0.01, penalty=l1, max_iter=50, C=7.742636826811269 ..........
[CV]  tol=0.01, penalty=l1, max_iter=50, C=7.742636826811269, total=   1.6s
[CV] tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245 ......
[CV]  tol=0.01, penalty=l1, max_iter=50, C=7.742636826811269, total=   1.6s
[CV] tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245 ......
[CV]  tol=0.01, penalty=l1, max_iter=50, C=7.742636826811269, total=   1.6s
[CV] tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245 ......
[CV]  tol=0.01, penalty=l1, max_iter=50, C=7.742636826811269, total=   1.7s
[CV] tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245 ......
[CV]  tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245, total=   3.8s
[CV] tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245 ......
[CV]  tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245, total=   4.5s
[CV] tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245 ......
[CV]  tol=0.0001, penalty=l1, max_iter=

[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  7.9min


[CV]  tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245, total=   4.4s
[CV] tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245 ......
[CV]  tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245, total=   4.9s
[CV] tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245 ......
[CV]  tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245, total=   5.1s
[CV] tol=1e-05, penalty=l1, max_iter=103, C=59.94842503189409 ........
[CV]  tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245, total=   4.3s
[CV] tol=1e-05, penalty=l1, max_iter=103, C=59.94842503189409 ........
[CV]  tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245, total=   4.4s
[CV] tol=1e-05, penalty=l1, max_iter=103, C=59.94842503189409 ........
[CV]  tol=0.0001, penalty=l1, max_iter=114, C=2.7825594022071245, total=   4.6s
[CV] tol=1e-05, penalty=l1, max_iter=103, C=59.94842503189409 ........
[CV]  tol=1e-05, penalty=l1, max_iter=103, C=59.94842503189409, total= 1.4min
[CV] tol=1e-05, 

[CV]  tol=0.01, penalty=l2, max_iter=71, C=21.544346900318832, total=   2.5s
[CV] tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593 ........
[CV]  tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593, total=   6.5s
[CV] tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593 ........
[CV]  tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593, total=   6.9s
[CV] tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593 ........
[CV]  tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593, total=   6.5s
[CV] tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593 ........
[CV]  tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593, total=  10.4s
[CV] tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593 ........
[CV]  tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593, total=   6.6s
[CV] tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593 ........
[CV]  tol=0.001, penalty=l1, max_iter=60, C=166.81005372000593, total=   7.7s
[CV] tol=0.001, penalty=l1, m

[CV]  tol=0.001, penalty=l2, max_iter=146, C=166.81005372000593, total=   4.8s
[CV] tol=0.001, penalty=l2, max_iter=146, C=166.81005372000593 .......
[CV]  tol=0.001, penalty=l2, max_iter=146, C=166.81005372000593, total=   5.4s
[CV] tol=0.0001, penalty=l1, max_iter=167, C=59.94842503189409 .......
[CV]  tol=0.001, penalty=l2, max_iter=146, C=166.81005372000593, total=   5.4s
[CV] tol=0.0001, penalty=l1, max_iter=167, C=59.94842503189409 .......
[CV]  tol=0.001, penalty=l2, max_iter=146, C=166.81005372000593, total=   4.4s
[CV] tol=0.0001, penalty=l1, max_iter=167, C=59.94842503189409 .......
[CV]  tol=0.001, penalty=l2, max_iter=146, C=166.81005372000593, total=   4.9s
[CV] tol=0.0001, penalty=l1, max_iter=167, C=59.94842503189409 .......
[CV]  tol=0.0001, penalty=l1, max_iter=167, C=59.94842503189409, total=  23.4s
[CV] tol=0.0001, penalty=l1, max_iter=167, C=59.94842503189409 .......
[CV]  tol=0.0001, penalty=l1, max_iter=167, C=59.94842503189409, total=  25.7s
[CV] tol=0.0001, pena

[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 16.1min finished


In [84]:
lr_rbig_tuned = lr_rbig_grid.best_estimator_
lr_rbig_tuned

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=125, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

#### Naive Bayes - Bigram

In [85]:
nb_big_grid = randGridOptimizer(nb_clf, nb_params, 30, 10, train_big)

Fitting 10 folds for each of 30 candidates, totalling 300 fits
[CV] class_prior=[0.2, 0.8], alpha=9 .................................
[CV] class_prior=[0.2, 0.8], alpha=9 .................................
[CV] class_prior=[0.2, 0.8], alpha=9 .................................
[CV] class_prior=[0.2, 0.8], alpha=9 .................................
[CV] .................. class_prior=[0.2, 0.8], alpha=9, total= 1.6min
[CV] class_prior=[0.2, 0.8], alpha=9 .................................
[CV] .................. class_prior=[0.2, 0.8], alpha=9, total= 2.2min
[CV] class_prior=[0.2, 0.8], alpha=9 .................................
[CV] .................. class_prior=[0.2, 0.8], alpha=9, total= 2.5min
[CV] class_prior=[0.2, 0.8], alpha=9 .................................
[CV] .................. class_prior=[0.2, 0.8], alpha=9, total= 2.6min
[CV] class_prior=[0.2, 0.8], alpha=9 .................................
[CV] .................. class_prior=[0.2, 0.8], alpha=9, total= 2.4min
[CV] class_pri

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 29.7min


[CV] .................. class_prior=[0.1, 0.9], alpha=9, total= 2.6min
[CV] class_prior=[0.1, 0.9], alpha=9 .................................
[CV] .................. class_prior=[0.1, 0.9], alpha=9, total= 2.6min
[CV] class_prior=[0.1, 0.9], alpha=9 .................................
[CV] .................. class_prior=[0.1, 0.9], alpha=9, total= 2.6min
[CV] class_prior=[0.1, 0.9], alpha=9 .................................
[CV] .................. class_prior=[0.1, 0.9], alpha=9, total= 2.6min
[CV] class_prior=[0.3, 0.7], alpha=8 .................................
[CV] .................. class_prior=[0.1, 0.9], alpha=9, total= 2.6min
[CV] class_prior=[0.3, 0.7], alpha=8 .................................
[CV] .................. class_prior=[0.1, 0.9], alpha=9, total= 2.6min
[CV] class_prior=[0.3, 0.7], alpha=8 .................................
[CV] .................. class_prior=[0.1, 0.9], alpha=9, total= 2.6min
[CV] class_prior=[0.3, 0.7], alpha=8 .................................
[CV] .

  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


[CV] .................. class_prior=[0.4, 0.6], alpha=0, total= 2.6min
[CV] class_prior=[0.4, 0.6], alpha=0 .................................
[CV] .................. class_prior=[0.4, 0.6], alpha=0, total= 2.6min
[CV] class_prior=[0.4, 0.6], alpha=0 .................................


  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


[CV] .................. class_prior=[0.4, 0.6], alpha=0, total= 2.7min
[CV] class_prior=[0.4, 0.6], alpha=0 .................................
[CV] .................. class_prior=[0.4, 0.6], alpha=0, total= 2.6min


  'setting alpha = %.1e' % _ALPHA_MIN)


[CV] class_prior=[0.4, 0.6], alpha=0 .................................


  'setting alpha = %.1e' % _ALPHA_MIN)


[CV] .................. class_prior=[0.4, 0.6], alpha=0, total= 2.5min
[CV] class_prior=[0.4, 0.6], alpha=0 .................................
[CV] .................. class_prior=[0.4, 0.6], alpha=0, total= 2.5min
[CV] class_prior=[0.4, 0.6], alpha=0 .................................


  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


[CV] .................. class_prior=[0.4, 0.6], alpha=0, total= 2.6min
[CV] class_prior=[0.3, 0.7], alpha=1 .................................
[CV] .................. class_prior=[0.4, 0.6], alpha=0, total= 2.6min
[CV] class_prior=[0.3, 0.7], alpha=1 .................................


  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


[CV] .................. class_prior=[0.4, 0.6], alpha=0, total= 2.6min
[CV] class_prior=[0.3, 0.7], alpha=1 .................................
[CV] .................. class_prior=[0.4, 0.6], alpha=0, total= 2.6min
[CV] class_prior=[0.3, 0.7], alpha=1 .................................
[CV] .................. class_prior=[0.3, 0.7], alpha=1, total= 2.6min
[CV] class_prior=[0.3, 0.7], alpha=1 .................................
[CV] .................. class_prior=[0.3, 0.7], alpha=1, total= 2.6min
[CV] class_prior=[0.3, 0.7], alpha=1 .................................
[CV] .................. class_prior=[0.3, 0.7], alpha=1, total= 2.6min
[CV] class_prior=[0.3, 0.7], alpha=1 .................................
[CV] .................. class_prior=[0.3, 0.7], alpha=1, total= 2.6min
[CV] class_prior=[0.3, 0.7], alpha=1 .................................
[CV] .................. class_prior=[0.3, 0.7], alpha=1, total= 2.6min
[CV] class_prior=[0.3, 0.7], alpha=1 .................................
[CV] .

[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 130.6min


[CV] class_prior=[0.2, 0.8], alpha=3 .................................
[CV] .................. class_prior=[0.2, 0.8], alpha=3, total= 2.6min
[CV] class_prior=[0.2, 0.8], alpha=3 .................................
[CV] .................. class_prior=[0.2, 0.8], alpha=3, total= 2.5min
[CV] class_prior=[0.2, 0.8], alpha=3 .................................
[CV] .................. class_prior=[0.2, 0.8], alpha=3, total= 2.6min
[CV] class_prior=[0.2, 0.8], alpha=10 ................................
[CV] .................. class_prior=[0.2, 0.8], alpha=3, total= 2.6min
[CV] class_prior=[0.2, 0.8], alpha=10 ................................
[CV] .................. class_prior=[0.2, 0.8], alpha=3, total= 2.6min
[CV] class_prior=[0.2, 0.8], alpha=10 ................................
[CV] .................. class_prior=[0.2, 0.8], alpha=3, total= 2.6min
[CV] class_prior=[0.2, 0.8], alpha=10 ................................
[CV] ................. class_prior=[0.2, 0.8], alpha=10, total= 2.6min
[CV] c

[CV] class_prior=[0.3, 0.7], alpha=6 .................................
[CV] .................. class_prior=[0.3, 0.7], alpha=6, total= 2.6min
[CV] class_prior=[0.3, 0.7], alpha=6 .................................
[CV] .................. class_prior=[0.3, 0.7], alpha=6, total= 2.6min
[CV] class_prior=[0.3, 0.7], alpha=6 .................................
[CV] .................. class_prior=[0.3, 0.7], alpha=6, total= 2.6min
[CV] class_prior=[0.3, 0.7], alpha=6 .................................
[CV] .................. class_prior=[0.3, 0.7], alpha=6, total= 2.6min
[CV] class_prior=[0.3, 0.7], alpha=6 .................................
[CV] .................. class_prior=[0.3, 0.7], alpha=6, total= 2.6min
[CV] class_prior=[0.1, 0.9], alpha=0 .................................
[CV] .................. class_prior=[0.3, 0.7], alpha=6, total= 2.6min
[CV] class_prior=[0.1, 0.9], alpha=0 .................................
[CV] .................. class_prior=[0.3, 0.7], alpha=6, total= 2.6min
[CV] c

  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


[CV] .................. class_prior=[0.1, 0.9], alpha=0, total= 2.6min
[CV] class_prior=[0.1, 0.9], alpha=0 .................................
[CV] .................. class_prior=[0.1, 0.9], alpha=0, total= 2.6min
[CV] class_prior=[0.1, 0.9], alpha=0 .................................


  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


[CV] .................. class_prior=[0.1, 0.9], alpha=0, total= 2.7min
[CV] class_prior=[0.1, 0.9], alpha=0 .................................
[CV] .................. class_prior=[0.1, 0.9], alpha=0, total= 2.6min


  'setting alpha = %.1e' % _ALPHA_MIN)


[CV] class_prior=[0.1, 0.9], alpha=0 .................................


  'setting alpha = %.1e' % _ALPHA_MIN)


[CV] .................. class_prior=[0.1, 0.9], alpha=0, total= 2.6min
[CV] class_prior=[0.1, 0.9], alpha=0 .................................
[CV] .................. class_prior=[0.1, 0.9], alpha=0, total= 2.5min
[CV] class_prior=[0.1, 0.9], alpha=0 .................................


  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


[CV] .................. class_prior=[0.1, 0.9], alpha=0, total= 2.6min
[CV] class_prior=[0.2, 0.8], alpha=7 .................................
[CV] .................. class_prior=[0.1, 0.9], alpha=0, total= 2.6min
[CV] class_prior=[0.2, 0.8], alpha=7 .................................


  'setting alpha = %.1e' % _ALPHA_MIN)


[CV] .................. class_prior=[0.1, 0.9], alpha=0, total= 2.6min
[CV] class_prior=[0.2, 0.8], alpha=7 .................................


  'setting alpha = %.1e' % _ALPHA_MIN)


[CV] .................. class_prior=[0.1, 0.9], alpha=0, total= 2.7min
[CV] class_prior=[0.2, 0.8], alpha=7 .................................
[CV] .................. class_prior=[0.2, 0.8], alpha=7, total= 2.6min
[CV] class_prior=[0.2, 0.8], alpha=7 .................................
[CV] .................. class_prior=[0.2, 0.8], alpha=7, total= 2.6min
[CV] class_prior=[0.2, 0.8], alpha=7 .................................
[CV] .................. class_prior=[0.2, 0.8], alpha=7, total= 2.6min
[CV] class_prior=[0.2, 0.8], alpha=7 .................................
[CV] .................. class_prior=[0.2, 0.8], alpha=7, total= 2.6min
[CV] class_prior=[0.2, 0.8], alpha=7 .................................
[CV] .................. class_prior=[0.2, 0.8], alpha=7, total= 2.6min
[CV] class_prior=[0.2, 0.8], alpha=7 .................................
[CV] .................. class_prior=[0.2, 0.8], alpha=7, total= 2.5min
[CV] class_prior=[0.2, 0.8], alpha=7 .................................
[CV] .

  'setting alpha = %.1e' % _ALPHA_MIN)


[CV] class_prior=[0.2, 0.8], alpha=0 .................................
[CV] .................. class_prior=[0.2, 0.8], alpha=0, total= 2.9min
[CV] class_prior=[0.2, 0.8], alpha=0 .................................


  'setting alpha = %.1e' % _ALPHA_MIN)


[CV] .................. class_prior=[0.2, 0.8], alpha=0, total= 2.7min
[CV] class_prior=[0.2, 0.8], alpha=0 .................................


  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


[CV] .................. class_prior=[0.2, 0.8], alpha=0, total= 2.6min
[CV] class_prior=[0.2, 0.8], alpha=0 .................................


  'setting alpha = %.1e' % _ALPHA_MIN)


[CV] .................. class_prior=[0.2, 0.8], alpha=0, total= 2.6min
[CV] class_prior=[0.2, 0.8], alpha=0 .................................


  'setting alpha = %.1e' % _ALPHA_MIN)


[CV] .................. class_prior=[0.2, 0.8], alpha=0, total= 2.6min
[CV] class_prior=[0.2, 0.8], alpha=0 .................................
[CV] .................. class_prior=[0.2, 0.8], alpha=0, total= 2.6min
[CV] class_prior=[0.2, 0.8], alpha=0 .................................


  'setting alpha = %.1e' % _ALPHA_MIN)


[CV] .................. class_prior=[0.2, 0.8], alpha=0, total= 2.6min
[CV] class_prior=[0.4, 0.6], alpha=9 .................................


  'setting alpha = %.1e' % _ALPHA_MIN)


[CV] .................. class_prior=[0.2, 0.8], alpha=0, total= 2.6min
[CV] class_prior=[0.4, 0.6], alpha=9 .................................


  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


[CV] .................. class_prior=[0.2, 0.8], alpha=0, total= 2.6min
[CV] class_prior=[0.4, 0.6], alpha=9 .................................
[CV] .................. class_prior=[0.2, 0.8], alpha=0, total= 2.7min
[CV] class_prior=[0.4, 0.6], alpha=9 .................................
[CV] .................. class_prior=[0.4, 0.6], alpha=9, total= 2.6min
[CV] class_prior=[0.4, 0.6], alpha=9 .................................
[CV] .................. class_prior=[0.4, 0.6], alpha=9, total= 2.6min
[CV] class_prior=[0.4, 0.6], alpha=9 .................................
[CV] .................. class_prior=[0.4, 0.6], alpha=9, total= 2.6min
[CV] class_prior=[0.4, 0.6], alpha=9 .................................
[CV] .................. class_prior=[0.4, 0.6], alpha=9, total= 2.5min
[CV] class_prior=[0.4, 0.6], alpha=9 .................................
[CV] .................. class_prior=[0.4, 0.6], alpha=9, total= 2.5min
[CV] class_prior=[0.4, 0.6], alpha=9 .................................
[CV] .

[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 252.9min finished


In [86]:
nb_big_tuned = nb_big_grid.best_estimator_
nb_big_tuned

MultinomialNB(alpha=9, class_prior=[0.4, 0.6], fit_prior=True)

# Tuned vs Untuned Models

In [90]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [92]:
def compare(original, tuned, test, clf_name, nlp_name):
    
    X_test = test[0]
    y_test = test[1]
    
    y_pred = original.predict(X_test)
    y_pred_t = tuned.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    acc_t = accuracy_score(y_test, y_pred_t)
    roc_auc_t = roc_auc_score(y_test, y_pred_t)
    f1_t = f1_score(y_test, y_pred_t)
    precision_t = precision_score(y_test, y_pred_t)
    recall_t = recall_score(y_test, y_pred_t)
    
    print('Untuned ' + clf_name + ' - ' + nlp_name + ' Metrics: ')
    print(clf_name + ' Accuracy: ' + str(acc))
    print(clf_name + ' ROC AUC Score: '+ str(roc_auc))
    print(clf_name + ' F Score: ' + str(f1))
    print(clf_name + ' Precision Score: ' + str(precision))
    print(clf_name + ' Recall Score ' + str(recall))
    print(" ")
    
    print('Tuned ' + clf_name + ' - ' + nlp_name + ' Metrics: ')
    print(clf_name + ' Accuracy: ' + str(acc_t))
    print(clf_name + ' ROC AUC Score: '+ str(roc_auc_t))
    print(clf_name + ' F Score: ' + str(f1_t))
    print(clf_name + ' Precision Score: ' + str(precision_t))
    print(clf_name + ' Recall Score ' + str(recall_t))
    print(" ")

### Random Forest TFIDF Comparison

In [93]:
compare(rf_tfidf, rf_tfidf_tuned, test_tfidf, 'Random Forest', 'TFIDF')

Untuned Random Forest - TFIDF Metrics: 
Random Forest Accuracy: 0.8954248366013072
Random Forest ROC AUC Score: 0.8939085114112242
Random Forest F Score: 0.8860971524288107
Random Forest Precision Score: 0.943800178412132
Random Forest Recall Score 0.835043409629045
 
Tuned Random Forest - TFIDF Metrics: 
Random Forest Accuracy: 0.8919646289888504
Random Forest ROC AUC Score: 0.8901189697179823
Random Forest F Score: 0.8806794055201699
Random Forest Precision Score: 0.953125
Random Forest Recall Score 0.8184688239936859
 


### Random Forest Reduced TFIDF Comparison

In [94]:
compare(rf_rtfidf, rf_rtfidf_tuned, test_rtfidf, 'Random Forest', 'Reduced TFIDF')

Untuned Random Forest - Reduced TFIDF Metrics: 
Random Forest Accuracy: 0.8673587081891581
Random Forest ROC AUC Score: 0.8659922209376764
Random Forest F Score: 0.8565488565488565
Random Forest Precision Score: 0.9050966608084359
Random Forest Recall Score 0.8129439621152328
 
Tuned Random Forest - Reduced TFIDF Metrics: 
Random Forest Accuracy: 0.8927335640138409
Random Forest ROC AUC Score: 0.8917406924004454
Random Forest F Score: 0.8857025809094634
Random Forest Precision Score: 0.9207836456558773
Random Forest Recall Score 0.8531965272296764
 


### Logistic Regression Unigram Comparison

In [95]:
compare(lr_uni, lr_uni_tuned, test_uni, 'Logistic Regression', 'Unigram')

Untuned Logistic Regression - Unigram Metrics: 
Logistic Regression Accuracy: 0.9154171472510573
Logistic Regression ROC AUC Score: 0.9144294269597639
Logistic Regression F Score: 0.9098360655737705
Logistic Regression Precision Score: 0.9462915601023018
Logistic Regression Recall Score 0.8760852407261247
 
Tuned Logistic Regression - Unigram Metrics: 
Logistic Regression Accuracy: 0.9215686274509803
Logistic Regression ROC AUC Score: 0.9207237344232383
Logistic Regression F Score: 0.9168704156479218
Logistic Regression Precision Score: 0.9477674810446504
Logistic Regression Recall Score 0.8879242304656669
 


### Logistic Regression Bigram Comparison

In [96]:
compare(lr_big, lr_big_tuned, test_big, 'Logistic Regression', 'Bigram')

Untuned Logistic Regression - Bigram Metrics: 
Logistic Regression Accuracy: 0.9150326797385621
Logistic Regression ROC AUC Score: 0.9139356919803714
Logistic Regression F Score: 0.9090160559901194
Logistic Regression Precision Score: 0.9500860585197934
Logistic Regression Recall Score 0.8713496448303079
 
Tuned Logistic Regression - Bigram Metrics: 
Logistic Regression Accuracy: 0.9231064975009612
Logistic Regression ROC AUC Score: 0.9223022663885106
Logistic Regression F Score: 0.9186330349877949
Logistic Regression Precision Score: 0.9479429051217464
Logistic Regression Recall Score 0.8910812943962115
 


### Logistic Regression Reduced Bigram Comparison

In [97]:
compare(lr_rbig, lr_rbig_tuned, test_rbig, 'Logistic Regression', 'Reduced Bigram')

Untuned Logistic Regression - Reduced Bigram Metrics: 
Logistic Regression Accuracy: 0.9142637447135717
Logistic Regression ROC AUC Score: 0.913186066792965
Logistic Regression F Score: 0.9082682023858494
Logistic Regression Precision Score: 0.9484536082474226
Logistic Regression Recall Score 0.8713496448303079
 
Tuned Logistic Regression - Reduced Bigram Metrics: 
Logistic Regression Accuracy: 0.9134948096885813
Logistic Regression ROC AUC Score: 0.9124166212079436
Logistic Regression F Score: 0.9074454956807897
Logistic Regression Precision Score: 0.947594501718213
Logistic Regression Recall Score 0.8705603788476717
 


### Naive Bayes Bigram Comparison

In [98]:
compare(nb_big, nb_big_tuned, test_big, 'Naive Bayes', 'Bigram')

Untuned Naive Bayes - Bigram Metrics: 
Naive Bayes Accuracy: 0.8669742406766628
Naive Bayes ROC AUC Score: 0.8677580112863853
Naive Bayes F Score: 0.8680396643783371
Naive Bayes Precision Score: 0.8398523985239852
Naive Bayes Recall Score 0.8981846882399369
 
Tuned Naive Bayes - Bigram Metrics: 
Naive Bayes Accuracy: 0.8646674356016917
Naive Bayes ROC AUC Score: 0.8655289561217814
Naive Bayes F Score: 0.8661596958174904
Naive Bayes Precision Score: 0.8356566397652238
Naive Bayes Recall Score 0.898973954222573
 


# Save Tuned Models

In [99]:
path = os.getcwd()
folderpath = path + '/models' + '/tuned'
os.mkdir(folderpath)
folderpath += '/randomSearchCV'
os.mkdir(folderpath)

In [100]:
def saveModel(model, name, test):
    filepath = folderpath + '/' + name + '.pkl'
    joblib.dump(model, filepath)
    
    # test to see if model correctly saved
    model_load = joblib.load(filepath)
    X = test[0]
    y = test[1]
    
    assert model.score(X, y) == model_load.score(X, y)

# Random Forest - TFIDF

In [101]:
saveModel(rf_tfidf_tuned, 'rf_tfidf', test_tfidf)

# Random Forest - Reduced TFIDF

In [102]:
saveModel(rf_rtfidf_tuned, 'rf_rtfidf', test_rtfidf)

# Logistic Regression - Unigram

In [103]:
saveModel(lr_uni_tuned, 'lr_uni', test_uni)

# Logistic Regression - Bigram

In [104]:
saveModel(lr_big_tuned, 'lr_big', test_big)

# Logistic Regression - Reduced Bigram

In [105]:
saveModel(lr_rbig_tuned, 'lr_rbig', test_rbig)

# Naive Bayes - Bigram

In [106]:
saveModel(nb_big_tuned, 'nb_big', test_big)