# Modeling

In [1]:
from sklearn.linear_model import LogisticRegression 
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

import pickle

%matplotlib inline

  from numpy.core.umath_tests import inner1d


### Import X and y

In [2]:
with open('../Data/X_doc2vec.pkl', 'rb') as f:
    X = pickle.load(f)

In [3]:
with open('../Data/y_doc2vec.pkl', 'rb') as f:
    y = pickle.load(f)

### Import Doc2Vec Models

In [4]:
with open('../Models/model_dbow.pkl', 'rb') as f:
    model_dbow = pickle.load(f)

In [5]:
with open('../Models/model_dm_mean.pkl', 'rb') as f:
    model_dm_mean = pickle.load(f)

In [6]:
with open('../Models/model_dm_concat.pkl', 'rb') as f:
    model_dm_concat = pickle.load(f)

In [7]:
models = [(model_dbow, 'model_dbow'), (model_dm_mean, 'model_dm_mean'), (model_dm_concat, 'model_dm_concat')]

With the vectors I got from the Doc2Vec models, I will fit a number of differt classifiers and evaluate the results.

### Bernoulli Naive_Bayes

In [8]:
def bern_nb_model_func(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=42)
    bern_nb = BernoulliNB()
    bern_nb.fit(X_train, y_train)
    print("cross-validated train scores:", cross_val_score(bern_nb, X_train, y_train, cv=3))
    print("cross-validated test scores:", cross_val_score(bern_nb, X_test, y_test, cv=3))
    print('---')

In [9]:
for model in models:
    print(model)
    bern_nb_model_func(X[model[1]], y)

(<gensim.models.doc2vec.Doc2Vec object at 0x1038d0240>, 'model_dbow')
cross-validated train scores: [0.80086719 0.80195952 0.79518072]
cross-validated test scores: [0.80684008 0.79576108 0.78640309]
---
(<gensim.models.doc2vec.Doc2Vec object at 0x1a254f45c0>, 'model_dm_mean')
cross-validated train scores: [0.79058937 0.7902345  0.78361446]
cross-validated test scores: [0.80684008 0.77842004 0.79074253]
---
(<gensim.models.doc2vec.Doc2Vec object at 0x1038d04a8>, 'model_dm_concat')
cross-validated train scores: [0.58294524 0.59058786 0.5913253 ]
cross-validated test scores: [0.5977842  0.57996146 0.58968177]
---


### Gaussian Naive-Bayes

In [10]:
def gb_model_func(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=42)
    gb = GaussianNB()
    gb.fit(X_train, y_train)
    print("cross-validated train scores:", cross_val_score(gb, X_train, y_train, cv=3))
    print("cross-validated test scores:", cross_val_score(gb, X_test, y_test, cv=3))
    print('---')

In [11]:
for model in models:
    print(model)
    gb_model_func(X[model[1]], y)

(<gensim.models.doc2vec.Doc2Vec object at 0x1038d0240>, 'model_dbow')
cross-validated train scores: [0.84278144 0.83841953 0.84048193]
cross-validated test scores: [0.83863198 0.83044316 0.82401157]
---
(<gensim.models.doc2vec.Doc2Vec object at 0x1a254f45c0>, 'model_dm_mean')
cross-validated train scores: [0.79058937 0.80228076 0.8       ]
cross-validated test scores: [0.75626204 0.76974952 0.7656702 ]
---
(<gensim.models.doc2vec.Doc2Vec object at 0x1038d04a8>, 'model_dm_concat')
cross-validated train scores: [0.58069696 0.58207517 0.58891566]
cross-validated test scores: [0.56840077 0.58766859 0.57473481]
---


### Logistic Regression

In [12]:
def lr_model_func(X,y):
    lr = LogisticRegression()
    pipe = Pipeline([
        ('lr', lr)
    ])
    
    params = {
        'lr__penalty': ['l1'],
        'lr__C': [0.5]
    }
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=42)
    gs_d2v_lr = GridSearchCV(pipe, param_grid=params, cv=2)
    gs_d2v_lr.fit(X_train, y_train)
    print('train score:', gs_d2v_lr.score(X_train, y_train))
    print('test score:', gs_d2v_lr.score(X_test, y_test))
    print('best score:', gs_d2v_lr.best_score_)
    print('best params:', gs_d2v_lr.best_params_)
    print('---')

In [13]:
for model in models:
    print(model)
    lr_model_func(X[model[1]], y)

(<gensim.models.doc2vec.Doc2Vec object at 0x1038d0240>, 'model_dbow')
train score: 0.8792161901702538
test score: 0.8784131063283007
best score: 0.8746118428097227
best params: {'lr__C': 0.5, 'lr__penalty': 'l1'}
---
(<gensim.models.doc2vec.Doc2Vec object at 0x1a254f45c0>, 'model_dm_mean')
train score: 0.8543205910697077
test score: 0.8597815611949887
best score: 0.8492879323268016
best params: {'lr__C': 0.5, 'lr__penalty': 'l1'}
---
(<gensim.models.doc2vec.Doc2Vec object at 0x1038d04a8>, 'model_dm_concat')
train score: 0.6685405289645572
test score: 0.6723417924831352
best score: 0.6604561516222294
best params: {'lr__C': 0.5, 'lr__penalty': 'l1'}
---


### Random Forest

In [14]:
def rf_model_func(X,y):
    rf = RandomForestClassifier()
    pipe = Pipeline([
        ('rf', rf)
    ])
    
    params = {
        'rf__n_estimators': [750],
        'rf__max_features': ['log2', 'sqrt']
    }
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=42)
    gs_d2v_rf = GridSearchCV(pipe, param_grid=params, cv=2)
    gs_d2v_rf.fit(X_train, y_train)
    print('train score:', gs_d2v_rf.score(X_train, y_train))
    print('test score:', gs_d2v_rf.score(X_test, y_test))
    print('best score:', gs_d2v_rf.best_score_)
    print('best params:', gs_d2v_rf.best_params_)
    print('---')

In [15]:
%%time

for model in models:
    print(model)
    rf_model_func(X[model[1]], y)

(<gensim.models.doc2vec.Doc2Vec object at 0x1038d0240>, 'model_dbow')
train score: 1.0
test score: 0.8560873755220045
best score: 0.8515901060070671
best params: {'rf__max_features': 'log2', 'rf__n_estimators': 750}
---
(<gensim.models.doc2vec.Doc2Vec object at 0x1a254f45c0>, 'model_dm_mean')
train score: 1.0
test score: 0.8311917764214584
best score: 0.8238569439982868
best params: {'rf__max_features': 'log2', 'rf__n_estimators': 750}
---
(<gensim.models.doc2vec.Doc2Vec object at 0x1038d04a8>, 'model_dm_concat')
train score: 1.0
test score: 0.6429489238676518
best score: 0.6283863368669023
best params: {'rf__max_features': 'sqrt', 'rf__n_estimators': 750}
---
CPU times: user 22min 36s, sys: 10.3 s, total: 22min 46s
Wall time: 48min 54s


### Gradient Boost

In [16]:
def grad_model_func(X,y):
    grad = GradientBoostingClassifier()
    pipe = Pipeline([
        ('grad', grad)
    ])
    
    params = {
        'grad__n_estimators': [1000],
        'grad__max_features': ['log2']
    }
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=42)
    gs_d2v_grad = GridSearchCV(pipe, param_grid=params, cv=2)
    gs_d2v_grad.fit(X_train, y_train)
    print('train score:', gs_d2v_grad.score(X_train, y_train))
    print('test score:', gs_d2v_grad.score(X_test, y_test))
    print('best score:', gs_d2v_grad.best_score_)
    print('best params:', gs_d2v_grad.best_params_)
#     print('classification report': 
    print('---')

In [17]:
%%time

for model in models:
    print(model)
    grad_model_func(X[model[1]], y)

(<gensim.models.doc2vec.Doc2Vec object at 0x1038d0240>, 'model_dbow')
train score: 0.9561516222293608
test score: 0.8766463218760039
best score: 0.8671163936181604
best params: {'grad__max_features': 'log2', 'grad__n_estimators': 1000}
---
(<gensim.models.doc2vec.Doc2Vec object at 0x1a254f45c0>, 'model_dm_mean')
train score: 0.9406253346182675
test score: 0.847414070028911
best score: 0.8393296926865832
best params: {'grad__max_features': 'log2', 'grad__n_estimators': 1000}
---
(<gensim.models.doc2vec.Doc2Vec object at 0x1038d04a8>, 'model_dm_concat')
train score: 0.8335474890245208
test score: 0.6567619659492451
best score: 0.6375950315879645
best params: {'grad__max_features': 'log2', 'grad__n_estimators': 1000}
---
CPU times: user 2min 27s, sys: 935 ms, total: 2min 28s
Wall time: 2min 55s


---

When pairing the Doc2Vec models with classifiers, the two pairings that produced the best scores were Doc2Vec DBOW with logistic regression and Doc2Vec DBOW with gradient boost.  I decided to use the gradient boost classifier in my production model.  I made this decision because gradient boosting is an ensemble technique which means it has many different predictors trying to predict the same target variable, and as a boosting method, it adds one classifier at a time so that the next classifier is trained to improve the already trained ensemble.  Logistic regression is a linear classifier and is not equipped to make the types of decision that gradient boosting does.  