# NLP Modelling

In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#### Import preprocessed dataframes

In [17]:
stem_df = pd.read_csv('./DataFrames/comments_stem_spellcheck_df.csv')
df2 = pd.read_csv('./DataFrames/holdout_df.csv')

#### Define training vars

In [18]:
X = stem_df['comments'].astype('U')
y = stem_df['real'].astype('U')

#### GridSearch #1
> Multinomial Bayes, comment/uncomment lines 2/3 to toggle between CountVectorizer and TfidfVectorizer

In [19]:
pipe = Pipeline([
    #('cvec', CountVectorizer()),
    ('tvec', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

params = {
    'tvec__max_features': [4_000, 6_000, 8_000],
    'tvec__ngram_range': [(1,1), (1,2), (1,3)],
    'nb__alpha': [0.01, 0.1, 0.2]
}

gs = GridSearchCV(pipe,
                  param_grid=params,
                  cv=5)

gs.fit(X, y)

print(gs.best_params_)
print(gs.best_score_)
print(gs.score(X, y))

{'nb__alpha': 0.1, 'tvec__max_features': 8000, 'tvec__ngram_range': (1, 1)}
0.687963631618995
0.8536644165863067


#### GridSearch #2
> RandomForest, comment/uncomment lines 2/3 to toggle between CountVectorizer and TfidfVectorizer

In [20]:
pipe2 = Pipeline([
    #('cvec', CountVectorizer()),
    ('tvec', TfidfVectorizer()),
    ('rf', RandomForestClassifier())
])

params2 = {
    'tvec__max_features': [6_000, 8_000],
    'tvec__ngram_range': [(1,1), (1,2),(1,3)],
    'rf__n_estimators': [60, 90],
    'rf__max_depth': [3, 5, 7]
}

gs2 = GridSearchCV(pipe2,
                  param_grid=params2,
                  cv=5)

gs2.fit(X, y)

print(gs2.best_params_)
print(gs2.best_score_)
print(gs2.score(X, y))

{'rf__max_depth': 7, 'rf__n_estimators': 60, 'tvec__max_features': 6000, 'tvec__ngram_range': (1, 3)}
0.5603502032738261
0.5639665702346512


#### GridSearch #3
> Support Vector Classifier, comment/uncomment lines 2/3 to toggle between CountVectorizer and TfidfVectorizer

In [21]:
pipe3 = Pipeline([
    #('cvec', CountVectorizer()),
    ('tvec', TfidfVectorizer()),
    ('svc', SVC())
])

params3 = {
    'tvec__max_features': [6_000, 8_000],
    'tvec__ngram_range': [(1,1), (1,2)],
    'svc__C': [6, 1, 14],
    'svc__gamma': ['scale', 'auto'],
    }

gs3 = GridSearchCV(pipe3,
                  param_grid=params3,
                  cv=5)

gs3.fit(X, y);

print(gs3.best_params_)
print(gs3.best_score_)
print(gs3.score(X, y))

{'svc__C': 1, 'svc__gamma': 'scale', 'tvec__max_features': 8000, 'tvec__ngram_range': (1, 1)}
0.7207482375829857
0.953150112504018


#### Compute accuracy on holdout data

In [27]:
gs_preds = gs.predict(df2['comments'])
gs2_preds = gs2.predict(df2['comments'])
gs3_preds = gs3.predict(df2['comments'])

gs_preds = np.array([int(x) for x in gs_preds])
gs2_preds = np.array([int(x) for x in gs2_preds])
gs3_preds = np.array([int(x) for x in gs3_preds])


real_array = np.array(df2['real'])

comp1 = []
for num in range(len(real_array)):
    comp1.append(gs_preds[num] == real_array[num])
    
comp2 = []
for num in range(len(real_array)):
    comp2.append(gs2_preds[num] == real_array[num])

comp3 = []
for num in range(len(real_array)):
    comp3.append(gs3_preds[num] == real_array[num])

gs1_acc = sum(comp1)/len(comp1)
gs2_acc = sum(comp2)/len(comp2)
gs3_acc = sum(comp3)/len(comp3)
print(gs1_acc)
print(gs2_acc)
print(gs3_acc)

0.5978689818468824
0.5244672454617206
0.7087608524072613
