# Non-Deep Learning Modeling

In [17]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

Bringing in the CountVectorized dataframe from the EDA section

In [2]:
train_df = pd.read_csv('../data/cvec.csv')

Train/test splitting the dataframe, dropping the sentiment from the training set and setting it as the target variable for classification.

In [4]:
X_train, X_test, y_train, y_test = train_test_split(train_df.drop('sentiment', axis=1), train_df.sentiment, random_state=42)

Instantiating a Logistic Regression Model and fitting it on the training data.

In [5]:
lr = LogisticRegression(solver='lbfgs', max_iter=1000, C=.3)
lr.fit(X_train, y_train)

LogisticRegression(C=0.3, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [11]:
%%time
print("Logistic Regression score on the training set:", lr.score(X_train, y_train))
print("Logistic Regression score on the test set:", lr.score(X_test, y_test))

Logistic Regression score on the training set: 0.9272533333333334
Logistic Regression score on the test set: 0.86192
CPU times: user 681 ms, sys: 138 ms, total: 819 ms
Wall time: 290 ms


---

Instantiating a KNeighbors Classifier Model and fitting it on the training data.

In [7]:
neighbors = KNeighborsClassifier(n_jobs=10)
knn = neighbors.fit(X_train, y_train)

In [10]:
%%time
print("KNN train score:", knn.score(X_train, y_train))

KNN train score: 0.7584
CPU times: user 50min 26s, sys: 8.04 s, total: 50min 34s
Wall time: 5min 7s


In [9]:
%%time
print("KNN test score:", knn.score(X_test, y_test))

KNN test score: 0.62368
CPU times: user 16min 22s, sys: 1.82 s, total: 16min 24s
Wall time: 1min 39s


---

In [15]:
%%time
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)

CPU times: user 23.9 s, sys: 84.7 ms, total: 24 s
Wall time: 24 s


In [16]:
%%time
print("Random Forests score on the training set:", rf.score(X_train, y_train))
print("Random Forests score on the test set:", rf.score(X_test, y_test))

Random Forests score on the training set: 1.0
Random Forests score on the test set: 0.8376
CPU times: user 1.17 s, sys: 72.5 ms, total: 1.25 s
Wall time: 1.26 s


In [23]:
%%time
mnb = MultinomialNB()
mnb.fit(X_train, y_train)

CPU times: user 388 ms, sys: 124 ms, total: 513 ms
Wall time: 281 ms


In [24]:
%%time
print("Naive Bayes score on the training set:", mnb.score(X_train, y_train))
print("Naive Bayes score on the test set:", mnb.score(X_test, y_test))

Naive Bayes score on the training set: 0.84624
Naive Bayes score on the test set: 0.83952
CPU times: user 790 ms, sys: 161 ms, total: 952 ms
Wall time: 325 ms
