# **3. Model Optimization & Evaluation**

In [162]:
# Imports
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

### Read in data and establish baseline model

In [116]:
# Import data
df = pd.read_csv('data/agg_data_wsb_cms.csv')
reddit = df[['subreddit', 'text']]
print(reddit.shape)
reddit.head()

(5271, 2)


Unnamed: 0,subreddit,text
0,1,PonyShibaInu - an adorable unique hybrid of Sh...
1,1,While you hunt meme and trashcoins Amazon rele...
2,1,Could $CUMINU $8m mc beat OnlyFans valued at $...
3,1,The Revolutionary Arsenal 2.0: Elevating Your ...
4,1,Qtum Strikes the Perfect Balance: The Blockcha...


In [117]:
# Set X and y variables. The subreddit classification will be the predictive variabe. 
X = reddit['text']
y = reddit['subreddit']

In [118]:
# Baseline model has 50.5% accuracy
y.mean()

0.5048377916903813

## Model#1: Multinomial Naive Bayes (Count Vectorizer)
### **Test Score: 0.97**

In [119]:
# Set X and y variables.
X = reddit['text']
y = reddit['subreddit']

In [120]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 95)

In [121]:
# Set up pipeline and view parameters 
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('nb', MultinomialNB())
])

# pipe.get_params()

In [122]:
# Search over following hyperparameters
pgrid = {
    'cvec__max_df': [0.9, 0.95],
    'cvec__max_features': [2_000, 5_000, None],
    'cvec__min_df': [1, 2, 3],
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__stop_words': [None, 'english'],
    'cvec__binary': [False, True]
}

In [123]:
# Instantiate randomsearchCV
mnb_cv = RandomizedSearchCV(pipe, param_distributions = pgrid, n_iter = 50, n_jobs = 8)

# Fit randomsearchcv to data
mnb_cv.fit(X_train, y_train)

In [124]:
# Best parameters
mnb_cv.best_params_

{'cvec__stop_words': None,
 'cvec__ngram_range': (1, 1),
 'cvec__min_df': 2,
 'cvec__max_features': None,
 'cvec__max_df': 0.95,
 'cvec__binary': True}

In [125]:
# Accuracy Score
print(mnb_cv.score(X_train, y_train))
print(mnb_cv.score(X_test, y_test))

0.9739438401214268
0.9719271623672231


In [126]:
# Confusion Matrix
preds = mnb_cv.predict(X_test)
confusion_matrix(y_test, preds)

array([[623,  24],
       [ 13, 658]], dtype=int64)

## Model #2: Multinomial Naive Bayes (Tfid Vectorizer)
### **Test Score: 0.97**

In [127]:
# Set up model 
X = reddit['text']
y = reddit['subreddit']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 95)

In [128]:
# Set up pipeline 
pipe = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

# pipe.get_params()

In [129]:
# Search over following hyperparameters
pgrid = {
    'tvec__binary': [False, True],
    'tvec__max_df': [0.9, 0.95],
    'tvec__max_features': [5_000, 7_000, None],
    'tvec__min_df': [1, 3, 5],
    'tvec__ngram_range': [(1, 1), (1,2)],
    'tvec__stop_words': [None, 'english']
}

In [130]:
# Instantiate randomsearchCV
mnb_tfid = RandomizedSearchCV(pipe, param_distributions = pgrid, n_iter = 50, n_jobs = 8)

# Fit randomsearchcv to data
mnb_tfid.fit(X_train, y_train)

In [131]:
# Best Parameters
mnb_tfid.best_params_

{'tvec__stop_words': None,
 'tvec__ngram_range': (1, 1),
 'tvec__min_df': 3,
 'tvec__max_features': 7000,
 'tvec__max_df': 0.95,
 'tvec__binary': True}

In [132]:
# Accuracy Scores
print(mnb_tfid.score(X_train, y_train))
print(mnb_tfid.score(X_test, y_test))

0.9698962813053377
0.9688922610015175


In [133]:
# Confusion Matrix
preds = mnb_tfid.predict(X_test)
confusion_matrix(y_test, preds)

array([[618,  29],
       [ 12, 659]], dtype=int64)

## Model #3: KNN
### **Test Score: 0.89**

In [168]:
# Set up model 
X = reddit['text']
y = reddit['subreddit']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 95)

In [169]:
# Setup Pipeline
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('knn', KNeighborsClassifier())
])

# pipe.get_params()

In [170]:
# Parameters 
pgrid = {
    'knn__n_neighbors': [5, 7, 10],
    'knn__weights': ['uniform', 'distance'],
    'cvec__max_df': [0.9, 0.95],
    'cvec__max_features': [2_000, 5_000, None],
    'cvec__min_df': [1, 2, 3],
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__stop_words': [None, 'english'],
    'cvec__binary': [False, True]    
}

In [171]:
# Instantiate randomsearchCV
knn = RandomizedSearchCV(pipe, param_distributions = pgrid, n_iter = 50, n_jobs = 8)

# Fit randomsearchcv to data
knn.fit(X_train, y_train)

In [172]:
# Best parameters
knn.best_params_

{'knn__weights': 'uniform',
 'knn__n_neighbors': 5,
 'cvec__stop_words': None,
 'cvec__ngram_range': (1, 1),
 'cvec__min_df': 1,
 'cvec__max_features': 2000,
 'cvec__max_df': 0.9,
 'cvec__binary': False}

In [173]:
# Accuracy Scores
print(knn.score(X_train, y_train))
print(knn.score(X_test, y_test))

0.9281558310144195
0.8899848254931715


In [174]:
# Confusion Matrix
preds = knn.predict(X_test)
confusion_matrix(y_test, preds)

array([[626,  21],
       [124, 547]], dtype=int64)

## Model #4: Logistic Regression
### **Model Score: 0.99**

In [141]:
# Set up model
X = reddit['text']
y = reddit['subreddit']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 95)

In [142]:
# Setup Pipeline
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('logreg', LogisticRegression())    
])

# pipe.get_params()

In [143]:
# Parameters 
pgrid = {
    'logreg__penalty': ['l2', None],
    'cvec__max_df': [0.9, 0.95],
    'cvec__max_features': [2_000, 5_000, None],
    'cvec__min_df': [1, 2, 3],
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__stop_words': [None, 'english'],
    'cvec__binary': [False, True]    
}

In [144]:
# Instantiate randomsearchCV
lr = RandomizedSearchCV(pipe, param_distributions = pgrid, n_iter = 50, n_jobs = 8)

# Fit randomsearchcv to data
lr.fit(X_train, y_train)

In [145]:
# Best parameters
lr.best_params_

{'logreg__penalty': None,
 'cvec__stop_words': 'english',
 'cvec__ngram_range': (1, 1),
 'cvec__min_df': 1,
 'cvec__max_features': None,
 'cvec__max_df': 0.95,
 'cvec__binary': False}

In [146]:
# Accuracy Scores
print(lr.score(X_train, y_train))
print(lr.score(X_test, y_test))

1.0
0.9863429438543247


In [147]:
# Confusion Matrix
preds = lr.predict(X_test)
confusion_matrix(y_test, preds)

array([[643,   4],
       [ 14, 657]], dtype=int64)

## Model #5: Random Forest (RF)
### **Model Score: 0.98**

In [148]:
# Set up model
X = reddit['text']
y = reddit['subreddit']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 95)

In [149]:
# Setup Pipeline
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('rf', RandomForestClassifier())    
])

# pipe.get_params()

In [150]:
# Parameters 
pgrid = {
    'rf__n_estimators': [10, 100, 200, 500],
    'rf__criterion': ['gini', 'entropy', 'log_loss'],
    'rf__max_features': ['sqrt', 'log2', None],
    'cvec__max_df': [0.9, 0.95],
    'cvec__max_features': [2_000, 5_000, None],
    'cvec__min_df': [1, 2, 3],
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__stop_words': [None, 'english'],
    'cvec__binary': [False, True]    
}

In [151]:
# Instantiate randomsearchCV
rf = RandomizedSearchCV(pipe, pgrid, n_iter = 50, n_jobs = 8)

# Fit randomsearchcv to data
rf.fit(X_train, y_train)

In [152]:
# Best parameters
rf.best_params_

{'rf__n_estimators': 100,
 'rf__max_features': 'sqrt',
 'rf__criterion': 'log_loss',
 'cvec__stop_words': 'english',
 'cvec__ngram_range': (1, 2),
 'cvec__min_df': 1,
 'cvec__max_features': 2000,
 'cvec__max_df': 0.9,
 'cvec__binary': False}

In [153]:
# Accuracy Scores
print(rf.score(X_train, y_train))
print(rf.score(X_test, y_test))

0.9992410827219833
0.9779969650986343


In [154]:
# Confusion Matrix
preds = rf.predict(X_test)
confusion_matrix(y_test, preds)

array([[640,   7],
       [ 22, 649]], dtype=int64)

## Model #6: Extra Trees Classifier
### **Model Score: 0.98**

In [155]:
# Set up model
X = reddit['text']
y = reddit['subreddit']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 95)

In [156]:
# Setup Pipeline
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('et', ExtraTreesClassifier())    
])

# pipe.get_params()

In [157]:
# Parameters 
pgrid = {
    'et__n_estimators': [100, 300, 500],
    'et__criterion': ['gini', 'entropy', 'log_loss'],
    'et__min_samples_leaf': [1, 2],    
    'cvec__max_df': [0.9, 0.95],
    'cvec__max_features': [2_000, 5_000, None],
    'cvec__min_df': [1, 2, 3],
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__stop_words': [None, 'english'],
    'cvec__binary': [False, True]    
}

In [158]:
# Instantiate randomsearchCV
et = RandomizedSearchCV(pipe, pgrid, n_iter = 50, n_jobs = 8)

# Fit randomsearchcv to data
et.fit(X_train, y_train)

In [159]:
# Best parameters
et.best_params_

{'et__n_estimators': 500,
 'et__min_samples_leaf': 2,
 'et__criterion': 'log_loss',
 'cvec__stop_words': 'english',
 'cvec__ngram_range': (1, 1),
 'cvec__min_df': 2,
 'cvec__max_features': 2000,
 'cvec__max_df': 0.95,
 'cvec__binary': True}

In [160]:
# Accuracy Scores
print(et.score(X_train, y_train))
print(et.score(X_test, y_test))

0.9944346066278775
0.9795144157814871


In [161]:
# Confusion Matrix
preds = et.predict(X_test)
confusion_matrix(y_test, preds)

array([[644,   3],
       [ 24, 647]], dtype=int64)