# Disclaimer: Gridsearches are run using 50 CPU threads, so don't run unless you want to wait eons

## Preprocessing and Modeling

Here we will build our models to accurately determine if a post is from TheSilphRoad or pokemongo subreddit. 

In [22]:
# import our libraries

import pandas as pd
import numpy as np
import re

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

from sklearn.experimental import enable_hist_gradient_boosting, enable_halving_search_cv
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, HalvingRandomSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, HistGradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import FunctionTransformer

from xgboost import XGBClassifier

from googletrans import Translator, LANGUAGES

In [2]:
# import dataset

merged_df = pd.read_csv('../data/merged.csv')
merged_df.head()

Unnamed: 0,subreddit,merged_text
0,TheSilphRoad,Fix to not being able to attack? Has anybody f...
1,TheSilphRoad,Attack glitch during Regi raids 2 raids today ...
2,TheSilphRoad,[Bug?] Can’t seem to earn or collect pokecoins...
3,TheSilphRoad,"[Bug?] AR suddenly freezes Using an iPhone 11,..."
4,TheSilphRoad,3 hour incense event personal results For any ...


In [3]:
# set TheSilphRoad to 1 and pokemongo to 0
merged_df['subreddit'] = np.where(merged_df['subreddit'] == 'TheSilphRoad', 1, 0)
merged_df.head()

Unnamed: 0,subreddit,merged_text
0,1,Fix to not being able to attack? Has anybody f...
1,1,Attack glitch during Regi raids 2 raids today ...
2,1,[Bug?] Can’t seem to earn or collect pokecoins...
3,1,"[Bug?] AR suddenly freezes Using an iPhone 11,..."
4,1,3 hour incense event personal results For any ...


In [4]:
merged_df.dropna(inplace = True)
merged_df.isnull().sum()

subreddit      0
merged_text    0
dtype: int64

In [5]:
merged_df.shape

(19254, 2)

In [6]:
# create a custom stop words list to remove all similar words we found in the previous notebook

# start with the base english stopwords
new_stopwords = stopwords.words('english')

# add stopwords that will easily identify a silph post
# also added stopwords that are common across both subreddits and stop words as a result and lemmatizing and stemming
custom_words = ['silph', 'road', 'silphroad', 'thesilphroad', 'pokemon', 'go', 'get', 'one', 'like', 'would', 'know', 'time', 'game', 'shiny', 
               'https', 'raid', 'anyone', 'got', 'new', 'event', 'day', 'level', 'even', 'com', 'raids', 'still', 'people', 'also', 'since',
               'use', 'catch', 'amp', 'see', 'want', 'could', 'first', 'research', 'shadow', 'think', 'else', 'way', 'niantic', 'make', 
               'back', 'really', 'need', 'eggs', 'community', 'something', 'much', 'good', 'able', "'d", "'ll", "'re", "'s", "'ve", 'abl', 
                'abov', 'ani', 'anyon', 'becaus', 'befor', 'commun', 'doe', 'dure', 'egg', 'els', 'ha', 'hi', 'http', 'might', 'must', 
                "n't", 'onc', 'onli', 'ourselv', 'peopl', 'realli', 'sha', 'shini', 'sinc', 'someth', 'themselv', 'thi', 'veri', 'wa', 
                'whi', 'wo', 'yourselv', 'becau', 'el']

new_stopwords.extend(custom_words)

In [7]:
# initialize CountVectorizer

cvec = CountVectorizer(stop_words = new_stopwords, strip_accents = 'unicode')
X = merged_df['merged_text']

# convert to DataFrame
X = cvec.fit_transform(X)
text_df = pd.DataFrame(X.todense(), columns = cvec.get_feature_names())
text_df.head()

Unnamed: 0,00,000,0000,00000000003,0000000001,0000006,000002322,000010,0004,000exp,...,что,шикарно,это,から毎週金曜よる6時55分にお引っ越しすることを記念し,そのうち10月6日,シャトウ,ツ_,テレヒ東京系にて放送中のアニメ,ホケットモンスター,特別な内容て開催されます
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Here we see that we have some text in other languages where CountVectorizer doesn't return the expected result. This needs to be handled before we can move on to modeling. In order to take care of this, we will use a Google Translate library to translate any posts that aren't English into English. 

In [8]:
# thanks to this article for the following code: https://medium.com/analytics-vidhya/popular-python-libraries-in-nlp-dealing-with-language-detection-translation-beyond-7b8e7cb2928e

# initialize Translator
trans = Translator()
texts = merged_df['merged_text'].copy()
for i in range(len(texts)):
    try:
        texts.loc[i] = trans.translate(texts[i]).text
    except:
        pass

# CountVectorize
cvec = CountVectorizer(stop_words = new_stopwords, strip_accents = 'unicode')

X = texts

# convert to Dataframe
X = cvec.fit_transform(X)
text_df = pd.DataFrame(X.todense(), columns = cvec.get_feature_names())
text_df.head()

Unnamed: 0,00,000,0000,00000000003,0000000001,0000006,000002322,000010,0004,000exp,...,что,шикарно,это,から毎週金曜よる6時55分にお引っ越しすることを記念し,そのうち10月6日,シャトウ,ツ_,テレヒ東京系にて放送中のアニメ,ホケットモンスター,特別な内容て開催されます
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# debugging steps to find strings that are only partially another language
text_df[text_df['特別な内容て開催されます'] == 1]
#texts.index[17182]
#texts.loc[15225]

In [21]:
# as the translation library only translates when the majority of text is another language, we need to do a few manually
texts.loc[905] = trans.translate(texts[906], src = 'ru', dest = 'en').text
texts.loc[1081] = trans.translate(texts[1081], src = 'ru', dest = 'en').text
texts.loc[1203] = trans.translate(texts[1203], src = 'ru', dest = 'en').text
texts.loc[6224] = trans.translate(texts[6270], src = 'ja', dest = 'en').text
texts.loc[7880] = trans.translate(texts[7880], src = 'ja', dest = 'en').text
texts.loc[8285] = 'Rank 9 player with &lt;2500 points Hi pvp guys, I think my GBL It\'s buggy (Bugs in GBL is kinda new right?@_@)\nToday i hit rank9 with 55% winrate, is practically the same as many others with a score of 2550+, but i only got 2200points which is too low for a rank9.\nAt first I thought it would be something related to the "hidden ELO" so I continued to battle and managed to get a 1-4 and i didn\'t lose points\nSo I was wondering if this happened to anyone else and if there is anyone who can help me, maybe @NianticHelp'
texts.loc[15119] = 'Is this a bug? https://i.imgur.com/e3sGsy9.png\n\n[Attraction with as [there] [were] six [hearts] Has anyone seen this before?'
texts.loc[17061] = 'Kanto Cup so far, to the tune of Kung Fu Fighting https/imgur.com/4Rj6cCW\n\nIt literally only took me one set to get the screenshots after getting this stupid song stuck in my head. Powered up that Gust Pidgeot specifically after seeing every gorram lead be one of these two. Easy pickings, I guess'

# countVectorize
cvec = CountVectorizer(stop_words = new_stopwords, strip_accents = 'unicode')

X = texts

X = cvec.fit_transform(X)
text_df = pd.DataFrame(X.todense(), columns = cvec.get_feature_names())
text_df.head()

# save our translated dataframe so we don't have to keep waiting on translating
merged_df['merged_text'] = texts
merged_df.to_csv('../data/tranlated.csv', index = False)

In [8]:
# custom preprocessing of text 
# thanks to the following article for the code: https://kavita-ganesan.com/how-to-use-countvectorizer/#CountVectorizer-Plain-and-Simple
lemmatizer = WordNetLemmatizer()

def lemma_preprocessor(text):
    text = text.lower()
    text = re.sub("\\W", " ", text) # remove special characters
    text = re.sub("\\s+(in|the|all|for|and|on)\\s+"," _connector_ ", text) # normalize certain words
    
    #stem words
    words = re.split("\\s+", text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

porter_stemmer = PorterStemmer()
def stem_preprocessor(text):
    text = text.lower()
    text = re.sub("\\W", " ", text) # remove special characters
    text = re.sub("\\s+(in|the|all|for|and|on)\\s+"," _connector_ ", text) # normalize certain words
    
    #stem words
    words = re.split("\\s+", text)
    stemmed_words = [porter_stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)

Now that we have resolved the issues with other languages, we can finally move onto creating a model to differentiate posts between TheSilphRoad and pokemongo subreddits. 

In [27]:
translated_df = pd.read_csv('../data/translated.csv')
# create a train_test_split
X = translated_df['merged_text']
y = translated_df['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [28]:
# create pipelines using count vectorizer or tfidf along with a classification model

cvec_lr_pipe = Pipeline([
    ('cvec', CountVectorizer(stop_words = new_stopwords, strip_accents = 'unicode')),
    ('logr', LogisticRegression(solver = 'liblinear', max_iter = 10000))
])

tfidf_lr_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words = new_stopwords, strip_accents = 'unicode')),
    ('logr', LogisticRegression(solver = 'liblinear', max_iter = 10000))
])

cvec_nb_pipe = Pipeline([
    ('cvec', CountVectorizer(stop_words = new_stopwords, strip_accents = 'unicode')),
    ('nb', MultinomialNB())
])

tfidf_nb_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words = new_stopwords, strip_accents = 'unicode')),
    ('nb', MultinomialNB())
])

cvec_knn_pipe = Pipeline([
    ('cvec', CountVectorizer(stop_words = new_stopwords, strip_accents = 'unicode')),
    ('knn', KNeighborsClassifier())
])

tfidf_knn_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words = new_stopwords, strip_accents = 'unicode')),
    ('knn', KNeighborsClassifier())
])

cvec_rforest_pipe = Pipeline([
    ('cvec', CountVectorizer(stop_words = new_stopwords, strip_accents = 'unicode')),
    ('rforest', RandomForestClassifier())
])

tfidf_rforest_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words = new_stopwords, strip_accents = 'unicode')),
    ('rforest', RandomForestClassifier())
])

cvec_xgb_pipe = Pipeline([
    ('cvec', CountVectorizer(stop_words = new_stopwords, strip_accents = 'unicode')),
    ('xgb', XGBClassifier(use_label_encoder = False))
])

tfidf_xgb_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words = new_stopwords, strip_accents = 'unicode')),
    ('xgb', XGBClassifier(use_label_encoder = False))
])

cvec_svc_pipe = Pipeline([
    ('cvec', CountVectorizer(stop_words = new_stopwords, strip_accents = 'unicode')),
    ('svc', SVC())
])

tfidf_svc_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words = new_stopwords, strip_accents = 'unicode')),
    ('svc', SVC())
])

cvec_histboost_pipe = Pipeline([
    ('cvec', CountVectorizer(stop_words = new_stopwords, strip_accents = 'unicode')),
    ('densify', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)), 
    ('histboost', HistGradientBoostingClassifier())
])

tfidf_histboost_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words = new_stopwords, strip_accents = 'unicode')),
    ('densify', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)),
    ('histboost', HistGradientBoostingClassifier())
])

In [31]:
# create parameters for each pipeline

cvec_lr_params = {
    'cvec__preprocessor': [lemma_preprocessor, stem_preprocessor], 
    'cvec__ngram_range': [(1,1), (1,2), (1,3), (2,3),(2,2),(3,3)],
    'cvec__max_df': np.linspace(0.8, 1.0, 10),
    'cvec__min_df': range(1,4),
    'cvec__max_features': range(2000, 100000, 100),
    'logr__penalty': ['l1', 'l2'],
    'logr__C': np.linspace(0.0001, 1, 1000)
}

tfidf_lr_params = {
    'tfidf__preprocessor': [lemma_preprocessor, stem_preprocessor],
    'tfidf__ngram_range': [(1,1), (1,2),(1,3),(2,3), (2,2),(3,3)],
    'tfidf__max_features': range(2000, 100000, 100),
    'tfidf__max_df': np.linspace(0.8, 1.0, 10),
    'tfidf__min_df': range(1,4),
    'logr__penalty': ['l1', 'l2'],
    'logr__C': np.linspace(0.0001, 1, 1000)
}

cvec_nb_params = {
    'cvec__preprocessor': [lemma_preprocessor, stem_preprocessor],
    'cvec__ngram_range': [(1,1), (1,2), (1,3), (2,3),(2,2),(3,3)],
    'cvec__max_df': np.linspace(0.8, 1.0, 10),
    'cvec__min_df': range(1,4),
    'cvec__max_features': range(2000, 100000, 100),
    'nb__alpha': np.logspace(0, 6, 100)
}

tfidf_nb_params = {
    'tfidf__preprocessor': [lemma_preprocessor, stem_preprocessor],
    'tfidf__ngram_range': [(1,1), (1,2), (1,3), (2,3),(2,2),(3,3)],
    'tfidf__max_features': range(2000, 100000, 100),
    'tfidf__max_df': np.linspace(0.8, 1.0, 10),
    'tfidf__min_df': range(1,4),
    'nb__alpha': np.logspace(0, 6, 100)
}

cvec_knn_params = {
    'cvec__preprocessor': [lemma_preprocessor, stem_preprocessor],
    'cvec__ngram_range': [(1,1), (1,2), (1,3), (2,3),(2,2),(3,3)],
    'cvec__max_df': np.linspace(0.8, 1.0, 10),
    'cvec__min_df': range(1,4),
    'cvec__max_features': range(2000, 100000, 100),
    'knn__n_neighbors': range(3, 100, 2)
}

tfidf_knn_params = {
    'tfidf__preprocessor': [lemma_preprocessor, stem_preprocessor],
    'tfidf__ngram_range': [(1,1), (1,2), (1,3), (2,3),(2,2),(3,3)],
    'tfidf__max_features': range(2000, 100000, 100),
    'tfidf__max_df': np.linspace(0.8, 1.0, 10),
    'tfidf__min_df': range(1,4),
    'knn__n_neighbors': range(3, 100, 2)
}

cvec_rforest_params = {
    'cvec__preprocessor': [lemma_preprocessor, stem_preprocessor],
    'cvec__ngram_range': [(1,1), (1,2), (1,3), (2,3),(2,2),(3,3)],
    'cvec__max_df': np.linspace(0.8, 1.0, 10),
    'cvec__min_df': range(1,4),
    'cvec__max_features': range(2000, 100000, 100),
    'rforest__n_estimators': range(50, 1000, 50),
    'rforest__max_depth': range(10, 101, 2),
    'rforest__min_samples_split': range(2, 21, 2),
    'rforest__min_samples_leaf': range(1, 50, 2), 
    'rforest__max_features': ['sqrt', 'log2']
}

tfidf_rforest_params = {
    'tfidf__preprocessor': [lemma_preprocessor, stem_preprocessor],
    'tfidf__ngram_range': [(1,1), (1,2), (1,3), (2,3),(2,2),(3,3)],
    'tfidf__max_features': range(2000, 100000, 100),
    'tfidf__max_df': np.linspace(0.8, 1.0, 10),
    'tfidf__min_df': range(1,4),
    'rforest__n_estimators': range(50, 1000, 50),
    'rforest__max_depth': range(10, 101, 2),
    'rforest__min_samples_split': range(2, 21, 2),
    'rforest__min_samples_leaf': range(1, 50, 2), 
    'rforest__max_features': ['sqrt', 'log2']
}

cvec_xgb_params = {
    'cvec__preprocessor': [lemma_preprocessor, stem_preprocessor],
    'cvec__ngram_range': [(1,1), (1,2), (1,3), (2,3),(2,2),(3,3)],
    'cvec__max_df': np.linspace(0.8, 1.0, 10),
    'cvec__min_df': range(1,4),
    'cvec__max_features': range(2000, 100000, 100),
    'xgb__n_estimators': range(10, 1000, 10), 
    'xgb__learning_rate': np.linspace(0.01, 1, 100),
    'xgb__max_depth': range(10, 101, 2), 
    'xgb__min_child_weight': range(1, 1000, 100)
}

tfidf_xgb_params = {
    'tfidf__preprocessor': [lemma_preprocessor, stem_preprocessor],
    'tfidf__ngram_range': [(1,1), (1,2), (1,3), (2,3),(2,2),(3,3)],
    'tfidf__max_features': range(2000, 100000, 100),
    'tfidf__max_df': np.linspace(0.8, 1.0, 10),
    'tfidf__min_df': range(1,4),
    'xgb__n_estimators': range(10, 1000, 10), 
    'xgb__learning_rate': np.linspace(0.01, 1, 100),
    'xgb__max_depth': range(10, 101, 2), 
    'xgb__min_child_weight': range(1, 1000, 100)
}

cvec_svc_params = {
    'cvec__preprocessor': [lemma_preprocessor, stem_preprocessor],
    'cvec__ngram_range': [(1,1), (1,2), (1,3), (2,3),(2,2),(3,3)],
    'cvec__max_df': np.linspace(0.8, 1.0, 10),
    'cvec__min_df': range(1,4),
    'cvec__max_features': range(2000, 100000, 100),
    'svc__C': np.linspace(0.0001, 1, 100), 
    'svc__kernel': ['poly', 'rbf', 'linear'], 
    'svc__degree': range(2, 10)
}

tfidf_svc_params = {
    'tfidf__preprocessor': [lemma_preprocessor, stem_preprocessor],
    'tfidf__ngram_range': [(1,1), (1,2), (1,3), (2,3),(2,2),(3,3)],
    'tfidf__max_features': range(2000, 100000, 100),
    'tfidf__max_df': np.linspace(0.8, 1.0, 10),
    'tfidf__min_df': range(1,4),
    'svc__C': np.linspace(0.0001, 1, 100), 
    'svc__kernel': ['poly', 'rbf', 'linear'], 
    'svc__degree': range(2, 10)
}

cvec_histboost_params = {
    'cvec__preprocessor': [lemma_preprocessor, stem_preprocessor],
    'cvec__ngram_range': [(1,1), (1,2), (1,3), (2,3),(2,2),(3,3)],
    'cvec__max_df': np.linspace(0.8, 1.0, 10),
    'cvec__min_df': range(1,4),
    'cvec__max_features': range(2000, 100000, 100),
    'histboost__loss': ['auto', 'binary_crossentropy'],
    'histboost__learning_rate': np.linspace(0.01, 1, 100),
    'histboost__max_iter': range(100, 10000, 100),
    'histboost__max_leaf_nodes': range(2, 101, 2), 
    'histboost__max_depth': range(10, 101, 2),
    'histboost__min_samples_leaf': range(1, 50, 2)
}

tfidf_histboost_params = {
    'tfidf__preprocessor': [lemma_preprocessor, stem_preprocessor],
    'tfidf__ngram_range': [(1,1), (1,2), (1,3), (2,3),(2,2),(3,3)],
    'tfidf__max_features': range(2000, 100000, 100),
    'tfidf__max_df': np.linspace(0.8, 1.0, 10),
    'tfidf__min_df': range(1,4),
    'histboost__loss': ['auto', 'binary_crossentropy'],
    'histboost__learning_rate': np.linspace(0.01, 1, 100),
    'histboost__max_iter': range(100, 10000, 100),
    'histboost__max_leaf_nodes': range(2, 101, 2), 
    'histboost__max_depth': range(10, 101, 2),
    'histboost__min_samples_leaf': range(1, 50, 2)
}

In [None]:
# create gridsearches on our pipelines and parameters that have been setup
cvec_lr_gs = HalvingRandomSearchCV(estimator = cvec_lr_pipe,
                                param_distributions = cvec_lr_params,
                                n_candidates = 5000,
                                verbose = 10, 
                                cv = 5, 
                                n_jobs = 50)

tfidf_lr_gs = HalvingRandomSearchCV(estimator = tfidf_lr_pipe,
                                param_distributions = tfidf_lr_params,
                                n_candidates = 5000,
                                verbose = 10, 
                                cv = 5, 
                                n_jobs = 50)

cvec_nb_gs = HalvingRandomSearchCV(estimator = cvec_nb_pipe,
                                param_distributions = cvec_nb_params,
                                n_candidates = 5000,
                                verbose = 10, 
                                cv = 5, 
                                n_jobs = 50)

tfidf_nb_gs = HalvingRandomSearchCV(estimator = tfidf_nb_pipe,
                                param_distributions = tfidf_nb_params,
                                n_candidates = 5000,
                                verbose = 10, 
                                cv = 5, 
                                n_jobs = 50)

cvec_knn_gs = HalvingRandomSearchCV(estimator = cvec_knn_pipe,
                                param_distributions = cvec_knn_params,
                                n_candidates = 5000,
                                verbose = 10, 
                                cv = 5, 
                                n_jobs = 50)

tfidf_knn_gs = HalvingRandomSearchCV(estimator = tfidf_knn_pipe,
                                param_distributions = tfidf_knn_params,
                                n_candidates = 5000,
                                verbose = 10, 
                                cv = 5, 
                                n_jobs = 50)

cvec_rforest_gs = HalvingRandomSearchCV(estimator = cvec_rforest_pipe,
                                param_distributions = cvec_rforest_params,
                                n_candidates = 5000,
                                verbose = 10, 
                                cv = 5, 
                                n_jobs = 50)

tfidf_rforest_gs = HalvingRandomSearchCV(estimator = tfidf_rforest_pipe,
                                param_distributions = tfidf_rforest_params,
                                n_candidates = 5000,
                                verbose = 10, 
                                cv = 5, 
                                n_jobs = 50)

cvec_xgb_gs = HalvingRandomSearchCV(estimator = cvec_xgb_pipe,
                                param_distributions = cvec_xgb_params,
                                n_candidates = 5000,
                                verbose = 10, 
                                cv = 5, 
                                n_jobs = 50)

tfidf_xgb_gs = HalvingRandomSearchCV(estimator = tfidf_xgb_pipe,
                                param_distributions = tfidf_xgb_params,
                                n_candidates = 5000,
                                verbose = 10, 
                                cv = 5, 
                                n_jobs = 50)

cvec_svc_gs = HalvingRandomSearchCV(estimator = cvec_svc_pipe,
                                param_distributions = cvec_svc_params,
                                n_candidates = 5000,
                                verbose = 10, 
                                cv = 5, 
                                n_jobs = 50)

tfidf_svc_gs = HalvingRandomSearchCV(estimator = tfidf_svc_pipe,
                                param_distributions = tfidf_svc_params,
                                n_candidates = 5000,
                                verbose = 10, 
                                cv = 5, 
                                n_jobs = 50)

cvec_histboost_gs = HalvingRandomSearchCV(estimator = cvec_histboost_pipe,
                                         param_distributions = cvec_histboost_params,
                                         n_candidates = 5000,
                                         verbose = 10, 
                                         cv = 5, 
                                         n_jobs = 50)

tfidf_histboost_gs = HalvingRandomSearchCV(estimator = tfidf_histboost_pipe,
                                          param_distributions = tfidf_histboost_params,
                                          n_candidates = 5000,
                                          verbose = 10,
                                          cv = 5, 
                                          n_jobs = 50)

# put all gridsearches into a list
gridsearches = [cvec_lr_gs, tfidf_lr_gs, cvec_nb_gs, tfidf_nb_gs, cvec_knn_gs, tfidf_knn_gs, cvec_rforest_gs, 
                tfidf_rforest_gs, cvec_xgb_gs, tfidf_xgb_gs, cvec_svc_gs, tfidf_svc_gs, cvec_histboost_gs, tfidf_histboost_gs]

# create dictionary of gridsearches
gridsearch_dict = {0: 'Count Vectorizer Logistic Regression', 1: 'TFIDF Vectorizer Logistic Regression',
                   2: 'Count Vectorizer Naive Bayes', 3: 'TFIDF Vectorizer Naive Bayes', 
                   4: 'Count Vectorizer K Nearest Neighbors', 5: 'TFIDF K Nearest Neighbors', 
                   6: 'Count Vectorizer Random Forest', 7: 'TFIDF Random Forest', 
                   8: 'Count Vectorizer XGBoost', 9: 'TFIDF Vectorizer XGBoost', 
                   10: 'Count Vectorizer Support Vector Machine', 11: 'TFIDF Vectorizer Support Vector Machine',
                  12: 'Count Vectorizer Histogram Gradient Boost', 13: 'TFIDF Vectorizer Histogram Gradient Boost'}

# thanks to the following article for the code below: https://www.kdnuggets.com/2018/01/managing-machine-learning-workflows-scikit-learn-pipelines-part-3.html

best_acc = 0.0
best_gs = ''
best_model = 0

for i, gs in enumerate(gridsearches):
    print('\nEstimator: %s' % gridsearch_dict[i])
    gs.fit(X_train, y_train)
    print('Best Parameters: %s' % gs.best_params_)
    print('Training accuracy: %.3f' % gs.best_score_)
    y_pred = gs.predict(X_test)
    print('Test accurracy: %.3f' % accuracy_score(y_test, y_pred))
    # get best model
    if accuracy_score(y_test, y_pred) > best_acc:
        best_acc = accuracy_score(y_test, y_pred)
        best_gs = gs
        best_model = i
print('Model with best test accuracy: %s' % gridsearch_dict[best_model])


Estimator: Count Vectorizer Histogram Gradient Boost
n_iterations: 7
n_required_iterations: 8
n_possible_iterations: 7
min_resources_: 20
max_resources_: 15403
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 5000
n_resources: 20
Fitting 5 folds for each of 5000 candidates, totalling 25000 fits
----------
iter: 1
n_candidates: 1667
n_resources: 60
Fitting 5 folds for each of 1667 candidates, totalling 8335 fits




----------
iter: 2
n_candidates: 556
n_resources: 180
Fitting 5 folds for each of 556 candidates, totalling 2780 fits




----------
iter: 3
n_candidates: 186
n_resources: 540
Fitting 5 folds for each of 186 candidates, totalling 930 fits




----------
iter: 4
n_candidates: 62
n_resources: 1620
Fitting 5 folds for each of 62 candidates, totalling 310 fits




Keep track of our previous scores for our previous models in case we need to interrupt the loop:

Estimator: Count Vectorizer Logistic Regression
Best Parameters: {'logr__penalty': 'l2', 'logr__C': 0.04313873873873874, 'cvec__preprocessor': <function lemma_preprocessor at 0x7ff3584c4280>, 'cvec__ngram_range': (1, 1), 'cvec__min_df': 3, 'cvec__max_features': 53400, 'cvec__max_df': 0.8222222222222223}
Training accuracy: 0.711
Test accurracy: 0.721

Estimator: TFIDF Logistic Regression
Best Parameters: {'tfidf__preprocessor': <function stem_preprocessor at 0x7ff3584c41f0>, 'tfidf__ngram_range': (2, 3), 'tfidf__min_df': 3, 'tfidf__max_features': 40400, 'tfidf__max_df': 1.0, 'logr__penalty': 'l2', 'logr__C': 0.8578720720720721}
Training accuracy: 0.667
Test accurracy: 0.678

Estimator: Count Vectorizer Naive Bayes
Best Parameters: {'nb__alpha': 3.0538555088334154, 'cvec__preprocessor': <function lemma_preprocessor at 0x7ff3584c4280>, 'cvec__ngram_range': (1, 1), 'cvec__min_df': 1, 'cvec__max_features': 67100, 'cvec__max_df': 0.9555555555555555}
Training accuracy: 0.690
Test accurracy: 0.696

Estimator: TFIDF Multinomial Naive Bayes
Best Parameters: {'tfidf__preprocessor': <function lemma_preprocessor at 0x7ff3584c4280>, 'tfidf__ngram_range': (1, 1), 'tfidf__min_df': 3, 'tfidf__max_features': 72100, 'tfidf__max_df': 0.8, 'nb__alpha': 1.321941148466029}
Training accuracy: 0.707
Test accurracy: 0.720


Estimator: Count Vectorizer K Nearest Neighbors
Best Parameters: {'knn__n_neighbors': 59, 'cvec__preprocessor': <function lemma_preprocessor at 0x7ff3584c4280>, 'cvec__ngram_range': (1, 1), 'cvec__min_df': 3, 'cvec__max_features': 52900, 'cvec__max_df': 0.8}
Training accuracy: 0.603
Test accurracy: 0.596

Estimator: TFIDF K Nearest Neighbors
Fitting 5 folds for each of 1296 candidates, totalling 6480 fits
Best Parameters: {'tfidf__preprocessor': <function lemma_preprocessor at 0x7ff3584c4280>, 'tfidf__ngram_range': (1, 3), 'tfidf__min_df': 3, 'tfidf__max_features': 56500, 'tfidf__max_df': 0.8222222222222223, 'knn__n_neighbors': 71}
Training accuracy: 0.681
Test accurracy: 0.683

Estimator: Count Vectorizer Random Forest
Best Parameters: {'rforest__n_estimators': 500, 'rforest__min_samples_split': 10, 'rforest__min_samples_leaf': 1, 'rforest__max_features': 'sqrt', 'rforest__max_depth': 58, 'cvec__preprocessor': <function lemma_preprocessor at 0x7ff3584c4280>, 'cvec__ngram_range': (1, 1), 'cvec__min_df': 3, 'cvec__max_features': 36800, 'cvec__max_df': 0.9555555555555555}
Training accuracy: 0.706
Test accurracy: 0.722

Estimator: TFIDF Random Forest
Best Parameters: {'tfidf__preprocessor': <function stem_preprocessor at 0x7ff3584c41f0>, 'tfidf__ngram_range': (1, 3), 'tfidf__min_df': 2, 'tfidf__max_features': 58100, 'tfidf__max_df': 1.0, 'rforest__n_estimators': 300, 'rforest__min_samples_split': 16, 'rforest__min_samples_leaf': 1, 'rforest__max_features': 'sqrt', 'rforest__max_depth': 94}
Training accuracy: 0.711
Test accurracy: 0.723

Estimator: Count Vectorizer XGBoost
Best Parameters: {'xgb__n_estimators': 440, 'xgb__min_child_weight': 1, 'xgb__max_depth': 10, 'xgb__learning_rate': 0.09999999999999999, 'cvec__preprocessor': <function stem_preprocessor at 0x7ff3584c41f0>, 'cvec__ngram_range': (1, 1), 'cvec__min_df': 3, 'cvec__max_features': 8200, 'cvec__max_df': 0.9555555555555555}
Training accuracy: 0.708
Test accurracy: 0.712

Estimator: TFIDF XGBoost
Best Parameters: {'xgb__n_estimators': 90, 'xgb__min_child_weight': 1, 'xgb__max_depth': 54, 'xgb__learning_rate': 0.14, 'tfidf__preprocessor': <function lemma_preprocessor at 0x7ff3584c4280>, 'tfidf__ngram_range': (1, 2), 'tfidf__min_df': 3, 'tfidf__max_features': 17000, 'tfidf__max_df': 1.0}
Training accuracy: 0.698
Test accurracy: 0.703

Estimator: Count Vectorizer Support Vector Machine
Best Parameters: {'svc__kernel': 'linear', 'svc__degree': 9, 'svc__C': 0.1011, 'cvec__preprocessor': <function lemma_preprocessor at 0x7ff3584c4280>, 'cvec__ngram_range': (3, 3), 'cvec__min_df': 3, 'cvec__max_features': 52400, 'cvec__max_df': 1.0}
Training accuracy: 0.573
Test accurracy: 0.586

Estimator: TFIDF Support Vector Machine
Best Parameters: {'tfidf__preprocessor': <function lemma_preprocessor at 0x7ff3584c4280>, 'tfidf__ngram_range': (1, 1), 'tfidf__min_df': 3, 'tfidf__max_features': 81700, 'tfidf__max_df': 0.9333333333333333, 'svc__kernel': 'rbf', 'svc__degree': 4, 'svc__C': 0.9798}
Training accuracy: 0.715
Test accurracy: 0.723

So here we can see that our best models had a test accuracy around 72%. While this is pretty good considering our subredits have people making posts about the same game, we would like to improve this to truly prove that TheSilphRoad subreddit is truly superior.   We will further tune this model in the next notebook. 

In [None]:
# create a dataframe with train and test scores of our models
model_score = pd.DataFrame({'model': ['Baseline', 'Count Vectorizer Logistic Regression', 'TFIDF Logistic Regression',
                                     'Count Vectorizer Naive Bayes', 'TFIDF Naive Bayes', 
                                     'Count Vectorizer K-Nearest Neighbors', 'TFIDF K-Nearest Neighbors', 
                                     'Count Vectorizer Random Forest', 'TFIDF Random Forest', 
                                     'Count Vectorizer XGBoost', 'TDIDF XGBoost', 
                                     'Count Vectorizer Support Vector Machine', 'TFIDF Support Vector Machine'] * 2,
                           'train/test': ['train' if i <= 12 else 'test' for i in range(26)],
                           'score': [0.55, 0.711, 0.667, 0.690, 0.707, 0.603, 0.681, 0.706, 0.711, 0.708, 0.698, 0.573, 0.715,
                                    0.55, 0.721, 0.678, 0.696, 0.720, 0.596, 0.683, 0.722, 0.723, 0.712, 0.703, 0.586, 0.723]})

In [None]:
# create bar chart from our dataframe
max_width = 10
plt.figure(figsize = (20, 10))
plt.title('Grid Search Accuracy Scores', fontsize = 30, fontweight = 'bold')
ax = sns.barplot(y = model_score['score'], x = model_score['model'], hue = model_score['train/test'])
ax.set_xticklabels(textwrap.fill(x.get_text(), max_width) for x in ax.get_xticklabels())
ax.set_ylabel('score', fontsize = 20)
ax.set_xlabel('model', fontsize = 15);

As we can see, many of our models perform very similarly. I was most surprised that the random forest models were not overfit as they are known to be high variance models, but I guess that is what happens when you grid search enough hyperparameters to remove the variance. Another surprise was that one of our simpler models performed better than XGBoost, which is unexpected as XGBoost is a famous model for winning many classification competitions in the past. 