In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
# Importing like this to make sure the datatypes work and are correct
dtyping ={
    'title' : str,
    'url' : str,
    'author' : str 
}
df = pd.read_csv('./data/wsb_and_btc_all.csv', dtype=dtyping)
df.head()

Unnamed: 0,title,id,url,body,date_created,number_comments,author,self_post,sub,submission
0,All In The War Machine,eld4i6,https://i.redd.it/cj2vw26nmd941.jpg,,1578440000.0,780.0,everythingorange9,0.0,wsb,1
1,How to get oil back up,g55or2,https://i.redd.it/w5iqqihjo2u41.jpg,,1587461000.0,516.0,futuretrollshark,0.0,wsb,1
2,Type yy into google.,c75d5x,https://www.reddit.com/r/wallstreetbets/commen...,That’s all I’m saying.,1561875000.0,17281.0,Alopez2897,1.0,wsb,1
3,Oil is now expenzive,d51f4o,https://i.redd.it/2j386s5iuym31.png,,1568673000.0,1019.0,,0.0,wsb,1
4,"My dad, working through a 15-hour time zone di...",d29nov,https://i.redd.it/60asaz4zhsl31.jpg,,1568160000.0,1633.0,SerraTL,0.0,wsb,1


In [3]:
# Post Titles
df_titles = df[['title', 'sub']].copy()
df_titles.dropna(inplace=True)

In [4]:
# pulling a random sample from the dataset because the whole dataset
# is taking way too long to model
df_samp = df.groupby('sub').apply(lambda x: x.sample(n=20_000, random_state = 42))


#adapted from https://stackoverflow.com/questions/41035187/stratified-samples-from-pandas
# Comments + Body
df_com_bod = df_samp[['sub', 'body']].copy()

df_com_bod.dropna(inplace=True)
df_com_bod.shape

# Comments
df_com = df_samp[df_samp['submission']==0]
df_com = df_com[['sub', 'body']]

In [5]:
df[df['self_post'] == 1].shape
# There are only 286 self posts in the dataset of both subreddits, 
# too small to do much with

(286, 10)

In [6]:
# This will print out both scores for the model
# Also, this creates a dictionary of the model and adds it to a list
# This is so I can store it and access it later
def scoring(name, model, X_train, X_test, y_train, y_test):
    train_score = model.score(X_train, y_train)
    test_score = model.score(X_test, y_test)
    print(f'{name} - Training: {train_score:.2%}')
    print(f'{name} - Testing: {test_score:.2%}')
    print('*'*15)
    models.append({'model_name' : name,
           'model' : model,
           'training_score' : train_score,
           'testing_score' : test_score})

## train/test/split the same for all models 

In [7]:
# Since I was running train, test, split with the same parameters
# Build this since it is shorter 
def ttt(X, y):
    return train_test_split(X, y, random_state=42, stratify=y)

In [8]:
# Parameters to iterate over
pipe_params = {
    'vect__max_features' : [2500, 5000, None],
    'vect__ngram_range' : [(1,1), (1,2)],
    'vect__max_df' : [.9, .95],
    'vect__min_df' : [None, .05],
    'vect__stop_words' : [None, 'english']
}

# Naive Bayes Classifier

In [9]:
# Naive Bayes Regression code
# Instantiates and fits the model returning the best performing one
# This was just easier since I was running this with the same 
# parameters each time
def nb_regging(X_train, y_train):
    pipe_nb = Pipeline([
    ('vect', TfidfVectorizer()),
    ('nb', BernoulliNB())
])
    gs = GridSearchCV(pipe_nb,
                 pipe_params,
                 cv=5,
                 n_jobs=-1)
    gs.fit(X_train, y_train)
    return gs.best_estimator_


# SVM

In [10]:
# SVC Regression
# This instantiates and fits the SVC
# Added in a few additional parameters to check here
# Using 2 degree polynomial is on the recomendation from Tim Book's lesson
def svc_regging(X_train, y_train):
    pipe_params_svc = pipe_params.copy()
    pipe_params_svc['svc__degree'] = [2, 3]
    pipe_params_svc['svc__kernel'] = ['poly', 'rbf']
    pipe_svc = Pipeline([
    ('vect', TfidfVectorizer()),
    ('svc', SVC())
])
    gs = GridSearchCV(pipe_svc,
                 pipe_params,
                 cv=5,
                 n_jobs=-1)
    gs.fit(X_train, y_train)
    return gs.best_estimator_

# Logistic Regression/Classifier

In [11]:
# Logistic Regression Fuction
# Similar to the two above, just to keep it cleaner
def log_regging(X_train, y_train):
    pipe_log = Pipeline([
    ('vect', TfidfVectorizer()),
    ('logreg', LogisticRegression())
])
    gs = GridSearchCV(pipe_log,
                 pipe_params,
                 cv=5,
                     n_jobs=-1)
    gs.fit(X_train, y_train)
    return gs.best_estimator_

# regressions

In [12]:
%%time
models = []
# Ran the same test, train, split on each dataset
# Based on Submission Title
# logistic regression
X_train, X_test, y_train, y_test = ttt(df_titles['title'], df_titles['sub'])
lr_mod = log_regging(X_train, y_train)
scoring('Submission Title - LogisticReg', lr_mod, X_train, X_test, y_train, y_test)

# Bernoulli Naive Bayes

nb_mod = nb_regging(X_train, y_train)
scoring('Submission Title - NB', nb_mod, X_train, X_test, y_train, y_test)

# Support Vector Classification

svc_mod = svc_regging(X_train, y_train)
scoring('Submission Title - SVC', svc_mod, X_train, X_test, y_train, y_test)

# Based on Submission Body(self-post) and Comments
# Logistic Regression
X_train, X_test, y_train, y_test = ttt(df_com_bod['body'], df_com_bod['sub'])
lr_mod = log_regging(X_train, y_train)
scoring('Submission Body and Comments - LogisticReg', lr_mod, X_train, X_test, y_train, y_test)

# Bernoulli Naive Bayes

nb_mod = nb_regging(X_train, y_train)
scoring('Submission Body and Comments - NB', nb_mod, X_train, X_test, y_train, y_test)

# Support Vector Classification

svc_mod = svc_regging(X_train, y_train)
scoring('Submission Body and Comments - SVC', nb_mod, X_train, X_test, y_train, y_test)

# Based on Comments
# Logistic Regression
X_train, X_test, y_train, y_test = ttt(df_com['body'], df_com['sub'])
lr_mod = log_regging(X_train, y_train)
scoring('Comments - LogisticReg', lr_mod, X_train, X_test, y_train, y_test)

# Bernoulli Naive Bayes

nb_mod = nb_regging(X_train, y_train)
scoring('Comments - NB', nb_mod, X_train, X_test, y_train, y_test)

# Support Vector Classification

svc_mod = svc_regging(X_train, y_train)
scoring('Comments - SVC', nb_mod, X_train, X_test, y_train, y_test)

Submission Title - LogisticReg - Training: 68.79%
Submission Title - LogisticReg - Testing: 64.91%
***************
Submission Title - NB - Training: 68.79%
Submission Title - NB - Testing: 64.91%
***************
Submission Title - SVC - Training: 75.15%
Submission Title - SVC - Testing: 64.91%
***************
Submission Body and Comments - LogisticReg - Training: 60.70%
Submission Body and Comments - LogisticReg - Testing: 60.00%
***************
Submission Body and Comments - NB - Training: 58.53%
Submission Body and Comments - NB - Testing: 58.38%
***************
Submission Body and Comments - SVC - Training: 58.53%
Submission Body and Comments - SVC - Testing: 58.38%
***************
Comments - LogisticReg - Training: 60.73%
Comments - LogisticReg - Testing: 59.94%
***************
Comments - NB - Training: 58.82%
Comments - NB - Testing: 58.50%
***************
Comments - SVC - Training: 58.82%
Comments - SVC - Testing: 58.50%
***************
CPU times: user 1min 44s, sys: 12.1 s, tota

Submission Title - LogisticReg - Training: 0.6878808395396073
Submission Title - LogisticReg - Testing: 0.6490872210953347
***************
Submission Title - NB - Training: 0.6878808395396073
Submission Title - NB - Testing: 0.6490872210953347
***************
Submission Title - SVC - Training: 0.7515233581584293
Submission Title - SVC - Testing: 0.6490872210953347
***************
Submission Body and comments - LogisticReg - Training: 0.6221395954801093
Submission Body and comments - LogisticReg - Testing: 0.6205155161127002
***************
Submission Body and comments - NB - Training: 0.6221395954801093
Submission Body and comments - NB - Testing: 0.6205155161127002
***************


In [16]:
models[2]['model']

Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.9, max_features=2500,
                                 min_df=0.05, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('svc',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', d

In [17]:
models[3]['model']

Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.9, max_features=2500,
                                 min_df=0.05, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('logreg',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_sc

In [18]:
models[6]['model']

Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.9, max_features=2500,
                                 min_df=0.05, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('logreg',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_sc