In [1]:
import praw
import numpy as np
import pandas as pd
import regex as re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk import word_tokenize 
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from pactools.grid_search import GridSearchCVProgressBar
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline, make_pipeline
import seaborn as sns
from sklearn.metrics import classification_report
from psaw import PushshiftAPI
#from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt

# classification models
import xgboost as xgb
from xgboost import XGBClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

plt.style.use('fivethirtyeight')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [3]:
reddit = praw.Reddit(site_name='ga_proj3')

In [7]:
api = PushshiftAPI(reddit)

In [8]:
nosleep_gen = list(api.search_submissions(subreddit='nosleep',
                                          limit=3000))

nosleep_dict = {'id':[submission.id for submission in nosleep_gen],
                'author':[submission.author for submission in nosleep_gen],
               'title':[submission.title for submission in nosleep_gen],
              'body':[submission.selftext for submission in nosleep_gen],
               'subreddit':[submission.subreddit.display_name for submission in nosleep_gen]}

nosleep_df_raw = pd.DataFrame(nosleep_dict)

In [53]:
nosleep_df_raw.shape

(3000, 5)

In [54]:
nosleep_df_raw.isin(['[removed]','[deleted]']).any()

id           False
author       False
title        False
body          True
subreddit    False
dtype: bool

In [55]:
nosleep_df_clean = nosleep_df_raw[(nosleep_df_raw['body'] != '[removed]') &
                                  (nosleep_df_raw['body'] != '[deleted]') &
                                  (nosleep_df_raw['body'] != '')]

In [56]:
nosleep_df_clean.shape

(1196, 5)

In [57]:
n_df = nosleep_df_clean.reset_index(drop=True)
n_df.to_csv('nosleep.csv',index=False)

In [15]:
creepypasta_gen = list(api.search_submissions(subreddit='creepypasta',
                                          limit=3000))

creepypasta_dict = {'id':[submission.id for submission in creepypasta_gen],
                'author':[submission.author for submission in creepypasta_gen],
               'title':[submission.title for submission in creepypasta_gen],
              'body':[submission.selftext for submission in creepypasta_gen],
                   'subreddit':[submission.subreddit.display_name for submission in creepypasta_gen]}

creepypasta_df_raw = pd.DataFrame(creepypasta_dict)

In [59]:
creepypasta_df_clean = creepypasta_df_raw[(creepypasta_df_raw['body'] != '[removed]') &
                                (creepypasta_df_raw['body'] != '[deleted]') &
                                (creepypasta_df_raw['body'] != '')]

In [None]:
creepypasta_df_clean.shape

In [61]:
c_df = creepypasta_df_clean.reset_index(drop=True)
c_df.to_csv('creepypasta.csv',index=False)

In [62]:
c_df.head()

Unnamed: 0,id,author,title,body,subreddit
0,c00vpg,OpinionatedIMO,"Someone kept calling, and it was ‘me’",—————\nThe phone rang. There was no caller ID ...,creepypasta
1,c00cad,ChemoSans,Perfect,Catherine Miller is perfect. Her skin is flawl...,creepypasta
2,bzzqfc,reddit_rats,Just remembered this,"anyone know the creepypasta ""buyer beware"" i j...",creepypasta
3,bzygc5,story_teller_64,Recess lost episode,So I went to a goodwill and found a recess DVD...,creepypasta
4,bzwxdm,Chase-Covington,I Got A Phone Call From A Number That Doesn't ...,This started happening 2 days ago at 9:02 AM....,creepypasta


In [87]:
c_df.shape

(1423, 5)

In [64]:
n_df.shape

(1196, 5)

In [6]:
agg_df = pd.concat([n_df,c_df],ignore_index=True)

In [22]:
agg_df['text'] = agg_df['title'] +  ' ' + agg_df['body']
agg_df['creepypasta'] = agg_df['subreddit'].map(lambda x: 1 if  x == 'creepypasta' else 0)

In [111]:
y.value_counts(normalize=True) # we observe some imbalance here, with slightly more representation of creepypasta

1    0.543337
0    0.456663
Name: creepypasta, dtype: float64

In [100]:
stop = stopwords.words("english")

In [101]:
stop.extend(['http','com','net','org','www','https'])

In [102]:
stop = set(stop)

In [106]:
def regcleaner(text):
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(text.lower())
    meaningful_words = [w for w in words if not w in stop]
    
    # Join the words back into one string separated by space, 
    # and return the result.
    return(" ".join(meaningful_words))

agg_df['cleantext'] = agg_df['text'].map(lambda x:regcleaner(x))

In [128]:
agg_df.head()

Unnamed: 0,id,author,title,body,subreddit,text,creepypasta,cleantext
0,c020my,OneFaraday,The Book of Autonomancy (Part 10),[Part 1](https://www.reddit.com/r/nosleep/comm...,nosleep,The Book of Autonomancy (Part 10) [Part 1](htt...,0,book autonomancy part 10 part 1 reddit r nosle...
1,c01yge,CrazybloxianEmpireNS,I think my brother found images that kill you ...,[PART 1](https://www.reddit.com/r/nosleep/comm...,nosleep,I think my brother found images that kill you ...,0,think brother found images kill part 2 part 1 ...
2,c01y1o,thayeryan,"Every night at 11:11, they call.","\r\n\r\nEvery night at 11:11, they call. I do...",nosleep,"Every night at 11:11, they call. \r\n\r\nEver...",0,every night 11 11 call every night 11 11 call ...
3,c01vnc,edelre28,The True Disney Working Experience,My name is Neil. I’ve been a walk around chara...,nosleep,The True Disney Working Experience My name is ...,0,true disney working experience name neil walk ...
4,c01px3,awsmithwrites,Don't Fall Asleep in a Russian Subway Station.,“Are you reading *Dead Souls?*” the blonde gir...,nosleep,Don't Fall Asleep in a Russian Subway Station....,0,fall asleep russian subway station reading dea...


In [140]:
X, y = agg_df['cleantext'], agg_df['creepypasta']

In [141]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                  random_state = 42,
                                                  stratify = y)

In [156]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

cvec = CountVectorizer(tokenizer=LemmaTokenizer(),
                      stop_words='english')
smt = SMOTE(random_state=42,ratio=1)
ss = StandardScaler()
mnb = MultinomialNB()
pipeline = Pipeline([('cvec',cvec),
                     ('smt',smt),
                    ('mnb',mnb)])
score = cross_val_score(pipeline, X_train, y_train, cv=5, verbose=1, n_jobs=-1)
model = pipeline.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    8.8s finished
  'stop_words.' % sorted(inconsistent))
  return matrix(data, dtype=dtype, copy=False)


In [157]:
y_pred = model.predict(X_test)

In [158]:
print(classification_report(y_test, y_pred, target_names=['0','1']))

              precision    recall  f1-score   support

           0       0.60      0.87      0.71       299
           1       0.83      0.52      0.64       356

   micro avg       0.68      0.68      0.68       655
   macro avg       0.71      0.69      0.67       655
weighted avg       0.72      0.68      0.67       655



In [160]:
score

array([0.68274112, 0.70558376, 0.68112245, 0.71428571, 0.64285714])

In [161]:
score.mean()

0.6853180358437791

In [162]:
model.score(X_test,y_test)

0.6778625954198473

In [166]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

tfv = TfidfVectorizer(tokenizer=LemmaTokenizer(),
                      stop_words='english')
smt = SMOTE(random_state=42,ratio=1)
ss = StandardScaler()
mnb = MultinomialNB()
pipeline = Pipeline([('tfv',tfv),
                     ('smt',smt),
                    ('mnb',mnb)])
score = cross_val_score(pipeline, X_train, y_train, cv=5, verbose=1, n_jobs=-1)
model = pipeline.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    9.3s finished
  'stop_words.' % sorted(inconsistent))
  return matrix(data, dtype=dtype, copy=False)


In [167]:
y_pred = model.predict(X_test)

In [168]:
print(classification_report(y_test, y_pred, target_names=['0','1']))

              precision    recall  f1-score   support

           0       0.52      1.00      0.69       299
           1       1.00      0.24      0.38       356

   micro avg       0.58      0.58      0.58       655
   macro avg       0.76      0.62      0.53       655
weighted avg       0.78      0.58      0.52       655



In [170]:
score

array([0.58375635, 0.58883249, 0.58928571, 0.60969388, 0.54591837])

In [171]:
score.mean()

0.5834973583341967

In [172]:
model.score(X_test,y_test)

0.5847328244274809

In [146]:
def gridsearch_summary(X_train, X_test, y_train, y_test, model_name, model, model_params, problem = 'classification'):
    '''
    Arguments:
    X_train, X_test, y_train, y_test : vectorized train test split of X and y
    model_name : str name of model
    model : model constructor
        example: 'LogisticRegression' : LogisticRegression()
    model_params : dictionary of param_grids for GridSearch
        example: 'LogisticRegression' : {
                      'penalty' : ['l1', 'l2'],
                      'C' : [.1, 1, 10] }
    problem : str of problem type: 'classification' or 'regression'
    
    Return:
    summary_df : a single row DataFrame containing the GridSearch model and its 
              best model, predictions, and scores.
    '''   
    problem = problem.lower()
    
    if problem != 'regression' and problem != 'classification':
        print('Invalid problem type. Try "regression" or "classification"')
        return

    summary = {}

    # Track progress
    print(f'Fitting {model_name}')
    
    # Pipeline
    model = model
    
    tfv = TfidfVectorizer(tokenizer=RegexpTokenizer,
                        stop_words=stop,
                        sublinear_tf=True)
    
    pipeline = Pipeline([('tfv', tfv),
                       ('scaler', StandardScaler()),
                       ('SMOTE', SMOTE()),
                       ('model', model)])
    for key in model_params.keys():
        model_params['model__'+key] = model_params.pop(key)
    model_params['tfv__ngram_range'] = [zip([1,1,1,1,2,2,2,3,3,4],[1,2,3,4,2,3,4,3,4,4])]
    model_params['tfv__max_df'] = np.arange(0.5,1.1,0.1)

    # GridSearch
    gs = GridSearchCVProgressBar(pipeline, model_params, cv = 5, n_jobs=-1)
    gs.fit(X_train, y_train)

    # Make predictions
    y_train_pred = gs.best_estimator_.predict(X_train)
    y_test_pred = gs.best_estimator_.predict(X_test)

    # Build summary
    summary['Model Name'] = model_name
    summary['Train Pred'] = y_train_pred
    summary['Test Pred'] = y_test_pred
    summary['Best Score'] = gs.best_score_
    summary['Best Params'] = gs.best_params_
    summary['Best Estimator'] = gs.best_estimator_
    summary['Grid Search Model'] = gs

    if problem == 'regression':
        summary['Train Score'] = r2_score(y_train, y_train_pred)
        summary['Test Score'] = r2_score(y_test, y_test_pred)
    elif problem == 'classification':
        summary['Train Score'] = accuracy_score(y_train, y_train_pred)
        summary['Test Score'] = accuracy_score(y_test, y_test_pred)
    
    # Construct output dataframe 
    summary_df = pd.DataFrame([summary])

    # Rearrange columns
    summary_df = summary_df[['Model Name', 'Best Params', 'Best Score', 'Best Estimator',
                             'Train Score', 'Test Score', 'Train Pred', 'Test Pred', 'Grid Search Model']]
        
    return summary_df

In [147]:
classifier_models = {
    'LogisticRegression' : LogisticRegression(random_state = 42),
    'KNN': KNeighborsClassifier(), 
    'NaiveBayes' : MultinomialNB(),
    'DecisionTree' : DecisionTreeClassifier(random_state = 42), 
    'BaggedDecisionTree' : BaggingClassifier(random_state = 42),
    'RandomForest' : RandomForestClassifier(random_state = 42), 
    'ExtraTrees' : ExtraTreesClassifier(random_state = 42), 
    'AdaBoost' : AdaBoostClassifier(random_state=42), 
    'GradientBoosting' : GradientBoostingClassifier(random_state = 42),
    'SVM' : SVC(random_state=42),
    'XGBoost' : XGBClassifier(random_state=42)
}

In [148]:
classifier_model_params = {
    'LogisticRegression' : {
        'penalty' : ['l1', 'l2'],
        'C' : np.arange(.05, 1, .05) },
    'KNN' : {
        'n_neighbors' : np.arange(3, 22, 2) },
    'NaiveBayes' : {
        'alpha' : np.arange(.05, 2, .05)},
    'DecisionTree': {
        'max_depth' : [None, 6, 10, 14], 
        'min_samples_leaf' : [1, 2],
        'min_samples_split': [2, 3] },
    'BaggedDecisionTree' : {
        'n_estimators' : [20, 60, 100] },
    'RandomForest' : {
        'n_estimators' : [20, 60, 100],
        'max_depth' : [None, 2, 6, 10],
        'min_samples_split' : [2, 3, 4] },
    'ExtraTrees' : {
        'n_estimators' : [20, 60, 100],
        'max_depth' : [None, 6, 10, 14],
        'min_samples_leaf' : [1, 2], 
        'min_samples_split' : [2, 3], },
    'AdaBoost' : {
        'n_estimators' : np.arange(100, 151, 25),
        'learning_rate' : np.linspace(0.05, 1, 20) },
    'GradientBoosting' : {
        'n_estimators' : np.arange(5, 150, 10),
        'learning_rate' : np.linspace(0.05, 1, 20),
        'max_depth' : [1, 2, 3] },
    'SVM' : {
        'C' : np.arange(0.05, 1, .05),
        'kernel' : ['rbf', 'linear'] },
    'XGBoost' : {
        'n_estimators'  : np.arange(100, 151, 25), 
        'learning_rate' : np.arange(0.1, 1, .3),
        'max_depth' : [3],
        'alpha' : np.arange(0, 1, .3),
        'lambda' : np.arange(0, 1, .3),
        'gamma' : np.arange(0, 1, .3),
        'subsample' : [.5],
        'n_jobs' : [4],
        }
}

In [149]:
gs_summaries = pd.DataFrame()

In [150]:
from imblearn.pipeline import Pipeline, make_pipeline

models = ['LogisticRegression', 'KNN']

for model in models:
    gs_summaries = gs_summaries.append(
                        gridsearch_summary(
                            X_train, X_test, y_train, y_test, 
                            model, classifier_models[model], 
                            classifier_model_params[model]), 
                            ignore_index = True)

Fitting LogisticRegression


ValueError: Invalid parameter model for estimator LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False). Check the list of available parameters with `estimator.get_params().keys()`.

In [None]:
models = ['LogisticRegression', 'KNN', 'NaiveBayes', 
          'DecisionTree', 'BaggedDecisionTree', 
          'RandomForest', 'ExtraTrees', 'AdaBoost', 'SVM',
          'GradientBoosting', 'XGBoost']

for model in models:
    gs_summaries = gs_summaries.append(
                        gridsearch_summary(
                            X_train, X_test, y_train, y_test, 
                            model, classifier_models[model], 
                            classifier_model_params[model]), 
                            ignore_index = True)

In [None]:
def review_to_words(review):
    letters_only = re.sub("[^a-zA-Z]", " ", review)
    
    # 3. Convert to lower case, split into individual words.
    words = letters_only.lower().split()
    
    # 4. In Python, searching a set is much faster than searching
    # a list, so convert the stop words to a set.
    stops = set(stopwords.words('english'))
    
    # 5. Remove stop words.
    meaningful_words = [w for w in words if not w in stops]
    
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return(" ".join(meaningful_words))

In [None]:
print("Cleaning and parsing the training set movie reviews...")

j = 0
for train_review in X_train['review']:
    # Convert review to words, then append to clean_train_reviews.
    clean_train_reviews.append(review_to_words(train_review))
    
    # If the index is divisible by 1000, print a message
    if (j + 1) % 1000 == 0:
        print(f'Review {j + 1} of {total_reviews}.')
    
    j += 1

# Let's do the same for our testing set.

print("Cleaning and parsing the testing set movie reviews...")

for test_review in X_test['review']:
    # Convert review to words, then append to clean_train_reviews.
    clean_test_reviews.append(review_to_words(test_review))
    
    # If the index is divisible by 1000, print a message
    if (j + 1) % 1000 == 0:
        print(f'Review {j + 1} of {total_reviews}.')
        
    j += 1