### imports

In [6]:
# libraries
import numpy as np
import pandas as pd
import time
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier

from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix as cm

In [7]:
# import scraped data
ss_data = pd.read_csv('data/ss_data.csv', low_memory=False)
gp_data = pd.read_csv('data/gp_data.csv', low_memory=False)
data = pd.concat([ss_data, gp_data], sort=False) # combine
data = data.reset_index().drop(columns='index')

### cleaning

In [8]:
bots = pd.read_csv('data/bots.csv') # import list of bots

def classifier(x):
    if x in bots['ss'].values or 'SS' in x: return 0
    if x in bots['gp'].values or 'GP' in x: return 1
    else: return np.nan

data['class'] = data['author'].apply(classifier) # add class column
data['class'].value_counts(normalize=True)

0.0    0.740614
1.0    0.259386
Name: class, dtype: float64

<div class='alert alert-warning'><b>Note unbalanced classes</b> </div>

In [9]:
df = data.dropna(subset=['class'])
df = df[['title','selftext', 'class', 'score', 'num_crossposts']]

df['title_selftext'] = df[['title', 'selftext']].fillna('').apply(lambda x: ' '.join(x), axis=1)
df.drop(columns=['title','selftext'], inplace=True)

In [10]:
df['class'].value_counts()

0.0    32293
1.0    11310
Name: class, dtype: int64

In [14]:
#small = df.sample(10)
small

Unnamed: 0,class,score,num_crossposts,title_selftext
28816,0.0,389,0.0,"In the walls?! Found this in my head, Spiders ..."
18611,0.0,5,0.0,A puffin can fly up to 180000 eggs in a rigid ...
39589,1.0,1,0.0,"Article: Ruling, Constitution"
20701,0.0,12,0.0,Vietnamese Spicy Beef Short Ribs
4437,0.0,2,,Something's not right here . . First play thro...
3214,0.0,10,,It's a good gaming PC for my graphics card by ...
4671,0.0,117,,Im quite happy relapsing once a month or so an...
14551,0.0,7,,(IL) Can a board tell us it was covered by ins...
42928,1.0,1,0.0,(F)eeling bravely horny tonight.
8483,0.0,5186,,Mark Zuckerberg Confirms That He Is A Litigiou...


In [22]:
pd.DataFrame(['Delete this part for photoshop' for i in range(100)],
            ['Delete this part for photoshop' for i in range(100)]).reset_index()

Unnamed: 0,index,0
0,Delete this part for photoshop,Delete this part for photoshop
1,Delete this part for photoshop,Delete this part for photoshop
2,Delete this part for photoshop,Delete this part for photoshop
3,Delete this part for photoshop,Delete this part for photoshop
4,Delete this part for photoshop,Delete this part for photoshop
...,...,...
95,Delete this part for photoshop,Delete this part for photoshop
96,Delete this part for photoshop,Delete this part for photoshop
97,Delete this part for photoshop,Delete this part for photoshop
98,Delete this part for photoshop,Delete this part for photoshop


### Bootstrap

In [6]:
n_boots = len(df[df['class']==0]) - len(df[df['class']==1])

boot = df[df['class']==1].sample(len(n_boots))
df_boot = pd.concat([df, boot])

KeyboardInterrupt: 

In [None]:
df_boot['class'].value_counts()

In [10]:
# export bootstrapped data
df_boot.to_csv('data/bootstrapped_gp.csv')

In [40]:
top_gp = df[df['class']==1]
top_ss = df[df['class']==0][:len(top_gp)]
top_df = pd.concat([top_gp, top_ss])
top_df.head(3)

Unnamed: 0,class,score,num_crossposts,title_selftext
32298,1.0,0,0.0,How my son who was prevent preventing from goi...
32299,1.0,10,0.0,"I hate when people say ""I'm sorry, but we're n..."
32300,1.0,1,0.0,Wen ur nan sends me a text telling me she's fr...


In [41]:
df_boot['class'].value_counts()

0.0    11310
1.0    11310
Name: class, dtype: int64

In [19]:
df_boot_gp = df_boot[(df_boot['class']==1) & ((df_boot['score']>3))]
df_boot_ss = df_boot[(df_boot['class']==0) & ((df_boot['score']>3))][:len(df_boot_gp)]
df_boot_top = pd.concat([df_boot_ss, df_boot_gp])

### tf-idf vectorize

In [20]:
# define X, y
X = df_boot_top['title_selftext']
y = df_boot_top['class']

# tfidf transform
tfid = TfidfVectorizer(max_features=10000, ngram_range=(1,3))
X = tfid.fit_transform(X)
X = pd.DataFrame(X.toarray(), columns=tfid.get_feature_names())

---
# Modeling

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

#### MultinomialNB

In [22]:
mnb = MultinomialNB(alpha=1*np.e**-20)
mnb.fit(X_train, y_train)
print('train:', mnb.score(X_train, y_train), 'test:', mnb.score(X_test, y_test))

train: 0.8824039067422811 test: 0.7911153119092628


In [23]:
mnb.fit(X_train, y_train)
print('cval:', cross_val_score(mnb, X_test, y_test, cv=5).mean())
# display confusion matrix
pd.DataFrame(data = cm(y_test, mnb.predict(X_test)),
             columns = ['ss','gp'], index = ['ss','gp']).rename_axis(index = 'act', columns = 'pred:')

cval: 0.6604361559517041


pred:,ss,gp
act,Unnamed: 1_level_1,Unnamed: 2_level_1
ss,1669,490
gp,394,1679


---

In [79]:
# filter posts
top_gp = df[(df['class']==1) & ((df['score']>3))]
top_ss = df[(df['class']==0) & ((df['score']>3))][:len(top_gp)*2]
top_df = pd.concat([top_gp, top_ss])

In [80]:
# define X, y
X = top_df['title_selftext']
y = top_df['class']

# tfidf transform
tfid = TfidfVectorizer(max_features=10000, ngram_range=(1,3))
X = tfid.fit_transform(X)
X = pd.DataFrame(X.toarray(), columns=tfid.get_feature_names())

In [81]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [82]:
mnb = MultinomialNB(alpha=.01)
mnb.fit(X_train, y_train)
print('cval:', cross_val_score(mnb, X_test, y_test, cv=5).mean())
# display confusion matrix
pd.DataFrame(data = cm(y_test, mnb.predict(X_test)),
             columns = ['ss','gp'], index = ['ss','gp']).rename_axis(index = 'act', columns = 'pred:')

cval: 0.7071357630842388


pred:,ss,gp
act,Unnamed: 1_level_1,Unnamed: 2_level_1
ss,1365,120
gp,411,337


In [83]:
# with balanced classes (all)
tn=1365
fp=120
fn=411
tp=337

accuracy = (tp+tn)/(tp+tn+fp+fn)
precision = tp/(tp+fp)
recall = tp/(tp+fn)
specificity = tn/(tn+fp)
print('accuracy:', accuracy )
print('precision:', precision )
print('recall:', recall )
print('specificity:', specificity )

accuracy: 0.7622033139274519
precision: 0.737417943107221
recall: 0.4505347593582888
specificity: 0.9191919191919192


### lemmatizing

In [16]:
# filter vectors
top_gp = df[df['class']==1]
top_ss = df[df['class']==0][:len(top_gp)]
top_df = pd.concat([top_gp, top_ss])
top_df.head(3)

Unnamed: 0,class,score,num_crossposts,title_selftext
32298,1.0,0,0.0,How my son who was prevent preventing from goi...
32299,1.0,10,0.0,"I hate when people say ""I'm sorry, but we're n..."
32300,1.0,1,0.0,Wen ur nan sends me a text telling me she's fr...


In [14]:
# from https://scikit-learn.org/stable/modules/feature_extraction.html
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
vect = TfidfVectorizer(tokenizer=LemmaTokenizer()) 

In [29]:
# filter vectors
top_gp = df[df['class']==1]
top_ss = df[df['class']==0][:len(top_gp)]
top_df = pd.concat([top_gp, top_ss])
top_df.head(3)

Unnamed: 0,class,score,num_crossposts,title_selftext
32298,1.0,0,0.0,How my son who was prevent preventing from goi...
32299,1.0,10,0.0,"I hate when people say ""I'm sorry, but we're n..."
32300,1.0,1,0.0,Wen ur nan sends me a text telling me she's fr...


In [30]:
# define X, y

X = top_df['title_selftext']
y = top_df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [34]:
Xvec_train = vect.fit_transform(X_train)

In [32]:
Xvec_test = vect.fit_transform(X_test)

In [35]:
mnb = MultinomialNB(alpha=.01)
mnb.fit(Xvec_train, y_train)
print('cval:', cross_val_score(mnb, Xvec_test, y_test, cv=5).mean())
# display confusion matrix
pd.DataFrame(data = cm(y_test, mnb.predict(Xvec_test)),
             columns = ['ss','gp'], index = ['ss','gp']).rename_axis(index = 'act', columns = 'pred:')

cval: 0.6933717360771322


ValueError: dimension mismatch