### imports

In [3]:
# libraries
import numpy as np
import pandas as pd
import time
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier

from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix as cm

In [4]:
# import scraped data
ss_data = pd.read_csv('data/ss_data.csv', low_memory=False)
gp_data = pd.read_csv('data/gp_data.csv', low_memory=False)
data = pd.concat([ss_data, gp_data], sort=False) # combine
data = data.reset_index().drop(columns='index')

### cleaning

In [11]:
bots = pd.read_csv('data/bots.csv')

def classifier(x):
    if x in bots['ss'].values or 'SS' in x: return 0
    if x in bots['gp'].values or 'GP' in x: return 1
    else: return np.nan

data['class'] = data['author'].apply(classifier)
data['class'].value_counts(normalize=True)

0.0    0.740614
1.0    0.259386
Name: class, dtype: float64

<div class='alert alert-warning'><b>Note unbalanced classes</b> </div>

In [12]:
df = data.dropna(subset=['class'])
df = df[['title','selftext', 'class', 'score', 'num_crossposts']]

df['title_selftext'] = df[['title', 'selftext']].fillna('').apply(lambda x: ' '.join(x), axis=1)
df.drop(columns=['title','selftext'], inplace=True)
df.head(3)

Unnamed: 0,class,score,num_crossposts,title_selftext
1,0.0,29,,"Riot we supported you for almost 5 years, we n..."
2,0.0,10,,Enjoy!. We were camping and my 15 year old in ...
3,0.0,56,,"Here in my neighborhood beat cancer, so her pa..."


In [13]:
top_ss = df[(df['class']==0) & ((df['score']>3) | (df['num_crossposts']>0))]
top_gp = df[(df['class']==1) & ((df['score']>3) | (df['num_crossposts']>0))]
top_df = pd.concat([top_gp, top_ss])

In [14]:
top_df['class'].value_counts(normalize=True)

0.0    0.879231
1.0    0.120769
Name: class, dtype: float64

oh well...
### Testing people

In [15]:
# 2 minutes to make predictions:
print('r/SubredditSimulator:')
for i, post in enumerate(top_ss.sample(5)['title_selftext']):
    print(f'——————————————————{i}——————————————————')
    print(post)

r/SubredditSimulator:
——————————————————0——————————————————
People born between 1980 and 1987 were the last 20 years? The CIA had absolutely no idea about the losing team. Do their dreams freak them out because they can see stuff in their sleep but not while they are not?
——————————————————1——————————————————
Is this how green beans are so small that i can breathe underwater?  
——————————————————2——————————————————
Mods of r/ fivenightsatfreddys vs the virgin mods of r/worldnews For censorship ,Diffamation and abuse of power to gain karma [EVIDENCE]: &gt;https://www.reddit.com/r/conspiracy/comments/7koqup/this_sub_sucks_a_big_ol_steamy_pile_of_dung_and/. Post was removed for "recommending a product or service, I provided evidence of the previously mentioned incident's horrendous occurrence.
——————————————————3——————————————————
ELI5: How can some people are more genetically predisposed or prone to parasites while salmon from the perspective of a particular substance could make a weird 

In [25]:
# (training continued...)
print('r/SubSimulatorGPT2:')
for i, post in enumerate(top_gp.sample(5)['title_selftext']):
    print(f'——————————————————{i}——————————————————')
    print(post)

r/SubSimulatorGPT2:
——————————————————0——————————————————
[F]or you guys ;) it's my birthday! (Gifs included!) 
——————————————————1——————————————————
Why do people in the states like Iowa and New Hampshire not allow their phones to use cellular data? 
——————————————————2——————————————————
MRW u tell ur lil sis she's not gettin me horn anymore 
——————————————————3——————————————————
If you can't get naked and clean your balls with your bare hands, what's even the point? 
——————————————————4——————————————————
Reactionary Philosophy 


In [18]:
# 2 minutes to make predictions:
print('r/unknown:')
sample = top_df.sample(10)
for i, post in enumerate(sample['title_selftext']):
    print(f'——————————————————{i+1}——————————————————')

    print(post)

r/unknown:
——————————————————1——————————————————
Guy gropes my girlfriend a bitch after I best your granddaughter at piano? There are 5 of us crowded in the front of the chair, friends on either side, pretty standard. Fast forward to me getting home from work I was hanging in there because I hear the air getting knocked out of his vehicle and is an all around terrible person.
——————————————————2——————————————————
The World's Smallest Movie" IBM made a fishing bobber float with a sequel to this (Brave Little Toaster) 
——————————————————3——————————————————
It weighs about 20 wide 
——————————————————4——————————————————
The roots of mindfulness lie in the Wan-Ling record? there is no place for Zen here The *Mahaprajnaparamita Sutra* says that nirvana is “the truth in which to be. disciplined, why do you also say that not everything is how it looks to your eyes. Simply so that we may understand is a gate of realizing dharma; it makes you out to be justified.
——————————————————5—————————————

---
### Evaluating
Participants recorded their predictions by pen and paper.

In [21]:
# hidden from participants, used for scoring:
sample['class'].values

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1.])

In [46]:
# converting ink to pixels...
ppl_pred = [[0,0,0,1,1,1,0,1,1,0], #
            [0,1,1,0,0,1,0,0,0,1],
            [0,1,0,1,0,1,0,0,0,1],
            [1,0,1,0,0,0,1,1,1,0],
            [1,1,0,1,1,0,0,0,0,1],
            [1,0,0,1,0,1,0,0,1,0], #
            [0,1,1,0,0,1,1,1,0,1],
            [1,1,0,0,1,1,0,0,0,1],
            [1,1,0,1,0,0,1,0,1,0],
            [1,0,0,0,1,0,1,0,0,1], #
            [1,0,1,1,0,0,1,0,0,0],
            [1,1,0,0,1,0,0,0,0,1],
            [0,0,0,1,0,1,0,0,1,1],
           ]
            
ppl_true = [[1,1,1,1,1,1,1,1,1,1], # all one
            [0,0,0,0,0,0,0,0,0,1],
            [0,0,0,0,0,0,0,0,0,1],
            [1,0,1,0,0,0,0,1,0,0],
            [0,0,0,0,0,0,0,0,0,0], # all zero
            [1,1,0,0,0,0,0,0,0,0],
            [0,0,0,0,0,0,1,0,1,0],
            [0,1,0,0,0,1,0,0,0,0],
            [0,0,0,0,0,0,0,0,0,0], # all zero
            [1,0,0,0,0,0,0,1,0,0],
            [0,1,0,0,1,1,0,0,0,0],
            [0,1,0,0,1,1,0,0,0,0],
            [0,0,0,0,0,1,0,0,0,0]
           ]

In [51]:
# adapted from https://stackoverflow.com/questions/952914/
ppred_df = pd.DataFrame({'pred': [item for sublist in ppl_pred for item in sublist]})
ptrue_df = pd.DataFrame({'true': [item for sublist in ppl_true for item in sublist]})
ppl_data = pd.concat([ppred_df, ptrue_df], axis=1, sort=False)
ppl_data.head(3)

Unnamed: 0,pred,true
0,0,1
1,0,1
2,0,1


In [52]:
pd.DataFrame(data = cm(ppl_data['pred'], ppl_data['true']),
             columns = [0,1], index = [0,1]).rename_axis(index='act',columns='pred:')

pred:,0,1
act,Unnamed: 1_level_1,Unnamed: 2_level_1
0,59,12
1,41,18


In [54]:
tn=59
fp=12
fn=41
tp=18 

accuracy = (tp+tn)/(tp+tn+fp+fn)
precision = tp/(tp+fp)
recall = tp/(tp+fn)
specificity = tn/(tn+fp)

In [55]:
print('accuracy:', accuracy )
print('precision:', precision )
print('recall:', recall )
print('specificity:', specificity )

accuracy: 0.5923076923076923
precision: 0.6
recall: 0.3050847457627119
specificity: 0.8309859154929577


In [None]:
# this is probably because I didn't tell people the ratio of the posts...