### imports

In [1]:
# libraries
import numpy as np
import pandas as pd
import time
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier

from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix as cm

In [2]:
# import scraped data
ss_data = pd.read_csv('data/ss_data.csv', low_memory=False)
gp_data = pd.read_csv('data/gp_data.csv', low_memory=False)
data = pd.concat([ss_data, gp_data], sort=False) # combine
data = data.reset_index().drop(columns='index')

### cleaning

In [3]:
bots = pd.read_csv('data/bots.csv')

def classifier(x):
    if x in bots['ss'].values or 'SS' in x: return 0
    if x in bots['gp'].values or 'GP' in x: return 1
    else: return np.nan

data['class'] = data['author'].apply(classifier)
data['class'].value_counts(normalize=True)

0.0    0.740614
1.0    0.259386
Name: class, dtype: float64

<div class='alert alert-warning'><b>Note unbalanced classes</b> </div>

In [4]:
df = data.dropna(subset=['class'])
df = df[['title','selftext', 'class', 'score', 'num_crossposts']]

df['title_selftext'] = df[['title', 'selftext']].fillna('').apply(lambda x: ' '.join(x), axis=1)
df.drop(columns=['title','selftext'], inplace=True)

In [7]:
top_gp = df[(df['class']==1) & ((df['score']>10) | (df['num_crossposts']>0))]
top_ss = df[(df['class']==0) & ((df['score']>10) | (df['num_crossposts']>0))][:len(top_gp)]
top_df = pd.concat([top_gp, top_ss])
top_df.head(3)

Unnamed: 0,class,score,num_crossposts,title_selftext
32301,1.0,52,0.0,The Black Cat I was out for a run in the dark ...
32302,1.0,27,0.0,I just took 2mg of 2C-B today. How much should...
32304,1.0,91,0.0,What are the best places to meet women?


In [8]:
top_df['class'].value_counts()

0.0    1505
1.0    1505
Name: class, dtype: int64

### tf-idf vectorize

In [9]:
# define X, y
X = top_df['title_selftext']
y = top_df['class']

# tfidf transform
tfid = TfidfVectorizer(max_features=10000, ngram_range=(1,3))
X = tfid.fit_transform(X)
X = pd.DataFrame(X.toarray(), columns=tfid.get_feature_names())

---
# Modeling

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

#### MultinomialNB

In [11]:
mnb = MultinomialNB(alpha=1*np.e**-20)
mnb.fit(X_train, y_train)
print('train:', mnb.score(X_train, y_train), 'test:', mnb.score(X_test, y_test))

train: 0.9636685866194062 test: 0.6374501992031872


In [25]:
# with unbalanced classes (~8:2)
tn=21626
fp=62
fn=2264
tp=715

accuracy = (tp+tn)/(tp+tn+fp+fn)
precision = tp/(tp+fp)
recall = tp/(tp+fn)
specificity = tn/(tn+fp)

In [26]:
print('accuracy:', accuracy )
print('precision:', precision )
print('recall:', recall )
print('specificity:', specificity )

accuracy: 0.9057039769732842
precision: 0.9202059202059202
recall: 0.24001342732460557
specificity: 0.9971412762818148


In [129]:
top_df['mnb_pred'] = mnb.predict(X)
top_df[top_df['class']!=top_df['mnb_pred']]

Unnamed: 0,class,score,num_crossposts,title_selftext,mnb_pred
32301,1.0,52,0.0,The Black Cat I was out for a run in the dark ...,0.0
32302,1.0,27,0.0,I just took 2mg of 2C-B today. How much should...,0.0
32304,1.0,91,0.0,What are the best places to meet women?,0.0
32305,1.0,4,0.0,"""The dick is a child.""",0.0
32306,1.0,44,0.0,Is this a joke? I have to ask:\n\nIs it really...,0.0
...,...,...,...,...,...
27596,0.0,14,0.0,"GangstaClause, like SantaClause, but instead i...",1.0
27769,0.0,5,0.0,Feelings don't care if it's the truth,1.0
27875,0.0,9,0.0,Let me tell you to come in when I'm sleeping *...,1.0
30043,0.0,5846,0.0,Why do we say the N word?,1.0


### Pipeline

In [137]:
X = top_df['title_selftext']
y = top_df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [138]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('mnb', MultinomialNB())
])

In [139]:
pipe_params = {
    'tfidf__max_features':[2000, 5000, 10000],
    'tfidf__ngram_range':[(1,3),(1,4),(1,5)],
    'tfidf__token_pattern': [None, r'\b[^\d\W]+\b'],
    'mnb__alpha':[1*np.e**-20, .1, 1],
}

In [140]:
gs = GridSearchCV(pipe,
                  pipe_params,
                  cv=3)

In [141]:
t0 = time.time()
gs.fit(X_train, y_train)
print(time.time() - t0)

660.723639011383


In [143]:
gs.score(X_test, y_test)

0.8907086103453867

In [144]:
gs.best_params_

{'mnb__alpha': 0.1, 'tfidf__max_features': 10000, 'tfidf__ngram_range': (1, 3)}

---

In [155]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('mnb', MultinomialNB())])

pipe_params = {
    'tfidf__max_features':[8000, 10000, 12000],
    'tfidf__ngram_range':[(1,2), (1,3)],
    'mnb__alpha':[.0001, .1, .3],}

gs = GridSearchCV(pipe,
                  pipe_params,
                  cv=3)

In [156]:
t0 = time.time()
gs.fit(X_train, y_train)
print(time.time() - t0)



TypeError: first argument must be string or compiled pattern

In [None]:
d