# Import 

In [109]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', -1)

from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, VotingClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer

# Read in Data

In [110]:
cols = ['author', 'author_flair_text', 'selftext', 'subreddit', 'title', 'id']

dc = pd.read_csv('./washingtondc_3.0yrs.csv', usecols=cols, )
chi = pd.read_csv('./chicago_3.0yrs.csv', usecols=cols)

In [111]:
#make one df with both dc and chi data  
df = dc 
df = df.append(chi)
df = df.reindex()

#create a binary target
df['target'] = np.where(df['subreddit'] == 'washingtondc', 1, 0)
df.head()

#fill na's within the post text & author's flair
df.selftext.fillna('picture', inplace=True)
df.author_flair_text.fillna('flairless', inplace=True)

#create 1 column of text to include: author, title, selftext, author_flair
df['text'] = df.author + ' ' + df.title + ' ' + df.selftext + ' ' + df.author_flair_text

# Test-Train Split

In [113]:
X = df.text
y = df.target

X_train, X_test, y_train, y_test = train_test_split(X, y)

# Vectorize

In [134]:
tfi = TfidfVectorizer(analyzer='word', ngram_range=(1,5),
                     stop_words='english', min_df=.0008, norm='l1')
X_train_vect = tfi.fit_transform(X_train)
X_test_vect = tfi.transform(X_test)
list(X_train_vect)

[<1x4747 sparse matrix of type '<class 'numpy.float64'>'
 	with 85 stored elements in Compressed Sparse Row format>,
 <1x4747 sparse matrix of type '<class 'numpy.float64'>'
 	with 32 stored elements in Compressed Sparse Row format>,
 <1x4747 sparse matrix of type '<class 'numpy.float64'>'
 	with 5 stored elements in Compressed Sparse Row format>,
 <1x4747 sparse matrix of type '<class 'numpy.float64'>'
 	with 8 stored elements in Compressed Sparse Row format>,
 <1x4747 sparse matrix of type '<class 'numpy.float64'>'
 	with 61 stored elements in Compressed Sparse Row format>,
 <1x4747 sparse matrix of type '<class 'numpy.float64'>'
 	with 20 stored elements in Compressed Sparse Row format>,
 <1x4747 sparse matrix of type '<class 'numpy.float64'>'
 	with 7 stored elements in Compressed Sparse Row format>,
 <1x4747 sparse matrix of type '<class 'numpy.float64'>'
 	with 53 stored elements in Compressed Sparse Row format>,
 <1x4747 sparse matrix of type '<class 'numpy.float64'>'
 	with 15 

In [115]:
tfi.get_params, #tfi.get_feature_names()

(<bound method BaseEstimator.get_params of TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=1.0, max_features=None, min_df=0.0008,
         ngram_range=(1, 5), norm='l1', preprocessor=None, smooth_idf=True,
         stop_words='english', strip_accents=None, sublinear_tf=False,
         token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
         vocabulary=None)>,)

# Cross Val Score

In [116]:
#cross val score all potential classification models
lr = LogisticRegression()
#knn = KNeighborsClassifier() #too much data to use, crashes.
nb = MultinomialNB()
dt = DecisionTreeClassifier()
et = ExtraTreeClassifier()
bag = BaggingClassifier()
rf = RandomForestClassifier()
gb = GradientBoostingClassifier()
ad = AdaBoostClassifier()
svm2 = svm.SVC()


print('lr', cross_val_score(lr, X_train_vect, y_train).mean()) #.866
#print(cross_val_score(knn, X_train, y_train).mean())
print('nb', cross_val_score(nb, X_train_vect, y_train).mean()) #.880
print('dt', cross_val_score(dt, X_train_vect, y_train).mean()) #.837
print('et', cross_val_score(et, X_train_vect, y_train).mean()) #.817
print('bag', cross_val_score(bag, X_train_vect, y_train).mean()) #.852
print('rf', cross_val_score(rf, X_train_vect, y_train).mean()) #.857
print('gb', cross_val_score(gb, X_train_vect, y_train).mean()) #.823
print('ad', cross_val_score(ad, X_train_vect, y_train).mean()) #.813
print('svm', cross_val_score(svm2, X_train_vect, y_train).mean()) #.510

lr 0.8663564087647857
nb 0.8802986232305603
dt 0.8370952103936397
et 0.8173356602675974
bag 0.8517742873763816
rf 0.8574171029668411
gb 0.822978475858057
ad 0.8132247430676749
svm 0.5100058173356603


# Tuning

In [118]:
#Tuning 3 highest models
lr = LogisticRegression(penalty='l1')
nb = MultinomialNB()
rf = RandomForestClassifier(n_estimators=50, verbose=1)

print('lr', cross_val_score(lr, X_train_vect, y_train).mean())
print('nb', cross_val_score(nb, X_train_vect, y_train).mean())
print('rf', cross_val_score(rf, X_train_vect, y_train).mean())

lr 0.8733372115571069
nb 0.8802986232305603


[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:   22.8s finished
[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:    1.1s finished
[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:   23.3s finished
[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:    1.2s finished
[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:   23.6s finished


rf 0.8719798332363777


[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:    1.0s finished


# Fit Models to Train & Score Test

In [120]:
#Fit
#lr_1
lr_1 = LogisticRegression(penalty='l1')
lr_1.fit(X_train_vect, y_train);
lr_1_pred = lr_1.predict(X_test_vect)
print('lr', accuracy_score(y_test, lr_1_pred))

#nb_1
nb_1 = MultinomialNB()
nb_1.fit(X_train_vect, y_train);
nb_1_pred = nb_1.predict(X_test_vect)
print('nb', accuracy_score(y_test, nb_1_pred))

#rf_1
rf_1 = RandomForestClassifier(n_estimators=50)
rf_1.fit(X_train_vect, y_train);
rf_1_pred = rf_1.predict(X_test_vect)
print('rf', accuracy_score(y_test, rf_1_pred))

lr 0.8852821407795229
nb 0.8845840605002908
rf 0.8796393251890634


# Use Voting Classifier to Combine Highest Models

In [121]:
model = [('lr', LogisticRegression(penalty='l1')), ('nb', MultinomialNB()), 
         ('rf', RandomForestClassifier(n_estimators=50))]

#('dt', DecisionTreeClassifier()), 
#('bag', BaggingClassifier()), ('ad', AdaBoostClassifier())

vote = VotingClassifier(model)
vote.fit(X_train_vect, y_train);
y_pred = vote.predict(X_test_vect)
accuracy_score(y_test, y_pred)

  if diff:


0.8934264107038976

# Additional Test Sample

In [135]:
#10 Chicago posts from 9/9/2018
chi_sample = pd.read_csv('./chicago_0.1_sample.csv', usecols=cols)

#clean NaNs
chi_sample.selftext.fillna('picture', inplace=True)
chi_sample.author_flair_text.fillna('flairless', inplace=True)

#create extra column with key text
chi_sample['text'] = chi_sample.author + ' ' + chi_sample.title + ' ' + chi_sample.selftext + ' ' + chi_sample.author_flair_text

#Vectorize the Chi Sample
X_chi_sample = chi_sample.text
X_chi_sample_vect = tfi.transform(X_chi_sample)
y_chi_sample_pred = vote.predict(X_chi_sample_vect)

#print Chi predictions, should all be 0; accuracy = 80%
y_chi_sample_pred

  if diff:


array([1, 0, 0, 0, 0, 0, 0, 0, 0, 1])

In [133]:
#samples of posts that were misclassified
X_chi_sample[0], X_chi_sample[9]

('I_Never_Get_Gifts I need help. I\'m really struggling right now. I have a job and everything I\'m not an unemployed mooch or a druggy or anything of that sort. I\'m a young adult. 21 years old. I really need help with making it through the week. I get paid on Friday (not the 7th, the Friday after.) but until then I"m just in a really bad spot. I had some medical issues and they kinda killed my money. (It was dental stuff, I don\'t have cancer or anything like that) I haven\'t been broke in a long time because the job I have is actually..really good for my age. I make $16 an hour which is probably more than 90% of 21 year olds can say they make I was just really unlucky...If someone can help me with money through paypal or something that would be amazing. I know this subreddit isn\'t like /r/whinykidsbegformoney but I\'m really hoping someone will help me. If not, I totally underst flairless',
 'Rugged_Turtle Odd question, anyone know what bar in the Old Town area has the best deals o

In [136]:
#10 DC posts from 9/9/2018
dc_sample = pd.read_csv('./washingtondc_0.1_sample.csv', usecols=cols)

#clean NaNs
dc_sample.selftext.fillna('picture', inplace=True)
dc_sample.author_flair_text.fillna('flairless', inplace=True)

#create extra column with key text
dc_sample['text'] = dc_sample.author + ' ' + dc_sample.title + ' ' + dc_sample.selftext + ' ' + dc_sample.author_flair_text

#vectorize DC sample
X_sample_dc = dc_sample.text
X_sample_dc_vect = tfi.transform(X_sample_dc)
y_sample_dc_pred = vote.predict(X_sample_dc_vect)

#print DC predictions, should all be 1; accuracy = 100%
y_sample_dc_pred

  if diff:


array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])