In [1]:
# libraries
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier

from sklearn.metrics import confusion_matrix as cm

In [2]:
# data
ss_scrape = pd.read_csv('data/ss_scrape.csv', low_memory=False)
gpt2_scrape = pd.read_csv('data/gpt2_scrape.csv', low_memory=False)
all_scrape = pd.concat([ss_scrape, gpt2_scrape], sort=False)          # combine

In [3]:
ss_scrape.shape, gpt2_scrape.shape

((1000, 2), (1000, 2))

#### List top words in each subreddit  by count 

In [7]:
tfid = TfidfVectorizer(max_features=10000, ngram_range=(1,4), token_pattern=r'\b[^\d\W]+\b')
df_cv = tfid.fit_transform(all_scrape['title'])
df_cv = pd.DataFrame(df_cv.todense(), columns=tfid.get_feature_names())

ss_cv = df_cv.loc[:999]  # vectorized subsum
gp_cv = df_cv.loc[1000:] # vectorized gpt2

In [8]:
# list words by frequency in sub
n=1500
ss_freq = list(ss_cv.sum().sort_values(ascending=False)[:n].index)
gp_freq = list(gp_cv.sum().sort_values(ascending=False)[:n].index)
top_all = set(ss_freq+gp_freq)

#### TF-IDF Transform Combined data

In [9]:
# combine dfs, define X, y
X = all_scrape['title']
y = all_scrape['sr']

# tfidf transform
tfid = TfidfVectorizer(max_features=10000, ngram_range=(1,4), token_pattern=r'\b[^\d\W]+\b')
Xf = tfid.fit_transform(X)
Xf = pd.DataFrame(Xf.toarray(), columns=tfid.get_feature_names())

# keep only cols from df_freq
Xf = Xf[[w for w in top_all if w in Xf.columns]]

---
# Model

In [10]:
X_train, X_test, y_train, y_test = train_test_split(Xf, y, random_state=42)

#### MultinomialNB

In [11]:
mnb = MultinomialNB(alpha=1*np.e**-20)
mnb.fit(Xf, y)
print('cval:', cross_val_score(mnb, Xf, y, cv=5).mean())
mnb.fit(X_train, y_train)
print('train:', mnb.score(X_train, y_train), 'test:', mnb.score(X_test, y_test))

cval: 0.7220000000000001
train: 0.9053333333333333 test: 0.726


In [12]:
# drop strong mnb coefs
mnb_coefs = pd.DataFrame({'word': Xf.columns, 'mnb coef': mnb.coef_[0]}).sort_values('mnb coef')[-100:]
Xfr = Xf[[c for c in Xf.columns if c not in mnb_coefs['word'].values]]

In [13]:
mnb = MultinomialNB(alpha=1*np.e**-20)
mnb.fit(Xfr, y)
print('cval:', cross_val_score(mnb, Xfr, y, cv=5).mean())
Xr_train, Xr_test, y_train, y_test = train_test_split(Xfr, y, random_state=42)
mnb.fit(Xr_train, y_train)
print('train:', mnb.score(Xr_train, y_train), 'test:', mnb.score(Xr_test, y_test))

cval: 0.7190000000000001
train: 0.908 test: 0.722


#### Logistic Regression (Ridge)

In [17]:
lr = LogisticRegressionCV(Cs=np.logspace(.1, 1.5, 50), penalty='l2', solver='lbfgs', max_iter=5000, cv=3)
lr.fit(Xf, y)
print('cval:', cross_val_score(lr, Xf, y, cv=5).mean())
Xf_train, Xf_test, y_train, y_test = train_test_split(Xf, y, random_state=42)
lr.fit(Xf_train, y_train)
print('train:', lr.score(Xf_train, y_train), 'test:', lr.score(Xf_test, y_test))

cval: 0.7185
train: 0.96 test: 0.732


In [18]:
lr.C_

array([10.33441064])

In [289]:
# drop strong ridge coefs
lr_coefs = pd.DataFrame({'word': Xf.columns, 'lr coef': lr.coef_[0]}).sort_values('lr coef')[-100:]
Xfr = Xf[[c for c in Xf.columns if c not in lr_coefs['word'].values]]

In [290]:
lr = LogisticRegressionCV(Cs=np.logspace(.1, 1, 50), penalty='l2', solver='lbfgs', max_iter=5000, cv=3)
lr.fit(Xfr, y)
print('cval:', cross_val_score(lr, Xfr, y, cv=5).mean())
Xfr_train, Xfr_test, y_train, y_test = train_test_split(Xfr, y, random_state=42)
lr.fit(Xfr_train, y_train)
print('train:', lr.score(Xfr_train, y_train), 'test:', lr.score(Xfr_test, y_test))

cval: 0.6915
train: 0.9493333333333334 test: 0.71


### Gradient Boost

In [291]:
X_train, X_test, y_train, y_test = train_test_split(Xf, y, random_state=42)

In [292]:
gradboost = GradientBoostingClassifier(max_depth=5)
gradboost.fit(X_train, y_train)
gradboost.score(X_test, y_test)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=5,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [294]:
gradboost.score(X_test, y_test)

0.68

### Voting Classifier

In [None]:
X_train, X_test, y_train, y_test = train_test_split(Xf, y, random_state=42)

In [None]:
lr = LogisticRegressionCV(Cs=np.logspace(.1, 1, 50), penalty='l2', solver='lbfgs', max_iter=5000, cv=3)
lr.fit(Xf, y)
print('cval:', cross_val_score(lr, Xf, y, cv=5).mean())
Xf_train, Xf_test, y_train, y_test = train_test_split(Xf, y, random_state=42)
lr.fit(Xf_train, y_train)
print('train:', lr.score(Xf_train, y_train), 'test:', lr.score(Xf_test, y_test))

In [304]:
mnb = MultinomialNB(alpha=1*np.e**-20)
mnb.fit(Xf, y)
print('cval:', cross_val_score(mnb, Xf, y, cv=5).mean())
mnb.fit(X_train, y_train)
print('train:', mnb.score(X_train, y_train), 'test:', mnb.score(X_test, y_test))

(0.532, 0.496)

In [307]:
grb = GradientBoostingClassifier(max_depth=4)
grb.fit(X_train, y_train)
grb.score(X_test, y_test)

0.66

In [312]:
LogisticRegressionCV(Cs=np.logspace(.1, 1, 50), penalty='l2', solver='lbfgs', max_iter=5000, cv=3)

LogisticRegressionCV(Cs=array([ 1.25892541,  1.31331029,  1.37004456,  1.42922973,  1.49097166,
        1.5553808 ,  1.62257239,  1.69266662,  1.76578887,  1.84206997,
        1.92164637,  2.00466042,  2.09126064,  2.18160194,  2.27584593,
        2.3741612 ,  2.47672365,  2.58371673,  2.69533186,  2.8117687 ,
        2.93323554,  3.05994969,  3.19213781,  3.33003639,  3.47389211,
        3.62396232,  3.78051548,  3.94383164,  4.11420298,  4.2...
        5.5316812 ,  5.77064675,  6.01993548,  6.27999335,  6.55128557,
        6.83429746,  7.12953531,  7.43752728,  7.75882432,  8.09400122,
        8.44365757,  8.80841888,  9.18893768,  9.58589468, 10.        ]),
                     class_weight=None, cv=3, dual=False, fit_intercept=True,
                     intercept_scaling=1.0, l1_ratios=None, max_iter=5000,
                     multi_class='warn', n_jobs=None, penalty='l2',
                     random_state=None, refit=True, scoring=None,
                     solver='lbfgs', tol=0.0

In [313]:
lr.C_

array([9.18893768])

In [1]:
vote = VotingClassifier([
    ('lr', LogisticRegressionCV(Cs=np.logspace(.1, 1, 50), penalty='l2', solver='lbfgs', max_iter=5000, cv=3)),
    ('mnb', MultinomialNB()),
    ('grb', GradientBoostingClassifier()),
])
vote_params = {
    'grb__max_depth': [4, 3],
    'weights': [[.1,.8,.1],[.50,.25,.25],[.25,.50,.25],[.25,.25,.50],]
}
gs = GridSearchCV(vote, param_grid=vote_params, cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_) # cross val score
gs.best_params_

NameError: name 'VotingClassifier' is not defined