In [22]:
# Standard data science imports:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

%matplotlib inline

In [3]:
import requests
import time
import pandas as pd

In [4]:
def get_data_from_pushshift(subreddit, epoch, numberofposts = 1000):    
    titles = []
    subreddits = []
    score = []
    ids = []
    df = pd.DataFrame(columns= ['title','subreddit','score','id'])
    headers = {'User-agent': 'JavierM'}
    url = 'https://api.pushshift.io/reddit/search/submission/'
    params = {
        'subreddit':subreddit,
        'after':epoch,
        'score':'>10',
        'size': numberofposts

    }

    res = requests.get(url, params=params, headers=headers)
    if res.status_code == 200:
        data = res.json()
        print(len(data['data']))
        for i in range(len(data['data'])):
            titles.append(data['data'][i]['title'])
            subreddits.append(data['data'][i]['subreddit'])
            score.append(data['data'][i]['score'])
            ids.append(data['data'][i]['id'])
    else:
        print(res.status_code)
    time.sleep(1) # input this to not hit the servers so hard
    print('DONE!')
    df['title'] = titles
    df['subreddit'] = subreddits
    df['score'] = score
    df['id'] = ids
    return df


In [5]:
subreddits = ['upliftingnews','news']
epochs = ['1451628000','1454306400','1456812000','1459486800','1462078800','1464757200','1467349200','1470027600','1472706000','1475298000','1480572000',' 1480572000','1483250400','1485928800','1488348000','1491022800','1493614800','1496293200','1498885200','1501563600','1504242000','1506834000','1509512400','1512108000','1514786400','1517464800','1519884000','1522558800','1525150800','1527829200','1530421200','1533099600','1535778000','1538370000',' 1541048400','1543644000']

In [6]:
#dfs = pd.DataFrame(columns= ['title','subreddit','score','id'])
#for subreddit in subreddits:
    #for epoch in epochs:
        #frame = get_data_from_pushshift(subreddit, epoch, numberofposts = 1000)
        #dfs = pd.concat([dfs,frame],ignore_index=True)

In [7]:
df = pd.read_csv('2016-2018news&upliftingnews')

In [8]:
df = df.drop_duplicates('id')

In [9]:
df = df.drop('Unnamed: 0', axis=1)

In [10]:
df['subreddit'].value_counts()

news             32012
UpliftingNews    18621
Name: subreddit, dtype: int64

In [11]:
df['subreddit_binary'] = df['subreddit'].map(lambda x: 1 if x == 'UpliftingNews' else 0)

In [12]:
df.head()

Unnamed: 0,title,subreddit,score,id,subreddit_binary
0,2015: The Best Year in History for the Average...,UpliftingNews,21,3z0aou,1
1,Deaf man saves deer from frozen river,UpliftingNews,25,3z0u23,1
2,Remember that deaf guy who saved and pet the d...,UpliftingNews,2162,3z16kk,1
3,"Arizona Girl, 12, Collects 1,000 Coats for the...",UpliftingNews,3272,3z3dzy,1
4,Russia unveils its body armour for DOGS follow...,UpliftingNews,42,3z3zz0,1


In [13]:
X = df['title']
y = df['subreddit_binary']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [15]:
vectorizer = CountVectorizer(stop_words='english')

X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)

X_train_features.shape

(37974, 27915)

In [16]:
pipe = Pipeline([
    ('vect', TfidfVectorizer()),
    ('model', LogisticRegression())
])

params = {
    'vect__min_df':[2,4],
    'vect__stop_words':[None ,'english'],
    'model__penalty':['l1','l2'],
    'model__C':[0.1, 1, 10]
}

gs = GridSearchCV(pipe, params, cv=5, verbose=2, n_jobs=-1)

gs.fit(X_train, y_train)

print('Best Params: ',gs.best_params_)

print('Best Estimator Score: ', gs.best_estimator_.score(X_test, y_test))

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:   45.7s finished


Best Params:  {'model__C': 1, 'model__penalty': 'l2', 'vect__min_df': 4, 'vect__stop_words': None}
Best Estimator Score:  0.8572557074018485


In [17]:
pipe = Pipeline([
    ('vect', CountVectorizer()),
    ('model', LogisticRegression())
])

params = {
    'vect__min_df':[2,4],
    'vect__stop_words':[None ,'english'],
    'model__penalty':['l1','l2'],
    'model__C':[0.1, 1, 10]  
}

gs = GridSearchCV(pipe, params, cv=5, verbose=2, n_jobs=-1)

gs.fit(X_train, y_train)

print('Best Params: ',gs.best_params_)

print('Best Estimator Score: ', gs.best_estimator_.score(X_test, y_test))

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    8.4s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  1.0min finished


Best Params:  {'model__C': 1, 'model__penalty': 'l2', 'vect__min_df': 2, 'vect__stop_words': None}
Best Estimator Score:  0.8585986254838455


In [None]:
pipe = Pipeline([
    ('vect', TfidfVectorizer()),
    ('model', RandomForestClassifier() )
])

params = {
    'vect__min_df':[2,4],
    'vect__stop_words':[None ,'english'],
    'model__n_estimators':[10, 75, 150],
    'model__max_depth':[5, 10,15],
    'model__min_samples_split':[2,3,4]
}

gs = GridSearchCV(pipe, params, cv=5, verbose=2, n_jobs=-1)

gs.fit(X_train, y_train)

print('Best Params: ',gs.best_params_)

print('Best Estimator Score: ', gs.best_estimator_.score(X_test, y_test))

Fitting 5 folds for each of 144 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   45.5s
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:  4.3min


In [20]:
pipe = Pipeline([
    ('vect', CountVectorizer()),
    ('model', RandomForestClassifier() )
])

params = {
    'vect__min_df':[2,4],
    'vect__stop_words':[None ,'english'],
    'model__n_estimators':[10, 20, 30],
    'model__max_depth':[1,2,3,4,5],
    'model__min_samples_split':[2,3,4]
}

gs = GridSearchCV(pipe, params, cv=5, verbose=2, n_jobs=-1)

gs.fit(X_train, y_train)

print('Best Params: ',gs.best_params_)

print('Best Estimator Score: ', gs.best_estimator_.score(X_test, y_test))

Fitting 5 folds for each of 180 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   13.1s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   35.5s
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:  3.1min finished


Best Params:  {'model__max_depth': 5, 'model__min_samples_split': 2, 'model__n_estimators': 10, 'vect__min_df': 2, 'vect__stop_words': 'english'}
Best Estimator Score:  0.6337783395212891


In [27]:
pipe = Pipeline([
    ('vect', TfidfVectorizer()),
    ('model', AdaBoostClassifier())
])

params = {
    'vect__min_df':[2,4],
    'vect__stop_words':[None ,'english'],
    'model__base_estimator':[DecisionTreeClassifier(max_depth=1), DecisionTreeClassifier(max_depth=2), DecisionTreeClassifier(max_depth=3),DecisionTreeClassifier(max_depth=4)],
    'model__n_estimators':[50, 100, 200]  
}

gs = GridSearchCV(pipe, params, cv=5, verbose=2, n_jobs=-1)

gs.fit(X_train, y_train)

print('Best Params: ',gs.best_params_)

print('Best Estimator Score: ', gs.best_estimator_.score(X_test, y_test))

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   16.7s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 10.2min finished


Best Params:  {'model__base_estimator': DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'), 'model__n_estimators': 200, 'vect__min_df': 2, 'vect__stop_words': None}
Best Estimator Score:  0.8123074492455961


In [23]:
pipe = Pipeline([
    ('vect', CountVectorizer()),
    ('model', AdaBoostClassifier())
])

params = {
    'vect__min_df':[2,4],
    'vect__stop_words':[None ,'english'],
    'model__base_estimator':[DecisionTreeClassifier(max_depth=1), DecisionTreeClassifier(max_depth=2), DecisionTreeClassifier(max_depth=3),DecisionTreeClassifier(max_depth=4)],
    'model__n_estimators':[50, 100, 200]  
}

gs = GridSearchCV(pipe, params, cv=5, verbose=2, n_jobs=-1)

gs.fit(X_train, y_train)

print('Best Params: ',gs.best_params_)

print('Best Estimator Score: ', gs.best_estimator_.score(X_test, y_test))

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   14.8s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  8.4min finished


Best Params:  {'model__base_estimator': DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'), 'model__n_estimators': 200, 'vect__min_df': 4, 'vect__stop_words': None}
Best Estimator Score:  0.8236827553519235


In [25]:
vocab = vectorizer.get_feature_names()

In [26]:
feats = pd.DataFrame(lr.coef_, columns=vocab)
feats.loc[0,feats.iloc[0,:].abs() >2].plot(kind='barh', figsize=(10,20));

NameError: name 'lr' is not defined