# Posts that beat the median
We're going to process our data a bit more so that we can build some models that will try to answer the following question:

> Q: For a given post, does this post have more than the median score or not?

The this end we'll transform our text data into numerical data that we can work with.

In [1]:
import pandas as pd
import numpy as np
import scipy

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier



In [4]:
df = pd.read_csv('../data/csv/all_processed.csv')

In [5]:
text_feats = ['title', 'selftext']
num_and_bool_feats = [col for col in df.columns if col not in text_feats + ['subreddit']]
df[num_and_bool_feats] = df[num_and_bool_feats].copy().astype(float)

df.head()

Unnamed: 0,title,score,over_18,subreddit,created_utc,is_self,higher_than_median,num_comments_log1p,title_len_log1p
0,Dave Bautista has achieved full Drax.,22715.0,0.0,funny,1527526000.0,0.0,1.0,5.802118,3.637586
1,Not even safe in Super Mario 64...,36365.0,0.0,gaming,1527525000.0,0.0,1.0,6.135565,3.555348
2,rapid-fire cigar box juggling (sort of looks l...,56601.0,0.0,gifs,1527522000.0,0.0,1.0,6.461468,4.174387
3,Very true,43332.0,0.0,pics,1527526000.0,0.0,1.0,7.870166,2.302585
4,Couple with Down's syndrome celebrate 22 years...,27582.0,0.0,aww,1527525000.0,0.0,1.0,6.431331,4.094345


The only remaining preprocessing is in tokenizing the titles. We can just do this in a pipeline without too much difficulty after we split everything into training and testing sets, so let's do that now. 

In [6]:
X = df.drop(['higher_than_median'],1)
y = df['higher_than_median']

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=41)

## Encoding Text as Numerical Features

Some explanation for NLP and feature extraction for text.


As a warm up we will encode the subreddit name as a collection of dummy variables. 

In [5]:
X_dum_train = pd.get_dummies(X_train['subreddit'])
X_dum_test = pd.get_dummies(X_test['subreddit'])

for col in X_dum_test:
    if col not in X_dum_train.columns:
        X_dum_test.drop(col, 1, inplace=True)

for col in X_dum_train:
    if col not in X_dum_test.columns:
        X_dum_test[col] = pd.Series([0 for _ in X_dum_test.index], index=X_dum_test.index)

X_train = X_train.join(X_dum_train)
X_train.drop('subreddit', 1, inplace=True)

X_test = X_test.join(X_dum_test)
X_test.drop('subreddit', 1, inplace=True)


I'm going to want to use a tf-idf tokenizer on the `title` feature, but to do this with a pipeline object from SKLearn, we can define a custom transformer that will only affect the `title`. This is not built in complete generality, so I make no promise that it will work without modification to a different data structure (i.e. if you're using different matrix encodings). 

In [6]:
from sklearn.base import TransformerMixin, BaseEstimator

class Tfidf_Col(TransformerMixin, BaseEstimator):
    def __init__(self, cols=[]):
        self.cols = cols
        self.vects = {col:TfidfVectorizer() for col in cols}
        
    def transform(self, X, *_):
        X = X.copy()
        transformed = []
        for col in self.cols:
            tmp = self.vects[col].transform(X.loc[:,col])
            transformed.append(tmp)
            X.drop(col, 1, inplace=True)
            
        X = scipy.sparse.csr_matrix(X.values)
        transformed.append(X)
        X = scipy.sparse.hstack(transformed)
        return X
        
    def fit(self, X, *_):
        for col in self.cols:
            self.vects[col].fit(X.loc[:,col])
        return self

## Logistic Regression

In [7]:
pl = [
    ('tfidf', Tfidf_Col(['title'])),
    ('ss', StandardScaler(with_mean=False)),
    ('logreg', LogisticRegression()),
]

np.random.seed(41)

my_params = {
    'logreg__C':np.random.uniform(0,5,1),
#     'logreg__penalty':['l1', 'l2'],
}

pipe = Pipeline(pl)
logreg = GridSearchCV(pipe, param_grid=my_params)

logreg.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidf', Tfidf_Col(cols=['title'])), ('ss', StandardScaler(copy=True, with_mean=False, with_std=True)), ('logreg', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'logreg__C': array([ 1.25462])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [8]:
logreg.score(X_train, y_train)

0.99537944553346402

In [9]:
logreg.score(X_test, y_test)

0.64804703905921879

## K-Nearest Neighbors

In [10]:
pl = [
    ('tfidf', Tfidf_Col(['title'])),
    ('ss', StandardScaler(with_mean=False)),
    ('knn', KNeighborsClassifier()),
]

np.random.seed(41)

my_params = {
    'knn__n_neighbors':range(3,4)
}

pipe = Pipeline(pl)
knn = GridSearchCV(pipe, param_grid=my_params)

knn.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidf', Tfidf_Col(cols=['title'])), ('ss', StandardScaler(copy=True, with_mean=False, with_std=True)), ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'knn__n_neighbors': range(3, 4)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [11]:
slice_index = X_train.index[:2000]
slice_X = X_train.loc[[*slice_index], :]
slice_y = y_train[slice_index]
knn.score(slice_X, slice_y)

0.70799999999999996

In [12]:
slice_index = X_train.index[2000:4000]
slice_X = X_train.loc[[*slice_index], :]
slice_y = y_train[slice_index]
knn.score(slice_X, slice_y)

0.72950000000000004

In [13]:
slice_index = X_train.index[4000:]
slice_X = X_train.loc[[*slice_index], :]
slice_y = y_train[slice_index]
knn.score(slice_X, slice_y)

0.70973901973265441

In [14]:
knn.score(X_test, y_test)

0.51658966820663588

Too big to score on the training set!!!

## Decision Tree

In [15]:
pl = [
    ('tfidf', Tfidf_Col(['title'])),
    ('ss', StandardScaler(with_mean=False)),
    ('dt', DecisionTreeClassifier()),
]

np.random.seed(41)

my_params = {
    
}

pipe = Pipeline(pl)
dt = GridSearchCV(pipe, param_grid=my_params)

dt.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidf', Tfidf_Col(cols=['title'])), ('ss', StandardScaler(copy=True, with_mean=False, with_std=True)), ('dt', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_...     min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))]),
       fit_params=None, iid=True, n_jobs=1, param_grid={},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [17]:
dt.score(X_test, y_test)

1.0

## Random Forest

In [20]:
pl = [
    ('tfidf', Tfidf_Col(['title'])),
    ('ss', StandardScaler(with_mean=False)),
    ('rf', RandomForestClassifier()),
]

np.random.seed(41)

my_params = {
    {'rf__n_estimators'}:range(6,14),
}

pipe = Pipeline(pl)
rf = GridSearchCV(pipe, param_grid=my_params)

rf.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidf', Tfidf_Col(cols=['title'])), ('ss', StandardScaler(copy=True, with_mean=False, with_std=True)), ('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1, param_grid={},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [21]:
rf.score(X_test, y_test)

0.9878202435951281