# Learners or practicioner?

> Q: For the data in the given category, did it come from the expert variant subreddit or the learn variant?

In [1]:
import pandas as pd
import numpy as np
import scipy

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier



In [2]:
df = pd.read_csv('../data/csv/combined_subreddits_processed.csv')

In [3]:
text_feats = ['title', 'selftext']
num_and_bool_feats = [col for col in df.columns if col not in text_feats + ['subreddit']]
df[num_and_bool_feats] = df[num_and_bool_feats].copy().astype(float)

df.head()

Unnamed: 0,title,num_comments,score,subreddit,created_utc,is_self,selftext,title_len,title_len_log1p
0,How to find a math REU for this summer,44.0,66.0,math,1483229000.0,1.0,"If you, or someone you love, is an undergradua...",38.0,3.663562
1,"Found this weird typo in a book I was reading,...",0.0,0.0,math,1483230000.0,0.0,[deleted],70.0,4.26268
2,2017 is a prime number.,4.0,0.0,math,1483233000.0,1.0,[removed],23.0,3.178054
3,Injective functions,2.0,0.0,math,1483248000.0,1.0,[removed],19.0,2.995732
4,Want to go into a major requiring Calculus but...,1.0,1.0,math,1483251000.0,1.0,[removed],202.0,5.313206


The only remaining preprocessing is in tokenizing the titles. We can just do this in a pipeline without too much difficulty after we split everything into training and testing sets, so let's do that now. 

In [5]:
X = df.drop(['subreddit'],1)
y = df['subreddit'].str.lower()

y_math = y[y.str.contains('math')]
y_math = y_math.str.contains('learn')

y_python = y[y.str.contains('python')]
y_python = y_python.str.contains('learn')

y_datascience = y[y.str.contains('datascience') | y.str.contains('learnmachinelearning')]
y_datascience = y_datascience.str.contains('learn')


X_math = X.loc[y_math.index, :]
X_python = X.loc[y_python.index, :]
X_datascience = X.loc[y_datascience.index, :]


In [7]:
X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(X_math, y_math, random_state=41)
X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(X_datascience, y_datascience, random_state=41)
X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(X_python, y_python, random_state=41)

## Encoding Text as Numerical Features

Unlinke when we were predicting whether or not a particular post beats the median score, we were keeping track of the subreddit for each post. THe net effect in processing the data here jus tmeans that we have no more categorical features to encode as ummy variables. 


I'm going to want to use a tf-idf tokenizer on the `title` feature, but to do this with a pipeline object from SKLearn, we can define a custom transformer that will only affect the `title`. This is not built in complete generality, so I make no promise that it will work without modification to a different data structure (i.e. if you're using different matrix encodings). 

In [8]:
from sklearn.base import TransformerMixin, BaseEstimator

class Tfidf_Col(TransformerMixin, BaseEstimator):
    def __init__(self, cols=[]):
        self.cols = cols
        self.vects = {col:TfidfVectorizer() for col in cols}
        
    def transform(self, X, *_):
        X = X.copy()
        transformed = []
        for col in self.cols:
            tmp = self.vects[col].transform(X.loc[:,col])
            transformed.append(tmp)
            X.drop(col, 1, inplace=True)
            
        X = scipy.sparse.csr_matrix(X.values)
        transformed.append(X)
        X = scipy.sparse.hstack(transformed)
        return X
        
    def fit(self, X, *_):
        for col in self.cols:
            self.vects[col].fit(X.loc[:,col])
        return self

## Base Model

In classifying models, we can compare them to a baseline model. For this we will use the most basic classification model: assume everything is the most common class. For our three classes this means the following:

|class| most common type in train data| baseline accuracy on test data|
|-|
|`datascience`| `datascience`| 67.19%|
|`math`| `math`| 53.58% | 
|`python`| `learnpython`| 61.80%| 

In [29]:
y_train_d.mean(), y_train_m.mean(), y_train_p.mean()

(0.33132691278758697, 0.4667558040598031, 0.61141804788213627)

In [30]:
y_test_d.mean(), y_test_m.mean(), y_test_p.mean()

(0.32811873245086243, 0.46420323325635104, 0.61802278743238581)

## Logistic Regression

In [11]:
pl = [
    ('tfidf', Tfidf_Col(['title', 'selftext'])),
    ('ss', StandardScaler(with_mean=False)),
    ('logreg', LogisticRegression()),
]

np.random.seed(41)

my_params = {
    'logreg__C':np.random.uniform(0,5,1),
}

pipe = Pipeline(pl)
logreg_d = GridSearchCV(pipe, param_grid=my_params)

logreg_d.fit(X_train_d, y_train_d)

logreg_d.score(X_train_d, y_train_d), logreg_d.score(X_test_d, y_test_d)

(0.98742643124665597, 0.7673485760128359)

In [12]:
pl = [
    ('tfidf', Tfidf_Col(['title', 'selftext'])),
    ('ss', StandardScaler(with_mean=False)),
    ('logreg', LogisticRegression()),
]

np.random.seed(41)

my_params = {
    'logreg__C':np.random.uniform(0,5,1),
}

pipe = Pipeline(pl)
logreg_m = GridSearchCV(pipe, param_grid=my_params)

logreg_m.fit(X_train_m, y_train_m)

logreg_m.score(X_train_m, y_train_m), logreg_m.score(X_test_m, y_test_m)

(0.99724484421214699, 0.81147441351647021)

In [47]:
pl = [
    ('tfidf', Tfidf_Col(['title', 'selftext'])),
    ('ss', StandardScaler(with_mean=False)),
    ('logreg', LogisticRegression()),
]

np.random.seed(41)

my_params = {
    'logreg__C':np.random.uniform(0,5,1),
}

pipe = Pipeline(pl)
logreg_p = GridSearchCV(pipe, param_grid=my_params)

logreg_p.fit(X_train_p, y_train_p)

logreg_p.score(X_train_p, y_train_p), logreg_p.score(X_test_p, y_test_p)

(0.98680171884591772, 0.77534814132811603)

|class| baseline accuracy| logistic regression accuracy
|-|
|`datascience`| 67.19%| 76.73%
|`math`| 53.58% | 81.15%
|`python`| 61.80%| 77.53%



## Decision Tree

In [19]:
pl = [
    ('tfidf', Tfidf_Col(['title', 'selftext'])),
    ('ss', StandardScaler(with_mean=False)),
    ('dt', DecisionTreeClassifier()),
]

np.random.seed(41)

my_params = {
    
}

pipe = Pipeline(pl)
dt_d = GridSearchCV(pipe, param_grid=my_params)

dt_d.fit(X_train_d, y_train_d)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidf', Tfidf_Col(cols=['title', 'selftext'])), ('ss', StandardScaler(copy=True, with_mean=False, with_std=True)), ('dt', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
    ...     min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))]),
       fit_params=None, iid=True, n_jobs=1, param_grid={},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [20]:
dt_d.score(X_train_d, y_train_d), dt_d.score(X_test_d, y_test_d)

(0.99986623863028357, 0.77256317689530685)

In [21]:
pl = [
    ('tfidf', Tfidf_Col(['title', 'selftext'])),
    ('ss', StandardScaler(with_mean=False)),
    ('dt', DecisionTreeClassifier()),
]

np.random.seed(41)

my_params = {
    
}

pipe = Pipeline(pl)
dt_m = GridSearchCV(pipe, param_grid=my_params)

dt_m.fit(X_train_m, y_train_m)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidf', Tfidf_Col(cols=['title', 'selftext'])), ('ss', StandardScaler(copy=True, with_mean=False, with_std=True)), ('dt', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
    ...     min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))]),
       fit_params=None, iid=True, n_jobs=1, param_grid={},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [22]:
dt_m.score(X_train_m, y_train_m), dt_m.score(X_test_m, y_test_m)

(1.0, 0.8383371824480369)

In [23]:
pl = [
    ('tfidf', Tfidf_Col(['title', 'selftext'])),
    ('ss', StandardScaler(with_mean=False)),
    ('dt', DecisionTreeClassifier()),
]

np.random.seed(41)

my_params = {
    
}

pipe = Pipeline(pl)
dt_p = GridSearchCV(pipe, param_grid=my_params)

dt_p.fit(X_train_p, y_train_p)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidf', Tfidf_Col(cols=['title', 'selftext'])), ('ss', StandardScaler(copy=True, with_mean=False, with_std=True)), ('dt', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
    ...     min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))]),
       fit_params=None, iid=True, n_jobs=1, param_grid={},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [24]:
dt_p.score(X_train_p, y_train_p), dt_p.score(X_test_p, y_test_p)

1.0

|class| baseline accuracy| decision tree accuracy
|-|
|`datascience`| 67.19%| 77.26%
|`math`| 53.58% | 83.83%
|`python`| 61.80%| 78.36%



## Random Forest

In [38]:
pl = [
    ('tfidf', Tfidf_Col(['title', 'selftext'])),
    ('ss', StandardScaler(with_mean=False)),
    ('rf', RandomForestClassifier()),
]

np.random.seed(41)

my_params = {
    'rf__n_estimators':range(9,11),
}

pipe = Pipeline(pl)
rf_d = GridSearchCV(pipe, param_grid=my_params)

rf_d.fit(X_train_d, y_train_d)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidf', Tfidf_Col(cols=['title', 'selftext'])), ('ss', StandardScaler(copy=True, with_mean=False, with_std=True)), ('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurit...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'rf__n_estimators': range(9, 11)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [39]:
rf_d.score(X_train_d, y_train_d), rf_d.score(X_test_d, y_test_d)

(0.98929909042268593, 0.80264741275571605)

In [31]:
pl = [
    ('tfidf', Tfidf_Col(['title', 'selftext'])),
    ('ss', StandardScaler(with_mean=False)),
    ('rf', RandomForestClassifier()),
]

np.random.seed(41)

my_params = {
    'rf__n_estimators':range(6,14),
}

pipe = Pipeline(pl)
rf_m = GridSearchCV(pipe, param_grid=my_params)

rf_m.fit(X_train_m, y_train_m)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidf', Tfidf_Col(cols=['title', 'selftext'])), ('ss', StandardScaler(copy=True, with_mean=False, with_std=True)), ('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurit...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'rf__n_estimators': range(6, 14)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [36]:
rf_m.score(X_train_m, y_train_m), rf_m.score(X_test_m, y_test_m)

(0.99457072241805433, 0.83712167254163117)

In [33]:
pl = [
    ('tfidf', Tfidf_Col(['title', 'selftext'])),
    ('ss', StandardScaler(with_mean=False)),
    ('rf', RandomForestClassifier()),
]

np.random.seed(41)

my_params = {
    'rf__n_estimators':range(9,11),
}

pipe = Pipeline(pl)
rf_p = GridSearchCV(pipe, param_grid=my_params)

rf_p.fit(X_train_p, y_train_p)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidf', Tfidf_Col(cols=['title', 'selftext'])), ('ss', StandardScaler(copy=True, with_mean=False, with_std=True)), ('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurit...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'rf__n_estimators': range(9, 11)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [37]:
rf_p.score(X_train_p, y_train_p), rf_p.score(X_test_p, y_test_p)

(0.98492173112338854, 0.79790539762918633)

|class| baseline accuracy| random forest accuracy
|-|
|`datascience`| 67.19%| 80.26%
|`math`| 53.58% | 83.71%
|`python`| 61.80%| 79.79%

## Summary

|class| baseline accuracy| logistic regression accuracy | decision tree accuracy| random forest accuracy |
|-|
|`datascience`| 66.86%|76.73%| 77.26 %| 80.26%|
|`math`| 53.32% | 81.15%| 81.15%| 83.71%|
|`python`| 61.14%| 77.53%| 77.53%| 79.79%|






In [72]:
r = rf_p.best_estimator_.steps[-1][-1]

In [81]:
max(r.feature_importances_)

0.049083076856664298

In [86]:
[*r.feature_importances_].index(max(r.feature_importances_))

81835

In [87]:
len(r.feature_importances_)

81838

No doubt the winner here is random forests, but each model at least beat the baseline by a bit. We'll go ahead 