# Reddit Project - 03B: Decision Tree Model
***

## Import Packages

In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

### Read in the csv

In [3]:
spm = pd.read_csv('../data/spm_target.csv')

In [5]:
spm.head()

Unnamed: 0,text,target,char_count,word_count,sentiment
0,Soul TribeJust trying to find my spiritual fam...,1,65,11,0.0
1,THE JOURNEY TO LIFES RICHES BEGAN BY MAKING YO...,1,134,22,0.8964
2,Spiritual awakening amp relationshipsHi Im ...,1,953,174,-0.5804
3,SpartacusIn one of my visions I am in a Ludus ...,1,2133,371,0.7587
4,Mixing dreams with realities I often find ...,1,314,56,0.6705


### Create X, y, and train test split

In [34]:
X = spm[['text', 'word_count', 'sentiment']]
y = spm['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33,
                                                    random_state = 26)

### Create a function to return data from text and a function to return the numeric columns 'word_count', and 'sentiment'

In [53]:
get_text_data = FunctionTransformer(lambda x: x['text'], validate=False)
get_numeric_data = FunctionTransformer(lambda x: x[['word_count', 'sentiment']], validate=False)

### Create a pipeline containing Feature Union of the numeric and text features in addition to Count Vectorizer and Decision Tree Classifier

In [69]:
pipe = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector', get_numeric_data),
                ('ss', StandardScaler())
            ])),
            ('text_features', Pipeline([
                ('selector', get_text_data),
                ('cvec', CountVectorizer())
            ]))
    ])),
    ('dt', DecisionTreeClassifier())
])

### Create Decision Tree parameters - max_depth, min_samples_split, and min_samples_leaf

In [69]:
params = {
    'dt__max_depth': [5],
    'dt__min_samples_split': [7],
    'dt__min_samples_leaf': [3]
}

### Instantiate and fit GridSearch CV

In [70]:
gs = GridSearchCV(pipe, params, cv = 5)

In [71]:
gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('features',
                                        FeatureUnion(n_jobs=None,
                                                     transformer_list=[('numeric_features',
                                                                        Pipeline(memory=None,
                                                                                 steps=[('selector',
                                                                                         FunctionTransformer(accept_sparse=False,
                                                                                                             check_inverse=True,
                                                                                                             func=<function <lambda> at 0x1a20b349e0>,
                                                                                        

### Create a best estimator variable to predict X_test

In [72]:
preds = gs.best_estimator_.predict(X_test)

### Build a function to print out an accuracy score and a nice confusion matrix

In [73]:
preds = gs.best_estimator_.predict(X_test)
def nice_conmat(y_test, preds, classes):
    conmat = confusion_matrix(y_test, preds)
    print(f'Accuracy Score: {accuracy_score(y_test, preds)}')
    return pd.DataFrame(conmat, columns=['Predicted ' +class_ for class_ in classes], \
                index=['Actual '+ class_ for class_ in classes])
nice_conmat(y_test, preds, ['meditation', 'spirituality'])

Accuracy Score: 0.8188650306748466


Unnamed: 0,Predicted meditation,Predicted spirituality
Actual meditation,2310,933
Actual spirituality,248,3029


### Print train and test scores to compare to the accuracy score above

In [74]:
print(gs.score(X_train, y_train))
print(gs.score(X_test, y_test))

0.8176186158960411
0.8188650306748466
