# Import Libraries and Clean Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import plot_confusion_matrix, balanced_accuracy_score, f1_score, recall_score, precision_score

from sklearn.tree import DecisionTreeClassifier, plot_tree

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer

In [2]:
subreddits = pd.read_csv('./data/posts_combined_clean.csv')

In [3]:
subreddits.shape

(7976, 10)

# Decision Tree Model

### X, y, train test split

In [4]:
X = subreddits['title']
y = subreddits['subreddit']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)

### Baseline Model

In [6]:
y.value_counts(normalize=True)

science       0.572091
technology    0.427909
Name: subreddit, dtype: float64

In [7]:
# the target in the dataset for posts was split approximately 57% for r/science.
# This will be the baseline model - if we guessed r/science each time we would 
# be correct 57% of the time

### Modeling

In [8]:
tree_pipe = make_pipeline(
    CountVectorizer(),
    StandardScaler(with_mean=False),
    DecisionTreeClassifier(),
)

In [9]:
tree_pipe.fit(X_train, y_train)

Pipeline(steps=[('countvectorizer', CountVectorizer()),
                ('standardscaler', StandardScaler(with_mean=False)),
                ('decisiontreeclassifier', DecisionTreeClassifier())])

### Model Scoring

In [10]:
preds_tree = tree_pipe.predict(X_test)

In [11]:
precision_tree = precision_score(y_test, preds_tree, pos_label='science')
recall_tree = recall_score(y_test, preds_tree, pos_label='science')
f1_tree = 2 * (precision_tree * recall_tree) / (precision_tree + recall_tree)

In [12]:
print(f'The accuracy training score is    {tree_pipe.score(X_train, y_train)}')
print(f'The accuracy testing score is     {tree_pipe.score(X_test, y_test)}')
print(f'The bac score is                  {balanced_accuracy_score(y_test, tree_pipe.predict(X_test))}')
print(f'The f1_score score is             {f1_tree}')
print(f'The precision is                  {precision_tree}')
print(f'The recall is                     {recall_tree}')

The accuracy training score is    0.9998328318288198
The accuracy testing score is     0.7938816449348044
The bac score is                  0.7891253269042171
The f1_score score is             0.8185430463576159
The precision is                  0.8096069868995633
The recall is                     0.8276785714285714


- Baseline decision tree is overfit
- The model fit the training data almost perfectly, but fit the testing data worse.
- The f1 score and balanced accuracy score did worse than the Logistic and Naive Bayes models.

### Decision Tree Grid Search Parameters

In [13]:
tree_pipe = make_pipeline(
    CountVectorizer(),
    StandardScaler(with_mean=False),
    DecisionTreeClassifier(),
)
tree_pipe.fit(X_train, y_train)
print(f'The accuracy training score is    {tree_pipe.score(X_train, y_train)}')
print(f'The accuracy testing score is     {tree_pipe.score(X_test, y_test)}')

The accuracy training score is    0.9998328318288198
The accuracy testing score is     0.7873620862587764


In [14]:
params = {
    'decisiontreeclassifier__max_depth': [2, 5, 7, 10, 30],
    'decisiontreeclassifier__min_samples_split': [5, 10, 20, 50],
    'decisiontreeclassifier__min_samples_leaf': [2, 5, 10, 20],
    'decisiontreeclassifier__ccp_alpha': [0.0001, 0.001, 0.01, 0.1, 0, 1, 10],
}

In [15]:
grid = GridSearchCV(tree_pipe, param_grid = params, n_jobs=-1)

In [16]:
grid.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('countvectorizer', CountVectorizer()),
                                       ('standardscaler',
                                        StandardScaler(with_mean=False)),
                                       ('decisiontreeclassifier',
                                        DecisionTreeClassifier())]),
             n_jobs=-1,
             param_grid={'decisiontreeclassifier__ccp_alpha': [0.0001, 0.001,
                                                               0.01, 0.1, 0, 1,
                                                               10],
                         'decisiontreeclassifier__max_depth': [2, 5, 7, 10, 30],
                         'decisiontreeclassifier__min_samples_leaf': [2, 5, 10,
                                                                      20],
                         'decisiontreeclassifier__min_samples_split': [5, 10,
                                                                       20,
               

In [17]:
grid.best_params_

{'decisiontreeclassifier__ccp_alpha': 0,
 'decisiontreeclassifier__max_depth': 30,
 'decisiontreeclassifier__min_samples_leaf': 2,
 'decisiontreeclassifier__min_samples_split': 10}

In [18]:
grid.score(X_train, y_train)

0.8259779338014042

In [19]:
grid.score(X_test, y_test)

0.7753259779338014

In [20]:
grid.best_params_

{'decisiontreeclassifier__ccp_alpha': 0,
 'decisiontreeclassifier__max_depth': 30,
 'decisiontreeclassifier__min_samples_leaf': 2,
 'decisiontreeclassifier__min_samples_split': 10}

### Decision Tree with Stop words

In [21]:
tree_pipe = make_pipeline(
    CountVectorizer(stop_words='english'),
    StandardScaler(with_mean=False),
    DecisionTreeClassifier(ccp_alpha=0, max_depth=30, min_samples_leaf=2, min_samples_split=10),
)
tree_pipe.fit(X_train, y_train)
print(f'The accuracy training score is    {tree_pipe.score(X_train, y_train)}')
print(f'The accuracy testing score is     {tree_pipe.score(X_test, y_test)}')
print(f'The bac score is                  {balanced_accuracy_score(y_test, tree_pipe.predict(X_test))}')

The accuracy training score is    0.7355399531929121
The accuracy testing score is     0.7216649949849548
The bac score is                  0.6856356243870546


### Feature Importance

In [22]:
tree_pipe.named_steps

{'countvectorizer': CountVectorizer(stop_words='english'),
 'standardscaler': StandardScaler(with_mean=False),
 'decisiontreeclassifier': DecisionTreeClassifier(ccp_alpha=0, max_depth=30, min_samples_leaf=2,
                        min_samples_split=10)}

In [24]:
coefs = tree_pipe.named_steps['decisiontreeclassifier'].feature_importances_

In [26]:
coef_df = pd.DataFrame({'coefs': coefs}, index = tree_pipe.named_steps['countvectorizer'].get_feature_names())
coef_df.nlargest(10, 'coefs')

Unnamed: 0,coefs
study,0.134656
19,0.085292
scientists,0.070206
researchers,0.052741
research,0.048033
app,0.04389
apple,0.033868
science,0.031222
trump,0.026862
2021,0.025017


- The words which had the highest weight on the model included words relating to scientists/research/studies, as well as some tech company names.