## Import Libraries

In [1]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

  from numpy.core.umath_tests import inner1d


## Import Data

In [2]:
# import the combined dataframe
# and check out the first five rows
df = pd.read_csv('./datasets/combined_df.csv')
df.head()

Unnamed: 0,subreddit,title
0,WritingPrompts,It's been over 800 days since you landed on P...
1,WritingPrompts,Humans are the only species known to have dom...
2,WritingPrompts,"He has been blind all his life. Now, he is th..."
3,WritingPrompts,You’re dying...and dying. And then you die. B...
4,WritingPrompts,Humanity has found a way to circumvent the ne...


In [3]:
# check out the distribution of the target column
df.subreddit.value_counts()

Showerthoughts    998
WritingPrompts    998
Name: subreddit, dtype: int64

# Pre-Processing

### Tokenize and Stem all of the Titles

In [4]:
# import tokenizer and stemmer
from nltk import word_tokenize
from nltk.stem import PorterStemmer

In [5]:
# define a function that takes in a title
# and tokenizes and stems that title
def stem_title(title):    
    stm = PorterStemmer()
    tokenized = word_tokenize(title)
    stem_tokes = []
    for toke in tokenized:
        stem_tokes.append(stm.stem(toke))
    stem_tokes

    comb = ''
    for stemmed in stem_tokes:
        comb += stemmed + ' '
    return comb

In [6]:
# check that the function works
stem_title(df.title[3])

'you ’ re die ... and die . and then you die . but you wake up . you ’ re now an anim . you ’ ve start to enjoy thi second life . '

In [7]:
# create a new column by applying the function to the title column
# and check out the head to see if it worked
df['stemmed_titles'] = df.title.apply(stem_title)
df.head()

Unnamed: 0,subreddit,title,stemmed_titles
0,WritingPrompts,It's been over 800 days since you landed on P...,It 's been over 800 day sinc you land on plane...
1,WritingPrompts,Humans are the only species known to have dom...,human are the onli speci known to have domest ...
2,WritingPrompts,"He has been blind all his life. Now, he is th...","He ha been blind all hi life . now , he is the..."
3,WritingPrompts,You’re dying...and dying. And then you die. B...,you ’ re die ... and die . and then you die . ...
4,WritingPrompts,Humanity has found a way to circumvent the ne...,human ha found a way to circumv the need for s...


In [8]:
# check out the tail, too
df.tail()

Unnamed: 0,subreddit,title,stemmed_titles
1991,Showerthoughts,The fact that we have collectively decided to ...,the fact that we have collect decid to trick a...
1992,Showerthoughts,"""Leaving the sinking ship"" fit metaphoricly pe...",`` leav the sink ship '' fit metaphoricli perf...
1993,Showerthoughts,"If you could lift objects with your mind, you ...","If you could lift object with your mind , you ..."
1994,Showerthoughts,In the one episode of Phineas and Ferb where F...,In the one episod of phinea and ferb where fer...
1995,Showerthoughts,If Final Fantasy ever reaches the 30th main ga...,If final fantasi ever reach the 30th main game...


# Model Prep

### Set up X and y Variables

In [9]:
X = df.stemmed_titles
y = df.subreddit

### Train-Test-Split

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    stratify=y, 
                                                    random_state=42)

### TfidfVectorize

In [12]:
# Instantiate
tf = TfidfVectorizer(stop_words='english', 
                     ngram_range=(1,2), 
                     max_features=1000)

In [13]:
# Fit and Transform the training data
X_train_tf = tf.fit_transform(X_train)

# Transform the test data
X_test_tf = tf.transform(X_test)

# Modeling

### Define a function to automate the following steps:
1. Instantiate
2. Fit to training data
3. Score on the training data
4. Score on the test data

In [14]:
def default_classifier(classifier):
    # Instantiate
    clf = classifier()
    
    # Fit to the training data
    clf.fit(X_train_tf, y_train)
    
    # Score on the training data
    print(f'Training Score: {clf.score(X_train_tf, y_train)}')

    # Score on the test data
    print(f'Test Score: {clf.score(X_test_tf, y_test)}')
    
    # Return the parameters of the classifier,
    # so I have a basis for sampling GridSearch parameters
    return clf.get_params

#### Dummy Classifier
I'm running a Dummy classifier to get a sense of my baseline. Since the classes are perfectly balanced (50/50), it's unsurprising that the Dummy classifier scores are around 50 percent.

In [15]:
default_classifier(DummyClassifier)

Training Score: 0.4909819639278557
Test Score: 0.531062124248497


  k in range(self.n_outputs_)).T


<bound method BaseEstimator.get_params of DummyClassifier(constant=None, random_state=None, strategy='stratified')>

#### Naive Bayes Classifier (default parameters)

In [16]:
default_classifier(MultinomialNB)

Training Score: 0.8964595858383434
Test Score: 0.781563126252505


<bound method BaseEstimator.get_params of MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)>

#### Naive Bayes Classifier (alternate parameters)

In [17]:
mnb = MultinomialNB(alpha=0.1)
mnb.fit(X_train_tf, y_train)
print(f'Score on training set: {mnb.score(X_train_tf, y_train)}')
print(f'Score on testing set: {mnb.score(X_test_tf, y_test)}')

Score on training set: 0.9111556446225785
Score on testing set: 0.7675350701402806


#### Logistic Regression Classifier (default parameters)

In [18]:
default_classifier(LogisticRegression)

Training Score: 0.9258517034068137
Test Score: 0.8356713426853707


<bound method BaseEstimator.get_params of LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)>

#### Logistic Regression Classifier (alternate parameters)

In [19]:
clf = LogisticRegression(C=0.1)
clf.fit(X_train_tf, y_train)
print(f'Score on training set: {clf.score(X_train_tf, y_train)}')
print(f'Score on testing set: {clf.score(X_test_tf, y_test)}')

Score on training set: 0.8717434869739479
Score on testing set: 0.8236472945891784


#### DecisionTree Classifier (default parameters)

In [20]:
default_classifier(DecisionTreeClassifier)

Training Score: 0.9993319973279893
Test Score: 0.7334669338677354


<bound method BaseEstimator.get_params of DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')>

#### DecisionTree Classifier (GridSearch parameters)

In [21]:
# Instantiate the GridSearch
grid = GridSearchCV(estimator=DecisionTreeClassifier(),
                    param_grid={'max_depth': [None, 3, 10, 20],
                                'min_samples_leaf': [1, 2, 3],
                                'min_samples_split': [2, 3, 4]},
                    cv=3)

# Fit the GridSearch on the training data
grid.fit(X_train_tf, y_train)

# Instantiate the RandomForest classifier with the best parameters
dt = DecisionTreeClassifier(max_depth= grid.best_params_['max_depth'],
                            min_samples_leaf= grid.best_params_['min_samples_leaf'],
                            min_samples_split= grid.best_params_['min_samples_split'])

# Fit the classifier to the training data
dt.fit(X_train_tf, y_train)

# Evaluate model.
print(f'Score on training set: {dt.score(X_train_tf, y_train)}')
print(f'Score on testing set: {dt.score(X_test_tf, y_test)}')

Score on training set: 0.9418837675350702
Score on testing set: 0.7114228456913828


#### RandomForest Classifier (default parameters)

In [22]:
# fit using the default parameters
default_classifier(RandomForestClassifier)

Training Score: 0.9879759519038076
Test Score: 0.8136272545090181


<bound method BaseEstimator.get_params of RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)>

#### RandomForest Classifier (GridSearch parameters)

In [23]:
# Instantiate the GridSearch
grid = GridSearchCV(estimator=RandomForestClassifier(),
                    param_grid={'max_depth': [None, 3, 10, 20],
                                'min_samples_leaf': [1, 2, 3],
                                'min_samples_split': [2, 3, 4],
                                'n_estimators': [10, 20, 30]},
                    cv=3)

# Fit the GridSearch on the training data
grid.fit(X_train_tf, y_train)

# Instantiate the RandomForest classifier with the best parameters
rf = RandomForestClassifier(max_depth= grid.best_params_['max_depth'],
                            min_samples_leaf= grid.best_params_['min_samples_leaf'],
                            min_samples_split= grid.best_params_['min_samples_split'],
                            n_estimators= grid.best_params_['n_estimators'])

# Fit the classifier to the training data
rf.fit(X_train_tf, y_train)

# Evaluate model.
print(f'Score on training set: {rf.score(X_train_tf, y_train)}')
print(f'Score on testing set: {rf.score(X_test_tf, y_test)}')

Score on training set: 0.9258517034068137
Score on testing set: 0.8276553106212425


#### Support Vector Classifier (default parameters)

In [24]:
default_classifier(SVC)

Training Score: 0.5003340013360054
Test Score: 0.49899799599198397


<bound method BaseEstimator.get_params of SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)>

#### Support Vector Classifier (alternate parameters)

In [25]:
svc_2 = SVC(kernel='linear')
svc_2.fit(X_train_tf, y_train)
print(f'Score on training set: {svc_2.score(X_train_tf, y_train)}')
print(f'Score on testing set: {svc_2.score(X_test_tf, y_test)}')

Score on training set: 0.9385437541750167
Score on testing set: 0.8256513026052105


In [26]:
svc_3 = SVC(kernel='linear', C=0.1)
svc_3.fit(X_train_tf, y_train)
print(f'Score on training set: {svc_3.score(X_train_tf, y_train)}')
print(f'Score on testing set: {svc_3.score(X_test_tf, y_test)}')

Score on training set: 0.8670674682698731
Score on testing set: 0.8236472945891784


#### K Nearest Neighbors Classifier (default parameters)

In [27]:
default_classifier(KNeighborsClassifier)

Training Score: 0.5263861055444222
Test Score: 0.503006012024048


<bound method BaseEstimator.get_params of KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')>

#### K Nearest Neighbors Classifier (alternate parameters)

In [28]:
knn2 = KNeighborsClassifier(n_neighbors=3)
knn2.fit(X_train_tf, y_train)
print(f'Score on training set: {knn2.score(X_train_tf, y_train)}')
print(f'Score on testing set: {knn2.score(X_test_tf, y_test)}')

Score on training set: 0.5337341349365398
Score on testing set: 0.503006012024048


In [29]:
knn3 = KNeighborsClassifier(n_neighbors=2)
knn3.fit(X_train_tf, y_train)
print(f'Score on training set: {knn3.score(X_train_tf, y_train)}')
print(f'Score on testing set: {knn3.score(X_test_tf, y_test)}')

Score on training set: 0.5290581162324649
Score on testing set: 0.5070140280561122


#### Bagging Classifier (Default Parameters)

In [30]:
default_classifier(BaggingClassifier)

Training Score: 0.9886439545758183
Test Score: 0.7875751503006012


<bound method BaseEstimator.get_params of BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
         verbose=0, warm_start=False)>

#### AdaBoost Classifier (Default Parameters)

In [31]:
default_classifier(AdaBoostClassifier)

Training Score: 0.832999331997328
Test Score: 0.7535070140280561


<bound method BaseEstimator.get_params of AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)>