## Import Libraries

In [1]:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

  from numpy.core.umath_tests import inner1d


## Import Data

In [2]:
# import the combined dataframe
# and check out the first five rows
df = pd.read_csv('./datasets/combined_df.csv')
df.head()

Unnamed: 0,subreddit,title
0,WritingPrompts,It's been over 800 days since you landed on P...
1,WritingPrompts,Humans are the only species known to have dom...
2,WritingPrompts,"He has been blind all his life. Now, he is th..."
3,WritingPrompts,You’re dying...and dying. And then you die. B...
4,WritingPrompts,Humanity has found a way to circumvent the ne...


In [3]:
# check out the distribution of the target column
df.subreddit.value_counts()

Showerthoughts    998
WritingPrompts    998
Name: subreddit, dtype: int64

### Lemmatize all of the words

In [4]:
words = ''
for entry in df.title:
    words += entry

from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
word_tokens = tokenizer.tokenize(words.lower())

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
words_lem = [lemmatizer.lemmatize(i) for i in word_tokens]

words_lem[:10]

['it', 's', 'been', 'over', '800', 'day', 'since', 'you', 'landed', 'on']

# Model Prep

### Set up X and y Variables

In [5]:
X = df.title
y = df.subreddit

### Train-Test-Split

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    stratify=y, 
                                                    random_state=42)

### CountVectorize

In [8]:
# Instantiate
cv = CountVectorizer(stop_words='english', 
                     ngram_range=(1,2), 
                     max_features=1000)

In [9]:
# Fit and Transform the training data
X_train_cv = cv.fit_transform(X_train)

# Transform the test data
X_test_cv = cv.transform(X_test)

# Modeling

### Define a function to automate the following steps:
1. Instantiate
2. Fit to training data
3. Score on the training data
4. Score on the test data

In [10]:
def default_classifier(classifier):
    # Instantiate
    clf = classifier()
    
    # Fit to the training data
    clf.fit(X_train_cv, y_train)
    
    # Score on the training data
    print(f'Training Score: {clf.score(X_train_cv, y_train)}')

    # Score on the test data
    print(f'Test Score: {clf.score(X_test_cv, y_test)}')
    
    # Return the parameters of the classifier,
    # so I have a basis for sampling GridSearch parameters
    return clf.get_params

#### Dummy Classifier
I'm running a Dummy classifier to get a sense of my baseline. Since the classes are perfectly balanced (50/50), it's unsurprising that the Dummy classifier scores are around 50 percent.

In [11]:
default_classifier(DummyClassifier)

Training Score: 0.509686038744155
Test Score: 0.5130260521042084


  k in range(self.n_outputs_)).T


<bound method BaseEstimator.get_params of DummyClassifier(constant=None, random_state=None, strategy='stratified')>

#### Naive Bayes Classifier (default parameters)

In [12]:
default_classifier(MultinomialNB)

Training Score: 0.8730794923179692
Test Score: 0.7314629258517034


<bound method BaseEstimator.get_params of MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)>

#### Naive Bayes Classifier (alternate parameters)

In [28]:
mnb = MultinomialNB(alpha=0.1)
mnb.fit(X_train_cv, y_train)
print(f'Score on training set: {mnb.score(X_train_cv, y_train)}')
print(f'Score on testing set: {mnb.score(X_test_cv, y_test)}')

Score on training set: 0.8891115564462257
Score on testing set: 0.7394789579158316


#### Logistic Regression Classifier (default parameters)

In [30]:
default_classifier(LogisticRegression)

Training Score: 0.9579158316633266
Test Score: 0.8176352705410822


<bound method BaseEstimator.get_params of LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)>

#### Logistic Regression Classifier (alternate parameters)

In [31]:
clf = LogisticRegression(C=0.1)
clf.fit(X_train_cv, y_train)
print(f'Score on training set: {clf.score(X_train_cv, y_train)}')
print(f'Score on testing set: {clf.score(X_test_cv, y_test)}')

Score on training set: 0.8804275217100869
Score on testing set: 0.8176352705410822


#### DecisionTree Classifier (default parameters)

In [14]:
default_classifier(DecisionTreeClassifier)

Training Score: 0.9979959919839679
Test Score: 0.7535070140280561


<bound method BaseEstimator.get_params of DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')>

#### DecisionTree Classifier (GridSearch parameters)

In [32]:
# Instantiate the GridSearch
grid = GridSearchCV(estimator=DecisionTreeClassifier(),
                    param_grid={'max_depth': [None, 3, 10, 20],
                                'min_samples_leaf': [1, 2, 3],
                                'min_samples_split': [2, 3, 4]},
                    cv=3)

# Fit the GridSearch on the training data
grid.fit(X_train_cv, y_train)

# Instantiate the RandomForest classifier with the best parameters
dt = DecisionTreeClassifier(max_depth= grid.best_params_['max_depth'],
                            min_samples_leaf= grid.best_params_['min_samples_leaf'],
                            min_samples_split= grid.best_params_['min_samples_split'])

# Fit the classifier to the training data
dt.fit(X_train_cv, y_train)

# Evaluate model.
print(f'Score on training set: {dt.score(X_train_cv, y_train)}')
print(f'Score on testing set: {dt.score(X_test_cv, y_test)}')

Score on training set: 0.9766199064796259
Score on testing set: 0.7755511022044088


#### RandomForest Classifier (default parameters)

In [15]:
# fit using the default parameters
default_classifier(RandomForestClassifier)

Training Score: 0.9812959251837008
Test Score: 0.7474949899799599


<bound method BaseEstimator.get_params of RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)>

#### RandomForest Classifier (GridSearch parameters)

In [16]:
# Instantiate the GridSearch
grid = GridSearchCV(estimator=RandomForestClassifier(),
                    param_grid={'max_depth': [None, 3, 10, 20],
                                'min_samples_leaf': [1, 2, 3],
                                'min_samples_split': [2, 3, 4],
                                'n_estimators': [10, 20, 30]},
                    cv=3)

# Fit the GridSearch on the training data
grid.fit(X_train_cv, y_train)

# Instantiate the RandomForest classifier with the best parameters
rf = RandomForestClassifier(max_depth= grid.best_params_['max_depth'],
                            min_samples_leaf= grid.best_params_['min_samples_leaf'],
                            min_samples_split= grid.best_params_['min_samples_split'],
                            n_estimators= grid.best_params_['n_estimators'])

# Fit the classifier to the training data
rf.fit(X_train_cv, y_train)

# Evaluate model.
print(f'Score on training set: {rf.score(X_train_cv, y_train)}')
print(f'Score on testing set: {rf.score(X_test_cv, y_test)}')

Score on training set: 0.9799599198396793
Score on testing set: 0.7895791583166333


#### Support Vector Classifier (default parameters)

In [17]:
default_classifier(SVC)

Training Score: 0.5003340013360054
Test Score: 0.49899799599198397


<bound method BaseEstimator.get_params of SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)>

#### Support Vector Classifier (alternate parameters)

In [18]:
svc_2 = SVC(kernel='linear')
svc_2.fit(X_train_cv, y_train)
print(f'Score on training set: {svc_2.score(X_train_cv, y_train)}')
print(f'Score on testing set: {svc_2.score(X_test_cv, y_test)}')

Score on training set: 0.9759519038076152
Score on testing set: 0.8016032064128257


In [19]:
svc_3 = SVC(kernel='linear', C=0.1)
svc_3.fit(X_train_cv, y_train)
print(f'Score on training set: {svc_3.score(X_train_cv, y_train)}')
print(f'Score on testing set: {svc_3.score(X_test_cv, y_test)}')

Score on training set: 0.8991315965263861
Score on testing set: 0.8196392785571143


#### K Nearest Neighbors Classifier (default parameters)

In [20]:
default_classifier(KNeighborsClassifier)

Training Score: 0.5784903139612558
Test Score: 0.5290581162324649


<bound method BaseEstimator.get_params of KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')>

#### K Nearest Neighbors Classifier (alternate parameters)

In [21]:
knn2 = KNeighborsClassifier(n_neighbors=3)
knn2.fit(X_train_cv, y_train)
print(f'Score on training set: {knn2.score(X_train_cv, y_train)}')
print(f'Score on testing set: {knn2.score(X_test_cv, y_test)}')

Score on training set: 0.6199064796259185
Score on testing set: 0.5390781563126252


In [27]:
knn3 = KNeighborsClassifier(n_neighbors=2)
knn3.fit(X_train_cv, y_train)
print(f'Score on training set: {knn3.score(X_train_cv, y_train)}')
print(f'Score on testing set: {knn3.score(X_test_cv, y_test)}')

Score on training set: 0.5918503674014696
Score on testing set: 0.5290581162324649
