# Step 3: Modeling 

In [15]:
## Imports
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import plot_confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegressionCV, Ridge, RidgeCV, Lasso, LassoCV, ElasticNet, ElasticNetCV
from sklearn.svm import LinearSVC

%matplotlib inline

import warnings

warnings.filterwarnings('ignore')


### Read in step 1 data & prep for modeling

In [16]:
# Import the data we're looking at from part 1
both_posts_imp = pd.read_csv('../data/both_posts_step1_xl.csv')
both_posts_imp.drop(columns="Unnamed: 0", inplace=True)
both_posts_imp.head()

Unnamed: 0,created_utc,selftext,subreddit,title,ds_ind
0,1638348806,Ideally would love a program that goes at my o...,datascience,Can a master's in DS from WGU or like universi...,1
1,1638348235,,datascience,Data Scientists in Germany: What to expect as ...,1
2,1638344086,,datascience,How's Pluralsight for Data Science?,1
3,1638341696,I am trying to implement this for car dashcam ...,datascience,vid2depth on custom video,1
4,1638333037,\n\nShould I buy this offer if I now the basi...,datascience,DataCamp CyberMonday Offer,1


In [3]:
both_posts_imp.shape

(19996, 5)

In [4]:
both_posts_imp.isnull().sum()

created_utc       0
selftext       3520
subreddit         0
title             0
ds_ind            0
dtype: int64

In [5]:
both_posts_imp['subreddit'].value_counts()

datascience    10000
analytics       9996
Name: subreddit, dtype: int64

In [6]:
#Inspect target (subreddit) and feature(title)
viewing_purposes=both_posts_imp.filter(["subreddit","title"])
viewing_purposes.sort_values('title')

Unnamed: 0,subreddit,title
14711,analytics,"""A theory of everything"":Mathematics shown to ..."
14246,analytics,"""Avg Page Load time(sec.)"" metric formula in G..."
10064,analytics,"""Betting in the economic perspetives"""
15974,analytics,"""Data Analyst"" without analytics skills. Looki..."
7247,datascience,"""Data scientists will be extinct in 10 years"" ..."
...,...,...
1283,datascience,😍Straight out of science fiction: Separate cli...
14593,analytics,😓 Monthly/Quarterly Reporting Dreadful
8044,datascience,🚀 How to Crack the Facebook Data Scientist Int...
2128,datascience,🤯🖼️Remove any object or person in an image eas...


### Setup Target X and Y

In [17]:
#Set up train/test with the default .75/.25 train/test size

X=both_posts_imp['title']
y=both_posts_imp['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42, stratify=y)

### Baseline Accuracy for Comparison

In [18]:
both_posts_imp['subreddit'].value_counts(normalize=True)

datascience    0.5001
analytics      0.4999
Name: subreddit, dtype: float64

### Create Stopwords Lists

In [19]:
stopwords_custom1 = nltk.corpus.stopwords.words('english')
stopwords_custom1.append("data science")
stopwords_custom1.append("analytics")
stopwords_custom1.append("science")
stopwords_custom1.append("scientists")
stopwords_custom1.append("data")

In [20]:
stopwords_custom2 = nltk.corpus.stopwords.words('english')
stopwords_custom2.append("data science")
stopwords_custom2.append("data")
stopwords_custom2.append("analytics")
stopwords_custom2.append("science")
stopwords_custom2.append("scientists")
stopwords_custom2.append("get")
stopwords_custom2.append("amp")
stopwords_custom2.append("would")
stopwords_custom2.append("using")
stopwords_custom2.append("use")

## Initial Step 1 Models

### CountVectorizer Passes
* Explore how different models interact with CountVectorizer


In [20]:
# Count Vectorizer LogReg
pipe = Pipeline([
    ('cv', CountVectorizer()),
    ('lr', LogisticRegressionCV(solver='liblinear'))
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.8723077948923118, 0.8013602720544108)

In [10]:
# Count Vectorizer Naive Bayes

pipe_nb = Pipeline([
    ('cv', CountVectorizer()),
    ('nb', MultinomialNB())
])

pipe_nb.fit(X_train, y_train)
pipe_nb.score(X_train, y_train), pipe_nb.score(X_test, y_test)

(0.8501033540041342, 0.7871574314862972)

In [13]:
# Count Vectorizer Decision Trees
pipe = Pipeline([
    ('cv', CountVectorizer()),
    ('dt', DecisionTreeClassifier())
])
pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.9935987197439488, 0.7643528705741148)

In [13]:
# Count Vectorizer Bagging Classifier
pipe_bc = Pipeline([
    ('cv', CountVectorizer()),
    ('bc', BaggingClassifier(random_state=42, n_estimators=100))
])
pipe_bc.fit(X_train, y_train)
pipe_bc.score(X_train, y_train), pipe_bc.score(X_test, y_test)

(0.9935320397412816, 0.7851570314062812)

In [21]:
# Count Vectorizer Random Forests

pipe_rf_tf = Pipeline([
    ('cv', CountVectorizer(min_df=2)),
    ('rf', RandomForestClassifier())
])
pipe_rf_tf.fit(X_train, y_train)
pipe_rf_tf.score(X_train, y_train), pipe_rf_tf.score(X_test, y_test)

(0.991998399679936, 0.7953590718143628)

In [14]:
# Count Vectorizer AdaBoost Classifier

pipe_abc1 = Pipeline([
    ('cv', CountVectorizer()),
    ('abc', AdaBoostClassifier(random_state=42, n_estimators=500))
])
pipe_abc1.fit(X_train, y_train)
pipe_abc1.score(X_train, y_train), pipe_abc1.score(X_test, y_test)

(0.8255651130226045, 0.7821564312862572)

In [15]:
# Count Vectorizer Gradient Boost Classifier

pipe_gb1 = Pipeline([
    ('cv', CountVectorizer()),
    ('gb', GradientBoostingClassifier(random_state=42, n_estimators=500))
])
pipe_gb1.fit(X_train, y_train)
pipe_gb1.score(X_train, y_train), pipe_gb1.score(X_test, y_test)

(0.8271654330866173, 0.7931586317263453)

In [16]:
# Count Vectorizer SVM

pipe_svm1 = Pipeline([
    ('cv', CountVectorizer()),
    ('svm', LinearSVC(max_iter=100))
])
pipe_svm1.fit(X_train, y_train)
pipe_svm1.score(X_train, y_train), pipe_svm1.score(X_test, y_test)

(0.9399213175968527, 0.7669533906781356)

### TF-IDF Passes
* Explore how different models interact with TF-IDF

In [22]:
# tfidf Log Regression

pipe = Pipeline([
    ('tf', TfidfVectorizer()),
    ('lr', LogisticRegressionCV(solver = 'liblinear'))
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.8459691938387678, 0.8025605121024205)

In [23]:
# tfidf Naive Bayes

pipe_nb = Pipeline([
    ('tf', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

pipe_nb.fit(X_train, y_train)
pipe_nb.score(X_train, y_train), pipe_nb.score(X_test, y_test)

(0.8514369540574782, 0.7699539907981596)

In [24]:
## tfidf KNN
pipe = Pipeline([
    ('tf', TfidfVectorizer()),
    ('knn', KNeighborsClassifier())
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.6191905047676202, 0.5471094218843768)

In [25]:
# tfidf Decision Trees
pipe = Pipeline([
    ('tf', TfidfVectorizer()),
    ('dt', DecisionTreeClassifier())
])
pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.9935987197439488, 0.7473494698939788)

In [26]:
# tfidf Bagging Classifier
pipe_bc = Pipeline([
    ('tf', TfidfVectorizer()),
    ('bc', BaggingClassifier(random_state=42, n_estimators=100))
])
pipe_bc.fit(X_train, y_train)
pipe_bc.score(X_train, y_train), pipe_bc.score(X_test, y_test)

(0.9935320397412816, 0.7869573914782957)

In [27]:
# tfidf Random Forests
pipe_rf1 = Pipeline([
    ('tf', TfidfVectorizer()),
    ('rf', RandomForestClassifier())
])
pipe_rf1.fit(X_train, y_train)
pipe_rf1.score(X_train, y_train), pipe_rf1.score(X_test, y_test)

(0.9935987197439488, 0.7997599519903981)

In [28]:
# tfidf AdaBoost Classifier

pipe_abc1 = Pipeline([
    ('tf', TfidfVectorizer()),
    ('abc', AdaBoostClassifier(random_state=42, n_estimators=500))
])
pipe_abc1.fit(X_train, y_train)
pipe_abc1.score(X_train, y_train), pipe_abc1.score(X_test, y_test)

(0.8351670334066813, 0.7765553110622124)

In [6]:
# tfidf Gradient Boost Classifier

pipe_gb1 = Pipeline([
    ('tf', TfidfVectorizer()),
    ('gb', GradientBoostingClassifier(random_state=42, n_estimators=500))
])
pipe_gb1.fit(X_train, y_train)
pipe_gb1.score(X_train, y_train), pipe_gb1.score(X_test, y_test)

(0.8365673134626925, 0.7889577915583117)

In [30]:
# tfidf SVM

pipe_svm1 = Pipeline([
    ('tf', TfidfVectorizer()),
    ('svm', LinearSVC(max_iter=100))
])
pipe_svm1.fit(X_train, y_train)
pipe_svm1.score(X_train, y_train), pipe_svm1.score(X_test, y_test)

(0.9235847169433887, 0.7865573114622925)

### Preliminary Results:
* Based on the runs above, we will choose to experiment with different hyperparameters via GridSearch to tune the model.
* Logistic Regression, Random Forests, and Gradient Boosting had the highest testing accuracy scores so we will focus primarily on these. In some cases, TF-IDF did slightly better than CountVectorizer so we will focus on these models as well.
* These initial passes show that several models have high variance and overfitting, so we will explore different hyperparameters to help with the overfitting as well.



## Step 2 Models: 
* Improve model performance with hyperparameters via GridSearch


In [31]:
# TFID Vectorizer X Logistic Regression

pipe_log_reg1c = Pipeline([
    ('tf', TfidfVectorizer()),
    ('lr', LogisticRegressionCV(solver='saga'))
])
pipe_log_reg1c.fit(X_train, y_train)
pipe_log_reg1c.score(X_train, y_train), pipe_log_reg1c.score(X_test, y_test)



logreg_params = {
    'tf__max_features': [None, 1000],
    'tf__min_df': [1,2],
    'tf__stop_words': [None, 'english', stopwords_custom1, stopwords_custom2],
    'tf__ngram_range': [(1, 1), (1, 2)],
    'lr__Cs': [[.1],[.2],[.3],[.35],[.4],[.45],[.5]],
    'lr__penalty':['l1', 'l2', 'elasticnet'],
    'lr__max_iter':range(100,300,100),
    'lr__random_state':[42]
}


pipe_log_reg2c = GridSearchCV(pipe_log_reg1c, # What model do we want to fit?
                                logreg_params, # What is the dictionary of hyperparameters
                                cv = 5, # what number of folds in CV will we use?
                                #verbose = 1,
                                n_jobs= -1 )

pipe_log_reg2c.fit(X_train, y_train)
print(pipe_log_reg2c.best_params_)
print(f'Cross Val: {pipe_log_reg2c.best_score_}')
print(f'Training Accuracy: {pipe_log_reg2c.score(X_train, y_train)}')
print(f'Testing Accuracy: {pipe_log_reg2c.score(X_test, y_test)}')

{'lr__Cs': [0.45], 'lr__max_iter': 100, 'lr__penalty': 'l2', 'lr__random_state': 42, 'tf__max_features': None, 'tf__min_df': 1, 'tf__ngram_range': (1, 2), 'tf__stop_words': 'english'}
Cross Val: 0.8021606090919194
Training Accuracy: 0.8898446355937855
Testing Accuracy: 0.8063612722544509


In [39]:
# Count Vectorizer X Logistic Regression

pipe_log_reg1d = Pipeline([
    ('cv', CountVectorizer()),
    ('lr', LogisticRegressionCV(solver='saga'))
])
pipe_log_reg1d.fit(X_train, y_train)
pipe_log_reg1d.score(X_train, y_train), pipe_log_reg1d.score(X_test, y_test)



logreg_params = {
    'cv__max_features': [None, 1000],
    'cv__min_df': [1,2],
    'cv__stop_words': [None, 'english', stopwords_custom1, stopwords_custom2],
    'cv__ngram_range': [(1, 1), (1, 2)],
    'lr__Cs': [[.1],[.2],[.3],[.35],[.4],[.45],[.5]],
    'lr__penalty':['l1', 'l2', 'elasticnet'],
    'lr__max_iter':range(100,300,100),
    'lr__random_state':[42]
}


pipe_log_reg2d = GridSearchCV(pipe_log_reg1d, # What model do we want to fit?
                                logreg_params, # What is the dictionary of hyperparameters
                                cv = 5, # what number of folds in CV will we use?
                                verbose = 1,
                                n_jobs= -1 )

pipe_log_reg2d.fit(X_train, y_train)
print(pipe_log_reg2d.best_params_)
print(f'Cross Val: {pipe_log_reg2d.best_score_}')
print(f'Training Accuracy: {pipe_log_reg2d.score(X_train, y_train)}')
print(f'Testing Accuracy: {pipe_log_reg2d.score(X_test, y_test)}')

{'cv__max_features': None, 'cv__min_df': 1, 'cv__ngram_range': (1, 2), 'cv__stop_words': 'english', 'lr__Cs': [0.2], 'lr__max_iter': 100, 'lr__penalty': 'l2', 'lr__random_state': 42}
Cross Val: 0.8008936978992998
Training Accuracy: 0.909381876375275
Testing Accuracy: 0.8069613922784556


In [40]:
# TFID Vectorizer X Naive Bayes

pipe_nb1 = Pipeline([
    ('tf', TfidfVectorizer()),
    ('nb', MultinomialNB())
])
pipe_nb1.fit(X_train, y_train)
pipe_nb1.score(X_train, y_train), pipe_nb1.score(X_test, y_test)



nb_params = {
    'tf__max_features': [None, 1000,2000],
    'tf__min_df': [1,2],
    'tf__stop_words': [None, 'english', stopwords_custom1, stopwords_custom2],
    'tf__ngram_range': [(1, 1), (1, 2)]
}


pipe_nb2 = GridSearchCV(pipe_nb1, # What model do we want to fit?
                                nb_params, # What is the dictionary of hyperparameters
                                cv = 5, # what number of folds in CV will we use?
                                verbose = 1,
                                n_jobs= -1 )

pipe_nb2.fit(X_train, y_train)
print(pipe_nb2.best_params_)
print(f'Cross Val: {pipe_nb2.best_score_}')
print(f'Training Accuracy: {pipe_nb2.score(X_train, y_train)}')
print(f'Testing Accuracy: {pipe_nb2.score(X_test, y_test)}')

Fitting 5 folds for each of 48 candidates, totalling 240 fits
{'tf__max_features': None, 'tf__min_df': 1, 'tf__ngram_range': (1, 2), 'tf__stop_words': 'english'}
Cross Val: 0.7860904301433811
Training Accuracy: 0.9381876375275054
Testing Accuracy: 0.7935587117423485


In [42]:
# TFID Vectorizer X KNN

pipe_knn1 = Pipeline([
    ('tf', TfidfVectorizer()),
    ('knn', KNeighborsClassifier())
])
pipe_knn1.fit(X_train, y_train)
pipe_knn1.score(X_train, y_train), pipe_knn1.score(X_test, y_test)



knn_params = {
    'tf__max_features': [None, 1000,2000],
    'tf__min_df': [1,2],
    'tf__stop_words': [None, 'english', stopwords_custom1, stopwords_custom2],
    'tf__ngram_range': [(1, 1), (1, 2)],
    'knn__n_neighbors': range(1, 51, 10),
    'knn__metric': ['euclidean', 'manhattan']
}


pipe_knn2 = GridSearchCV(pipe_knn1, # What model do we want to fit?
                                knn_params, # What is the dictionary of hyperparameters
                                cv = 5, # what number of folds in CV will we use?
                                verbose = 1,
                                n_jobs= -1 )

pipe_knn2.fit(X_train, y_train)
print(pipe_knn2.best_params_)
print(f'Cross Val: {pipe_knn2.best_score_}')
print(f'Training Accuracy: {pipe_knn2.score(X_train, y_train)}')
print(f'Testing Accuracy: {pipe_knn2.score(X_test, y_test)}')

Fitting 5 folds for each of 480 candidates, totalling 2400 fits
{'knn__metric': 'euclidean', 'knn__n_neighbors': 41, 'tf__max_features': None, 'tf__min_df': 1, 'tf__ngram_range': (1, 2), 'tf__stop_words': 'english'}
Cross Val: 0.7500170056685562
Training Accuracy: 0.7596185903847437
Testing Accuracy: 0.7451490298059612


In [41]:
# TFID Vectorizer X Decision Trees

pipe_dt1 = Pipeline([
    ('tf', TfidfVectorizer()),
    ('dt', DecisionTreeClassifier())
])
pipe_dt1.fit(X_train, y_train)
pipe_dt1.score(X_train, y_train), pipe_dt1.score(X_test, y_test)



dt_params = {
    'tf__max_features': [None, 1000,2000],
    'tf__min_df': [1,2],
    'tf__stop_words': [None, 'english', stopwords_custom1, stopwords_custom2],
    'tf__ngram_range': [(1, 1), (1, 2)],
    'dt__max_depth': [None, 2, 3, 5, 7],
    'dt__min_samples_split': [2, 5, 10, 15, 20],
    'dt__min_samples_leaf': range(1, 7),
    
}


pipe_dt2 = GridSearchCV(pipe_dt1, # What model do we want to fit?
                                dt_params, # What is the dictionary of hyperparameters
                                cv = 5, # what number of folds in CV will we use?
                                verbose = 1,
                                n_jobs= -1 )

pipe_dt2.fit(X_train, y_train)
print(pipe_dt2.best_params_)
print(f'Cross Val: {pipe_dt2.best_score_}')
print(f'Training Accuracy: {pipe_dt2.score(X_train, y_train)}')
print(f'Testing Accuracy: {pipe_dt2.score(X_test, y_test)}')

Fitting 5 folds for each of 7200 candidates, totalling 36000 fits
{'dt__max_depth': None, 'dt__min_samples_leaf': 6, 'dt__min_samples_split': 20, 'tf__max_features': 2000, 'tf__min_df': 1, 'tf__ngram_range': (1, 1), 'tf__stop_words': 'english'}
Cross Val: 0.7589519839946648
Training Accuracy: 0.8529705941188238
Testing Accuracy: 0.75375075015003


In [38]:
# TFID Vectorizer X Random Forests

#Trying this with more narrowed down params (V1)


pipe_rf1 = Pipeline([
    ('tf', TfidfVectorizer(max_features=1000, ngram_range=(1,2),stop_words='english', min_df=1)),
    ('rf', RandomForestClassifier())
])
pipe_rf1.fit(X_train, y_train)
pipe_rf1.score(X_train, y_train), pipe_rf1.score(X_test, y_test)



rf_params = {
   # 'tf__max_features': [None, 1000,2000],
   # 'tf__min_df': [0,1,2],
   # 'tf__stop_words': [None, 'english', stopwords_custom1, stopwords_custom2],
   # 'tf__ngram_range': [(1, 1), (1, 2)],
    
    'rf__max_depth': [None, 2, 3, 5, 7],
    'rf__min_samples_split': [2, 5, 10, 15, 20],
    'rf__min_samples_leaf': range(1, 7),
    'rf__n_estimators': range(100, 500, 50),
    'rf__max_depth': [None, 1, 2, 3, 4, 5],
    'rf__random_state':[42]
}


pipe_rf2 = GridSearchCV(pipe_rf1, # What model do we want to fit?
                                rf_params, # What is the dictionary of hyperparameters
                                cv = 5, # what number of folds in CV will we use?
                                verbose = 1,
                                n_jobs= -1 )

pipe_rf2.fit(X_train, y_train)
print(pipe_rf2.best_params_)
print(f'Cross Val: {pipe_rf2.best_score_}')
print(f'Training Accuracy: {pipe_rf2.score(X_train, y_train)}')
print(f'Testing Accuracy: {pipe_rf2.score(X_test, y_test)}')

Fitting 5 folds for each of 1440 candidates, totalling 7200 fits
{'rf__max_depth': None, 'rf__min_samples_leaf': 2, 'rf__min_samples_split': 2, 'rf__n_estimators': 400, 'rf__random_state': 42}
Cross Val: 0.7843571857285763
Training Accuracy: 0.8822431152897247
Testing Accuracy: 0.7885577115423085


In [22]:
# TFID Vectorizer X Gradient Boosting 


pipe_gb1 = Pipeline([
    ('tf', TfidfVectorizer()),
    ('gb', GradientBoostingClassifier())
])
pipe_gb1.fit(X_train, y_train)
pipe_gb1.score(X_train, y_train), pipe_gb1.score(X_test, y_test)



gb_params = {
    'tf__max_features': [None, 1000,2000],
    'tf__min_df': [1,2],
    'tf__stop_words': [None, 'english', stopwords_custom1, stopwords_custom2],
    'tf__ngram_range': [(1, 1), (1, 2)],
    
    'gb__n_estimators': range(100, 500, 50),
    'gb__random_state':[42]
}


pipe_gb2 = GridSearchCV(pipe_gb1, # What model do we want to fit?
                                gb_params, # What is the dictionary of hyperparameters
                                cv = 5, # what number of folds in CV will we use?
                                verbose = 1,
                                n_jobs= -1 )

pipe_gb2.fit(X_train, y_train)
print(pipe_gb2.best_params_)
print(f'Cross Val: {pipe_gb2.best_score_}')
print(f'Training Accuracy: {pipe_gb2.score(X_train, y_train)}')
print(f'Testing Accuracy: {pipe_gb2.score(X_test, y_test)}')

Fitting 5 folds for each of 384 candidates, totalling 1920 fits
{'gb__n_estimators': 400, 'gb__random_state': 42, 'tf__max_features': None, 'tf__min_df': 2, 'tf__ngram_range': (1, 1), 'tf__stop_words': 'english'}
Cross Val: 0.7860240969211959
Training Accuracy: 0.8259651930386077
Testing Accuracy: 0.7885577115423085


In [24]:
# Count Vectorizer X Gradient Boosting 


pipe_gb1b = Pipeline([
    ('cv', CountVectorizer()),
    ('gb', GradientBoostingClassifier())
])
pipe_gb1b.fit(X_train, y_train)
pipe_gb1b.score(X_train, y_train), pipe_gb1b.score(X_test, y_test)



gb_b_params = {
    'cv__max_features': [None, 1000],
    'cv__min_df': [1,2],
    'cv__stop_words': [None, 'english', stopwords_custom1, stopwords_custom2],
    'cv__ngram_range': [(1, 1), (1, 2)],
    
    'gb__n_estimators': range(100, 500, 50),
    'gb__random_state':[42]
}


pipe_gb2b = GridSearchCV(pipe_gb1b, # What model do we want to fit?
                               gb_b_params, # What is the dictionary of hyperparameters
                                cv = 5, # what number of folds in CV will we use?
                                verbose = 1,
                                n_jobs= -1 )

pipe_gb2b.fit(X_train, y_train)
print(pipe_gb2b.best_params_)
print(f'Cross Val: {pipe_gb2b.best_score_}')
print(f'Training Accuracy: {pipe_gb2b.score(X_train, y_train)}')
print(f'Testing Accuracy: {pipe_gb2b.score(X_test, y_test)}')

Fitting 5 folds for each of 256 candidates, totalling 1280 fits
{'cv__max_features': 1000, 'cv__min_df': 1, 'cv__ngram_range': (1, 2), 'cv__stop_words': 'english', 'gb__n_estimators': 450, 'gb__random_state': 42}
Cross Val: 0.7898252973213292
Training Accuracy: 0.8166966726678669
Testing Accuracy: 0.7945589117823565


# Results:
* The model that performed the best was the Logistic Regression with a Count Vectorizer, with a testing accuracy of 80.7%. The cross-val score was 80.9%. The second logistic regression with TF-IDF was very close, with a test accuracy of 80.6%.
* In our dataset, Data Science had a baseline accuracy of ~50%, thus we see that our model did better than the baseline (81% vs 50%). 
* The next model that did the best was Gradient Boost with Count Vectorizer at 79.5%. 
* Most of the remaining models performed between 75%-79%. 
* Based on these results, we can see that the two subreddits of Data Science and Analytics have a lot of overlap, with many terms in common. This may be the reason that all of our models hit a ceiling with the best accuracy at 81%.
