# Model Preparation, Modeling, and Model Selection

## Imports

In [57]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

RANDOM_STATE = 42

## Read-In Data

In [2]:
subreddits = pd.read_csv('../data/subreddits_preprocessed.csv')
subreddits.drop(columns = 'Unnamed: 0', inplace = True)

## Model Preparation

When beginning to model the data, I was not sure if the original text, lemmatized text, or stemmed text would perform better in models or if a CountVectorizer or if a Tfidf Vectorizer would be better. Due to it's computational speed, I built grid searched over a logistic regression pipeline for each combination (E.g. Original + Count, Original + Tfidf, etc.)

### Train Test Split

In [3]:
common_features = ['num_comments', 'score', 'post_length_char', 'post_length_words', 'polarity']

#### Original Data

In [4]:
X_original = subreddits[common_features + ['original_text']]
y_original = subreddits['is_unethical']

In [5]:
X_original_train, X_original_test, y_original_train, y_original_test = train_test_split(X_original, y_original, test_size = 0.3, random_state = RANDOM_STATE, stratify = y_original)

#### Lemmatized Data

In [6]:
X_lemma = subreddits[common_features + ['lemma_text']]
y_lemma = subreddits['is_unethical']

In [7]:
X_lemma_train, X_lemma_test, y_lemma_train, y_lemma_test = train_test_split(X_lemma, y_lemma, test_size = 0.3, random_state = RANDOM_STATE, stratify = y_lemma)

#### Stemmed Data

In [8]:
X_stem = subreddits[common_features + ['stemmer_text']]
y_stem = subreddits['is_unethical']

In [9]:
X_stem_train, X_stem_test, y_stem_train, y_stem_test = train_test_split(X_stem, y_stem, test_size = 0.3, random_state = RANDOM_STATE, stratify = y_original)

### Build Column Transformers to Only Apply Vectorizers to Text Features

I used this [source](https://towardsdatascience.com/columntransformer-meets-natural-language-processing-da1f116dd69f) for guidance during this process.

In [10]:
ct_cvec_original = ColumnTransformer([
    ('cvec', CountVectorizer(), 'original_text'),], 
    remainder='passthrough')

In [11]:
ct_cvec_lemma = ColumnTransformer([
    ('cvec', CountVectorizer(), 'lemma_text'),], 
    remainder='passthrough')

In [12]:
ct_cvec_stemmed = ColumnTransformer([
    ('cvec', CountVectorizer(), 'stemmer_text'),], 
    remainder='passthrough')

In [13]:
ct_tfidf_original = ColumnTransformer([
    ('tfidf', TfidfVectorizer(), 'original_text'),], 
    remainder='passthrough')

In [14]:
ct_tfidf_lemma = ColumnTransformer([
    ('tfidf', TfidfVectorizer(), 'lemma_text'),], 
    remainder='passthrough')

In [15]:
ct_tfidf_stemmed = ColumnTransformer([
    ('tfidf', TfidfVectorizer(), 'stemmer_text'),], 
    remainder='passthrough')

## Modeling

### Logistic Regression Pipelines

#### Count Vectorizer with Original Text

In [16]:
# Build Pipeline
logr_cv_pipe = Pipeline([
    ('ct_cvec_original', ct_cvec_original),
    ('rs', RobustScaler(with_centering = False)),
    ('logr', LogisticRegression(solver = 'liblinear', max_iter = 200))
])

# Make param grid for grid search
logr_cv_params = {
    'ct_cvec_original__cvec__stop_words': [None],
    'ct_cvec_original__cvec__ngram_range': [(1,2)],
    'ct_cvec_original__cvec__min_df': [2],
    'ct_cvec_original__cvec__max_df': [0.95],
    'logr__penalty' :['l2'],
    'logr__C' : [ 1]   
}

# Instantiate grid search
gs_logreg_cv = GridSearchCV(logr_cv_pipe,
                           param_grid = logr_cv_params,
                           verbose = 1,
                           cv = 5)

In [17]:
gs_logreg_cv.fit(X_original_train, y_original_train);

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   13.6s finished


In [18]:
gs_logreg_cv.best_params_

{'ct_cvec_original__cvec__max_df': 0.95,
 'ct_cvec_original__cvec__min_df': 2,
 'ct_cvec_original__cvec__ngram_range': (1, 2),
 'ct_cvec_original__cvec__stop_words': None,
 'logr__C': 1,
 'logr__penalty': 'l2'}

In [19]:
gs_logreg_cv.best_score_

0.7817850167886176

In [20]:
gs_logreg_cv.score(X_original_train, y_original_train)

0.9973209936678032

In [21]:
gs_logreg_cv.score(X_original_test, y_original_test)

0.7910278250993753

#### Count Vectorizer with Lemma Text

In [22]:
logr_cv_pipe = Pipeline([
    ('ct_cvec_lemma', ct_cvec_lemma),
    ('rs', RobustScaler(with_centering = False)),
    ('logr', LogisticRegression(solver = 'liblinear', max_iter = 200))
])

# Make param grid for grid search
logr_cv_params = {
    'ct_cvec_lemma__cvec__stop_words': [None],
    'ct_cvec_lemma__cvec__ngram_range': [(1,2)],
    'ct_cvec_lemma__cvec__min_df': [2],
    'ct_cvec_lemma__cvec__max_df': [0.95],
    'logr__penalty' :['l2'],
    'logr__C' : [1]   
}

# Instantiate grid search
gs_logreg_cv = GridSearchCV(logr_cv_pipe,
                           param_grid = logr_cv_params,
                           verbose = 1,
                           cv = 5)

In [23]:
gs_logreg_cv.fit(X_lemma_train, y_lemma_train);

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   12.7s finished


In [24]:
gs_logreg_cv.best_params_

{'ct_cvec_lemma__cvec__max_df': 0.95,
 'ct_cvec_lemma__cvec__min_df': 2,
 'ct_cvec_lemma__cvec__ngram_range': (1, 2),
 'ct_cvec_lemma__cvec__stop_words': None,
 'logr__C': 1,
 'logr__penalty': 'l2'}

In [25]:
gs_logreg_cv.best_score_

0.7905512534414444

In [26]:
gs_logreg_cv.score(X_lemma_train, y_lemma_train)

0.997564539698003

In [27]:
gs_logreg_cv.score(X_lemma_test, y_lemma_test)

0.7847813742191937

#### Count Vectorizer with Stemmed Text

In [51]:
logr_cv_pipe = Pipeline([
    ('ct_cvec_stemmed', ct_cvec_stemmed),
    ('rs', RobustScaler(with_centering = False)),
    ('logr', LogisticRegression(solver = 'liblinear', max_iter = 200))
])

# Make param grid for grid search
logr_cv_params = {
    'ct_cvec_stemmed__cvec__stop_words': [None],
    'ct_cvec_stemmed__cvec__ngram_range': [(1,2)],
    'ct_cvec_stemmed__cvec__min_df': [2],
    'ct_cvec_stemmed__cvec__max_df': [0.95],
    'logr__penalty' :['l2'],
    'logr__C' : [1]   
}

# Instantiate grid search
gs_logreg_cv = GridSearchCV(logr_cv_pipe,
                           param_grid = logr_cv_params,
                           verbose = 1,
                           cv = 5)

In [52]:
gs_logreg_cv.fit(X_stem_train, y_stem_train);

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   12.9s finished


In [53]:
gs_logreg_cv.best_params_

{'ct_cvec_stemmed__cvec__max_df': 0.95,
 'ct_cvec_stemmed__cvec__min_df': 2,
 'ct_cvec_stemmed__cvec__ngram_range': (1, 2),
 'ct_cvec_stemmed__cvec__stop_words': None,
 'logr__C': 1,
 'logr__penalty': 'l2'}

In [54]:
gs_logreg_cv.best_score_

0.7864129259018882

In [55]:
gs_logreg_cv.score(X_stem_train, y_stem_train)

0.9978080857282027

In [56]:
gs_logreg_cv.score(X_stem_test, y_stem_test)

0.7904599659284497

#### Tfidf Vectorizer with Original Text

In [33]:
logr_cv_pipe = Pipeline([
    ('ct_tfidf_original', ct_tfidf_original),
    ('rs', RobustScaler(with_centering = False)),
    ('logr', LogisticRegression(solver = 'liblinear', max_iter = 200))
])

# Make param grid for grid search
logr_cv_params = {
    'ct_tfidf_original__tfidf__stop_words': [None],
    'ct_tfidf_original__tfidf__ngram_range': [(1,2)],
    'ct_tfidf_original__tfidf__min_df': [2],
    'ct_tfidf_original__tfidf__max_df': [0.95],
    'logr__penalty' :['l2'],
    'logr__C' : [10]   
}

# Instantiate grid search
gs_logreg_cv = GridSearchCV(logr_cv_pipe,
                           param_grid = logr_cv_params,
                           verbose = 1,
                           cv = 5)

In [34]:
gs_logreg_cv.fit(X_original_train, y_original_train);

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   14.3s finished


In [35]:
gs_logreg_cv.best_params_

{'ct_tfidf_original__tfidf__max_df': 0.95,
 'ct_tfidf_original__tfidf__min_df': 2,
 'ct_tfidf_original__tfidf__ngram_range': (1, 2),
 'ct_tfidf_original__tfidf__stop_words': None,
 'logr__C': 10,
 'logr__penalty': 'l2'}

In [36]:
gs_logreg_cv.best_score_

0.7976161052185484

In [37]:
gs_logreg_cv.score(X_original_test, y_original_test)

0.7898921067575241

#### Tfidf Vectorizer with Lemma Text

In [38]:
logr_cv_pipe = Pipeline([
    ('ct_tfidf_lemma', ct_tfidf_lemma),
    ('rs', RobustScaler(with_centering = False)),
    ('logr', LogisticRegression(solver = 'liblinear', max_iter = 200))
])

# Make param grid for grid search
logr_cv_params = {
    'ct_tfidf_lemma__tfidf__stop_words': [None],
    'ct_tfidf_lemma__tfidf__ngram_range': [(1,2)],
    'ct_tfidf_lemma__tfidf__min_df': [2],
    'ct_tfidf_lemma__tfidf__max_df': [0.95],
    'logr__penalty' :['l2'],
    'logr__C' : [1000]   
}

# Instantiate grid search
gs_logreg_cv = GridSearchCV(logr_cv_pipe,
                           param_grid = logr_cv_params,
                           verbose = 1,
                           cv = 5)

In [39]:
gs_logreg_cv.fit(X_lemma_train, y_lemma_train);

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   14.9s finished


In [40]:
gs_logreg_cv.best_params_

{'ct_tfidf_lemma__tfidf__max_df': 0.95,
 'ct_tfidf_lemma__tfidf__min_df': 2,
 'ct_tfidf_lemma__tfidf__ngram_range': (1, 2),
 'ct_tfidf_lemma__tfidf__stop_words': None,
 'logr__C': 1000,
 'logr__penalty': 'l2'}

In [41]:
gs_logreg_cv.best_score_

0.7978591178640967

In [42]:
gs_logreg_cv.score(X_lemma_train, y_lemma_train)

0.9995129079396006

In [43]:
gs_logreg_cv.score(X_lemma_test, y_lemma_test)

0.7893242475865985

#### Tfidf Vectorizer with Stemmed Text

In [44]:
logr_cv_pipe = Pipeline([
    ('ct_tfidf_stemmed', ct_tfidf_stemmed),
    ('rs', RobustScaler(with_centering = False)),
    ('logr', LogisticRegression(solver = 'liblinear', max_iter = 200))
])

# Make param grid for grid search
logr_cv_params = {
    'ct_tfidf_stemmed__tfidf__stop_words': [None],
    'ct_tfidf_stemmed__tfidf__ngram_range': [(1,2)],
    'ct_tfidf_stemmed__tfidf__min_df': [2],
    'ct_tfidf_stemmed__tfidf__max_df': [0.95],
    'logr__penalty' :['l2'],
    'logr__C' : [10]   
}

# Instantiate grid search
gs_logreg_cv = GridSearchCV(logr_cv_pipe,
                           param_grid = logr_cv_params,
                           verbose = 1,
                           cv = 5)

In [45]:
gs_logreg_cv.fit(X_stem_train, y_stem_train);

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   14.4s finished


In [46]:
gs_logreg_cv.best_params_

{'ct_tfidf_stemmed__tfidf__max_df': 0.95,
 'ct_tfidf_stemmed__tfidf__min_df': 2,
 'ct_tfidf_stemmed__tfidf__ngram_range': (1, 2),
 'ct_tfidf_stemmed__tfidf__stop_words': None,
 'logr__C': 10,
 'logr__penalty': 'l2'}

In [47]:
gs_logreg_cv.best_score_

0.7968885490663277

In [48]:
gs_logreg_cv.score(X_stem_train, y_stem_train)

0.9985387238188017

In [49]:
gs_logreg_cv.score(X_stem_test, y_stem_test)

0.8006814310051107

## Model Selection

In [50]:
subreddits['subreddit'].value_counts(normalize = True)

UnethicalLifeProTips    0.54389
LifeProTips             0.45611
Name: subreddit, dtype: float64