# Model Preparation, Modeling, and Model Selection

## Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

RANDOM_STATE = 42

## Read-In Data

In [2]:
subreddits = pd.read_csv('../data/subreddits_preprocessed.csv')
subreddits.drop(columns = 'Unnamed: 0', inplace = True)

## Model Preparation

### Train Test Split

In [3]:
common_features = ['num_comments', 'score', 'post_length_char', 'post_length_words', 'polarity']

#### Original Data

In [4]:
X_original = subreddits[common_features + ['original_text']]
y_original = subreddits['is_unethical']

In [5]:
X_original_train, X_original_test, y_original_train, y_original_test = train_test_split(X_original, y_original, test_size = 0.3, random_state = RANDOM_STATE, stratify = y_original)

#### Lemmatized Data

In [6]:
X_lemma = subreddits[common_features + ['lemma_text']]
y_lemma = subreddits['is_unethical']

In [7]:
X_lemma_train, X_lemma_test, y_lemma_train, y_lemma_test = train_test_split(X_lemma, y_lemma, test_size = 0.3, random_state = RANDOM_STATE, stratify = y_lemma)

#### Stemmed Data

In [8]:
X_stem = subreddits[common_features + ['stemmer_text']]
y_stem = subreddits['is_unethical']

In [9]:
X_stem_train, X_stem_test, y_stem_train, y_stem_test = train_test_split(X_stem, y_stem, test_size = 0.3, random_state = RANDOM_STATE, stratify = y_original)

### Build Column Transformers to Only Apply Vectorizers to Text Features

I used this [source](https://towardsdatascience.com/columntransformer-meets-natural-language-processing-da1f116dd69f) for guidance during this process.

In [10]:
ct_cvec_original = ColumnTransformer([
    ('cvec', CountVectorizer(), 'original_text'),], 
    remainder='passthrough')

In [11]:
ct_cvec_lemma = ColumnTransformer([
    ('cvec', CountVectorizer(), 'lemma_text'),], 
    remainder='passthrough')

In [12]:
ct_cvec_stemmed = ColumnTransformer([
    ('cvec', CountVectorizer(), 'stemmer_text'),], 
    remainder='passthrough')

In [52]:
ct_tfidf_original = ColumnTransformer([
    ('tfidf', TfidfVectorizer(), 'original_text'),], 
    remainder='passthrough')

In [53]:
ct_tfidf_lemma = ColumnTransformer([
    ('tfidf', TfidfVectorizer(), 'lemma_text'),], 
    remainder='passthrough')

In [54]:
ct_tfidf_stemmed = ColumnTransformer([
    ('tfidf', TfidfVectorizer(), 'stemmer_text'),], 
    remainder='passthrough')

## Modeling

### Logistic Regression Pipelines

#### Count Vectorizer with Original Text

In [16]:
# Build Pipeline
logr_cv_pipe = Pipeline([
    ('ct_cvec_original', ct_cvec_original),
    ('rs', RobustScaler(with_centering = False)),
    ('logr', LogisticRegression(solver = 'liblinear'))
])

# Make param grid for grid search
logr_cv_params = {
    'logr__penalty' :['l1', 'l2'],
    'logr__C' : [0.01, 0.1, 1, 10, 1000]   
}

# Instantiate grid search
gs_logreg_cv = GridSearchCV(logr_cv_pipe,
                           param_grid = logr_cv_params,
                           verbose = 1,
                           cv = 5)

In [17]:
gs_logreg_cv.fit(X_original_train, y_original_train);

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   58.8s finished


In [18]:
gs_logreg_cv.best_params_

{'logr__C': 1, 'logr__penalty': 'l2'}

In [19]:
gs_logreg_cv.best_score_

0.778861159763033

In [20]:
gs_logreg_cv.score(X_original_test, y_original_test)

0.7722884724588303

#### Count Vectorizer with Lemma Text

In [24]:
# Build Pipeline
logr_cv_pipe = Pipeline([
    ('ct_cvec_lemma', ct_cvec_lemma),
    ('rs', RobustScaler(with_centering = False)),
    ('logr', LogisticRegression(solver = 'liblinear'))
])

In [25]:
# Make param grid for grid search
logr_cv_params = {
    'logr__penalty' :['l1', 'l2'],
    'logr__C' : [0.01, 0.1, 1, 10, 1000]   
}

In [26]:
gs_logreg_cv = GridSearchCV(logr_cv_pipe,
                           param_grid = logr_cv_params,
                           verbose = 1,
                           cv = 5)

In [27]:
gs_logreg_cv.fit(X_lemma_train, y_lemma_train);

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   53.6s finished


In [28]:
gs_logreg_cv.best_params_

{'logr__C': 1, 'logr__penalty': 'l2'}

In [29]:
gs_logreg_cv.best_score_

0.7820295112185899

In [30]:
gs_logreg_cv.score(X_lemma_test, y_lemma_test)

0.7643384440658717

#### Count Vectorizer with Stemmed Text

In [35]:
# Build Pipeline
logr_cv_pipe = Pipeline([
    ('ct_cvec_stemmed', ct_cvec_stemmed),
    ('rs', RobustScaler(with_centering = False)),
    ('logr', LogisticRegression(solver = 'liblinear'))
])

In [36]:
# Make param grid for grid search
logr_cv_params = {
    'logr__penalty' :['l1', 'l2'],
    'logr__C' : [0.01, 0.1, 1, 10, 1000]   
}

In [37]:
gs_logreg_cv = GridSearchCV(logr_cv_pipe,
                           param_grid = logr_cv_params,
                           verbose = 1,
                           cv = 5)

In [39]:
gs_logreg_cv.fit(X_stem_train, y_stem_train);

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   45.1s finished


In [41]:
gs_logreg_cv.best_params_

{'logr__C': 1, 'logr__penalty': 'l2'}

In [42]:
gs_logreg_cv.best_score_

0.7783763198994758

In [44]:
gs_logreg_cv.score(X_stem_test, y_stem_test)

0.7791027825099376

#### Tfidf Vectorizer with Original Text

In [57]:
# Build Pipeline
logr_cv_pipe = Pipeline([
    ('ct_tfidf_original', ct_tfidf_original),
    ('rs', RobustScaler(with_centering = False)),
    ('logr', LogisticRegression(solver = 'liblinear', max_iter = 200))
])

# Make param grid for grid search
logr_cv_params = {
    'logr__penalty' :['l1', 'l2'],
    'logr__C' : [0.01, 0.1, 1, 10, 1000]   
}

# Instantiate grid search
gs_logreg_cv = GridSearchCV(logr_cv_pipe,
                           param_grid = logr_cv_params,
                           verbose = 1,
                           cv = 5)

In [58]:
gs_logreg_cv.fit(X_original_train, y_original_train);

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   57.4s finished


In [59]:
gs_logreg_cv.best_params_

{'logr__C': 10, 'logr__penalty': 'l2'}

In [60]:
gs_logreg_cv.best_score_

0.7924983181746788

In [61]:
gs_logreg_cv.score(X_original_test, y_original_test)

0.7836456558773425

#### Tfidf Vectorizer with Lemma Text

In [70]:
# Build Pipeline
logr_cv_pipe = Pipeline([
    ('ct_tfidf_lemma', ct_tfidf_lemma),
    ('rs', RobustScaler(with_centering = False)),
    ('logr', LogisticRegression(solver = 'liblinear', max_iter = 200))
])

# Make param grid for grid search
logr_cv_params = {
    'logr__penalty' :['l1', 'l2'],
    'logr__C' : [0.01, 0.1, 1, 10, 1000]   
}

# Instantiate grid search
gs_logreg_cv = GridSearchCV(logr_cv_pipe,
                           param_grid = logr_cv_params,
                           verbose = 1,
                           cv = 5)

In [71]:
gs_logreg_cv.fit(X_lemma_train, y_lemma_train);

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   55.4s finished


In [72]:
gs_logreg_cv.best_params_

{'logr__C': 10, 'logr__penalty': 'l2'}

In [73]:
gs_logreg_cv.best_score_

0.7934751104670288

In [74]:
gs_logreg_cv.score(X_lemma_test, y_lemma_test)

0.7768313458262351

#### Tfidf Vectorizer with Stemmed Text

In [75]:
# Build Pipeline
logr_cv_pipe = Pipeline([
    ('ct_tfidf_stemmed', ct_tfidf_stemmed),
    ('rs', RobustScaler(with_centering = False)),
    ('logr', LogisticRegression(max_iter = 200))
])

# Make param grid for grid search
logr_cv_params = {
    'logr__penalty' :['l1', 'l2'],
    'logr__C' : [0.01, 0.1, 1, 10, 1000]   
}

# Instantiate grid search
gs_logreg_cv = GridSearchCV(logr_cv_pipe,
                           param_grid = logr_cv_params,
                           verbose = 1,
                           cv = 5)

In [77]:
gs_logreg_cv.fit(X_stem_train, y_stem_train);

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   46.0s finished


In [82]:
gs_logreg_cv.best_params_

{'logr__C': 10, 'logr__penalty': 'l2'}

In [78]:
gs_logreg_cv.best_score_

0.7942062229018673

In [81]:
gs_logreg_cv.score(X_stem_test, y_stem_test)

0.7779670641680864

## Model Selection