In [245]:
# gs_logreg_pipe.best_estimator_.steps[0][1].transformers_[0][1].get_feature_names()

## Imports

In [223]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
import nltk
from nltk.corpus import stopwords
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.metrics import make_scorer, recall_score, confusion_matrix

RANDOM_STATE = 42

## Read-In Data

In [2]:
subreddits = pd.read_csv('../data/subreddits_preprocessed.csv')
subreddits.drop(columns = 'Unnamed: 0', inplace = True)

In [3]:
subreddits.head(2)

Unnamed: 0,subreddit,author,num_comments,score,timestamp,original_text,post_length_char,post_length_words,is_unethical,stemmer_text,polarity,sentiment_cat
0,LifeProTips,AlienAgency,2,1,2020-07-17,: Answers to why,16,4,0,: answer to whi,0.0,Neutral
1,LifeProTips,GarbageMiserable0x0,2,1,2020-07-17,¿Quieres obtener juegos y premios gratis en tu...,60,10,0,¿quier obten juego y premio grati en tu tiempo...,0.0,Neutral


## Model Preparation

In a separate set of models, I determined that stemmed text and the Tfidf Vectorizer would be a good choice for my data. Therefore, I will conduct a train test split on the stemmed text and set up a Column Transformer to only vectorize my text data.

### Train Test Split

In [4]:
features = ['num_comments', 'score', 'post_length_char', 'post_length_words', 'stemmer_text']
#features = ['stemmer_text']
X = subreddits[features]
y = subreddits['is_unethical']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = RANDOM_STATE, stratify = y)

#### Data for Multinomial Bayes Only

In [6]:
X_bayes = subreddits[['stemmer_text']]
y_bayes = subreddits['is_unethical']

In [7]:
X_bayes_train, X_bayes_test, y_bayes_train, y_bayes_test = train_test_split(X_bayes, y_bayes, test_size = 0.3, random_state = RANDOM_STATE, stratify = y)

### Define Custom Stop Words Hyperparameter for Vectorizer

In [8]:
nltk_stopwords = stopwords.words('english') + stopwords.words('spanish')
custom_stop_words = nltk_stopwords + ['someon', 'll', 'like', 'know', 'ulpt', 'lpt', 'need', 'use', 'make', 'wa', 'way', 'peopl', 'ask', 'say', 'time', 'thi', 'want', 'work', 'just', 'start', 'ha', 'tri', 'becaus', 'onli', 'friend']

### Build Column Transformer to Only Apply Vectorizer to Text Features

In [9]:
tfidf_transformer = ColumnTransformer([
    ('tfidf', TfidfVectorizer(), 'stemmer_text'),], 
    remainder='passthrough')

## Modeling

MARKDOWN TO DESCRIBE THE PROCESS!

### Functions

In [10]:
def display_accuracy_scores(model, xtrain, ytrain, xtest, ytest):
    print(f'The cross validation accuracy score is {round(cross_val_score(model, xtrain, ytrain).mean(),4)}.')
    print(f'The training accuracy score is {round(model.score(xtrain, ytrain),4)}.')
    print(f'The testing accuracy score is {round(model.score(xtest, ytest),4)}.')

In [11]:
def display_accuracy_scores_gs(model, xtrain, ytrain, xtest, ytest):
    print(f'The training accuracy score is {round(model.score(xtrain, ytrain),4)}.')
    print(f'The testing accuracy score is {round(model.score(xtest, ytest),4)}.')

In [12]:
def display_cross_val_gs(model):
    print(f'The cross_val score is {round(model.best_score_, 4)}.')

In [13]:
def get_training_sensitivity(actual_values, predicted_values):
    tn, fp, fn, tp = confusion_matrix(actual_values, predicted_values).ravel()
    print(f'The training sensitivity score is {round(tp/(tp+fn), 4)}.')

In [14]:
def get_testing_sensitivity(actual_values, predicted_values):
    tn, fp, fn, tp = confusion_matrix(actual_values, predicted_values).ravel()
    print(f'The testing sensitivity score is {round(tp/(tp+fn), 4)}.')

### Model 1: Null Model

In [15]:
null = DummyClassifier(strategy = 'stratified', random_state = RANDOM_STATE) # Will respect training class distribution

In [16]:
null.fit(X_train, y_train);

#### Evaluate Accuracy and Sensitivity Metrics

*Accuracy*

In [17]:
display_accuracy_scores(null, X_train, y_train, X_test, y_test)

The cross validation accuracy score is 0.501.
The training accuracy score is 0.5051.
The testing accuracy score is 0.5006.


*Sensitivity*

In [18]:
get_training_sensitivity(y_train, null.predict(X_train))

The training sensitivity score is 0.5437.


In [19]:
get_testing_sensitivity(y_test, null.predict(X_test))

The testing sensitivity score is 0.5397.


The null model is making predictions in order to preserve the class distributions of the training set. In order to perform better than the null model, any models taht are built must perform better than 50.1% accuracy and ~54.0% sensitivity on the testing data.

### Model 2a: Logistic Regression with No Regularization

#### Create Pipeline

In [20]:
logreg_pipe = Pipeline([
    ('tfidf', tfidf_transformer),
    ('logreg', LogisticRegression(penalty = 'none', solver = 'newton-cg', max_iter = 600))
])

# The model would not converge for the other solvers. Newton-cg can be used to fit larger datasets.

#### Grid Search Over Pipeline

In [269]:
logreg_pipe_params = {
    'tfidf__tfidf__stop_words': [nltk_stopwords],
    'tfidf__tfidf__ngram_range': [(1,1)],
    'tfidf__tfidf__min_df': [6],
    'tfidf__tfidf__max_df': [0.25],
    'tfidf__tfidf__max_features': [750]
}

# Only best hyperparameters shown

In [262]:
gs_logreg_pipe = GridSearchCV(logreg_pipe, 
                              param_grid = logreg_pipe_params, 
                              cv = 5, 
                              verbose = 1, 
                              n_jobs = -1)

In [263]:
gs_logreg_pipe.fit(X_train, y_train);

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   25.6s finished


In [268]:
gs_logreg_pipe.best_params_

{'tfidf__tfidf__max_df': 0.25,
 'tfidf__tfidf__max_features': 750,
 'tfidf__tfidf__min_df': 6,
 'tfidf__tfidf__ngram_range': (1, 1),
 'tfidf__tfidf__stop_words': ['i',
  'me',
  'my',
  'myself',
  'we',
  'our',
  'ours',
  'ourselves',
  'you',
  "you're",
  "you've",
  "you'll",
  "you'd",
  'your',
  'yours',
  'yourself',
  'yourselves',
  'he',
  'him',
  'his',
  'himself',
  'she',
  "she's",
  'her',
  'hers',
  'herself',
  'it',
  "it's",
  'its',
  'itself',
  'they',
  'them',
  'their',
  'theirs',
  'themselves',
  'what',
  'which',
  'who',
  'whom',
  'this',
  'that',
  "that'll",
  'these',
  'those',
  'am',
  'is',
  'are',
  'was',
  'were',
  'be',
  'been',
  'being',
  'have',
  'has',
  'had',
  'having',
  'do',
  'does',
  'did',
  'doing',
  'a',
  'an',
  'the',
  'and',
  'but',
  'if',
  'or',
  'because',
  'as',
  'until',
  'while',
  'of',
  'at',
  'by',
  'for',
  'with',
  'about',
  'against',
  'between',
  'into',
  'through',
  'during',
  'b

#### Evaluate Accuracy Metric

*Accuracy*

In [264]:
display_cross_val_gs(gs_logreg_pipe)

The cross_val score is 0.7032.


In [265]:
display_accuracy_scores_gs(model = gs_logreg_pipe, xtrain = X_train, xtest = X_test, ytrain = y_train, ytest = y_test)

The training accuracy score is 0.8404.
The testing accuracy score is 0.7176.


*Sensitivity*

In [267]:
get_training_sensitivity(y_train, gs_logreg_pipe.predict(X_train))

The training sensitivity score is 0.8491.


In [266]:
get_testing_sensitivity(y_test, gs_logreg_pipe.predict(X_test))

The testing sensitivity score is 0.7422.


Although this model performs better than baseline accuracy, this model is extremely overfit, but this is likely due to the large number of features in the model without any regularization. (Note: In this model, there are 3035 features (3030 are words and 5 are numerical features).

### Model 2b: Logistic Regression with Regularization

#### Create Pipeline

In [29]:
logreg_reg_pipe = Pipeline([
    ('tfidf', tfidf_transformer),
    ('ss', StandardScaler(with_mean = False)),
    ('logreg', LogisticRegression(solver = 'liblinear'))])

#### Grid Search Over Pipeline

In [296]:
logreg_reg_pipe_params = {
    'tfidf__tfidf__stop_words': [custom_stop_words],
    'tfidf__tfidf__ngram_range': [(1,2)],
    'tfidf__tfidf__max_df': [0.65],
    'tfidf__tfidf__min_df': [2],
    'logreg__penalty': ['l2'],
    'logreg__C': [0.0001]
}
# Only best params remain in grid

In [297]:
gs_logreg_reg_pipe = GridSearchCV(logreg_reg_pipe, param_grid = logreg_reg_pipe_params, cv = 5, verbose=1, n_jobs = -1)

In [298]:
gs_logreg_reg_pipe.fit(X_train, y_train);

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    0.7s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    1.0s finished


#### Evaluate Accuracy Metric

*Accuracy*

In [299]:
display_cross_val_gs(gs_logreg_reg_pipe)

The cross_val score is 0.7654.


In [300]:
display_accuracy_scores_gs(model = gs_logreg_reg_pipe, xtrain = X_train, xtest = X_test, ytrain = y_train, ytest = y_test)

The training accuracy score is 0.9786.
The testing accuracy score is 0.783.


*Sensitivity*

In [301]:
get_training_sensitivity(y_train, gs_logreg_reg_pipe.predict(X_train))

The training sensitivity score is 0.9915.


In [302]:
get_testing_sensitivity(y_test, gs_logreg_reg_pipe.predict(X_test))

The testing sensitivity score is 0.8434.


### Model 3: Multinomial Naive Bayes

#### Create Pipe

In [37]:
nb_pipe = Pipeline([
    ('tfidf', tfidf_transformer),
    ('nb', MultinomialNB())
])

#### Grid Search Over Pipe

In [311]:
nb_pipe_params = {
    'tfidf__tfidf__stop_words': [nltk_stopwords],
    'tfidf__tfidf__max_features': [6500],
    'tfidf__tfidf__min_df': [3],
    'tfidf__tfidf__max_df': [0.25],
    'tfidf__tfidf__ngram_range': [(1,2)],
}

In [312]:
gs_nb_pipe = GridSearchCV(nb_pipe, param_grid = nb_pipe_params, cv = 5, verbose = 1, n_jobs = -1)

In [313]:
gs_nb_pipe.fit(X_bayes_train, y_bayes_train);

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.4s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.5s finished


In [318]:
gs_nb_pipe.best_params_

{'tfidf__tfidf__max_df': 0.25,
 'tfidf__tfidf__max_features': 6500,
 'tfidf__tfidf__min_df': 3,
 'tfidf__tfidf__ngram_range': (1, 2),
 'tfidf__tfidf__stop_words': ['i',
  'me',
  'my',
  'myself',
  'we',
  'our',
  'ours',
  'ourselves',
  'you',
  "you're",
  "you've",
  "you'll",
  "you'd",
  'your',
  'yours',
  'yourself',
  'yourselves',
  'he',
  'him',
  'his',
  'himself',
  'she',
  "she's",
  'her',
  'hers',
  'herself',
  'it',
  "it's",
  'its',
  'itself',
  'they',
  'them',
  'their',
  'theirs',
  'themselves',
  'what',
  'which',
  'who',
  'whom',
  'this',
  'that',
  "that'll",
  'these',
  'those',
  'am',
  'is',
  'are',
  'was',
  'were',
  'be',
  'been',
  'being',
  'have',
  'has',
  'had',
  'having',
  'do',
  'does',
  'did',
  'doing',
  'a',
  'an',
  'the',
  'and',
  'but',
  'if',
  'or',
  'because',
  'as',
  'until',
  'while',
  'of',
  'at',
  'by',
  'for',
  'with',
  'about',
  'against',
  'between',
  'into',
  'through',
  'during',
  '

#### Evaluate Sensitivity Metric

*Accuracy*

In [314]:
display_cross_val_gs(gs_nb_pipe)

The cross_val score is 0.7705.


In [315]:
display_accuracy_scores_gs(gs_nb_pipe, X_bayes_train, y_bayes_train, X_bayes_test, y_bayes_test)

The training accuracy score is 0.902.
The testing accuracy score is 0.7903.


*Sensitivity*

In [317]:
get_training_sensitivity(y_bayes_train, gs_nb_pipe.predict(X_bayes_train))

The training sensitivity score is 0.953.


In [316]:
get_testing_sensitivity(y_bayes_test, gs_nb_pipe.predict(X_bayes_test))

The testing sensitivity score is 0.8841.


### Model 4: KNN

#### Create Pipe

In [45]:
knn_pipe_robust = Pipeline([
    ('tfidf', tfidf_transformer),
    ('rs', RobustScaler(with_centering = False)),
    ('knn', KNeighborsClassifier())
])

# In this case, robust scaler performed better than standard scaler

#### Grid Search Over Pipe

In [176]:
knn_pipe_params = {
    'tfidf__tfidf__stop_words':['english'],
    'tfidf__tfidf__min_df': [8],
    'tfidf__tfidf__max_df': [0.45],
    'tfidf__tfidf__ngram_range': [(1,1)],
    'knn__n_neighbors' : [20],
    'knn__metric':['euclidean'],
    'knn__weights': ['distance']    
}
# Only best params remain

In [177]:
gs_knn_pipe_robust = GridSearchCV(knn_pipe_robust, knn_pipe_params, cv = 5, verbose = 1, n_jobs = -1)

In [178]:
gs_knn_pipe_robust.fit(X_train, y_train);

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.4s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.5s finished


#### Evaluate Metrics

*Accuracy*

In [166]:
gs_knn_pipe_robust.best_score_

0.6834779121238228

In [173]:
display_accuracy_scores_gs(gs_knn_pipe_robust, X_train, y_train, X_test, y_test)

The training accuracy score is 1.0.
The testing accuracy score is 0.671.


*Sensitivity*

In [168]:
get_training_sensitivity(y_train, gs_knn_pipe_robust.predict(X_train))

The training sensitivity score is 1.0.


In [174]:
get_testing_sensitivity(y_test, gs_knn_pipe_robust.predict(X_test))

The testing sensitivity score is 0.762.


### Model 5: Decision Tree

#### Create Pipeline

In [53]:
dt_pipe = Pipeline([
    ('tfidf', tfidf_transformer),
    ('dt', DecisionTreeClassifier())
])

#### Grid Search Over Pipeline

In [179]:
dt_pipe_params = {
    'tfidf__tfidf__stop_words': ['english'],
    'tfidf__tfidf__min_df': [2],
    'tfidf__tfidf__max_df': [0.98],
    'tfidf__tfidf__ngram_range': [(1,2)],
    'dt__min_samples_split': [7],
    'dt__min_samples_leaf': [1]
}
# Only best params shown

In [180]:
gs_dt_pipe = GridSearchCV(dt_pipe, param_grid = dt_pipe_params, cv = 5, verbose = 1, n_jobs = -1)

In [181]:
gs_dt_pipe.fit(X_train, y_train);

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    1.4s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    2.0s finished


In [186]:
gs_dt_pipe.best_params_

{'dt__min_samples_leaf': 1,
 'dt__min_samples_split': 7,
 'tfidf__tfidf__max_df': 0.98,
 'tfidf__tfidf__min_df': 2,
 'tfidf__tfidf__ngram_range': (1, 2),
 'tfidf__tfidf__stop_words': 'english'}

#### Evaluate Metrics

*Accuracy*

In [182]:
display_cross_val_gs(gs_dt_pipe)

The cross_val score is 0.6815.


In [183]:
display_accuracy_scores_gs(gs_dt_pipe, X_train, y_train, X_test, y_test)

The training accuracy score is 0.982.
The testing accuracy score is 0.675.


*Sensitivity*

In [185]:
get_training_sensitivity(y_train, gs_dt_pipe.predict(X_train))

The training sensitivity score is 0.9852.


In [184]:
get_testing_sensitivity(y_test, gs_dt_pipe.predict(X_test))

The testing sensitivity score is 0.715.


### Model 6: Bagging Classifier

#### Create Pipe

In [206]:
bag_pipe = Pipeline([
    ('tfidf', tfidf_transformer),
    ('bc', BaggingClassifier(random_state = RANDOM_STATE))
])

In [209]:
bag_pipe_params = {
    'tfidf__tfidf__stop_words':['english', nltk_stopwords],
    'tfidf__tfidf__min_df': [2, 3, 4, 5, 6, 7, ],
    'tfidf__tfidf__max_df': [0.7, 0.8, 0.9],
    'tfidf__tfidf__ngram_range': [(1,2)],
    'bc__n_estimators':[100]
}

In [210]:
gs_bag_pipe = RandomizedSearchCV(bag_pipe, param_distributions = bag_pipe_params, cv = 5, verbose = 1, n_jobs = -1)

In [211]:
gs_bag_pipe.fit(X_train, y_train);

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  4.2min finished


In [65]:
gs_bag_pipe.best_params_

{'bc__n_estimators': 20,
 'tfidf__tfidf__max_df': 0.7,
 'tfidf__tfidf__min_df': 3,
 'tfidf__tfidf__ngram_range': (1, 2),
 'tfidf__tfidf__stop_words': 'english'}

#### Evaluate Metrics

*Accuracy*

In [199]:
display_cross_val_gs(gs_bag_pipe)

The cross_val score is 0.7393.


*Sensitivity*

In [201]:
get_testing_sensitivity(y_train, gs_bag_pipe.predict(X_train))

The testing sensitivity score is 1.0.


In [200]:
get_testing_sensitivity(y_test, gs_bag_pipe.predict(X_test))

The testing sensitivity score is 0.7307.


### Model 7: Random Forest

#### Create Pipe

In [224]:
rf_pipe = Pipeline([
    ('tfidf', tfidf_transformer),
    ('forest', RandomForestClassifier())
])

#### Grid Search Over Pipe

In [226]:
gs_rf_pipe_params = {
    'tfidf__tfidf__'
}

#### Evaluate Metrics

1.0

0.7784090909090909