## Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.compose import ColumnTransformer
import nltk
from nltk.corpus import stopwords
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import make_scorer, recall_score, confusion_matrix

RANDOM_STATE = 42

## Read-In Data

In [2]:
subreddits = pd.read_csv('../data/subreddits_preprocessed.csv')
subreddits.drop(columns = 'Unnamed: 0', inplace = True)

In [3]:
subreddits.head(2)

Unnamed: 0,title,selftext,subreddit,author,num_comments,score,timestamp,original_text,post_length_char,post_length_words,is_unethical,stemmer_text,polarity,sentiment_cat
0,: Answers to why,,LifeProTips,AlienAgency,2,1,2020-07-17,: Answers to why,16,4,0,: answer to whi,0.0,Neutral
1,¿Quieres obtener juegos y premios gratis en tu...,,LifeProTips,GarbageMiserable0x0,2,1,2020-07-17,¿Quieres obtener juegos y premios gratis en tu...,60,10,0,¿quier obten juego y premio grati en tu tiempo...,0.0,Neutral


## Model Preparation

In a separate set of models, I determined that stemmed text and the Tfidf Vectorizer would be a good choice for my data. Therefore, I will conduct a train test split on the stemmed text and set up a Column Transformer to only vectorize my text data.

### Train Test Split

In [4]:
features = ['num_comments', 'score', 'post_length_char', 'post_length_words', 'stemmer_text']
#features = ['stemmer_text']
X = subreddits[features]
y = subreddits['is_unethical']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = RANDOM_STATE, stratify = y)

#### Data for Multinomial Bayes Only

In [6]:
X_bayes = subreddits[['stemmer_text']]
y_bayes = subreddits['is_unethical']

In [7]:
X_bayes_train, X_bayes_test, y_bayes_train, y_bayes_test = train_test_split(X_bayes, y_bayes, test_size = 0.3, random_state = RANDOM_STATE, stratify = y)

### Define Custom Stop Words Hyperparameter for Vectorizer

In [8]:
custom_stop_words = ['someon', 'll', 'like', 'know', 'ulpt', 'lpt', 'need', 'use', 'make', 'wa', 'way', 'peopl', 'ask', 'say', 'time', 'thi', 'want', 'work', 'just', 'start', 'ha', 'tri', 'becaus', 'onli', 'friend']
nltk_stopwords = stopwords.words('english')

### Build Column Transformer to Only Apply Vectorizer to Text Features

In [9]:
tfidf_transformer = ColumnTransformer([
    ('tfidf', TfidfVectorizer(), 'stemmer_text'),], 
    remainder='passthrough')

## Modeling

MARKDOWN TO DESCRIBE THE PROCESS!

### Functions

In [10]:
def display_accuracy_scores(model, xtrain, ytrain, xtest, ytest):
    print(f'The cross validation accuracy score is {round(cross_val_score(model, xtrain, ytrain).mean(),4)}.')
    print(f'The training accuracy score is {round(model.score(xtrain, ytrain),4)}.')
    print(f'The testing accuracy score is {round(model.score(xtest, ytest),4)}.')

In [11]:
def display_accuracy_scores_gs(model, xtrain, ytrain, xtest, ytest):
    print(f'The training accuracy score is {round(model.score(xtrain, ytrain),4)}.')
    print(f'The testing accuracy score is {round(model.score(xtest, ytest),4)}.')

In [12]:
def display_cross_val_gs(model):
    print(f'The cross_val score is {round(model.best_score_, 4)}.')

In [13]:
def get_training_sensitivity(actual_values, predicted_values):
    tn, fp, fn, tp = confusion_matrix(actual_values, predicted_values).ravel()
    print(f'The training sensitivity score is {round(tp/(tp+fn), 4)}.')

In [14]:
def get_testing_sensitivity(actual_values, predicted_values):
    tn, fp, fn, tp = confusion_matrix(actual_values, predicted_values).ravel()
    print(f'The testing sensitivity score is {round(tp/(tp+fn), 4)}.')

### Model 1: Null Model

In [15]:
null = DummyClassifier(strategy = 'stratified', random_state = RANDOM_STATE) # Will respect training class distribution

In [16]:
null.fit(X_train, y_train);

#### Evaluate Accuracy and Sensitivity Metrics

*Accuracy*

In [17]:
display_accuracy_scores(null, X_train, y_train, X_test, y_test)

The cross validation accuracy score is 0.501.
The training accuracy score is 0.5051.
The testing accuracy score is 0.5006.


*Sensitivity*

In [19]:
get_training_sensitivity(y_train, null.predict(X_train))

The training sensitivity score is 0.5437.


In [20]:
get_testing_sensitivity(y_test, null.predict(X_test))

The testing sensitivity score is 0.5397.


The null model is making predictions in order to preserve the class distributions of the training set. In order to perform better than the null model, any models taht are built must perform better than 50.1% accuracy and ~54.0% sensitivity on the testing data.

### Model 2a: Logistic Regression with No Regularization

#### Create Pipeline

In [21]:
logreg_pipe = Pipeline([
    ('tfidf', tfidf_transformer),
    ('logreg', LogisticRegression(penalty = 'none', solver = 'newton-cg', max_iter = 600))
])

# The model would not converge for the other solvers. Newton-cg can be used to fit larger datasets.

#### Grid Search Over Pipeline

In [148]:
logreg_pipe_params = {
    'tfidf__tfidf__stop_words': [nltk_stopwords],
    'tfidf__tfidf__ngram_range': [(1,1), (1,2)],
    'tfidf__tfidf__min_df': [3, 4, 5, 6, 7, 8],
    'tfidf__tfidf__max_df': [0.30, 0.35, 0.40]
}

# Only best hyperparameters shown

In [149]:
gs_logreg_pipe = GridSearchCV(logreg_pipe, 
                              param_grid = logreg_pipe_params, 
                              cv = 5, 
                              verbose = 1, 
                              n_jobs = -1)

In [150]:
gs_logreg_pipe.fit(X_train, y_train);

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   23.2s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:  1.9min finished


The best parameters for this model were determined to be a maximum occurrence in the data frame of 0.98, a minimum occurrence of 5, and and ngram range of 1.

In [151]:
gs_logreg_pipe.best_params_

{'tfidf__tfidf__max_df': 0.3,
 'tfidf__tfidf__min_df': 6,
 'tfidf__tfidf__ngram_range': (1, 1),
 'tfidf__tfidf__stop_words': ['i',
  'me',
  'my',
  'myself',
  'we',
  'our',
  'ours',
  'ourselves',
  'you',
  "you're",
  "you've",
  "you'll",
  "you'd",
  'your',
  'yours',
  'yourself',
  'yourselves',
  'he',
  'him',
  'his',
  'himself',
  'she',
  "she's",
  'her',
  'hers',
  'herself',
  'it',
  "it's",
  'its',
  'itself',
  'they',
  'them',
  'their',
  'theirs',
  'themselves',
  'what',
  'which',
  'who',
  'whom',
  'this',
  'that',
  "that'll",
  'these',
  'those',
  'am',
  'is',
  'are',
  'was',
  'were',
  'be',
  'been',
  'being',
  'have',
  'has',
  'had',
  'having',
  'do',
  'does',
  'did',
  'doing',
  'a',
  'an',
  'the',
  'and',
  'but',
  'if',
  'or',
  'because',
  'as',
  'until',
  'while',
  'of',
  'at',
  'by',
  'for',
  'with',
  'about',
  'against',
  'between',
  'into',
  'through',
  'during',
  'before',
  'after',
  'above',
  'belo

#### Evaluate Accuracy Metric

*Accuracy*

In [131]:
display_cross_val_gs(gs_logreg_pipe)

The cross_val score is 0.7147.


In [123]:
display_accuracy_scores_gs(model = gs_logreg_pipe, xtrain = X_train, xtest = X_test, ytrain = y_train, ytest = y_test)

The training accuracy score is 0.9996.
The testing accuracy score is 0.7265.


*Sensitivity*

In [124]:
get_training_sensitivity(y_train, gs_logreg_pipe.predict(X_train))

The training sensitivity score is 0.9996.


In [125]:
get_testing_sensitivity(y_test, gs_logreg_pipe.predict(X_test))

The testing sensitivity score is 0.7265.


Although this model performs better than baseline accuracy, this model is extremely overfit, but this is likely due to the large number of features in the model without any regularization. (Note: In this model, there are 3035 features (3030 are words and 5 are numerical features).

### Model 2b: Logistic Regression with Regularization

#### Create Pipeline

In [29]:
logreg_reg_pipe = Pipeline([
    ('tfidf', tfidf_transformer),
    ('ss', StandardScaler(with_mean = False)),
    ('logreg', LogisticRegression(solver = 'liblinear'))])

#### Grid Search Over Pipeline

In [111]:
logreg_reg_pipe_params = {
    'tfidf__tfidf__stop_words': [custom_stop_words],
    'tfidf__tfidf__ngram_range': [(1,2)],
    'tfidf__tfidf__max_df': [0.65],
    'tfidf__tfidf__min_df': [2],
    'logreg__penalty': ['l2'],
    'logreg__C': [0.0001]
}
# Only best params remain in grid

In [112]:
gs_logreg_reg_pipe = GridSearchCV(logreg_reg_pipe, param_grid = logreg_reg_pipe_params, cv = 5, verbose=1, n_jobs = -1)

In [113]:
gs_logreg_reg_pipe.fit(X_train, y_train);

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:    1.7s finished


In [114]:
gs_logreg_reg_pipe.best_params_

{'logreg__C': 0.0001,
 'logreg__penalty': 'l2',
 'tfidf__tfidf__max_df': 0.65,
 'tfidf__tfidf__min_df': 2,
 'tfidf__tfidf__ngram_range': (1, 2),
 'tfidf__tfidf__stop_words': ['someon',
  'll',
  'like',
  'know',
  'ulpt',
  'lpt',
  'need',
  'use',
  'make',
  'wa',
  'way',
  'peopl',
  'ask',
  'say',
  'time',
  'thi',
  'want',
  'work',
  'just',
  'start',
  'ha',
  'tri',
  'becaus',
  'onli',
  'friend']}

#### Evaluate Accuracy Metric

*Accuracy*

In [115]:
display_cross_val_gs(gs_logreg_reg_pipe)

The cross_val score is 0.7841.


In [116]:
display_accuracy_scores_gs(model = gs_logreg_reg_pipe, xtrain = X_train, xtest = X_test, ytrain = y_train, ytest = y_test)

The training accuracy score is 0.9949.
The testing accuracy score is 0.7875.


*Sensitivity*

In [117]:
get_training_sensitivity(y_train, gs_logreg_reg_pipe.predict(X_train))

The training sensitivity score is 1.0.


In [118]:
get_testing_sensitivity(y_test, gs_logreg_reg_pipe.predict(X_test))

The testing sensitivity score is 0.8455.


### Model 3: Multinomial Naive Bayes

#### Create Pipe

In [37]:
nb_pipe = Pipeline([
    ('tfidf', tfidf_transformer),
    ('nb', MultinomialNB())
])

#### Grid Search Over Pipe

In [161]:
nb_pipe_params = {
    'tfidf__tfidf__stop_words': [custom_stop_words],
    'tfidf__tfidf__max_features': [7000],
    'tfidf__tfidf__min_df': [3],
    'tfidf__tfidf__max_df': [ 0.60],
    'tfidf__tfidf__ngram_range': [(1,2)],
}

In [162]:
gs_nb_pipe = GridSearchCV(nb_pipe, param_grid = nb_pipe_params, cv = 5, verbose = 1, n_jobs = -1)

In [163]:
gs_nb_pipe.fit(X_bayes_train, y_bayes_train);

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.5s remaining:    0.8s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.6s finished


#### Evaluate Sensitivity Metric

*Accuracy*

In [165]:
display_cross_val_gs(gs_nb_pipe)

The cross_val score is 0.7827.


In [166]:
display_accuracy_scores_gs(gs_nb_pipe, X_bayes_train, y_bayes_train, X_bayes_test, y_bayes_test)

The training accuracy score is 0.8962.
The testing accuracy score is 0.7983.


*Sensitivity*

In [167]:
get_training_sensitivity(y_bayes_train, gs_nb_pipe.predict(X_bayes_train))

The training sensitivity score is 0.9485.


In [168]:
get_testing_sensitivity(y_bayes_test, gs_nb_pipe.predict(X_bayes_test))

The testing sensitivity score is 0.8789.


### Model 4: KNN

#### Create Pipe

In [46]:
knn_pipe_robust = Pipeline([
    ('tfidf', tfidf_transformer),
    ('rs', RobustScaler(with_centering = False)),
    ('knn', KNeighborsClassifier())
])

# In this case, robust scaler performed better than standard scaler

#### Grid Search Over Pipe

In [231]:
knn_pipe_params = {
    'tfidf__tfidf__stop_words':['english'],
    'tfidf__tfidf__min_df': [8],
    'tfidf__tfidf__max_df': [0.45],
    'tfidf__tfidf__ngram_range': [(1,1)],
    'knn__n_neighbors' : [20],
    'knn__metric':['euclidean'],
    'knn__weights': ['distance']    
}
# Only best params remain

In [232]:
gs_knn_pipe_robust = GridSearchCV(knn_pipe_robust, knn_pipe_params, cv = 5, verbose = 1, n_jobs = -1)

In [233]:
gs_knn_pipe_robust.fit(X_train, y_train);

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.4s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.5s finished


#### Evaluate Metrics

*Accuracy*

In [234]:
gs_knn_pipe_robust.best_score_

0.6834779121238228

In [235]:
display_accuracy_scores_gs(gs_knn_pipe_robust, X_train, y_train, X_test, y_test)

The training accuracy score is 1.0.
The testing accuracy score is 0.671.


*Sensitivity*

In [236]:
get_training_sensitivity(y_train, gs_knn_pipe_robust.predict(X_train))

The training sensitivity score is 1.0.


In [237]:
get_testing_sensitivity(y_test, gs_knn_pipe_robust.predict(X_test))

The testing sensitivity score is 0.762.


### Model 5: Decision Tree

#### Create Pipeline

In [238]:
dt_pipe = Pipeline([
    ('tfidf', tfidf_transformer),
    ('dt', DecisionTreeClassifier())
])

#### Grid Search Over Pipeline

In [244]:
dt_pipe_params = {
    'tfidf__tfidf__stop_words': ['english', nltk_stopwords, custom_stop_words],
    'tfidf__tfidf__min_df': [2, 3, 4],
    'tfidf__tfidf__max_df': [0.9, 0.95, 0.98],
    'tfidf__tfidf__ngram_range': [(1,1), (1,2)],
    'dt__min_samples_split': [8, 9, 10],
    'dt__min_samples_leaf': [1, 2, 3]
}

In [245]:
gs_dt_pipe = GridSearchCV(dt_pipe, param_grid = dt_pipe_params, cv = 5, verbose = 1, n_jobs = -1)

In [246]:
gs_dt_pipe.fit(X_train, y_train);

Fitting 5 folds for each of 486 candidates, totalling 2430 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   22.2s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   54.9s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 2430 out of 2430 | elapsed:  5.1min finished


In [242]:
gs_dt_pipe.best_params_

{'dt__min_samples_leaf': 1,
 'dt__min_samples_split': 9,
 'tfidf__tfidf__max_df': 0.95,
 'tfidf__tfidf__min_df': 2,
 'tfidf__tfidf__ngram_range': (1, 2)}

#### Evaluate Sensitivity Metric

In [247]:
get_testing_sensitivity(y_test, gs_dt_pipe.predict(X_test))

The testing sensitivity score is 0.7098.


In [248]:
get_training_sensitivity(y_train, gs_dt_pipe.predict(X_train))

The training sensitivity score is 0.9812.


In [250]:
gs_dt_pipe.best_score_

0.6973646059237694

In [249]:
gs_dt_pipe.score(X_train, y_train)

0.9768518518518519

In [251]:
gs_dt_pipe.score(X_test, y_test)

0.6829545454545455

### Model 6: Bagging Classifier

#### Create Pipe

In [None]:
bag_pipe = Pipeline([
    ('tfidf', tfidf_transformer),
    ('bc', BaggingClassifier())
])

In [None]:
bag_pipe_params = {
    'tfidf__tfidf__min_df': [2, 3, 4],
    'tfidf__tfidf__max_df': [0.9, 0.95, 0.98],
    'tfidf__tfidf__ngram_range': [(1,1), (1,2)],
}

In [None]:
gs_bag_pipe = GridSearchCV(bag_pipe, param_grid = bag_pipe_params, cv = 5, verbose = 1, n_jobs = -1)

In [None]:
gs_bag_pipe.fit(X_train, y_train);

In [None]:
gs_bag_pipe.best_score_

In [None]:
get_sensitivity(y_test, gs_bag_pipe.predict(X_test))