# Pre-processing & Modelling

## Import

In [1]:
# Import libraries
import pandas as pd
import string
import re

import nltk
from nltk.corpus import stopwords

import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.metrics import accuracy_score, classification_report

from imblearn.over_sampling import SMOTE

from xgboost import XGBClassifier

In [2]:
# Import Sheila's df as csv
reddits = pd.read_csv('reddits.csv')

reddits.head()

Unnamed: 0,title,post_text,id,score,total_comments,post_url,subreddit,post_type,title_&_text,title_text_stemmed,title_text_lemmatized,trending
0,Daily Fasting Check-in!,"* **Type** of fast (water, juice, smoking, etc...",16o7z6r,1,2,https://www.reddit.com/r/intermittentfasting/c...,intermittentfasting,hot,Daily Fasting Check-in! * **Type** of fast (wa...,"['daili', 'checkin', 'type', 'fast', 'water', ...","['daily', 'checkin', 'type', 'fast', 'water', ...",2
1,I decided who I wanted to be and I became her 💅🏽,"So a little background: I’m 39, have birthed t...",16ntqoy,1176,36,https://i.redd.it/fclkjnwhmgpb1.jpg,intermittentfasting,hot,I decided who I wanted to be and I became her ...,"['decid', 'want', 'becam', 'littl', 'backgroun...","['decided', 'wanted', 'became', 'little', 'bac...",42336
2,Some photos from a past vacation came up as a ...,I remember being miserable and insecure the en...,16ni914,1505,77,https://www.reddit.com/gallery/16ni914,intermittentfasting,hot,Some photos from a past vacation came up as a ...,"['photo', 'past', 'vacat', 'came', 'memori', '...","['photo', 'past', 'vacation', 'came', 'memory'...",115885
3,"Anybody find IF, lose weight, and then lose mo...",I know I am an idiot.,16nuqx9,198,78,https://www.reddit.com/r/intermittentfasting/c...,intermittentfasting,hot,"Anybody find IF, lose weight, and then lose mo...","['anybodi', 'find', 'lose', 'weight', 'lose', ...","['anybody', 'find', 'lose', 'weight', 'lose', ...",15444
4,2 and a half months of IF,From 234 to 211 in 2.5 months. It works! Once ...,16nuxqs,180,12,https://i.redd.it/30yqmtsdvgpb1.jpg,intermittentfasting,hot,2 and a half months of IF From 234 to 211 in 2...,"['2', 'half', 'month', '234', '211', '25', 'mo...","['2', 'half', 'month', '234', '211', '25', 'mo...",2160


## Pre-processing

In [3]:
# Binarize 'subreddit' for modelling
## 'AnorexiaNervosa' = 0
## 'intermittentfasting' = 1

reddits['subreddit_binarized'] = reddits['subreddit'].map({'AnorexiaNervosa': 0, 'intermittentfasting': 1})

reddits['subreddit_binarized']

0       1
1       1
2       1
3       1
4       1
       ..
2484    0
2485    0
2486    0
2487    0
2488    0
Name: subreddit_binarized, Length: 2489, dtype: int64

In [4]:
# Assemble features (x) and target (y)

X = reddits['title_text_stemmed'].tolist()
y = reddits['subreddit_binarized'].tolist()

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Vectorization

From the EDA, **we have decided to use the stemmed texts** instead of lemmatized texts as they provide more insights rather than variations of the same words.

### Functions for the 3 vectorization methods
* Count vectorizer
* N-grams
* TF-IDF

In [5]:
# Define a function for CountVectorizer for unigrams

def cv_unigram(data):
    
    # Instantiate CountVectorizer 
    cv = CountVectorizer()
    
    # X stores the vectorized version of the data
    X = cv.fit_transform(data)
    
    return X, cv

In [6]:
# Define a function for CountVectorizer for bigrams

def cv_bigram(data):
    
    # Instantiate CountVectorizer 
    cv = CountVectorizer(ngram_range=(2,2))
    
    # X stores the vectorized version of the data
    X = cv.fit_transform(data)
    
    return X, cv

In [7]:
# Define a function for TF-IDF

def tfidf(data):
    
    #Instantiate TfidfVectorizer
    tfidf = TfidfVectorizer()
    X = tfidf.fit_transform(data)
    
    return X, tfidf

### Functions for the 3 Naive Bayes (NB) models
* Bernoulli NB
* Multinomial NB
* Gaussian NB

### Function for SMOTE to account for the class imbalance
* r/AnorexiaNervosa: 1588 posts
* r/intermittentfasting: 901 posts

In [8]:
# Define a function for Bernoulli NB

def bernoulli(X_train, X_test, y_train, y_test):

    # Apply SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    
    # Instantiate the Bernoulli model
    BernNB = BernoulliNB(binarize=0.1)
    
    # Fit the model
    BernNB.fit(X_train_resampled, y_train_resampled)
    
    # Make predictions
    y_pred_train_resampled = BernNB.predict(X_train_resampled)
    y_pred_test = BernNB.predict(X_test)
    
    # Get score
    train_score = accuracy_score(y_train_resampled, y_pred_train_resampled)
    test_score = accuracy_score(y_test, y_pred_test)
    
    return train_score, test_score

In [9]:
# Define a function for Multinomial NB

def multinomial(X_train, X_test, y_train, y_test):
    
    # Apply SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    
    # Instantiate the Multinomial model
    MultiNB = MultinomialNB()
    
    # Fit the model
    MultiNB.fit(X_train_resampled, y_train_resampled)
    
    # Make predictions
    y_pred_train_resampled = MultiNB.predict(X_train_resampled)
    y_pred_test = MultiNB.predict(X_test)
    
    # Get score
    train_score = accuracy_score(y_train_resampled, y_pred_train_resampled)
    test_score = accuracy_score(y_test, y_pred_test)
    
    return train_score, test_score

In [10]:
# Define a function for Gaussian NB

def gaussian(X_train, X_test, y_train, y_test):
    
    # Apply SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    
    # Instantiate the Gaussian model
    GausNB = GaussianNB()
    
    # Fit the model
    GausNB.fit(X_train_resampled, y_train_resampled)
    
    # Make predictions
    y_pred_train_resampled = GausNB.predict(X_train_resampled)
    y_pred_test = GausNB.predict(X_test)
    
    # Get score
    train_score = accuracy_score(y_train_resampled, y_pred_train_resampled)
    test_score = accuracy_score(y_test, y_pred_test)
    
    return train_score, test_score

In [11]:
# Define a function for cross-validation

def cross_val(X, y, model):
    
    cross_val = cross_val_score(model, X, y, cv=5)
    
    return cross_val

In [12]:
# Vectorize the data using different methods
X_cv_uni, cv_obj_uni = cv_unigram(X_train)
X_cv_bi, cv_obj_bi = cv_bigram(X_train)
X_tf, tf_obj = tfidf(X_train)

In [14]:
# Train and evaluate the Bernoulli NB model
train_bern_uni_score, test_bern_uni_score = bernoulli(X_cv_uni, cv_obj_uni.transform(X_test), y_train, y_test)
train_bern_bi_score, test_bern_bi_score = bernoulli(X_cv_bi, cv_obj_bi.transform(X_test), y_train, y_test)
train_bern_tf_score, test_bern_tf_score = bernoulli(X_tf, tf_obj.transform(X_test), y_train, y_test)

print('Bernoulli NB:')
print('Unigram:')
print(f'Train: {train_bern_uni_score}, Test: {test_bern_uni_score}')
print()
print('Bigram:')
print(f'Train: {train_bern_bi_score}, Test: {test_bern_bi_score}')
print()
print('TF-IDF:')
print(f'Train: {train_bern_tf_score}, Test: {test_bern_tf_score}')

Bernoulli NB:
Unigram:
Train: 0.8902053712480252, Test: 0.8333333333333334

Bigram:
Train: 0.8143759873617693, Test: 0.44779116465863456

TF-IDF:
Train: 0.9778830963665087, Test: 0.9397590361445783


In [15]:
#Instantiate the Bernoulli model
BernNB = BernoulliNB()

# Cross-validation on the Bernoulli NB model using unigram features
cross_val_uni_scores = cross_val(X_cv_uni, y_train, BernNB)
cross_val_uni_bern = (cross_val(X_cv_uni, y_train, BernNB)).mean()

# Cross-validation on the Bernoulli NB model using bigram features
cross_val_bi_scores = cross_val(X_cv_bi, y_train, BernNB)
cross_val_bi_bern = (cross_val(X_cv_bi, y_train, BernNB)).mean()

# Cross-validation on the Bernoulli NB model using TF-IDF features
cross_val_tf_scores = cross_val(X_tf, y_train, BernNB)
cross_val_tf_bern = (cross_val(X_tf, y_train, BernNB)).mean()

print('Cross-validation:')
print(f'Unigram scores: {cross_val_uni_scores}')
print(f'Unigram mean: {cross_val_uni_bern}')
print()
print(f'Bigram scores: {cross_val_bi_scores}')
print(f'Bigram mean: {cross_val_bi_bern}')
print()
print(f'TF-IDF scores: {cross_val_tf_scores}')
print(f'TF-IDF mean: {cross_val_tf_bern}')

Cross-validation:
Unigram scores: [0.95238095 0.94974874 0.94221106 0.96733668 0.95477387]
Unigram mean: 0.9532902608279492

Bigram scores: [0.64661654 0.67085427 0.65075377 0.66834171 0.67085427]
Bigram mean: 0.6614841122907772

TF-IDF scores: [0.95238095 0.94974874 0.94221106 0.96733668 0.95477387]
TF-IDF mean: 0.9532902608279492


In [16]:
# Train and evaluate the Multinomial NB model
train_multi_uni_score, test_multi_uni_score = multinomial(X_cv_uni, cv_obj_uni.transform(X_test), y_train, y_test)
train_multi_bi_score, test_multi_bi_score = multinomial(X_cv_bi, cv_obj_bi.transform(X_test), y_train, y_test)
train_multi_tf_score, test_multi_tf_score = multinomial(X_tf, tf_obj.transform(X_test), y_train, y_test)

print('Multinomial NB:')
print('Unigram:')
print(f'Train: {train_multi_uni_score}, Test: {test_multi_uni_score}')
print()
print('Bigrams:')
print(f'Train: {train_multi_bi_score}, Test: {test_multi_bi_score}')
print()
print('TF-IDF:')
print(f'Train: {train_multi_tf_score}, Test: {test_multi_tf_score}')

Multinomial NB:
Unigram:
Train: 0.9451026856240127, Test: 0.9518072289156626

Bigrams:
Train: 0.8554502369668247, Test: 0.9397590361445783

TF-IDF:
Train: 0.9826224328593997, Test: 0.9497991967871486


In [17]:
#Instantiate the Multinomial model
MultiNB = MultinomialNB()

# Cross-validation on the Multinomial NB model using unigram features
cross_val_uni_scores = cross_val(X_cv_uni, y_train, MultiNB)
cross_val_uni_multi = (cross_val(X_cv_uni, y_train, MultiNB)).mean()

# Cross-validation on the Multinomial NB model using bigram features
cross_val_bi_scores = cross_val(X_cv_bi, y_train, MultiNB)
cross_val_bi_multi = (cross_val(X_cv_bi, y_train, MultiNB)).mean()

# Cross-validation on the Multinomial NB model using TF-IDF features
cross_val_tf_scores = cross_val(X_tf, y_train, MultiNB)
cross_val_tf_multi = (cross_val(X_tf, y_train, MultiNB)).mean()

print('Cross-validation:')
print(f'Unigram scores: {cross_val_uni_scores}')
print(f'Unigram mean: {cross_val_uni_multi}')
print()
print(f'Bigram scores: {cross_val_bi_scores}')
print(f'Bigram mean: {cross_val_bi_multi}')
print()
print(f'TF-IDF scores: {cross_val_tf_scores}')
print(f'TF-IDF mean: {cross_val_tf_multi}')

Cross-validation:
Unigram scores: [0.95739348 0.96231156 0.93467337 0.95477387 0.95477387]
Unigram mean: 0.9527852294051712

Bigram scores: [0.73684211 0.74623116 0.74371859 0.75879397 0.72613065]
Bigram mean: 0.7423432954244908

TF-IDF scores: [0.87719298 0.85678392 0.84924623 0.87437186 0.85929648]
TF-IDF mean: 0.8633782949836905


In [18]:
# Train and evaluate the Gaussian NB model
train_gaus_uni_score, test_gaus_uni_score = gaussian(X_cv_uni.toarray(), cv_obj_uni.transform(X_test).toarray(), y_train, y_test)
train_gaus_bi_score, test_gaus_bi_score = gaussian(X_cv_bi.toarray(), cv_obj_bi.transform(X_test).toarray(), y_train, y_test)
train_gaus_tf_score, test_gaus_tf_score = gaussian(X_tf.toarray(), tf_obj.transform(X_test).toarray(), y_train, y_test)

print('Gaussian NB:')
print('Unigram:')
print(f'Train: {train_gaus_uni_score}, Test: {test_gaus_uni_score}')
print()
print('Bigram:')
print(f'Train: {train_gaus_bi_score}, Test: {test_gaus_bi_score}')
print()
print('TF-IDF:')
print(f'Train: {train_gaus_tf_score}, Test: {test_gaus_tf_score}')

Gaussian NB:
Unigram:
Train: 0.9514218009478673, Test: 0.8232931726907631

Bigram:
Train: 1.0, Test: 0.8815261044176707

TF-IDF:
Train: 0.9652448657187994, Test: 0.8313253012048193


In [19]:
#Instantiate the Gaussian model
GausNB = GaussianNB()

# Cross-validation on the Gaussian NB model using unigram features
cross_val_uni_scores = cross_val(X_cv_uni.toarray(), y_train, GausNB)
cross_val_uni_gaus = (cross_val(X_cv_uni.toarray(), y_train, GausNB)).mean()

# Cross-validation on the Gaussian NB model using bigram features
cross_val_bi_scores = cross_val(X_cv_bi.toarray(), y_train, GausNB)
cross_val_bi_gaus = (cross_val(X_cv_bi.toarray(), y_train, GausNB)).mean()

# Cross-validation on the Gaussian NB model using TF-IDF features
cross_val_tf_scores = cross_val(X_tf.toarray(), y_train, GausNB)
cross_val_tf_gaus = (cross_val(X_tf.toarray(), y_train, GausNB)).mean()

print('Cross-validation:')
print(f'Unigram scores: {cross_val_uni_scores}')
print(f'Unigram mean: {cross_val_uni_gaus}')
print()
print(f'Bigram scores: {cross_val_bi_scores}')
print(f'Bigram mean: {cross_val_bi_gaus}')
print()
print(f'TF-IDF scores: {cross_val_tf_scores}')
print(f'TF-IDF mean: {cross_val_tf_gaus}')

Cross-validation:
Unigram scores: [0.82957393 0.83919598 0.79648241 0.80150754 0.77889447]
Unigram mean: 0.8091308673694286

Bigram scores: [0.88220551 0.84924623 0.86180905 0.8919598  0.85678392]
Bigram mean: 0.868400901751867

TF-IDF scores: [0.81954887 0.83417085 0.80653266 0.78643216 0.77135678]
TF-IDF mean: 0.8036082668984017


### XGBoost

In [20]:
# Define a function for XGBoost classifier

def xgboost(X_train, X_test, y_train, y_test):
    
    # Apply SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    
    # Instantiate the XGBoost classifier
    xgb_classifier = XGBClassifier()
    
    # Fit the XGBoost classifier on the resampled training data
    xgb_classifier.fit(X_train_resampled, y_train_resampled)
    
    # Predict the labels for both training and test data
    y_pred_train = xgb_classifier.predict(X_train)
    y_pred_test = xgb_classifier.predict(X_test)
    
    # Get score
    train_score = accuracy_score(y_train, y_pred_train)
    test_score = accuracy_score(y_test, y_pred_test)
    
    return train_score, test_score

In [21]:
# Train and evaluate the XGBoost classifier model
train_xgb_uni_score, test_xgb_uni_score = xgboost(X_cv_uni, cv_obj_uni.transform(X_test), y_train, y_test)
train_xgb_bi_score, test_xgb_bi_score = xgboost(X_cv_bi, cv_obj_bi.transform(X_test), y_train, y_test)
train_xgb_tf_score, test_xgb_tf_score = xgboost(X_tf, tf_obj.transform(X_test), y_train, y_test)

print('XGBoost Classifier:')
print('Unigram:')
print(f'Train: {train_xgb_uni_score}, Test: {test_xgb_uni_score}')
print()
print('Bigram:')
print(f'Train: {train_xgb_bi_score}, Test: {test_xgb_bi_score}')
print()
print('TF-IDF:')
print(f'Train: {train_xgb_tf_score}, Test: {test_xgb_tf_score}')

XGBoost Classifier:
Unigram:
Train: 0.9964841788046208, Test: 0.9377510040160643

Bigram:
Train: 0.8940231039678553, Test: 0.8493975903614458

TF-IDF:
Train: 0.9969864389753893, Test: 0.9337349397590361


In [22]:
# Instantiate the XGBoost Classifier model
xgb_classifier = XGBClassifier()

# Cross-validation on the XGBoost Classifier model using unigram features
cross_val_uni_scores = cross_val(X_cv_uni, y_train, xgb_classifier)
cross_val_uni_xgb = (cross_val(X_cv_uni, y_train, xgb_classifier)).mean()

# Cross-validation on the XGBoost Classifier model using bigram features
cross_val_bi_scores = cross_val(X_cv_bi, y_train, xgb_classifier)
cross_val_bi_xgb = (cross_val(X_cv_bi, y_train, xgb_classifier)).mean()

# Cross-validation on the XGBoost Classifier model using TF-IDF features
cross_val_tf_scores = cross_val(X_tf, y_train, xgb_classifier)
cross_val_tf_xgb = (cross_val(X_tf, y_train, xgb_classifier)).mean()

print('Cross-validation:')
print(f'Unigram scores: {cross_val_uni_scores}')
print(f'Unigram mean: {cross_val_uni_multi}')
print()
print(f'Bigram scores: {cross_val_bi_scores}')
print(f'Bigram mean: {cross_val_bi_multi}')
print()
print(f'TF-IDF scores: {cross_val_tf_scores}')
print(f'TF-IDF mean: {cross_val_tf_multi}')

Cross-validation:
Unigram scores: [0.94235589 0.93467337 0.95226131 0.92713568 0.95477387]
Unigram mean: 0.9527852294051712

Bigram scores: [0.80701754 0.83165829 0.78140704 0.80653266 0.8040201 ]
Bigram mean: 0.7423432954244908

TF-IDF scores: [0.94736842 0.92713568 0.94974874 0.9120603  0.94472362]
TF-IDF mean: 0.8633782949836905


In [23]:
results = {'Model': ['Bernoulli NB', 'Multinomial NB', 'Gaussian NB', 'XGBoost', 
                     'Bernoulli NB', 'Multinomial NB', 'Gaussian NB', 'XGBoost',
                     'Bernoulli NB', 'Multinomial NB', 'Gaussian NB', 'XGBoost'],
          'Vectorizer': ['CountVectorizer', 'CountVectorizer', 'CountVectorizer', 'CountVectorizer', 
                         'N-gram (2,2)', 'N-gram (2,2)', 'N-gram (2,2)', 'N-gram (2,2)', 
                         'TF-IDF', 'TF-IDF', 'TF-IDF', 'TF-IDF'],
          'Cross-validation': [cross_val_uni_bern, cross_val_bi_bern, cross_val_tf_bern, 
                               cross_val_uni_multi, cross_val_bi_multi, cross_val_tf_multi, 
                               cross_val_uni_gaus, cross_val_bi_gaus, cross_val_tf_gaus,
                               cross_val_uni_xgb, cross_val_bi_xgb, cross_val_tf_xgb],
          'Train': [train_bern_uni_score, train_bern_bi_score, train_bern_tf_score,
                    train_multi_uni_score, train_multi_bi_score, train_multi_tf_score,
                    train_gaus_uni_score, train_gaus_bi_score, train_gaus_tf_score,
                    train_xgb_uni_score, train_xgb_bi_score, train_xgb_tf_score],
          'Test': [test_bern_uni_score, test_bern_bi_score, test_bern_tf_score,
                   test_multi_uni_score, test_multi_bi_score, test_multi_tf_score,
                   test_gaus_uni_score, test_gaus_bi_score, test_gaus_tf_score, 
                   test_xgb_uni_score, test_xgb_bi_score, test_xgb_tf_score]
}

results = pd.DataFrame(results)
results

Unnamed: 0,Model,Vectorizer,Cross-validation,Train,Test
0,Bernoulli NB,CountVectorizer,0.95329,0.890205,0.833333
1,Multinomial NB,CountVectorizer,0.661484,0.814376,0.447791
2,Gaussian NB,CountVectorizer,0.95329,0.977883,0.939759
3,XGBoost,CountVectorizer,0.952785,0.945103,0.951807
4,Bernoulli NB,"N-gram (2,2)",0.742343,0.85545,0.939759
5,Multinomial NB,"N-gram (2,2)",0.863378,0.982622,0.949799
6,Gaussian NB,"N-gram (2,2)",0.809131,0.951422,0.823293
7,XGBoost,"N-gram (2,2)",0.868401,1.0,0.881526
8,Bernoulli NB,TF-IDF,0.803608,0.965245,0.831325
9,Multinomial NB,TF-IDF,0.94224,0.996484,0.937751


Train scores are significantly higher than the test scores and cross-validation scores. This indicates **overfitting across all 3 models**.