In [2]:
import pandas as pd
import nltk
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_auc_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

In [3]:
df = pd.read_csv("clean_table.csv")

In [4]:
df.sample(10)

Unnamed: 0.1,Unnamed: 0,subreddit,body
8042,9041,1,Women look at porn too lmao
11107,12436,1,"You can get the internet in the Arctic, just a..."
2531,2860,1,What do you suggest to bring allies to the cause?
36293,39917,0,"Uh, what?"
27231,30098,0,Best place to start is to go back the source o...
21458,23909,0,Don't feel bad about eating humanely raised eg...
20685,23085,0,yes! also would note a lot of foods should be ...
33,35,1,Context?
9726,10920,1,Second this. Few ideas:\n1. Find some soyrizo ...
33437,36825,0,gonna nut


In [5]:
df.drop(columns=['Unnamed: 0'], inplace=True)

In [6]:
df.sample(10)

Unnamed: 0,subreddit,body
7705,1,This is why I say people suck on a regular bas...
1264,1,Costco sells a vegan source of EPA/DHA.\n\nAma...
17278,1,Street by street block by block! Great pic. EC...
33874,0,"For frozen pizza, maybe, but for cheese off th..."
118,1,Just took a glance at your comment history and...
25725,0,"His street corn salad recipe is so so good, I ..."
11440,1,It’s called district eat play! It’s an arcade ...
10183,1,the fact that i’ve probably eaten a few bug le...
14055,1,u gotta freeze them and squeeze them to get th...
29672,0,"Aww geez, yes the girl I was with got sick and..."


In [7]:
X = df['body']
y = df['subreddit']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

In [9]:
#LOGISTIC REGRESSION
logr_model = make_pipeline(CountVectorizer(stop_words='english'),
                           LogisticRegression(solver='lbfgs', multi_class='auto', random_state=42))

cv_scores = cross_val_score(logr_model, X_train, y_train, cv=5, scoring='roc_auc')

logr_model.fit(X_train, y_train)
y_pred = logr_model.predict(X_test)

print('Logistic regression model')
print('Train score: {}, Test score {}'.format(round(cv_scores.mean(), 4), round(roc_auc_score(y_test, y_pred), 4)))
print('Number of features: {}'.format(len(logr_model.named_steps.countvectorizer.get_feature_names())))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Logistic regression model
Train score: 0.8376, Test score 0.7606
Number of features: 27368


In [15]:
#MULTINOMAIL NAIVE BAYES CLASSIFIER
mnb_model = make_pipeline(CountVectorizer(stop_words='english'), MultinomialNB())

cv_scores = cross_val_score(mnb_model, X_train, y_train, cv=5, scoring='roc_auc')

mnb_model.fit(X_train, y_train)
y_pred = mnb_model.predict(X_test)

print('Multinomial naive Bayes classifier')
print('Train score: {}, Test score {}'.format(round(cv_scores.mean(), 5), round(roc_auc_score(y_test, y_pred), 5)))
print('Number of features: {}'.format(len(mnb_model.named_steps.countvectorizer.get_feature_names())))

Multinomial naive Bayes classifier
Train score: 0.82645, Test score 0.75704
Number of features: 27368


In [11]:
#parameters for vectorizer
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('tfidf', None),
    ('mnb', MultinomialNB())
])

params = dict(
    cvec__stop_words = [None, 'english'],
    cvec__max_df = (0.5, 0.75, 1.0),
    cvec__ngram_range = [(1, 1), (2, 2)],
    tfidf = [None, TfidfTransformer(use_idf=True), TfidfTransformer(use_idf=False)]
)

gs = GridSearchCV(pipe, params, cv=5, scoring='roc_auc', n_jobs=-1, verbose=1)
gs.fit(X_train, y_train)
print("Best params: {}".format(gs.best_params_))
print('Train score: {}, Test score {}'.format(round(gs.best_score_, 4), round(gs.best_estimator_.score(X_test, y_test), 4)))
print('Number of features: {}'.format(len(gs.best_estimator_.named_steps.cvec.get_feature_names())))

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best params: {'cvec__max_df': 0.5, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': 'english', 'tfidf': TfidfTransformer(use_idf=False)}
Train score: 0.8388, Test score 0.7575
Number of features: 27368


In [12]:
# Tune hyperparameter for multinomial naive Bayes
params = dict(
    cvec__stop_words = ['english'],
    tfidf = [TfidfTransformer(use_idf=True)],
    mnb__alpha = np.linspace(0.1, 1.0, 20)
)

gs = GridSearchCV(pipe, params, cv=5, scoring='roc_auc', n_jobs=-1, verbose=1)
gs.fit(X_train, y_train)
print("Best params: {}".format(gs.best_params_))
print('Train score: {}, Test score {}'.format(round(gs.best_score_, 4), round(gs.best_estimator_.score(X_test, y_test), 4)))
print('Number of features: {}'.format(len(gs.best_estimator_.named_steps.cvec.get_feature_names())))

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best params: {'cvec__stop_words': 'english', 'mnb__alpha': 0.7631578947368421, 'tfidf': TfidfTransformer()}
Train score: 0.8342, Test score 0.7548
Number of features: 27368


In [24]:
# Generate a confusion matrix LOGISTIC REGRESSION
y_pred = logr_model.predict(X_test)

conclusion_df = pd.DataFrame(confusion_matrix(y_test, y_pred),
                             columns=['predict /r/vegetarian', 'predict /r/vegan'],
                             index=['actual /r/vegetarian', 'actual /r/vegan'])
conclusion_df

Unnamed: 0,predict /r/vegetarian,predict /r/vegan
actual /r/vegetarian,3520,1136
actual /r/vegan,1042,3395


In [25]:
# Examine some classification metrics 
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print('Accuracy: {}'.format(round((tp+tn)/(tp+fp+tn+fn),4)))
print('Misclassification rate: {}'.format(round((fp+fn)/(tp+fp+tn+fn),4)))
print('Precision: {}'.format(round(tp/(tp+fp),4)))
print('Recall: {}'.format(round(tp/(tp+fn),4)))
print('Specificity: {}'.format(round(tn/(tn+fp),4)))

Accuracy: 0.7605
Misclassification rate: 0.2395
Precision: 0.7493
Recall: 0.7652
Specificity: 0.756


In [26]:
# Generate a confusion matrix MULTINOMAIL NAIVE BAYES CLASSIFIER
y_pred = mnb_model.predict(X_test)

conclusion_df = pd.DataFrame(confusion_matrix(y_test, y_pred),
                             columns=['predict /r/vegetarian', 'predict /r/vegan'],
                             index=['actual /r/vegetarian', 'actual /r/vegan'])
conclusion_df

Unnamed: 0,predict /r/vegetarian,predict /r/vegan
actual /r/vegetarian,3573,1083
actual /r/vegan,1124,3313


In [27]:
# Examine some classification metrics 
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print('Accuracy: {}'.format(round((tp+tn)/(tp+fp+tn+fn),4)))
print('Misclassification rate: {}'.format(round((fp+fn)/(tp+fp+tn+fn),4)))
print('Precision: {}'.format(round(tp/(tp+fp),4)))
print('Recall: {}'.format(round(tp/(tp+fn),4)))
print('Specificity: {}'.format(round(tn/(tn+fp),4)))

Accuracy: 0.7573
Misclassification rate: 0.2427
Precision: 0.7536
Recall: 0.7467
Specificity: 0.7674


#### CONCLUSION

- Our Logistic Regression model had an accuracy metric of 76.06% and naive Bayes classifier had an accuracy metric of 75.73%. Misclassification rate of Logistic Regression model is 23.95% and for naive Bayes classifier 24.27%. We definetly had a better perfomance with our Logistic Regression model. Nevertheless to get a better perfomance, I would optimize stop words, or I use a random forest classifier. 