In [75]:
# Imports
import pandas as pd
from bs4 import BeautifulSoup
import regex as re

from nltk.tokenize import RegexpTokenizer
import nltk as nltk
from nltk.corpus import stopwords

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

In [76]:
# Read in the data
df = pd.read_csv('./data/train.csv')

In [77]:
# Show the first few rows of the data
df.head()

Unnamed: 0,title,comments,age,thread
0,october recommendations suggestions playlists ...,20,20551.811835,1
1,october covers thread,3,20550.578502,1
2,rock hall fame notorious b g whitney houston s...,9,580.078502,1
3,pat benatar invincible,0,106.795169,1
4,kate bush david gilmour running hill live secr...,0,544.745169,1


In [78]:
# Get the shape of the data
df.shape

(1958, 4)

In [81]:
# Sanity check
df.isnull().sum()

title       0
comments    0
age         0
thread      0
dtype: int64

In [80]:
# Drop null values
df.dropna(axis=0, inplace=True)

In [82]:
# Set up model variables X and y
X = df['title']
y = df['thread']

In [83]:
# Split the data into training and testing sets for model creation and performance evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [84]:
# Sanity check
X.loc[X.isnull() == True]

Series([], Name: title, dtype: object)

In [87]:
# Make sure the data is properly stratified
y_test.value_counts(normalize=True)

0    0.509202
1    0.490798
Name: thread, dtype: float64

In [88]:
# Define a custom function to remove a given list of words from a column of a DataFrame.
def remove_given_words(column, given_words):
    
    cv = CountVectorizer(stop_words=given_words)
    words = cv.fit_transform(column)
    df_words = pd.DataFrame(words.toarray(), columns=cv.get_feature_names())
    print(df_words.sum())
    return df_words

In [89]:
# Consider gridsearching over LR parameters
lr = LogisticRegression()
lr.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'warn',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'warn',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [90]:
# Establish a pipeline to function as the model for gridsearching
pipe = Pipeline([
    
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression())
    #('lr', LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5))
    
])


# Set the pipeline parameters that I want gridsearch to vary
pipe_params = {
    
    'cvec__max_features': [100, 500, 1000],
    'cvec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'cvec__stop_words': [None, 'english']#,
    #'lr__C': [1, 1e3, 1e6, 1e9]
    
}

# Instantiate GridSearchCV, fit the model and find the best set of parameters
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=5)
gs.fit(X_train, y_train)
gs_model = gs.best_estimator_
print(f'CVEC/LR Best Accuracy Score: {round(gs.best_score_, 3)}')
print(f'CVEC/LR Training Score: {round(gs_model.score(X_train, y_train), 3)}')
print(f'CVEC/LR Testing Score {round(gs_model.score(X_test, y_test), 3)}')



CVEC/LR Best Accuracy Score: 0.841
CVEC/LR Training Score: 0.916
CVEC/LR Testing Score 0.84


Varying the amount and types of regularization in the model had very little effect on the scores. My original model was chosen as the best because of its simplicity and manageable amount of overfitting.

In [91]:
# Establish a pipeline to function as the model for gridsearching
pipe2 = Pipeline([
    
    ('tfidf', TfidfVectorizer()),
    ('lr', LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5))
    
])

# Set the pipeline parameters that I want gridsearch to vary
pipe_params2 = {
    
    'tfidf__max_features': [100, 500, 1000],
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf__stop_words': [None, 'english'],
    'lr__C': [1, 1e3, 1e6, 1e9]
}

# Instantiate GridSearchCV, fit the model and find the best set of parameters
gs2 = GridSearchCV(pipe2, param_grid=pipe_params2, cv=5)
gs2.fit(X_train, y_train)
gs_model2 = gs2.best_estimator_
print(f'TFIDF/LR Best Accuracy Score: {round(gs2.best_score_, 3)}')
print(f'TFIDF/LR Training Score: {round(gs_model2.score(X_train, y_train), 3)}')
print(f'TFIDF/LR Testing Score {round(gs_model2.score(X_test, y_test), 3)}')



TFIDF/LR Best Accuracy Score: 0.819
TFIDF/LR Training Score: 0.951
TFIDF/LR Testing Score 0.836


Varying the amount and types of regularization in the model had a negative effect on the scores but led to a drastic decrease in overfitting. The new model varying the amount of elasticnet regularization was chosen as best.

In [93]:
# Consider gridsearching over BernoulliNB parameters
nb = BernoulliNB()
nb.get_params()

{'alpha': 1.0, 'binarize': 0.0, 'class_prior': None, 'fit_prior': True}

In [94]:
# Establish a pipeline to function as the model for gridsearching
pipe4 = Pipeline([
    
    ('cvec', CountVectorizer()),
    ('nb', BernoulliNB())
    
])

# Set the pipeline parameters that I want gridsearch to vary
pipe_params4 = {
    
    'cvec__max_features': [100, 500, 1000],
    'cvec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'cvec__stop_words': [None, 'english']
}

# Instantiate GridSearchCV, fit the model and find the best set of parameters
gs4 = GridSearchCV(pipe4, param_grid=pipe_params4, cv=5)
gs4.fit(X_train, y_train)
gs_model4 = gs4.best_estimator_
print(f'CVEC/BNB Best Accuracy Score: {round(gs4.best_score_, 3)}')
print(f'CVEC/BNB Training Score: {round(gs_model4.score(X_train, y_train), 3)}')
print(f'CVEC/BNB Testing Score {round(gs_model4.score(X_test, y_test), 3)}')

CVEC/BNB Best Accuracy Score: 0.834
CVEC/BNB Training Score: 0.898
CVEC/BNB Testing Score 0.843


In [95]:
# Establish a pipeline to function as the model for gridsearching
pipe5 = Pipeline([
    
    ('tfidf', TfidfVectorizer()),
    ('nb', BernoulliNB())
    
])

# Set the pipeline parameters that I want gridsearch to vary
pipe_params5 = {
    
    'tfidf__max_features': [100, 500, 1000],
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf__stop_words': [None, 'english']
}

# Instantiate GridSearchCV, fit the model and find the best set of parameters
gs5 = GridSearchCV(pipe5, param_grid=pipe_params5, cv=5)
gs5.fit(X_train, y_train)
gs_model5 = gs5.best_estimator_
print(f'TFIDF/BNB Best Accuracy Score: {round(gs5.best_score_, 3)}')
print(f'TFIDF/BNB Training Score: {round(gs_model5.score(X_train, y_train), 3)}')
print(f'TFIDF/BNB Testing Score {round(gs_model5.score(X_test, y_test), 3)}')

TFIDF/BNB Best Accuracy Score: 0.834
TFIDF/BNB Training Score: 0.898
TFIDF/BNB Testing Score 0.843


The data under consideration for this project are in text form, so some form of text vectorization must occur before instantiating any sort of model. I chose to implement pipelines vectorizing the text data first using CountVectorizer and second using TfidfVectorizer. This is a binary classification problem so the first two pipelines used a Logistic Regression (LR) model and functioned as a baseline. Added benefits of using LR include interpretability, accuracy and ease of implementation. The Naive Bayes classifier was then implemented and compared to the results of LR. 

For more information regarding model selection, see README.md. 