In [26]:
# Imports
import pandas as pd
from bs4 import BeautifulSoup
import regex as re

from nltk.tokenize import RegexpTokenizer
import nltk as nltk
from nltk.corpus import stopwords

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

In [27]:
# Read in the data
df = pd.read_csv('./data/train_round2.csv')

In [28]:
# Show the first few rows of the data
df.head()

Unnamed: 0,title,comments,age,thread
0,funkadelic maggot brain,16,841.220417,1
1,motorhead motorhead live germany,0,289.60375,1
2,bands prefer,5,125.55375,1
3,anyone special affinity towards lengthier songs,12,654.987084,1
4,jebediah jerks attention,0,181.837084,1


In [29]:
# Double check mapping function output
df['thread'].value_counts()

0    996
1    993
Name: thread, dtype: int64

In [30]:
# Get the shape of the data
df.shape

(1989, 4)

In [31]:
# Sanity check
df.isnull().sum()

title       5
comments    0
age         0
thread      0
dtype: int64

In [32]:
# Drop nulls 
df.dropna(inplace=True)

In [33]:
# Double/Triple check
df.isnull().sum()

title       0
comments    0
age         0
thread      0
dtype: int64

In [34]:
# Drop null values
#df.dropna(axis=0, inplace=True)

In [35]:
# Set up model variables X and y
X = df['title']
y = df['thread']

In [36]:
# Split the data into training and testing sets for model creation and performance evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y) # Stratify splits data according to the original label proportions provided

In [37]:
# Sanity check
X.loc[X.isnull() == True]

Series([], Name: title, dtype: object)

In [38]:
# Make sure the data is properly stratified by comparing the train test split proportions
y_test.value_counts(normalize=True)

1    0.5
0    0.5
Name: thread, dtype: float64

In [39]:
# with the original proportions. 
df['thread'].value_counts(normalize=True) # The percentages appear to be approximately equal.

0    0.501008
1    0.498992
Name: thread, dtype: float64

In [40]:
# Define a custom function to remove a given list of words from a dataframe column.
def remove_given_words(column, given_words):
    
    cv = CountVectorizer(stop_words=given_words)
    words = cv.fit_transform(column)
    df_words = pd.DataFrame(words.toarray(), columns=cv.get_feature_names())
    print(df_words.sum())
    return df_words

In [41]:
# Consider gridsearching over logistic regression parameters
lr = LogisticRegression()
lr.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'warn',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'warn',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [46]:
# Establish a pipeline to contain transformers and a final estimator
pipe = Pipeline([
    
    ('cvec', CountVectorizer()), # Converts a collection of text into a matrix of token counts
    ('lr', LogisticRegression()) # For predicting binary class labels
    #('lr', LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5))
    
])

# The elasticnet is a regularized expression method that linearly combines the L1 and L2 Lasso and Ridge methods.
# The saga solver supports the elasticnet regularizaion penalty
# l1_ratio sets the proportion of L1 and L2 regularization. Setting it to 0.5 applies equal amounts of L1 and L2 regularization.

# Set the pipeline parameters that I want gridsearch to vary
pipe_params = {
    
    'cvec__max_features': [100, 500, 1000], # Consider the first 100, 500 and 1000 words
    'cvec__ngram_range': [(1, 1), (1, 2), (1, 3)], # Consider 1, 2 and 3 grams
    'cvec__stop_words': [None, 'english'] # Build the model with no stopwords and with English stopwords
    #'lr__C': [1, 1e3, 1e6, 1e9] # Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization.
    
}

# Instantiate GridSearchCV, fit the model and find the best set of parameters
gs = GridSearchCV(pipe, param_grid=pipe_params, cv=5)
gs.fit(X_train, y_train)
gs_model = gs.best_estimator_

# Display relevant model scores
print(f'CVEC/LR Best Accuracy Score: {round(gs.best_score_, 3)}')
print(f'CVEC/LR Training Score: {round(gs_model.score(X_train, y_train), 3)}')
print(f'CVEC/LR Testing Score: {round(gs_model.score(X_test, y_test), 3)}')
print(f'Score Drop: {round(round(gs_model.score(X_train, y_train), 3) - round(gs_model.score(X_test, y_test), 3), 3)}')







CVEC/LR Best Accuracy Score: 0.8
CVEC/LR Training Score: 0.938
CVEC/LR Testing Score: 0.792
Score Drop: 0.146




Varying the amount and types of regularization in the model had very little effect on the scores. My original model was chosen as the best because of its simplicity and manageable amount of overfitting.

In [48]:
# Establish a pipeline to contain transformers and a final estimator
pipe2 = Pipeline([
    
    ('tfidf', TfidfVectorizer()), # TFIDF is a statistical measure of the importance of a specific word to a corpus
    ('lr', LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5)) # For predicting binary class labels
    
])

# The elasticnet is a regularized expression method that linearly combines the L1 and L2 Lasso and Ridge methods.
# The saga solver supports the elasticnet regularizaion penalty
# l1_ratio sets the proportion of L1 and L2 regularization. Setting it to 0.5 applies equal amounts of L1 and L2 regularization.

# Set the pipeline parameters that I want gridsearch to vary
pipe_params2 = {
    
    'tfidf__max_features': [100, 500, 1000], # Consider the first 100, 500 and 1000 words
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)], # Consider 1, 2 and 3 grams
    'tfidf__stop_words': [None, 'english'], # Build the model with no stopwords and with English stopwords
    'lr__C': [1, 1e3, 1e6, 1e9] # Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization.
}

# Instantiate GridSearchCV, fit the model and find the best set of parameters
gs2 = GridSearchCV(pipe2, param_grid=pipe_params2, cv=5)
gs2.fit(X_train, y_train)
gs_model2 = gs2.best_estimator_

# Display relevant model scores
print(f'TFIDF/LR Best Accuracy Score: {round(gs2.best_score_, 3)}')
print(f'TFIDF/LR Training Score: {round(gs_model2.score(X_train, y_train), 3)}')
print(f'TFIDF/LR Testing Score: {round(gs_model2.score(X_test, y_test), 3)}')
print(f'Score Drop: {round(round(gs_model2.score(X_train, y_train), 3) - round(gs_model2.score(X_test, y_test), 3), 3)}')















TFIDF/LR Best Accuracy Score: 0.797
TFIDF/LR Training Score: 0.973
TFIDF/LR Testing Score: 0.786
Score Drop: 0.187




Varying the amount and types of regularization in the model had a negative effect on the scores but led to a drastic decrease in overfitting. The new model varying the amount of elasticnet regularization was chosen as best.

In [93]:
# Consider gridsearching over BernoulliNB parameters
nb = BernoulliNB()
nb.get_params()

{'alpha': 1.0, 'binarize': 0.0, 'class_prior': None, 'fit_prior': True}

In [49]:
# Establish a pipeline to contain transformers and a final estimator
pipe4 = Pipeline([
    
    ('cvec', CountVectorizer()), # Converts a collection of text into a matrix of token counts
    ('nb', BernoulliNB()) # Bernoulli Naive Bayes is used to predict a binary outcome
    
])

# Set the pipeline parameters that I want gridsearch to vary
pipe_params4 = {
    
    'cvec__max_features': [100, 500, 1000], # Consider the first 100, 500 and 1000 words
    'cvec__ngram_range': [(1, 1), (1, 2), (1, 3)], # Consider 1, 2 and 3 grams
    'cvec__stop_words': [None, 'english'] # Build the model with no stopwords and with English stopwords
}

# Instantiate GridSearchCV, fit the model and find the best set of parameters
gs4 = GridSearchCV(pipe4, param_grid=pipe_params4, cv=5)
gs4.fit(X_train, y_train)
gs_model4 = gs4.best_estimator_

# Display relevant model scores
print(f'CVEC/BNB Best Accuracy Score: {round(gs4.best_score_, 3)}')
print(f'CVEC/BNB Training Score: {round(gs_model4.score(X_train, y_train), 3)}')
print(f'CVEC/BNB Testing Score {round(gs_model4.score(X_test, y_test), 3)}')
print(f'Score Drop: {round(round(gs_model4.score(X_train, y_train), 3) - round(gs_model4.score(X_test, y_test), 3), 3)}')

CVEC/BNB Best Accuracy Score: 0.79
CVEC/BNB Training Score: 0.894
CVEC/BNB Testing Score 0.798
Score Drop: 0.096


In [50]:
# Establish a pipeline to contain transformers and a final estimator
pipe5 = Pipeline([
    
    ('tfidf', TfidfVectorizer()), # TFIDF is a statistical measure of the importance of a specific word to a corpus
    ('nb', BernoulliNB()) # Bernoulli Naive Bayes is used to predict a binary outcome
    
])

# Set the pipeline parameters that I want gridsearch to vary
pipe_params5 = {
    
    'tfidf__max_features': [100, 500, 1000], # Consider the first 100, 500 and 1000 words
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)], # Consider 1, 2 and 3 grams
    'tfidf__stop_words': [None, 'english'] # Build the model with no stopwords and with English stopwords
}

# Instantiate GridSearchCV, fit the model and find the best set of parameters
gs5 = GridSearchCV(pipe5, param_grid=pipe_params5, cv=5)
gs5.fit(X_train, y_train)
gs_model5 = gs5.best_estimator_

# Display relevant model scores
print(f'TFIDF/BNB Best Accuracy Score: {round(gs5.best_score_, 3)}')
print(f'TFIDF/BNB Training Score: {round(gs_model5.score(X_train, y_train), 3)}')
print(f'TFIDF/BNB Testing Score {round(gs_model5.score(X_test, y_test), 3)}')
print(f'Score Drop: {round(round(gs_model5.score(X_train, y_train), 3) - round(gs_model5.score(X_test, y_test), 3), 3)}')

TFIDF/BNB Best Accuracy Score: 0.79
TFIDF/BNB Training Score: 0.894
TFIDF/BNB Testing Score 0.798
Score Drop: 0.096


The data under consideration for this project are in text form, so some form of text vectorization must occur before instantiating any sort of model. I chose to implement pipelines vectorizing the text data first using CountVectorizer and second using TfidfVectorizer. This is a binary classification problem so the first two pipelines used a Logistic Regression (LR) model and functioned as a baseline. Added benefits of using LR include interpretability, accuracy and ease of implementation. The Naive Bayes classifier was then implemented and compared to the results of LR. 

For more information regarding model selection, see README.md. 