<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 20px; height: 55px">

# NLP & Classification Project
---

## import and Cleaning data

In [60]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Classifiers
from sklearn.naive_bayes import MultinomialNB 
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

# NLP tools
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [62]:
stem = pd.read_csv('subreddit_pepsi_vs_cocacola.csv')

In [63]:
stem.shape

(1958, 8)

In [64]:
stem.head()

Unnamed: 0,title,score,id,url,comms_num,created,body,is_pepsi
0,Wall clock.,17,1godhom,https://www.reddit.com/gallery/1godhom,1,11/11/2024 5:58,I'm trying to locate a value for this clock. I...,0
1,"Happy Veterans Day!, Coca Cola poster sealed d...",4,1gojyxe,https://i.redd.it/a3xsqwwaa70e1.jpeg,0,11/11/2024 11:31,,0
2,What do yall like better,30,1go7cpq,https://www.reddit.com/gallery/1go7cpq,85,11/11/2024 1:33,Diet coke or coke and why,0
3,Every drink company cola has owned and owns an...,0,1goc0ni,https://miro.com/app/board/uXjVM_YVujs=/,1,11/11/2024 4:53,,0
4,Wall thermometer,22,1gnaebp,https://i.redd.it/13cyo811rvzd1.jpeg,4,11/9/2024 20:44,Vintage wall thermometer my Dad had.,0


In [65]:
stem.isnull().sum()

title          0
score          0
id             0
url            0
comms_num      0
created        0
body         713
is_pepsi       0
dtype: int64

In [66]:
stem.shape

(1958, 8)

In [67]:
stem['body'].head()

0    I'm trying to locate a value for this clock. I...
1                                                  NaN
2                            Diet coke or coke and why
3                                                  NaN
4                 Vintage wall thermometer my Dad had.
Name: body, dtype: object

In [68]:
#stem = stem.dropna()
stem.shape

(1958, 8)

In [69]:
stem['body'] = np.where(stem['body'].isnull(), stem['title'], stem['title'] + ' ' + stem['body'])

## Define Stopwords and remove from X

In [71]:
custom_stop_words = set(stopwords.words('english') + ['pepsi','pepsico', 'coke', 'coca', 'cola'])

In [72]:
X = stem['body']
y = stem['is_pepsi'] 

In [73]:
def remove_stop_words(text):
    tokens = text.split()  # Tokenize the text
    filtered_tokens = [word for word in tokens if word.lower() not in custom_stop_words]
    return ' '.join(filtered_tokens)

# Apply the function to the dataset X
X = X.apply(remove_stop_words)

In [74]:
# Split the data into the training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.20,
                                                    stratify=y,
                                                    random_state=42)

In [75]:
y_train.value_counts(normalize=True)

is_pepsi
1    0.508301
0    0.491699
Name: proportion, dtype: float64

In [76]:
y_test.value_counts(normalize=True)

is_pepsi
1    0.507653
0    0.492347
Name: proportion, dtype: float64

### Define Stem and Lematize function

In [78]:
stemmer = PorterStemmer() 
lemmatizer = WordNetLemmatizer()  

def stemmer_tokenizer(tokens):
    tokens = tokens.split()
    return [stemmer.stem(token) for token in tokens]

def lemmatizer_tokenizer(tokens):
    tokens = tokens.split()
    return [lemmatizer.lemmatize(token) for token in tokens]

In [79]:
# Defualt vector params 
defualt_params = {
    'vectorizer__max_features': [None],  
    'vectorizer__ngram_range': [(1, 1)], 
    'vectorizer__lowercase': [True],     
    'vectorizer__min_df': [1],           
    'vectorizer__max_df': [1.0],         
    'vectorizer__token_pattern': [r'(?u)\b\w+\b']
}

In [80]:
# Dictionary of vectors Params for GridSearchCV
params = {
    'vectorizer__max_features': [3000, 5000],
    'vectorizer__min_df': [2, 3],
    'vectorizer__max_df': [0.8, 0.9],
    'vectorizer__ngram_range': [(1, 2), (1, 1)],
    'vectorizer__lowercase': [True, False],
    'vectorizer__tokenizer': [None, stemmer_tokenizer, lemmatizer_tokenizer],  # Custom tokenizers
    'vectorizer__token_pattern': [r'(?u)\b[a-zA-Z]+\b', r'(?u)\b[a-zA-Z0-9]+\b']
}

In [81]:
# Dictionary of model params (defualt_params be added)
defualt_param_grids = {
    'LogisticRegression': [{**params, 'classifier': [LogisticRegression()]}],
    'KNN': [{**params, 'classifier': [KNeighborsClassifier()]}],
    'DecisionTree': [{**params, 'classifier': [DecisionTreeClassifier()]}],
    'BaggedDecisionTree': [{**params, 'classifier': [BaggingClassifier(DecisionTreeClassifier())]}],
    'RandomForest': [{**params, 'classifier': [RandomForestClassifier()]}],
    'AdaBoost': [{**params,'classifier': [AdaBoostClassifier(algorithm='SAMME')]}],    
    'GradientBoosting': [{**params,  'classifier': [GradientBoostingClassifier()]}],    
    'XGBoost': [{**params,  'classifier': [XGBClassifier()]}],    
    'SVM': [{**params,  'classifier': [SVC()]}],    
    'Naive Bayes': [{**params,  'classifier': [MultinomialNB()]}]
}

In [82]:
# Function to evaluate model and return dataframe to compare train/test score by model,params 
def evaluate_models(X_train, y_train, param_grids):

    start_time = time.time()
    vectorizers = {'CountVectorizer': CountVectorizer(), 'TfidfVectorizer': TfidfVectorizer()}
    results = []

    # Iterate through vectorizer options
    for vector_name, vector in vectorizers.items():
        for clf_name, param_grid in param_grids.items():
            for params in param_grid:
                pipeline = Pipeline([
                    ('vectorizer', vector),
                    ('classifier', None)  # Will be set by GridSearchCV
                ])
                
                print(f"Running GridSearch for {clf_name}...")

                # GridSearchCV with the classifier-specific parameters
                grid_search = GridSearchCV(pipeline, params, cv=5, n_jobs=-1, scoring='accuracy', return_train_score=True)
                grid_search.fit(X_train, y_train)

                # Collecting all combinations of parameters and scores
                for param_combination, train_score, test_score in zip(grid_search.cv_results_['params'], 
                                                                      grid_search.cv_results_['mean_train_score'],
                                                                      grid_search.cv_results_['mean_test_score']): 

                    tokenizer = param_combination.get('vectorizer__tokenizer', None)
                    tokenizer_name = tokenizer.__name__ if callable(tokenizer) else 'NONE'
                    
                    if param_combination.get('vectorizer__token_pattern') == r'(?u)\b[a-zA-Z]+\b':
                        pattern = 'Keep only alphabet'
                    elif param_combination.get('vectorizer__token_pattern') == r'(?u)\b[a-zA-Z0-9]+\b':
                        pattern = 'Keep numeric and alphabet'
                    else:
                        pattern = 'Keep all characters'
                    
                    results.append({
                        'Classifier': clf_name,
                        'Vectorizer': vector_name,
                        'Tokenizer': tokenizer_name,
                        'Max Features': param_combination.get('vectorizer__max_features', 'None'),
                        'Ngram Range': param_combination.get('vectorizer__ngram_range', 'None'),
                        'Lowercase': param_combination.get('vectorizer__lowercase', 'None'),
                        'Min DF': param_combination.get('vectorizer__min_df', 'None'),
                        'Max DF': param_combination.get('vectorizer__max_df', 'None'),
                        'Token Pattern': pattern,
                        'Train Score': train_score,
                        'Test Score': test_score
                    })
    
    # Convert the list of results to a DataFrame
    results_df = pd.DataFrame(results)

    end_time = time.time()

    print(f"Time to take the model runs: {(end_time - start_time) / 60:.0f} minutes")

    return results_df


In [83]:
results_df = evaluate_models(X_train, y_train, defualt_param_grids)

Running GridSearch for LogisticRegression...
Running GridSearch for KNN...
Running GridSearch for DecisionTree...
Running GridSearch for BaggedDecisionTree...
Running GridSearch for RandomForest...
Running GridSearch for AdaBoost...
Running GridSearch for GradientBoosting...
Running GridSearch for XGBoost...
Running GridSearch for SVM...
Running GridSearch for Naive Bayes...
Running GridSearch for LogisticRegression...
Running GridSearch for KNN...
Running GridSearch for DecisionTree...
Running GridSearch for BaggedDecisionTree...
Running GridSearch for RandomForest...
Running GridSearch for AdaBoost...
Running GridSearch for GradientBoosting...
Running GridSearch for XGBoost...
Running GridSearch for SVM...
Running GridSearch for Naive Bayes...
Time to take the model runs: 161 minutes


In [110]:
results_df['gap'] = results_df['Train Score'] - results_df['Test Score']
results_df.sort_values(by='gap', ascending = True)


Unnamed: 0,Classifier,Vectorizer,Tokenizer,Max Features,Ngram Range,Lowercase,Min DF,Max DF,Token Pattern,Train Score,Test Score,gap
965,AdaBoost,CountVectorizer,lemmatizer_tokenizer,3000,"(1, 2)",True,2,0.8,Keep numeric and alphabet,0.651023,0.644973,0.006050
1001,AdaBoost,CountVectorizer,lemmatizer_tokenizer,5000,"(1, 2)",True,3,0.8,Keep numeric and alphabet,0.651023,0.644973,0.006050
1004,AdaBoost,CountVectorizer,lemmatizer_tokenizer,5000,"(1, 1)",True,3,0.8,Keep only alphabet,0.651023,0.644973,0.006050
1049,AdaBoost,CountVectorizer,lemmatizer_tokenizer,5000,"(1, 2)",True,3,0.9,Keep numeric and alphabet,0.651023,0.644973,0.006050
1010,AdaBoost,CountVectorizer,lemmatizer_tokenizer,3000,"(1, 2)",True,2,0.9,Keep only alphabet,0.651023,0.644973,0.006050
...,...,...,...,...,...,...,...,...,...,...,...,...
467,DecisionTree,CountVectorizer,lemmatizer_tokenizer,5000,"(1, 1)",True,2,0.9,Keep numeric and alphabet,0.990900,0.617513,0.373387
2435,DecisionTree,TfidfVectorizer,lemmatizer_tokenizer,5000,"(1, 1)",False,2,0.8,Keep numeric and alphabet,0.988186,0.613038,0.375148
2474,DecisionTree,TfidfVectorizer,lemmatizer_tokenizer,5000,"(1, 2)",False,2,0.9,Keep only alphabet,0.988186,0.612385,0.375801
2483,DecisionTree,TfidfVectorizer,lemmatizer_tokenizer,5000,"(1, 1)",False,2,0.9,Keep numeric and alphabet,0.988186,0.608577,0.379609


In [112]:
results_df.to_csv('model_compare_df.csv')

In [122]:
results_df.sort_values(by = 'Test Score',ascending = False).head(20)

Unnamed: 0,Classifier,Vectorizer,Tokenizer,Max Features,Ngram Range,Lowercase,Min DF,Max DF,Token Pattern,Train Score,Test Score,gap
795,RandomForest,CountVectorizer,NONE,5000,"(1, 2)",True,2,0.8,Keep numeric and alphabet,0.993455,0.735016,0.258439
1182,GradientBoosting,CountVectorizer,NONE,5000,"(1, 1)",True,2,0.8,Keep only alphabet,0.828065,0.733748,0.094318
3105,GradientBoosting,TfidfVectorizer,NONE,5000,"(1, 1)",True,2,0.8,Keep numeric and alphabet,0.862867,0.73374,0.129127
3156,GradientBoosting,TfidfVectorizer,NONE,5000,"(1, 2)",True,3,0.9,Keep only alphabet,0.863507,0.733732,0.129775
1173,GradientBoosting,CountVectorizer,NONE,3000,"(1, 1)",True,3,0.8,Keep numeric and alphabet,0.824873,0.733105,0.091768
1194,GradientBoosting,CountVectorizer,NONE,5000,"(1, 1)",True,3,0.8,Keep only alphabet,0.82615,0.733105,0.093045
777,RandomForest,CountVectorizer,NONE,3000,"(1, 1)",True,2,0.8,Keep numeric and alphabet,0.993455,0.733093,0.260362
1218,GradientBoosting,CountVectorizer,NONE,3000,"(1, 1)",True,3,0.9,Keep only alphabet,0.826789,0.732466,0.094323
3084,GradientBoosting,TfidfVectorizer,NONE,3000,"(1, 2)",True,3,0.8,Keep only alphabet,0.866859,0.732464,0.134396
1176,GradientBoosting,CountVectorizer,NONE,5000,"(1, 2)",True,2,0.8,Keep only alphabet,0.827267,0.731829,0.095438
