In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
%matplotlib inline

In [2]:
#Required text pre-processing libraries are imported
import string
import nltk
import re

# download the stopwords and wordnet corpus
nltk.download('stopwords')
from nltk.corpus import stopwords
# import tokenize from nltk library
from nltk import tokenize
# import WordNetLemmatizer from nltk library
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import CountVectorizer
from nltk.probability import FreqDist

#Required data visualisation libraries are imported
import plotly.express as px
import seaborn as sns 
import matplotlib.pyplot as plt


#Required prediction modelling libraries are imported
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import plot_confusion_matrix, confusion_matrix, precision_recall_curve, auc, roc_curve, accuracy_score, recall_score, classification_report, f1_score, precision_score, precision_recall_fscore_support, roc_auc_score, make_scorer
from sklearn.pipeline import Pipeline

import xgboost
from xgboost import XGBClassifier

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jillian/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/jillian/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/jillian/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jillian/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
#reading in single_word dataset

In [4]:
df = pd.read_csv('../data/single_word_data.csv')

FileNotFoundError: [Errno 2] No such file or directory: '../data/single_word_data.csv'

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df[df.String.isna() == True]

In [None]:
#due to adding words to our stop list, we now have 4 "Clean" reviews with no words; 
#we will eliminate these rows for modeling

In [None]:
df = df[df.String.isna() == False]

In [None]:
df.info()

# Modeling with Single Words Vectorized

In [None]:
#will be using accuracy as metric; want to identify neutral/negative sentiments and be able
#to sample enough of them to get a clear view of if there is a consistent issue
#that we as a company can change for our soft roll out
#also want to be able to ball park which items will be most popular, so we can have
#an appropriate amount of inventory

For each model we created a pipeline that includes a TF-IDF vectorizer, a smote component to deal with class imbalance, and the classifier itself. We elected to use a TF-IDF vectorizer instead of a count vectorizer because it provides a way to understand the importance of each word to the tweet, as well as just how frequently it occurs.

To give the model a little bit more information with those same features, we'll use a TfidfVectorizer (documentation here) so that it counts not only the term frequency (tf) within a single document, it also includes the inverse document frequency (idf) — how rare the term is.


## Train-Test Split

In [None]:
# Specify X as the cleaned strings in df and y as the target-Rating.
X = df['String']
y = df['Rating']

In [None]:
#Performed the train-test split, using 20% for the hold-out data.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=42,
                                                    test_size=0.20)

In [None]:
X_train

In [None]:
# Instantiate a vectorizer 
# Instantiate and fit/transform X_train using the TF-IDF vectorizer.
tfidf = TfidfVectorizer()
X_train_vectorized = tfidf.fit_transform(X_train)
X_train_vectorized

In [None]:
# Convert vectorized X_train to a vector for easier visual inspection.
X_train_vec = pd.DataFrame.sparse.from_spmatrix(X_train_vectorized, columns=tfidf.get_feature_names())
X_train_vec

In [None]:
# Transform test data using TF-IDF Vectorizer trained on X_train, y_train
X_test_vectorized = tfidf.transform(X_test)
X_test_vectorized

In [None]:
# Convert to DataFrame for visual inspection
X_test_vec = pd.DataFrame.sparse.from_spmatrix(X_test_vectorized, columns=tfidf.get_feature_names())
X_test_vec

In [None]:
#Function to return scores in cross validation

In [None]:
# Accuracy
accuracy = make_scorer(accuracy_score)

#f1
custom_f1 = make_scorer(
    f1_score, 
    average="weighted")

# Precision
multi_prec = make_scorer(
    precision_score,
    average="weighted")

# Recall
multi_rec = make_scorer(
    recall_score,
    average="weighted")

# This function will allow for quick cross-validation of the chosen score for each of our models.
def cross_val(model, X, y, custom_scorer, kfolds=5):
    """ Perform cross-validated scoring and store/print results """
    results = cross_val_score(model, X, y, cv=kfolds, scoring=custom_scorer)
    mean = np.mean(results)
    median = np.median(results)
    std = np.std(results)
    if custom_scorer == accuracy:
        print(f"Mean accuracy score: ", {mean}, ".")
        print(f"Median acuracy score: ", {median}, ".")
        print(f"Standard Deviation in accuracy: ", {std}, ".") 
    elif custom_scorer == custom_f1:
        print(f"Mean f1 score: ", {mean}, ".")
        print(f"Median f1 score: ", {median}, ".")
        print(f"Standard Deviation in f1 score: ", {std}, ".") 
    elif custom_scorer == multi_prec:
        print(f"Mean precision score: ", {mean}, ".")
        print(f"Median precision score: ", {median}, ".")
        print(f"Standard Deviation in precision score: ", {std}, ".") 
    elif custom_scorer == multi_rec:
        print(f"Mean recall score: ", {mean}, ".")
        print(f"Median recall score: ", {median}, ".")
        print(f"Standard Deviation in recall score: ", {std}, ".") 

## Dummy Classifier

In [None]:
# Instantiate Dummy Classifier 
dummy_model = DummyClassifier()

#Fit and Evaluate Dummy Classifier
dummy_model.fit(X_train_vectorized, y_train)
dummy_yhat = dummy_model.predict(X_train)
plot_confusion_matrix(dummy_model, X_train, y_train);
print(accuracy_score(y_train, dummy_yhat))

We see our Dummy Model predicts our majority label, 1, for each observation. Due to class imbalance, the model performed at 55% accuracy score.

## Initial Model CV Results

In [None]:
benchmark = []
# Iterate over all algorithms
for algorithm in [MultinomialNB(), LogisticRegression(), KNeighborsClassifier(), DecisionTreeClassifier(), XGBClassifier(), RandomForestClassifier()]:
    # Perform cross validation
    results = cross_val_score(algorithm, X_train_vectorized, y_train)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm')

In [None]:
##Logistic Regression cannot be used for multi-class classification; should explore XGBClassifier and RandomForestClassifier

# Setting up SMOTE Subpipe

In [None]:
subpipe_smote = SMOTE(sampling_strategy='auto', random_state=42)

## Random Forest Classifier

### Initial Pipeline

In [None]:
# Create our random forest pipeline to incorporate column transformer, use default hyperparameters
rfc_pipe = Pipeline(steps=[('vect', TfidfVectorizer(max_features=2000)),
                           ('rfc', RandomForestClassifier(random_state=42))])


In [None]:
# Fit our random forest classifier to the training data
rfc_pipe.fit(X_train, y_train)

In [None]:
# Calculate initial accuracy score of random forest with default hyperparameters
rfc_pipe.score(X_train, y_train)

In [None]:
#not surprising our model has a score of 1; going to be overfit

In [None]:
# Obtained cross-validated accuracy score
cross_val(rfc_pipe, X_train, y_train, accuracy)

In [None]:
#oof... much worse score when cross validating

In [None]:
# Instantiate XGBClassifier with SMOTE subpipe
rfc_sm_pipe = Pipeline(steps=[('vect', TfidfVectorizer(max_features=2000)), 
                           ('sm', subpipe_smote),
                            ('rfc', RandomForestClassifier(random_state=42))])

In [None]:
rfc_sm_pipe.fit(X_train, y_train)

In [None]:
rfc_sm_pipe.score(X_train, y_train)

In [None]:
# Obtained cross-validated accuracy score
cross_val(rfc_sm_pipe, X_train, y_train, accuracy)

## Initial Random Forest Grid Search

In [None]:
# create our grid for the grid search parameters
params = {'rfc__n_estimators': [50, 100, 150],
          'rfc__min_samples_split': [2, 10, 50],
          'rfc__max_depth': [5, 10, 15],
          'vect__max_features': [2000, 3000],
          'vect__ngram_range': [(1,1), (2,2)]}

# set up GridSearchCV object
grid_rfc = GridSearchCV(rfc_pipe, param_grid=params, cv=5, verbose=3)

In [None]:
#grid_rfc.fit(X_train, y_train)

In [None]:
# find the best hyper parameters from our first grid search
#grid_rfc.best_params_

- 'rfc__max_depth': 15,
- 'rfc__min_samples_split': 2,
- 'rfc__n_estimators': 150,
- 'vect__max_features': 2000,
- 'vect__ngram_range': (1, 1)}

In [None]:
#maxed out parameters for max_depth and n_estimators; will increae for next grid search

In [None]:
# calculate the best accuracy score from our first grid search
#grid_rfc.best_score_

0.5529048523773297

In [None]:
pd.DataFrame(grid_rfc.cv_results_)

## XGBoost

In [None]:
# Instantiate our default XGB pipeline
XGB_pipe = ImPipeline(steps=[('vect', TfidfVectorizer(max_features=2000)), 
                             ('XGB', XGBClassifier())])

In [None]:
# Fit the xgb pipeline to our training data
XGB_pipe.fit(X_train, y_train)

In [None]:
#Assess the score
XGB_pipe.score(X_train, y_train)

In [None]:
# Obtained cross-validated accuracy score
cross_val(XGB_pipe, X_train, y_train, accuracy)

In [None]:
training_preds = XGB_pipe.predict(X_train)

In [None]:
print(classification_report(y_train, training_preds))

In [None]:
plot_confusion_matrix(XGB_pipe, X_train, y_train);

In [None]:
# Instantiate XGBClassifier with SMOTE subpipe
XGB_sm_pipe = ImPipeline(steps=[('vect', TfidfVectorizer(max_features=2000)), 
                           ('sm', subpipe_smote),
                            ('XGB', XGBClassifier())])

In [None]:
XGB_sm_pipe.fit(X_train, y_train)

In [None]:
#Assess the score
XGB_sm_pipe.score(X_train, y_train)

In [None]:
# Obtained cross-validated accuracy score
cross_val(XGB_sm_pipe, X_train, y_train, accuracy)

In [None]:
plot_confusion_matrix(XGB_sm_pipe, X_train, y_train);

In [None]:
#results not as good with SMOTE

## Initial XGBoost Grid Search

In [None]:
# create our massive grid for the grid search parameters
paramsXGB = {
    'XGB__learning_rate': [0.1, 0.2],
    'XGB__max_depth': range(3, 10, 2),
    'XGB__min_child_weight': range(1, 8, 2),
    'XGB__gamma': [0, .1, .2],
    'XGB__subsample': [.5, .75, 1],
    'vect__ngram_range': [(1,1), (2,2)]
}

In [None]:
grid_XGB = GridSearchCV(XGB_pipe, param_grid=paramsXGB, cv=5, verbose=3, n_jobs=-2)

#Fit grid search object to our training data to check the hyper parameters

In [None]:
#grid_XGB.fit(X_train, y_train)

In [None]:
# find the best hyper parameters from our first grid search
#grid_XGB.best_params_

- {'XGB__gamma': 0.1,
- 'XGB__learning_rate': 0.2,
- 'XGB__max_depth': 9,
- 'XGB__min_child_weight': 7,
- 'XGB__subsample': 0.75,
- 'vect__ngram_range': (1, 1)}

In [None]:
# calculate the best accuracy score from our first grid search
#grid_XGB.best_score_

0.6189529572493482

In [None]:
plot_confusion_matrix(grid_XGB, X_train, y_train);

In [None]:
print(classification_report(y_train, grid_XGB.predict(X_train)))

If our review is a 5, we are most likely rating that review a 5 (high recall). However, we are also rating many addition reviews a 5 (low precision). 

In [None]:
# Instantiate our XGB pipeline with best params
XGB_pipe2 = ImPipeline(steps=[('vect', TfidfVectorizer(max_features=2000)), 
                             ('XGB', XGBClassifier(gamma=.1, learning_rate=.2, max_depth=9, min_child_weight=7, subsample=.75))])

In [None]:
XGB_pipe2.fit(X_train, y_train)

In [None]:
XGB_pipe2.score(X_train, y_train)

We have achieved 79% accuracy with our params from our grid search.

In [None]:
cross_val(XGB_pipe2, X_train, y_train, accuracy)

As expected, when cross-validating our mean accuracy is only 61.9% (same as .best_score_).

Since we maxed out learning rate, max depth, and min child weight, I will adjust these and use higher ranges in my next grid search.

In [None]:
paramsXGB = {
    'XGB__learning_rate': [0.1, 0.2, 0.3],
    'XGB__max_depth': [9, 12, 15],
    'XGB__min_child_weight': [6, 8, 10],
    'XGB__gamma': [0, .1, .2],
    'XGB__subsample': [.5, .75, 1],
}

In [None]:
grid_XGB2 = GridSearchCV(XGB_pipe, param_grid=paramsXGB, cv=5, verbose=3, n_jobs=-2)


In [None]:
grid_XGB2.fit(X_train, y_train)

In [None]:
grid_XGB2.best_params_

- {'XGB__gamma': 0.2,
- 'XGB__learning_rate': 0.2,
- 'XGB__max_depth': 12,
- 'XGB__min_child_weight': 10,
- 'XGB__subsample': 0.75}

In [None]:
grid_XGB2.best_score_

0.6188977238883981

In [None]:
plot_confusion_matrix(grid_XGB2, X_train, y_train);

In [None]:
#this grid search performed worse than last grid search; will try adding
#Smote in pipeline 

In [None]:
#same as XGB_pipe2 but with Smote
XGB_pipe3 = ImPipeline(steps=[('vect', TfidfVectorizer(max_features=2000)), 
                              ('sm', subpipe_smote),
                             ('XGB', XGBClassifier(gamma=.1, learning_rate=.2, max_depth=9, min_child_weight=7, subsample=.75))])

In [None]:
XGB_pipe3.fit(X_train, y_train)

In [None]:
XGB_pipe3.score(X_train, y_train)

In [None]:
# Obtained cross-validated accuracy score
cross_val(XGB_pipe3, X_train, y_train, accuracy)

In [None]:
#cross val mean of 59.9%, median of 59.8%; not as good as results without SMOTE

In [None]:
print(classification_report(y_train, XGB_pipe3.predict(X_train)))

In [None]:
plot_confusion_matrix(XGB_pipe3, X_train, y_train);

# Modeling with Bigrams Without Additional Stop Words Removed

In [None]:
df2 = pd.read_csv('./data/bigram_data.csv')

In [None]:
df2.head()

## Train Test Split

In [None]:
#train test split for data to utilize bigrams

In [None]:
# Specify X as the cleaned strings in df and y as the target-Rating.
X2 = df2['String']
y2 = df2['Rating']

In [None]:
#Performed the train-test split, using 20% for the hold-out data.
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2,
                                                    y2,
                                                    random_state=42,
                                                    test_size=0.20)

In [None]:
# Instantiate a vectorizer 
# Instantiate and fit/transform X_train using the TF-IDF vectorizer.
tfidf = TfidfVectorizer(ngram_range=(2,2), max_features=2000)
X_train2_vectorized = tfidf.fit_transform(X_train2)
X_train2_vectorized

In [None]:
# Convert vectorized X_train to a vector for easier visual inspection.
X_train2_vec = pd.DataFrame.sparse.from_spmatrix(X_train2_vectorized, columns=tfidf.get_feature_names())
X_train2_vec

In [None]:
# Transform test data using TF-IDF Vectorizer trained on X_train, y_train
X_test2_vectorized = tfidf.transform(X_test2)
X_test2_vectorized

In [None]:
# Convert to DataFrame for visual inspection
X_test2_vec = pd.DataFrame.sparse.from_spmatrix(X_test2_vectorized, columns=tfidf.get_feature_names())
X_test2_vec

In [None]:
benchmark = []
# Iterate over all algorithms
for algorithm in [MultinomialNB(), KNeighborsClassifier(), DecisionTreeClassifier(), XGBClassifier(), RandomForestClassifier()]:
    # Perform cross validation
    results = cross_val_score(algorithm, X_train2_vectorized, y_train2)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm')

In [None]:
##MultinomialNB best in initial cv with defaults; initially not peforming as well with bigrams, will try tweaking models

## Multinomial Model with Bigrams

In [None]:
# Instantiate our default MultinomialNB pipeline
mnb_pipe = ImPipeline(steps=[('vect', TfidfVectorizer(ngram_range=(2,2), max_features=2000)), 
                             ('mnb', MultinomialNB())])

In [None]:
mnb_pipe.fit(X_train2, y_train2)

In [None]:
mnb_pipe.score(X_train2, y_train2)

In [None]:
# Obtained cross-validated accuracy score
cross_val(mnb_pipe, X_train2, y_train2, accuracy)

In [None]:
#grid_XGB.fit(X_train, y_train)

## Random Forest Grid Search

In [None]:
#running the same initial grid search on this data set that did not eliminiate additional stop words

In [None]:
# create our grid for the grid search parameters
params = {'rfc__n_estimators': [50, 100, 150],
          'rfc__min_samples_split': [2, 10, 50],
          'rfc__max_depth': [5, 10, 15],
          'vect__max_features': [2000, 3000],
          'vect__ngram_range': [(1,1), (2,2)]}

# set up GridSearchCV object
grid_rfc = GridSearchCV(rfc_pipe, param_grid=params, cv=5, verbose=3)

In [None]:
#grid_rfc.fit(X_train2, y_train2)

In [None]:
# find the best hyper parameters from our first grid search
#grid_rfc.best_params_

- {'rfc__max_depth': 15,
- 'rfc__min_samples_split': 2,
- 'rfc__n_estimators': 50,
- 'vect__max_features': 2000,
- 'vect__ngram_range': (1, 1)}

In [None]:
# calculate the best accuracy score from our first grid search
#grid_rfc.best_score_

0.5550463303274542

In [None]:
##still seeing ngram_range of (1,1) (single words) producing better results; better results 
#using no additional stop words cleaned... interesting

In [None]:
# create our grid for the grid search parameters
params = {'rfc__n_estimators': [25, 50, 75],
          'rfc__min_samples_split': [2, 3, 5],
          'rfc__max_depth': [15, 20, 25],
          'vect__ngram_range': [(1,1), (2,2)]}

# set up GridSearchCV object
grid_rfc2 = GridSearchCV(rfc_pipe, param_grid=params, cv=5, verbose=3, n_jobs=-2)

In [None]:
grid_rfc2.fit(X_train, y_train)

In [None]:
grid_rfc2.best_score_

In [None]:
grid_rfc2.best_params_

In [None]:
# create our massive grid for the grid search parameters
params5 = {'rfc__n_estimators': [15, 20, 25],
           'rfc__max_depth': [20, 25, 30],
           'rfc__max_features': ['sqrt', 0.1, 0.2, 0.5],
           'rfc__min_samples_split': [2, 3, 4],
           'rfc__min_impurity_decrease': [0, 0.05, 0.1]}

# set up GridSearchCV object
grid_rfc3 = GridSearchCV(rfc_pipe, param_grid=params5, cv=5, verbose=3, n_jobs=-1)

In [None]:
grid_rfc3.fit(X_train, y_train)

In [None]:
grid_rfc3.best_score_

0.5900157236660618

In [None]:
grid_rfc3.best_params_

-{'rfc__max_depth': 30,
- 'rfc__max_features': 0.1,
- 'rfc__min_impurity_decrease': 0,
- 'rfc__min_samples_split': 3,
- 'rfc__n_estimators': 20}