In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
%matplotlib inline

In [2]:
#Required text pre-processing libraries are imported
import string
import nltk
import re

# download the stopwords and wordnet corpus
nltk.download('stopwords')
from nltk.corpus import stopwords
# import tokenize from nltk library
from nltk import tokenize
# import WordNetLemmatizer from nltk library
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import CountVectorizer
from nltk.probability import FreqDist

#Required data visualisation libraries are imported
import plotly.express as px
import seaborn as sns 
import matplotlib.pyplot as plt


#Required prediction modelling libraries are imported
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import plot_confusion_matrix, confusion_matrix, precision_recall_curve, auc, roc_curve, accuracy_score, recall_score, classification_report, f1_score, precision_score, precision_recall_fscore_support, roc_auc_score, make_scorer
from sklearn.pipeline import Pipeline

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline

import xgboost
from xgboost import XGBClassifier

import pickle
import json


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jillian/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/jillian/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/jillian/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jillian/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
#reading in final dataset

In [4]:
df = pd.read_csv('./data/singe_word_data.csv')

In [5]:
df.head()

Unnamed: 0,Clothing_ID,Age,Title,Review_Text,Rating,Recommended_IND,Positive_Feedback_Count,Division_Name,Department Name,Class_Name,Clean,String,Positive_Rating
0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates,"['absolutely', 'wonderful', 'silky', 'sexy', '...",absolutely wonderful silky sexy comfortable,1
1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses,"['sooo', 'pretty', 'happened', 'find', 'store'...",sooo pretty happened find store glad bc never ...,1
2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses,"['high', 'hope', 'wanted', 'work', 'initially'...",high hope wanted work initially petite usual f...,0
3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants,"['jumpsuit', 'fun', 'flirty', 'fabulous', 'eve...",jumpsuit fun flirty fabulous every time get no...,1
4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses,"['shirt', 'flattering', 'due', 'adjustable', '...",shirt flattering due adjustable front tie perf...,1


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22640 entries, 0 to 22639
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Clothing_ID              22640 non-null  int64 
 1   Age                      22640 non-null  int64 
 2   Title                    19675 non-null  object
 3   Review_Text              22640 non-null  object
 4   Rating                   22640 non-null  int64 
 5   Recommended_IND          22640 non-null  int64 
 6   Positive_Feedback_Count  22640 non-null  int64 
 7   Division_Name            22627 non-null  object
 8   Department Name          22627 non-null  object
 9   Class_Name               22627 non-null  object
 10  Clean                    22640 non-null  object
 11  String                   22636 non-null  object
 12  Positive_Rating          22640 non-null  int64 
dtypes: int64(6), object(7)
memory usage: 2.2+ MB


In [7]:
df.isna().sum()

Clothing_ID                   0
Age                           0
Title                      2965
Review_Text                   0
Rating                        0
Recommended_IND               0
Positive_Feedback_Count       0
Division_Name                13
Department Name              13
Class_Name                   13
Clean                         0
String                        4
Positive_Rating               0
dtype: int64

# Modeling

In [8]:
#will be using accuracy as metric; want to identify neutral/negative sentiments and be able
#to sample enough of them to get a clear view of if there is a consistent issue
#that we as a company can change for our soft roll out
#also want to be able to ball park which items will be most popular, so we can have
#an appropriate amount of inventory

For each model we created a pipeline that includes a TF-IDF vectorizer, a smote component to deal with class imbalance, and the classifier itself. We elected to use a TF-IDF vectorizer instead of a count vectorizer because it provides a way to understand the importance of each word to the tweet, as well as just how frequently it occurs.

To give the model a little bit more information with those same features, we'll use a TfidfVectorizer (documentation here) so that it counts not only the term frequency (tf) within a single document, it also includes the inverse document frequency (idf) — how rare the term is.



### Train-Test Split

In [9]:
# Specify X as the cleaned strings in df and y as the target-Positive_Rating.
X = df['String']
y = df['Positive_Rating']

In [10]:
#Performed the train-test split, using 20% for the hold-out data.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=42,
                                                    test_size=0.20)

In [11]:
X_train

19805                        contrast much prettier person
11396    better hanger ive looking feminine plaid flirt...
7884     wasnt much question whether id pant several pa...
21304    legging warm comfortable theyre thick enough p...
7216                     much lower quality robe purchased
                               ...                        
11964    bought grey gorgeous long length arm though tr...
21575    pretty fun see tone subtle pretty beading neck...
5390     bought tee washed time luckily hole yet review...
860      excited see jean since came petite short insea...
15795                       bought sweater favorite season
Name: String, Length: 18112, dtype: object

In [12]:
# Instantiate a vectorizer 
# Instantiate and fit/transform X_train using the TF-IDF vectorizer.
tfidf = TfidfVectorizer()
X_train_vectorized = tfidf.fit_transform(X_train)
X_train_vectorized

ValueError: np.nan is an invalid document, expected byte or unicode string.

In [None]:
# Convert vectorized X_train to a vector for easier visual inspection.
X_train_vec = pd.DataFrame.sparse.from_spmatrix(X_train_vectorized, columns=tfidf.get_feature_names())
X_train_vec

In [None]:
# Transform test data using TF-IDF Vectorizer trained on X_train, y_train
X_test_vectorized = tfidf.transform(X_test)
X_test_vectorized

In [None]:
# Convert to DataFrame for visual inspection
X_test_vec = pd.DataFrame.sparse.from_spmatrix(X_test_vectorized, columns=tfidf.get_feature_names())
X_test_vec

In [None]:
#Function to return scores in cross validation

In [None]:
# Accuracy
accuracy = make_scorer(accuracy_score)

#f1
custom_f1 = make_scorer(
    f1_score, 
    average="weighted")

# Precision
multi_prec = make_scorer(
    precision_score,
    average="weighted")

# Recall
multi_rec = make_scorer(
    recall_score,
    average="weighted")

# This function will allow for quick cross-validation of the chosen score for each of our models.
def cross_val(model, X, y, custom_scorer, kfolds=5):
    """ Perform cross-validated scoring and store/print results """
    results = cross_val_score(model, X, y, cv=kfolds, scoring=custom_scorer)
    mean = np.mean(results)
    median = np.median(results)
    std = np.std(results)
    if custom_scorer == accuracy:
        print(f"Mean accuracy score: ", {mean}, ".")
        print(f"Median acuracy score: ", {median}, ".")
        print(f"Standard Deviation in accuracy: ", {std}, ".") 
    elif custom_scorer == custom_f1:
        print(f"Mean f1 score: ", {mean}, ".")
        print(f"Median f1 score: ", {median}, ".")
        print(f"Standard Deviation in f1 score: ", {std}, ".") 
    elif custom_scorer == multi_prec:
        print(f"Mean precision score: ", {mean}, ".")
        print(f"Median precision score: ", {median}, ".")
        print(f"Standard Deviation in precision score: ", {std}, ".") 
    elif custom_scorer == multi_rec:
        print(f"Mean recall score: ", {mean}, ".")
        print(f"Median recall score: ", {median}, ".")
        print(f"Standard Deviation in recall score: ", {std}, ".") 

## Dummy Classifier

In [None]:
# Instantiate Dummy Classifier 
dummy_model = DummyClassifier()

#Fit and Evaluate Dummy Classifier
dummy_model.fit(X_train_vectorized, y_train)
dummy_yhat = dummy_model.predict(X_train)
plot_confusion_matrix(dummy_model, X_train, y_train);
print(accuracy_score(y_train, dummy_yhat))

We see our Dummy Model predicts our majority label, 1, for each observation. Due to class imbalance, the model performed well at 77% accuracy score.

## Multinomal Naive Bayes

Now that we have preprocessed data, we can fit and evaluate a multinomial Naive Bayes classifier using cross_val_score.

In [None]:
# Instantiate a MultinomialNB classifier
multinomial_model = MultinomialNB()

# Evaluate the classifier on X_train_vectorized and y_train
multinomial_cv = cross_val_score(multinomial_model, X_train_vectorized, y_train)
multinomial_cv

In [None]:
y_train.value_counts(normalize=True)

If we guessed the plurality class every time (class 1), we would expect about 77% accuracy score (captured by our Dummy Model). So when this model is getting 78% accuracy, that is a very minimal improvement over just guessing. Let's see if we can improve that with more sophisticated modeling techinques. 

In [None]:
#With no max_features set, we see X_train_vec and X_test_vec contains 15448 columns(unique words); this many words will create
#a lot of "noise"; we want to set our max_features to only focus on the words that are appearing more often
#setting max_features to 500 improved Multinomial NB to over 80%

## Initial Model CV Results

In [None]:
benchmark = []
# Iterate over all algorithms
for algorithm in [MultinomialNB(), LogisticRegression(), KNeighborsClassifier(), DecisionTreeClassifier(), XGBClassifier(), RandomForestClassifier()]:
    # Perform cross validation
    results = cross_val_score(algorithm, X_train_vectorized, y_train)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm')

In [None]:
#based on initial run through of algorithms, it looks like I should explore LogisticRegression, XGBoost, and RandomForestClassifier more

In [None]:
#also want to play with vectorizor params

## Grid Searching Logistic Regression 

In [None]:
lr_pipe = Pipeline([
           ('vect', TfidfVectorizer()),
           ('lr', LogisticRegression(random_state=42)),
])


In [None]:
lr_pipe.fit(X_train, y_train)

In [None]:
lr_pipe.score(X_train, y_train)

In [None]:
#initial Logistic Regression score with default parameters

In [None]:
# Perform a cross validation to determine whether or not the model is overfit
avg_lr_cv = np.mean(cross_val_score(estimator=lr_pipe, X=X_train, y=y_train, cv=5))

In [None]:
avg_lr_cv

In [None]:
#setting up grid to perform grid serach to tune hyperparameters

In [None]:
lr_params = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__max_features': (None, 500, 1000, 2000), 
    'lr__solver': ('lbfgs', 'saga', 'liblinear'),
    'lr__penalty': ('l2', 'elasticnet'),
    'lr__class_weight': (None, 'balanced')
    }

In [None]:
# set up GridSearchCV object
grid_lr = GridSearchCV(lr_pipe, param_grid=lr_params, cv=5, verbose=3)

In [None]:
# fit our grid object for Logistic Regression to the training data
#grid_lr.fit(X_train, y_train)

In [None]:
#best_parameters = grid_lr.best_params_

#print('Grid Search found the following optimal parameters: ')
#for param_name in sorted(best_parameters.keys()):
    #print('%s: %r' % (param_name, best_parameters[param_name]))

#training_preds = grid_lr.predict(X_train)
#training_accuracy = accuracy_score(y_train, training_preds)
#training_recall = recall_score(y_train, training_preds, average = None)

#print('Training Accuracy: {:.4}%'.format(training_accuracy * 100))
#print(training_recall)

Grid Search found the following optimal parameters: 
- lr__class_weight: None
- lr__penalty: 'l2'
- lr__solver: 'lbfgs'
- vect__max_df: 0.5
- vect__max_features: 2000
- Training Accuracy: 89.38%
- [0.65782557 0.96473461]

In [None]:
lr2_params = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__max_features': (2000, 3500, 5000, 7000), 
    'lr__solver': ('lbfgs', 'saga', 'liblinear'),
    'lr__penalty': ('l2', 'elasticnet'),
    'lr__class_weight': (None, 'balanced')
    }

In [None]:
grid_lr2 = GridSearchCV(lr_pipe, param_grid=lr2_params, cv=5, verbose=3)

In [None]:
#grid_lr2.fit(X_train, y_train)

In [None]:
#best_parameters = grid_lr2.best_params_

#print('Grid Search found the following optimal parameters: ')
#for param_name in sorted(best_parameters.keys()):
    #print('%s: %r' % (param_name, best_parameters[param_name]))

#training_preds = grid_lr.predict(X_train)
#training_accuracy = accuracy_score(y_train, training_preds)
#training_recall = recall_score(y_train, training_preds, average = None)

#print('Training Accuracy: {:.4}%'.format(training_accuracy * 100))
#print(training_recall)

Grid Search found the following optimal parameters: 
- lr__class_weight: None
- lr__penalty: 'l2'
- lr__solver: 'saga'
- vect__max_df: 0.5
- vect__max_features: 3500
- Training Accuracy: 89.38%
- [0.65782557 0.96473461]

In [None]:
#same training accuracy as above paraments; best accuracy = 65.7% accurate at identifying class 0

In [None]:
#testing if we get better results with SMOTE with best params from first grid searc

In [None]:
lr2_pipe = ImPipeline([
           ('vect', TfidfVectorizer(max_df=.5, max_features=2000)),
            ('sm', SMOTE(random_state=42)),
           ('lr', LogisticRegression(random_state=42)),
])


In [None]:
lr2_pipe.fit(X_train, y_train)

In [None]:
lr2_yhat = lr2_pipe.predict(X_train)


In [None]:
plot_confusion_matrix(lr_pipe, X_train, y_train)

In [None]:
plot_confusion_matrix(lr2_pipe, X_train, y_train)

In [None]:
#comparing confusion matrix w/ smote (lr2) versus confusion matrix w/o smote (lr)

In [None]:
# Show Classification Report
print(classification_report(y_train, lr2_yhat))

In [None]:
cross_val(lr2_pipe, X_train, y_train, accuracy)

- Mean accuracy score:  {0.8565823084586492} .
- Median acuracy score:  {0.8578133627829928} .
- Standard Deviation in accuracy:  {0.0034112273820527256} .

In [None]:
#adjusting max_features params

In [None]:
lr3_params = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__max_features': (1500, 2000, 3000), 
    'lr__solver': ('lbfgs', 'saga', 'liblinear'),
    'lr__penalty': ('l2', 'elasticnet'),
    'lr__class_weight': (None, 'balanced')
    }

In [None]:
grid_lr3 = GridSearchCV(lr_pipe, param_grid=lr3_params, cv=5, verbose=3)

In [None]:
#grid_lr3.fit(X_train, y_train)

In [None]:
#best_parameters = grid_lr3.best_params_

#print('Grid Search found the following optimal parameters: ')
#for param_name in sorted(best_parameters.keys()):
    #print('%s: %r' % (param_name, best_parameters[param_name]))

#training_preds = grid_lr3.predict(X_train)
#training_accuracy = accuracy_score(y_train, training_preds)
#training_recall = recall_score(y_train, training_preds, average = None)

#print('Training Accuracy: {:.4}%'.format(training_accuracy * 100))
#print(training_recall)

Grid Search found the following optimal parameters: 
- lr__class_weight: None
- lr__penalty: 'l2'
- lr__solver: 'saga'
- vect__max_df: 0.5
- vect__max_features: 3000
- Training Accuracy: 89.62%
- [0.66332139 0.96624291]

In [None]:
#vect max features of 3000 seems to be optimal

In [None]:
##utilizing current best model as best_model to test out model with Twitter data; will still
#try to tune models

In [None]:
lr3_pipe = ImPipeline([
           ('vect', TfidfVectorizer(max_df=.5, max_features=3000)),
           ('lr', LogisticRegression(random_state=42, class_weight=None, solver='saga')),
])


In [None]:
lr3_pipe.fit(X_train, y_train)

In [None]:
lr3_pipe.score(X_train, y_train)

In [None]:
#trying using ngram_range of 1,2 to see if accuracy improves looking at single words and bigrams; silght improvement (.04%)

In [None]:
lr4_pipe = ImPipeline([
           ('vect', TfidfVectorizer(max_df=.5, max_features=3000, ngram_range=(1,2))),
           ('lr', LogisticRegression(random_state=42, class_weight=None, solver='saga')),
])

In [None]:
lr4_pipe.fit(X_train, y_train)

In [None]:
lr4)pipe.score(X_train, y_train)

## Grid Searching XGBoost

In [None]:
# Instantiate our default XGB pipeline
XGB_pipe = ImPipeline(steps=[('vect', TfidfVectorizer(max_features=3000)), 
                             ('XGB', XGBClassifier())])

In [None]:
# create our massive grid for the grid search parameters
paramsXGB = {
    'XGB__learning_rate': [0.1, 0.2],
    'XGB__max_depth': range(3, 10, 2),
    'XGB__min_child_weight': range(1, 8, 2),
    'XGB__gamma': [0, .1, .2],
    'XGB__subsample': [.5, .75, 1],
    'vect__ngram_range': [(1,1), (2,2)],
}

In [None]:
grid_XGB = GridSearchCV(XGB_pipe, param_grid=paramsXGB, cv=5, verbose=3, n_jobs=-2)

#Fit grid search object to our training data to check the hyper parameters

In [None]:
#grid_XGB.fit(X_train, y_train)

In [None]:
#grid_XGB.best_params_

- {'XGB__gamma': 0,
- 'XGB__learning_rate': 0.2,
- 'XGB__max_depth': 9,
- 'XGB__min_child_weight': 5,
- 'XGB__subsample': 0.75,
- 'vect__ngram_range': (1, 1)}

In [None]:
#grid_XGB.best_score_

0.8630436967252351

In [None]:
#plot_confusion_matrix(grid_XGB, X_train, y_train);

In [None]:
#print(classification_report(y_train, grid_XGB.predict(X_train)))

In [None]:
    #precision    recall  f1-score   support

        #0       0.89      0.72      0.80      4185
        #1       0.92      0.97      0.95     13923

    #accuracy                           0.91     18108
  # macro avg       0.91      0.85      0.87     18108
#weighted avg       0.91      0.91      0.91     18108


## Grid Searching RandomForestClassfier

In [None]:
#since I want to return predict proba, might not make sense to use Random Forest Classifier

# Pickling Model

Playing around with pickling using current "Best Model"; Note: DO NOT NEED TO TEST TWEETS

In [None]:
# save the model to disk
#filename = 'finalized_model.sav'
#pickle.dump(best_model, open(filename, 'wb'))

In [None]:
#loaded_model = pickle.load(open(filename, 'rb'))
#result = loaded_model.score(X_test, y_test)
#print(result)