## Binary Sentiment Prediction

In [2]:
import pandas as pd
import numpy as np

In [3]:
yelp_df = pd.read_csv('data/yelp_revs.csv')

In [4]:
yelp_df.head()

Unnamed: 0,author,datePublished,reviewRating,description,name
0,Jess S.,2021-07-05,5,Everything from the atmosphere to the service ...,MILA Plant-Based
1,Arlene B.,2021-10-03,4,You dont have to be vegan to appreciate the fo...,MILA Plant-Based
2,Poonam M.,2021-08-08,4,This the old Juniper so the layout is the same...,MILA Plant-Based
3,Suzan L.,2021-06-26,1,Very disappointing experience The Sakura cockt...,MILA Plant-Based
4,Sangeetha M.,2021-05-05,4,I ordered for delivery so I cant comment on th...,MILA Plant-Based


Since we're doing binary sentiment prediction, we need to place the numerical ratings into two categories: 'good' and 'bad'

In [5]:
def sentiment(rating):
    if rating > 3:
        return 'good'
    else:
        return 'bad'

In [7]:
yelp_df['reviewSentiment'] = yelp_df['reviewRating'].apply(sentiment)
yelp_df.head()

Unnamed: 0,author,datePublished,reviewRating,description,name,reviewSentiment
0,Jess S.,2021-07-05,5,Everything from the atmosphere to the service ...,MILA Plant-Based,good
1,Arlene B.,2021-10-03,4,You dont have to be vegan to appreciate the fo...,MILA Plant-Based,good
2,Poonam M.,2021-08-08,4,This the old Juniper so the layout is the same...,MILA Plant-Based,good
3,Suzan L.,2021-06-26,1,Very disappointing experience The Sakura cockt...,MILA Plant-Based,bad
4,Sangeetha M.,2021-05-05,4,I ordered for delivery so I cant comment on th...,MILA Plant-Based,good


In [9]:
yelp_df['reviewSentiment'].value_counts()

good    471
bad     115
Name: reviewSentiment, dtype: int64

There is an imbalance in class distribution so we will need to address that during training

### Data Splits

In [10]:
from sklearn.model_selection import (
    GridSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
    RandomizedSearchCV
)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(yelp_df['description'], yelp_df['reviewSentiment'])

In [None]:
#!pip install --user pandas_profiling

In [None]:
#from pandas_profiling import ProfileReport

#train_good = yelp_df.query('reviewSentiment == "good"')
#train_bad = yelp_df.query('reviewSentiment == "bad"')

#good_profile = ProfileReport(train_good, title="Pandas Profiling Report")
#good_profile.to_notebook_iframe()

### Model Building

In [14]:
import re
import sys
from hashlib import sha1

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Autograding
#import tests_lab4
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

# train test split and cross validation
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [15]:
def store_cross_val_results(model_name, scores, results_dict):
    """
    Stores mean scores from cross_validate in results_dict for
    the given model model_name.

    Parameters
    ----------
    model_name :
        scikit-learn classification model
    scores : dict
        object return by `cross_validate`
    results_dict: dict
        dictionary to store results

    Returns
    ----------
        None

    """
    results_dict[model_name] = {
        "mean_train_accuracy": "{:0.4f}".format(np.mean(scores["train_score"])),
        "mean_valid_accuracy": "{:0.4f}".format(np.mean(scores["test_score"])),
        "mean_fit_time (s)": "{:0.4f}".format(np.mean(scores["fit_time"])),
        "mean_score_time (s)": "{:0.4f}".format(np.mean(scores["score_time"])),
        "std_train_score": "{:0.4f}".format(scores["train_score"].std()),
        "std_valid_score": "{:0.4f}".format(scores["test_score"].std()),
    }

#### Dummy Classifier

In [17]:
results_dict = {}
dummy = DummyClassifier()
scores = cross_validate(dummy, X_train, y_train, n_jobs=-1, return_train_score=True)
store_cross_val_results("Dummy", scores, results_dict)
pd.DataFrame(results_dict).T

Unnamed: 0,mean_fit_time (s),mean_score_time (s),mean_train_accuracy,mean_valid_accuracy,std_train_score,std_valid_score
Dummy,0.002,0.0014,0.6657,0.6969,0.0097,0.0402


#### Other Models

In [19]:
models = {
    "Decision Tree": DecisionTreeClassifier(class_weight='balanced'),
    "RBF SVM": SVC(class_weight='balanced'),
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=2000, class_weight='balanced'),
}

for model_name, model in models.items():
    pipe = make_pipeline(CountVectorizer(), model)
    scores = cross_validate(pipe, X_train, y_train, return_train_score=True, n_jobs=-1)
    store_cross_val_results(model_name, scores, results_dict)

pd.DataFrame(results_dict).T

Unnamed: 0,mean_train_accuracy,mean_valid_accuracy,mean_fit_time (s),mean_score_time (s),std_train_score,std_valid_score
Dummy,0.6657,0.6969,0.002,0.0014,0.0097,0.0402
Decision Tree,1.0,0.7608,0.099,0.0154,0.0,0.0439
RBF SVM,0.9516,0.8157,0.1536,0.0332,0.0072,0.0596
Naive Bayes,0.9789,0.8543,0.0552,0.0098,0.0043,0.0206
Logistic Regression,1.0,0.8634,0.1552,0.0094,0.0,0.02


#### Hyperparameter Optimization

In [20]:
pipe = make_pipeline(CountVectorizer(), LogisticRegression(max_iter=1000, class_weight='balanced'))
pipe.fit(X_train, y_train)
vocab_size = len(pipe["countvectorizer"].get_feature_names())  # get the vocab_size for
print("Vocab size: ", vocab_size)

Vocab size:  4619


In [21]:
from scipy.stats import lognorm, loguniform, randint

param_grid = {
    "logisticregression__C": loguniform(1e-3, 1e3),
    "countvectorizer__max_features": randint(100, vocab_size),
}

In [22]:
random_search = RandomizedSearchCV(
    pipe,
    param_grid,
    n_iter=50,
    verbose=1,
    n_jobs=-1,
    random_state=123,
    return_train_score=True,
)

random_search.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 235 out of 250 | elapsed:    6.5s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    6.8s finished


RandomizedSearchCV(estimator=Pipeline(steps=[('countvectorizer',
                                              CountVectorizer()),
                                             ('logisticregression',
                                              LogisticRegression(class_weight='balanced',
                                                                 max_iter=1000))]),
                   n_iter=50, n_jobs=-1,
                   param_distributions={'countvectorizer__max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000208427DF640>,
                                        'logisticregression__C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000208427DF520>},
                   random_state=123, return_train_score=True, verbose=1)

In [23]:
print("Best hyperparameter values: ", random_search.best_params_)
print("Best score: %0.3f" % (random_search.best_score_))

pd.DataFrame(random_search.cv_results_)[
    [
        "mean_train_score",
        "mean_test_score",
        "param_logisticregression__C",
        "param_countvectorizer__max_features",
        "mean_fit_time",
        "rank_test_score",
    ]
].set_index("rank_test_score").sort_index()

Best hyperparameter values:  {'countvectorizer__max_features': 1042, 'logisticregression__C': 5.806334557802442}
Best score: 0.868


Unnamed: 0_level_0,mean_train_score,mean_test_score,param_logisticregression__C,param_countvectorizer__max_features,mean_fit_time
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1.0,0.867973,5.80633,1042,0.131202
2,1.0,0.867947,18.9554,3682,0.229753
2,1.0,0.867947,10.3725,3787,0.184199
2,1.0,0.867947,14.7411,3425,0.157998
2,1.0,0.867947,12.577,3416,0.179399
6,1.0,0.8657,326.844,967,0.153405
7,1.0,0.865674,4.15299,3637,0.168004
8,1.0,0.865648,0.42799,3581,0.132601
8,1.0,0.865648,0.358907,3962,0.144812
10,1.0,0.863401,18.5503,4033,0.193421


#### Model Interpretation

In [24]:
best_estimator = random_search.best_estimator_

In [25]:
vocab = best_estimator[
    "countvectorizer"
].get_feature_names()  # Get features (words in our case)

weights = best_estimator[
    "logisticregression"
].coef_.flatten()  # Get feature coefficients

inds = np.argsort(
    best_estimator["logisticregression"].coef_.flatten()
)  # Sort the coefficients in descending order


negative_words = [
    vocab[index] for index in inds[:20]
]  # pick the first 20 as most informative features for negative reviews

positive_words = [
    vocab[index] for index in inds[-20:]
]  # pick the last 20 features as most informative features for positive reviews

neg_words_weights = [(weights[index]) for index in inds[:20]]
pos_words_weights = [(weights[index]) for index in inds[-20:]]

df = pd.DataFrame(
    {
        "Neg feats": negative_words,
        "Neg weights": neg_words_weights,
        "Pos feats": positive_words,
        "Pos weights": pos_words_weights,
    }
)
df

Unnamed: 0,Neg feats,Neg weights,Pos feats,Pos weights
0,thats,-1.609052,flavorful,0.648075
1,disappointed,-1.474498,more,0.663106
2,ordered,-1.261018,tried,0.672982
3,never,-1.239542,end,0.688098
4,kitchen,-1.211926,yam,0.701833
5,there,-1.102574,amazing,0.702996
6,off,-1.022058,nice,0.703865
7,not,-0.989468,favourite,0.719984
8,like,-0.977897,worth,0.721632
9,dish,-0.971393,friendly,0.72834


#### Model Evaluation

In [26]:
best_model = random_search.best_estimator_
best_model.fit(
    X_train, y_train
)  # Not necessary, as by default `refit=True` in `RandomizedSearchCV`. But OK for the purpose of this lab.
print("Grid Search best model score: %0.3f" % (random_search.best_score_))
print("Train score on the full train set: %0.3f" % (best_model.score(X_train, y_train)))
print("Test score on the full test set: %0.3f" % (best_model.score(X_test, y_test)))


Grid Search best model score: 0.868
Train score on the full train set: 1.000
Test score on the full test set: 0.857


In [27]:
most_pos_prob = np.max(best_model.predict_proba(X_test)[:, 1])
most_pos_ind = np.argmax(best_model.predict_proba(X_test)[:, 1])
print(
    "Most positive review where the prob of being positive is %0.3f:\n%s"
    % (most_pos_prob, X_test.iloc[most_pos_ind])
)

Most positive review where the prob of being positive is 1.000:
I absolutely love Chau I cant believe that for years I held off trying it because I was under the false belief that I wouldnt feel fully satisfied with it being vegan I finally tried it one evening with a vegan friend and boy was I wrong The moment the golden temple broth touched my lips I was hooked Ive been coming here regularly ever since Even my meat-loving boyfriend loves it Im so obsessed with the golden-temple soup that I cant not order it every single time My favourite way to have it is with the quinoa which sounds like its going to be too healthy tasting but it just compliments the soup so well The texture of the quinoa is so nice in the soup and it doesnt really absorb the broth too much I also like to get extra veggies between my boyfriend and I we get one order of extra regular veggies broccoli cauliflower carrot and one extra order of yams The extra veggies makes it all the more hearty and I always have leftov

In [28]:
most_neg_prob = np.min(best_model.predict_proba(X_test)[:, 1])
most_neg_ind = np.argmin(best_model.predict_proba(X_test)[:, 1])
print(
    "Most negative review where the prob of being positive is %0.3f:\n%s"
    % (most_neg_prob, X_test.iloc[most_neg_ind])
)

Most negative review where the prob of being positive is 0.000:
Visit the Arbor if you feel like getting yelled at by a server My spouse and I attended the Arbor to purchase a gift card or gift certificate for a friend We like the food at the Arbor and because we live nearby we find ourselves there once a month or so often brining along friends or family Today when we approached the counter I started to request information about a gift card or gift certificate I began to say Hi would it be possible to before I was rudely cut off by the man at the desk He raised his voice interrupting me mid-sentence to say Ill be with you in a minute Needless to say my spouse and I were put off enough that we left and will not be returning With his condescending and unfriendly behavior the staffperson at the front counter not only lost our business tonight in terms of the gift card we hoped to buy but our business in the future and the business of our friends for whom we were buying the gift card It wo