# Text Processing - Yelp 2021 - Part 2

This notebook covers:
* Tf-Idf Text Vectorization
* Naive Bayes Predictions
* Support Vector Machine Predictions

## Imports and Global Settings

In [83]:
# Common Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Main NLP libraries
import nltk
# Classification
from sklearn.model_selection import cross_validate, GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import make_scorer, recall_score, precision_score

pd.set_option('display.float_format', lambda x: '%.5f' % x)

## Import Data

In [2]:
file_location = "../data/full_data/analytics_ready/"
filename = "text_data.json"

In [3]:
# 6907890 records available
num_records_to_load = 10000

In [4]:
df = pd.read_json(file_location + filename, nrows=num_records_to_load, orient="records", lines=True)

## Dataframe Pre-Processing

In [5]:
df.head(5)

Unnamed: 0,review_id,review_stars,review_text,target_ufc_bool,target_ufc_count
0,---zlFD4Kgfatr0SbDh_zg,4,Been looking for a halfway decent Chinese/Amer...,False,0
1,--BcxYRlOpG0v7nVQWseYA,4,I visited Kyma last week for the first time an...,False,0
2,--KO46TSxWzv32x00s5w9Q,5,It might be the most expensive gelato I've eve...,False,0
3,--XNrIWxRUafMsGqzB5o0g,5,"Love this place! They have great antiques, be...",True,1
4,--aGgQu9HVva6F9fB2-0ew,4,Great salad and cold sandwich.. The soup is am...,False,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   review_id         10000 non-null  object
 1   review_stars      10000 non-null  int64 
 2   review_text       10000 non-null  object
 3   target_ufc_bool   10000 non-null  object
 4   target_ufc_count  10000 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 390.8+ KB


In [7]:
df.target_ufc_bool.value_counts()

True     5134
False    4866
Name: target_ufc_bool, dtype: int64

## Splitting Text

In [8]:
corpus = df.review_text
text_target = df[['target_ufc_bool', 'review_text']]
Q_corpus = df[df["target_ufc_bool"] == "True"]["review_text"]
NQ_corpus = df[df["target_ufc_bool"] == "False"]["review_text"]
print(f'Corpus Size: Total:{corpus.size}, Quality:{Q_corpus.size}, Not Quality:{NQ_corpus.size}')

Corpus Size: Total:10000, Quality:5134, Not Quality:4866


## Prep Work

In [9]:
baseline_cls_data = df[['review_text', 'target_ufc_bool']]
baseline_cls_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   review_text      10000 non-null  object
 1   target_ufc_bool  10000 non-null  object
dtypes: object(2)
memory usage: 156.4+ KB


In [10]:
X = baseline_cls_data.review_text
y = baseline_cls_data.target_ufc_bool

In [11]:
scoring = {'roc_auc': 'roc_auc',
           'accuracy': 'accuracy',
           'precision': make_scorer(precision_score, pos_label="True"),
           'recall': make_scorer(recall_score, pos_label="True")}

In [12]:
def display_clf_results(cv_clf):
    print('Model Performance')
    print(f'Fit Time: {np.mean(cv_clf["fit_time"]):.2f} secs, Score Time: {np.mean(cv_clf["score_time"]):.2f} secs')
    print(f'Accuracy: {np.mean(cv_clf["test_accuracy"]):.2f}')
    print(f'ROC AUC: {np.mean(cv_clf["test_roc_auc"]):.2f}')
    print(f'Precision: {np.mean(cv_clf["test_precision"]):.2f}')
    print(f'Recall: {np.mean(cv_clf["test_recall"]):.2f}')

### Majority Class Baseline (True or Quality)

In [84]:
majority_class_baseline = round(len(y[y=="True"]) / len(y), 2)
print(f'Majority Class Baseline: {majority_class_baseline}')

Majority Class Baseline: 0.51


### Word Count Baseline

In [85]:
word_count_pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', SGDClassifier()),
])

In [86]:
wc_results = cross_validate(word_count_pipeline, X, y, cv=5, scoring=scoring)

In [87]:
display_clf_results(wc_results)

Model Performance
Fit Time: 0.77 secs, Score Time: 0.30 secs
Accuracy: 0.59
ROC AUC: 0.61
Precision: 0.61
Recall: 0.55


### Tf-idf Baseline

In [88]:
tfidf_pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

In [89]:
tfidf_results = cross_validate(tfidf_pipeline, X, y, cv=5, scoring=scoring)

In [90]:
display_clf_results(tfidf_results)

Model Performance
Fit Time: 0.75 secs, Score Time: 0.32 secs
Accuracy: 0.61
ROC AUC: 0.65
Precision: 0.62
Recall: 0.62


### Tf-idf Grid Search Hyperparameter Tuning

In [20]:
tfidf_grid_cv_pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

tfidf_grid_cv_params = {
    'vect__strip_accents': [None, 'ascii'],
    'vect__lowercase': [True, False],
    'vect__stop_words': [None, 'english'],
    'vect__ngram_range': [(1,1), (1,2), (1,3)],
    'tfidf__norm': ['l1', 'l2'],
    'clf__alpha': [0.1, 1, 10]
}

In [21]:
tfidf_grid_cv_MNB = GridSearchCV(tfidf_grid_cv_pipeline, tfidf_grid_cv_params,
                      n_jobs=1, cv=5, scoring=scoring, refit='accuracy', verbose=3)
tfidf_grid_cv_MNB.fit(X, y)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect', CountVectorizer()),
                                       ('tfidf', TfidfTransformer()),
                                       ('clf', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'clf__alpha': [0.1, 1, 10],
                         'tfidf__norm': ['l1', 'l2'],
                         'vect__lowercase': [True, False],
                         'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
                         'vect__stop_words': [None, 'english'],
                         'vect__strip_accents': [None, 'ascii']},
             refit='accuracy',
             scoring={'accuracy': 'accuracy',
                      'precision': make_scorer(precision_score, pos_label=True),
                      'recall': make_scorer(recall_score, pos_label=True),
                      'roc_auc': 'roc_auc'},
             verbose=3)

In [25]:
idx = tfidf_grid_cv_MNB.best_index_
results = tfidf_grid_cv_MNB.cv_results_

In [26]:
def display_clf_grid_search_results(cv_clf_gs, best_idx):
    print('Model Performance')
    print(f'Fit Time: {cv_clf_gs["mean_fit_time"][best_idx]:.2f} secs, Score Time: {cv_clf_gs["mean_score_time"][best_idx]:.2f} secs')
    print(f'Accuracy: {cv_clf_gs["mean_test_accuracy"][best_idx]:.2f}')
    print(f'ROC AUC: {cv_clf_gs["mean_test_roc_auc"][best_idx]:.2f}')
    print(f'Precision: {cv_clf_gs["mean_test_precision"][best_idx]:.2f}')
    print(f'Recall: {cv_clf_gs["mean_test_recall"][best_idx]:.2f}')
    print('Best model choosen using accuracy.')
    print(f'Best ROC AUC: {max(cv_clf_gs["mean_test_roc_auc"]):.2f}')
    print(f'Best Precision: {max(cv_clf_gs["mean_test_precision"]):.2f}')
    print(f'Best Recall: {max(cv_clf_gs["mean_test_recall"]):.2f}')

In [27]:
display_clf_grid_search_results(results, idx)

Model Performance
Fit Time: 1.49 secs, Score Time: 0.74 secs
Accuracy: 0.61
ROC AUC: 0.65
Precision: 0.61
Recall: 0.66
Best model choosen using accuracy.
Best ROC AUC: 0.68
Best Precision: 0.61
Best Recall: 1.00


### Feature Reduction Using Chi-Squared Test

In [112]:
tfidf_chi2_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1,2))),
    ('feature_selection', SelectKBest(chi2, k=10)),
    ('clf', SGDClassifier()),
])

In [113]:
tfidf_chi2_results = cross_validate(tfidf_chi2_pipeline, X, y, cv=5, scoring=scoring)

In [114]:
display_clf_results(tfidf_chi2_results)

Model Performance
Fit Time: 2.04 secs, Score Time: 0.54 secs
Accuracy: 0.55
ROC AUC: 0.55
Precision: 0.54
Recall: 0.89
