# Text Classification & Sentiment Analysis: Yelp Reviews

### Loading Libraries

In [1]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Warnings
import warnings

# Joblib, Path & Time
import joblib
from time import time
from pathlib import Path

# Java Script Object Notation
import json

# Light Booster
import lightgbm as lgb

#SciPy
from scipy import sparse

# TextBlob
from textblob import TextBlob

# Scikit-Learn
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
%matplotlib inline

In [3]:
sns.set_style('white')

warnings.filterwarnings('ignore')

### Yelp Challenge: Business Reviews Dataset

#### Loading Data

In [4]:
data_dir = Path('..', 'data', 'yelp')

In [6]:
yelp_reviews = pd.read_parquet(data_dir / 'user_reviews.parquet')

In [7]:
yelp_reviews.info(null_counts=True)

### Exploring Data

In [20]:
yelp_dir = Path('results', 'yelp')

text_features_dir = yelp_dir / 'data'

if not text_features_dir.exists():
    text_features_dir.mkdir(exist_ok=True, parents=True)

#### Reviews & Stars by Year

In [21]:
fig, axes = plt.subplots(ncols=3, figsize=(18, 4))
yelp_reviews.year.value_counts().sort_index().plot.bar(title='Reviews per Year', ax=axes[0], rot=0);
sns.lineplot(x='year', y='stars', data=yelp_reviews, ax=axes[1])
axes[1].set_title('Stars per year')

stars_dist = yelp_reviews.stars.value_counts(normalize=True).sort_index().mul(100)
stars_dist.index = stars_dist.index.astype(int)
stars_dist.plot.barh(title='# Stars Breakdown', ax=axes[2])
axes[2].set_xlabel('Share of all Ratings (%)')
axes[2].set_ylabel('Number of Stars');

sns.despine()
fig.tight_layout();
plt.show()

#### Years of Membership Breakdown

In [22]:
ax = yelp_reviews.member_yrs.value_counts().div(1000).sort_index().plot.bar(title='Years of Membership',
                                                                            rot=0)
ax.set_xlabel('Number of Years')
ax.set_ylabel("Number of Members  ('000)")
sns.despine()
plt.tight_layout()
plt.show()

### Creating Train-Test Split

In [23]:
train = yelp_reviews[yelp_reviews.year < 2019].sample(frac=.25)

test = yelp_reviews[yelp_reviews.year == 2019]

In [24]:
print(f'# Training Obs: {len(train):,.0f} | # Test Obs: {len(test):,.0f}')

In [25]:
train.to_parquet(text_features_dir / 'train.parquet')

test.to_parquet(text_features_dir / 'test.parquet')

In [26]:
del yelp_reviews

#### Reloading Stored Data

In [27]:
train = pd.read_parquet(text_features_dir / 'train.parquet')

test = pd.read_parquet(text_features_dir / 'test.parquet')

### Creating Yelp Review Document-Term Matrix

In [28]:
vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 2), max_features=10000)

train_dtm = vectorizer.fit_transform(train.text)
train_dtm

In [29]:
sparse.save_npz(text_features_dir / 'train_dtm', train_dtm)

In [30]:
test_dtm = vectorizer.transform(test.text)

sparse.save_npz(text_features_dir / 'test_dtm', test_dtm)

#### Reloading Stored Data

In [31]:
train_dtm = sparse.load_npz(text_features_dir / 'train_dtm.npz')

test_dtm = sparse.load_npz(text_features_dir / 'test_dtm.npz')

### Combine Non-Text Features with The Document-Term Matrix

#### One-Hot-Encoding

In [32]:
df = pd.concat([train.drop(['text', 'stars'], axis=1).assign(source='train'),
                test.drop(['text', 'stars'], axis=1).assign(source='test')])

In [33]:
uniques = df.nunique()

binned = pd.concat([(df.loc[:, uniques[uniques > 20].index]
                     .apply(pd.qcut, q=10, labels=False, duplicates='drop')),
                    df.loc[:, uniques[uniques <= 20].index]], axis=1)

binned.info(null_counts=True)

In [34]:
dummies = pd.get_dummies(binned, 
                         columns=binned.columns.drop('source'), 
                         drop_first=True)

dummies.info()

In [35]:
train_dummies = dummies[dummies.source=='train'].drop('source', axis=1)

train_dummies.info()

### Training set

In [36]:
train_numeric = sparse.csr_matrix(train_dummies.astype(np.uint8))

train_numeric.shape

In [37]:
train_dtm_numeric = sparse.hstack((train_dtm, train_numeric))

train_dtm_numeric.shape

In [38]:
sparse.save_npz(text_features_dir / 'train_dtm_numeric', 
                train_dtm_numeric)

#### Repeating for Test Set

In [39]:
test_dummies = dummies[dummies.source=='test'].drop('source', axis=1)
test_numeric = sparse.csr_matrix(test_dummies.astype(np.int8))

test_dtm_numeric = sparse.hstack((test_dtm, test_numeric))
test_dtm_numeric.shape

In [40]:
sparse.save_npz(text_features_dir / 'test_dtm_numeric', test_dtm_numeric)

#### Reloading Stored Data

In [41]:
train_dtm_numeric = sparse.load_npz(text_features_dir / 'train_dtm_numeric.npz')

test_dtm_numeric = sparse.load_npz(text_features_dir / 'test_dtm_numeric.npz')

### Benchmark Accuracy

In [42]:
accuracy, runtime = {}, {}

predictions = test[['stars']].copy()

In [43]:
naive_prediction = np.full_like(predictions.stars,
                                fill_value=train.stars.mode().iloc[0])

In [44]:
naive_benchmark = accuracy_score(predictions.stars, naive_prediction)

In [45]:
naive_benchmark

### Model Evaluation Helper

In [46]:
def evaluate_model(model, X_train, X_test, name, store=False):
    start = time()
    model.fit(X_train, train.stars)
    runtime[name] = time() - start
    predictions[name] = model.predict(X_test)
    accuracy[result] = accuracy_score(test.stars, predictions[result])
    if store:
        joblib.dump(model, f'results/{result}.joblib')

### Multiclass Naive Bayes

In [47]:
nb = MultinomialNB()

#### Text Features

In [48]:
result = 'nb_text'

In [49]:
evaluate_model(nb, train_dtm, test_dtm, result, store=False)

#### Accuracy

In [50]:
accuracy[result]

#### Confusion Matrix

In [51]:
stars = index = list(range(1, 6))

pd.DataFrame(confusion_matrix(test.stars, 
                              predictions[result]),
             columns=stars,
             index=stars)

### Text & Numeric Features

In [53]:
result = 'nb_combined'

In [54]:
evaluate_model(nb, train_dtm_numeric, test_dtm_numeric, result, store=False)

#### Accuracy

In [55]:
accuracy[result]

### Multinomial Logistic Regression

In [56]:
Cs = np.logspace(-5, 5, 11)

#### Text Features

In [57]:
log_reg_text_accuracy = {}

log_reg_text_runtime = []

for i, C in enumerate(Cs):
    start = time()
    model = LogisticRegression(C=C,
                               multi_class='multinomial',
                               solver='lbfgs')

    model.fit(train_dtm, train.stars)
    log_reg_text_runtime.append(time() - start)
    log_reg_text_accuracy[C] = accuracy_score(test.stars,
                                              model.predict(test_dtm))

    print(f'{C:12.5f}: {log_reg_text_runtime[i]:.2f}s | {log_reg_text_accuracy[C]:.2%}', flush=True)

In [58]:
pd.Series(log_reg_text_accuracy).to_csv(yelp_dir / 'logreg_text.csv')

In [59]:
accuracy['lr_text'] = pd.Series(log_reg_text_accuracy).max()

runtime['lr_text'] = np.mean(log_reg_text_runtime)

### Combined Features

In [60]:
log_reg_comb_accuracy = {}

log_reg_comb_runtime = []

for i, C in enumerate(Cs):
    start = time()
    model = LogisticRegression(C=C,
                               multi_class='multinomial',
                               solver='lbfgs')

    model.fit(train_dtm_numeric, train.stars)
    log_reg_comb_runtime.append(time() - start)
    log_reg_comb_accuracy[C] = accuracy_score(test.stars,
                                              model.predict(test_dtm_numeric))

    print(f'{C:12.5f}: {log_reg_comb_runtime[i]:.2f}s | {log_reg_comb_accuracy[C]:.2%}', flush=True)

In [61]:
pd.Series(log_reg_comb_accuracy).to_csv(yelp_dir / 'logreg_combined.csv')

In [62]:
accuracy['lr_comb'] = pd.Series(log_reg_comb_accuracy).max()

runtime['lr_comb'] = np.mean(log_reg_comb_runtime)

### Gradient Boosting

In [63]:
lgb_train = lgb.Dataset(data=train_dtm_numeric.tocsr().astype(np.float32), 
                        label=train.stars.sub(1), 
                        categorical_feature=list(range(train_dtm_numeric.shape[1])))

In [64]:
lgb_test = lgb.Dataset(data=test_dtm_numeric.tocsr().astype(np.float32), 
                       label=test.stars.sub(1), 
                       reference=lgb_train)

In [65]:
param = {'objective': 'multiclass',
         'metrics': ['multi_error'],
         'num_class': 5}

In [66]:
booster = lgb.train(params=param,
                    train_set=lgb_train,
                    num_boost_round=2000,
                    early_stopping_rounds=25,
                    valid_sets=[lgb_train, lgb_test],
                    verbose_eval=25)

In [67]:
booster.save_model((yelp_dir / 'lgb_model.txt').as_posix());

In [68]:
y_pred_class = booster.predict(test_dtm_numeric.astype(float))

In [69]:
accuracy['lgb_comb'] = accuracy_score(test.stars, y_pred_class.argmax(1) + 1)

### Comparison

In [70]:
model_map = {'nb_combined': 'Naive Bayes',
             'lr_comb': 'Logistic Regression',
             'lgb_comb': 'LightGBM'}

In [71]:
accuracy_ = {model_map[k]: v for k, v in accuracy.items() if model_map.get(k)}

In [72]:
log_reg_text = pd.read_csv(yelp_dir / 'logreg_text.csv',
                       index_col=0,
                       squeeze=True)

log_reg_combined = pd.read_csv(yelp_dir / 'logreg_combined.csv',
                       index_col=0,
                       squeeze=True)

In [73]:
fig, axes = plt.subplots(ncols=2, figsize=(14, 4))

pd.Series(accuracy_).sort_values().plot.barh(
    ax=axes[0], xlim=(.45, .75), title='Accuracy by Model')
axes[0].axvline(naive_benchmark, ls='--', lw=1, c='k')

log_reg = (log_reg_text.to_frame('text')
           .join(log_reg_combined.to_frame('combined')))
log_reg.plot(logx=True,
             ax=axes[1],
             title='Logistic Regression - Model Tuning')

axes[1].set_xlabel('Regularization')
axes[1].set_ylabel('Accuracy')
axes[0].set_xlabel('Accuracy')
sns.despine()
fig.tight_layout()
plt.show()

### Textblob for Sentiment Analysis

In [74]:
sample_review = train.text.sample(1).iloc[0]

print(sample_review)

In [75]:
TextBlob(sample_review).sentiment.polarity

In [76]:
def detect_sentiment(text):
    return TextBlob(text).sentiment.polarity

In [77]:
train['sentiment'] = train.text.apply(detect_sentiment)

In [78]:
sample_reviews = train[['stars', 'text']].sample(100000)

In [79]:
sample_reviews['sentiment'] = sample_reviews.text.apply(detect_sentiment)

In [80]:
sns.boxenplot(x='stars', y='sentiment', data=train);
plt.show()

In [81]:
pd.set_option('max_colwidth', 500)

In [82]:
train[train.sentiment == -1].text.head()

In [83]:
train.loc[(train.stars == 5) & (train.sentiment < -0.3), 'text'].head(1)

In [84]:
train.loc[(train.stars == 1) & (train.sentiment > 0.5), 'text'].head(1)

In [85]:
pd.reset_option('max_colwidth')