# Review Ranking

This notebook contains:
* EDA of review predictions.
* Analysis of how well the models rank the reviews.

## Imports and Global Settings

In [41]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.metrics import r2_score

pd.set_option('display.float_format', lambda x: '%.5f' % x)
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 200)
sns.set_theme(style="whitegrid")

## Load Data

In [2]:
# EC2
# filepath_prefix = "/home/ubuntu/"

# Local
filepath_prefix = "/home/jeff/Documents/Data_Science_Projects/Yelp_Reviews/data/full_data/final_predict/"

In [3]:
train_records_to_load = 5523992 # Total: 5523992
test_records_to_load = 1382379 # Total: 1382379

In [4]:
datatypes = {'target_reg': 'int16',
             'log_reg_pred_proba': 'float32',
             'lin_reg_pred': 'float32',
             'review_stars': 'int16',
             'nb_prob': 'float32',
             'svm_pred': 'float32',
             'ft_prob': 'float32',
             'lda_t1': 'float32',
             'lda_t2': 'float32',
             'lda_t3': 'float32',
             'lda_t4': 'float32',
             'lda_t5': 'float32',
             'grade_level': 'float32',
             'polarity': 'float32',
             'subjectivity': 'float32',
             'word_cnt': 'int16',
             'character_cnt': 'int16'}

columns_to_load = ['review_id', 'user_id', 'business_id', 'date', 'target_clf'] + list(datatypes.keys())

In [5]:
train = pd.read_csv(f"{filepath_prefix}train_rank.csv", nrows=train_records_to_load,
                    true_values=["True"], false_values=["False"], usecols=columns_to_load,
                    dtype=datatypes, parse_dates=['date'], infer_datetime_format=True)
test = pd.read_csv(f"{filepath_prefix}test_rank.csv", nrows=test_records_to_load,
                   true_values=["True"], false_values=["False"], usecols=columns_to_load,
                   dtype=datatypes, parse_dates=['date'], infer_datetime_format=True)

In [11]:
all_data = pd.concat([train, test], ignore_index=True, verify_integrity=True)

## Basic Overview

In [12]:
print(all_data.shape)

(6906371, 22)


In [13]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6906371 entries, 0 to 6906370
Data columns (total 22 columns):
 #   Column              Dtype         
---  ------              -----         
 0   review_id           object        
 1   user_id             object        
 2   business_id         object        
 3   date                datetime64[ns]
 4   target_clf          bool          
 5   target_reg          int16         
 6   log_reg_pred_proba  float32       
 7   lin_reg_pred        float32       
 8   nb_prob             float32       
 9   svm_pred            float32       
 10  ft_prob             float32       
 11  lda_t1              float32       
 12  lda_t2              float32       
 13  lda_t3              float32       
 14  lda_t4              float32       
 15  lda_t5              float32       
 16  grade_level         float32       
 17  polarity            float32       
 18  subjectivity        float32       
 19  word_cnt            int16         
 20  ch

In [14]:
all_data.head()

Unnamed: 0,review_id,user_id,business_id,date,target_clf,target_reg,log_reg_pred_proba,lin_reg_pred,nb_prob,svm_pred,ft_prob,lda_t1,lda_t2,lda_t3,lda_t4,lda_t5,grade_level,polarity,subjectivity,word_cnt,character_cnt,review_stars
0,syrAB11Ayj0qb64M3orNyQ,eJTjh_nIJC7ldCuU-z7W5Q,VJj_xjjWX_UHaFNL91_Duw,2007-04-09 00:41:12,False,0,0.7725,3.85699,0.0,0.805,0.78156,0.0049,0.00489,0.83379,0.15149,0.00493,12.1,0.13819,0.58714,91,552,4
1,ybCCcr1ICVynGJBx0lpBAw,cibQYbrbI4UoEqeD0vpxmA,Qz-R16P6zvgJrRerqdtIaQ,2019-03-17 14:17:40,False,0,0.52248,2.16479,0.885,-0.356,0.61055,0.00686,0.00729,0.97217,0.00684,0.00685,5.3,0.44034,0.66364,57,338,4
2,HBuNpb82_z25gLK2htZjZw,wcgCqmw7mx7cEg5jgrgnEA,y2w6rFaO0XEiG5mFfOsiFA,2019-04-02 22:12:04,True,4,0.56829,2.71898,0.004,-0.014,0.59267,0.00297,0.00297,0.21996,0.51115,0.26294,27.6,0.28628,0.54253,151,764,5
3,RAIaaKEIg9gSJ-B4XcXBwA,ybjy3GVtk25kpZoGc8Nu-Q,eqr5t-Py3oOhIuukz27dEA,2018-12-02 19:25:20,False,0,0.63771,3.04406,0.0,0.892,0.55841,0.57003,0.0015,0.00151,0.42545,0.00151,6.1,0.02428,0.49481,327,1685,1
4,4fbqvddoQTLa7ChLJDYreg,xivmXoWXkWOubz-2ALK_Iw,CH0xyLeK0ixASWBGIEYkkA,2013-07-23 20:28:22,False,0,0.04768,-2.5596,0.0,-3.863,0.25632,0.01018,0.01022,0.95915,0.01019,0.01026,4.6,0.5125,0.67375,37,198,5


In [15]:
all_data.describe()

Unnamed: 0,target_reg,log_reg_pred_proba,lin_reg_pred,nb_prob,svm_pred,ft_prob,lda_t1,lda_t2,lda_t3,lda_t4,lda_t5,grade_level,polarity,subjectivity,word_cnt,character_cnt,review_stars
count,6906371.0,6906371.0,6906371.0,6906371.0,6906371.0,6906371.0,6906371.0,6906371.0,6906371.0,6906371.0,6906371.0,6906371.0,6906371.0,6906371.0,6906371.0,6906371.0,6906371.0
mean,2.17175,0.50572,2.17219,0.38203,-0.00991,0.50915,0.17765,0.21172,0.25091,0.15871,0.18486,13.37884,0.23498,0.55845,112.70522,606.26812,3.73077
std,6.52438,0.20209,1.96208,0.45105,1.4519,0.16511,0.31047,0.36537,0.35534,0.28754,0.30966,19.63479,0.2273,0.1322,104.54801,559.78931,1.45691
min,0.0,0.0,-32.2135,0.0,-31.336,-1e-05,0.0004,0.00041,0.00041,0.00042,0.00042,-15.7,-1.0,0.0,1.0,1.0,1.0
25%,0.0,0.34461,0.87227,0.0,-0.884,0.4011,0.00487,0.00456,0.00513,0.00513,0.00477,5.0,0.10139,0.48214,45.0,244.0,3.0
50%,1.0,0.47114,1.72557,0.02,-0.444,0.53175,0.00983,0.00969,0.01376,0.01028,0.01082,7.1,0.23772,0.55799,81.0,436.0,4.0
75%,2.0,0.64672,2.98055,0.999,0.418,0.63321,0.20636,0.28591,0.48554,0.18649,0.2734,11.7,0.3729,0.63778,144.0,771.0,5.0
max,1457.0,1.0,55.01871,1.0,44.159,1.0,0.99805,0.9981,0.99833,0.9981,0.99714,1330.0,1.0,1.0,3602.0,5000.0,5.0


## Calculate Rankings

In [22]:
all_data['actual_rank'] = all_data.groupby('business_id')['target_reg'].rank(ascending=False, method='min').astype('int16')
all_data['actual_rank_pct'] = all_data.groupby('business_id')['target_reg'].rank(ascending=False, method='min', pct=True).astype('float32')

In [23]:
all_data['clf_pred_rank'] = all_data.groupby('business_id')['log_reg_pred_proba'].rank(ascending=False, method='min').astype('int16')
all_data['clf_pred_rank_pct'] = all_data.groupby('business_id')['log_reg_pred_proba'].rank(ascending=False, method='min', pct=True).astype('float32')

In [24]:
all_data['reg_pred_rank'] = all_data.groupby('business_id')['lin_reg_pred'].rank(ascending=False, method='min').astype('int16')
all_data['reg_pred_rank_pct'] = all_data.groupby('business_id')['lin_reg_pred'].rank(ascending=False, method='min', pct=True).astype('float32')

## Compare Regression vs. Classifcation Predictions for Rankings

### MAE

In [45]:
clf_mae = ((all_data['actual_rank'] - all_data['clf_pred_rank']).abs().sum()) / len(all_data)
reg_mae = ((all_data['actual_rank'] - all_data['reg_pred_rank']).abs().sum()) / len(all_data)
print(f"Classification Rank MAE: {clf_mae:.2f}")
print(f"Regression Rank MAE: {reg_mae:.2f}")

Classification Rank MAE: 81.25
Regression Rank MAE: 81.09


### RMSE

In [47]:
clf_rmse = math.sqrt((((all_data['actual_rank'] - all_data['clf_pred_rank']) ** 2).sum()) / len(all_data))
reg_rmse = math.sqrt((((all_data['actual_rank'] - all_data['reg_pred_rank']) ** 2).sum()) / len(all_data))
print(f"Classification Rank RMSE: {clf_rmse:.2f}")
print(f"Regression Rank RMSE: {reg_rmse:.2f}")

Classification Rank RMSE: 45.55
Regression Rank RMSE: 45.52


### R2 Score

In [49]:
clf_r2 = r2_score(all_data['actual_rank'], all_data['clf_pred_rank'])
reg_r2 = r2_score(all_data['actual_rank'], all_data['reg_pred_rank'])
print(f"Classification Rank R2 Score: {clf_r2:.2f}")
print(f"Regression Rank R2 Score: {reg_r2:.2f}")

Classification Rank R2 Score: 0.95
Regression Rank R2 Score: 0.95


## Recreate Train/Test and Save Data

In [51]:
new_cols = ['review_id',
            'actual_rank', 'actual_rank_pct',
            'clf_pred_rank', 'clf_pred_rank_pct',
            'reg_pred_rank', 'reg_pred_rank_pct']
train_rank = train.merge(all_data[new_cols], how='left', on='review_id', validate='1:1')
test_rank = test.merge(all_data[new_cols], how='left', on='review_id', validate='1:1')

In [55]:
# train_rank.to_csv(f'{filepath_prefix}train_rankings.csv', index=False)
# test_rank.to_csv(f'{filepath_prefix}test_rankings.csv', index=False)