# Baseline Bias Model

The idea is to get a baseline model for some metrics of prediction. This model is basically a fixed effects regression, and it test the effect of each independent variable (user and product) over the response variable (rating) and the interaction of user*product over the response variable. 
The estimation of the model should tell us whether the named effects are statistically significant or not, to then chose a model to predict some ratings and get a prediction power metric.

In [0]:
import pandas as pd
import numpy as np

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
drive_root = "/content/drive/My Drive/Final Project/"

### Read Train and Test Dataframes

In [0]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [0]:
train = pd.read_csv(drive_root+"Clean Data/train_set_new.csv", low_memory=False)

In [0]:
train.head()

Unnamed: 0,review_id,user_id,business_id,rating,date
0,Q1sbwvVQXV2734tPgoKj4Q,hG7b0MtEbXx5QzbzE6C_VA,ujmEBvifdJM6h6RLv4wQIg,1.0,2013-05-07 04:34:36
1,2TzJjDVDEuAW6MR5Vuc1ug,n6-Gk65cPZL6Uz8qRm3NYw,WTqjgwHlXbSFevF32_DJVw,5.0,2016-11-09 20:09:03
2,G7XHMxG0bx9oBJNECG4IFg,jlu4CztcSxrKx56ba1a5AQ,3fw2X5bZYeW9xCz_zGhOHg,3.0,2016-05-07 01:21:02
3,8e9HxxLjjqc9ez5ezzN7iQ,d6xvYpyzcfbF_AZ8vMB7QA,zvO-PJCpNk4fgAVUnExYAA,1.0,2010-10-05 19:12:35
4,qrffudO73zsslZbe8B9D3Q,sG_h0dIzTKWa3Q6fmb4u-g,b2jN2mm9Wf3RcrZCgfo1cg,2.0,2015-01-18 14:04:18


In [0]:
train.describe()

Unnamed: 0,rating
count,4252142.0
mean,3.75
std,1.35
min,1.0
25%,3.0
50%,4.0
75%,5.0
max,5.0


In [0]:
train.user_id.value_counts()

CxDOIDnH8gp9KXzpBHJYXw    4128
bLbSNkLggFnqwNNzzq-Ijw    2353
PKEzKWv_FktMm2mGPjwd0Q    1821
ELcQDlf69kb-ihJfxZyL0A    1763
DK57YibC5ShBmqQl97CKog    1726
                          ... 
mXimc5-6ty518Nh3UFLzgA       4
qdkkG3t_piHaR4IiAtR2YA       4
p2FRjdnz2z06BCzuu_I0Lg       4
t21dZOntAXHdkrlw80hDCg       4
eM4cjVUyX--72bHZiCM9gA       4
Name: user_id, Length: 286130, dtype: int64

### Calculate user and business bias in Train set

In [0]:
grouped_user = train.groupby("user_id")['rating'].mean()

In [0]:
grouped_user

user_id
---1lKK3aKOuomHnwAkAow   4.01
--0kuuLmuYBe3Rmu0Iycww   3.45
--2HUmLkcNHZp0xw6AMBPg   4.58
--2vR0DIsmQ6WfcSzKWigw   4.12
--3WaS23LcIXtxyFULJHTA   3.93
                         ... 
zzrZ8gFESj5pe-SFZ-h0lw   4.80
zzsmdXHUFBYuKUtPbXWjRA   5.00
zzvV3l9IqTRX7Db8nxThbA   4.25
zzw0Z6-_VDp9ShIRSKIsQw   3.22
zzxZoMmjbUjXcWZzrE3PIw   1.80
Name: rating, Length: 286130, dtype: float64

In [0]:
grouped_bus = train.groupby("business_id")['rating'].mean()

In [0]:
grouped_bus

business_id
--1UhMGODdWsrMastO9DZw   4.25
--6MefnULPED_I942VcFNA   3.30
--7zmmkVg-IMGaXbuVd0SQ   3.79
--8LPVSo5i0Oo61X01sV9A   3.00
--9QQLMTbFzLJ_oT-ON3Xw   3.36
                         ... 
zzvlwkcNR1CCqOPXwuvz2A   4.00
zzwaS0xn1MVEPEf0hNLjew   3.59
zzwhN7x37nyjP0ZM8oiHmw   4.07
zzwicjPC9g246MK2M1ZFBA   3.10
zzzaIBwimxVej4tY6qFOUQ   3.48
Name: rating, Length: 183398, dtype: float64

Calculate global rating mean

In [0]:
global_mean = train.rating.mean()
global_mean

3.7483973959477366

Substract the global mean to each user and business mean to calculate user and business bias

In [0]:
user_bias = (grouped_user - global_mean).to_frame(name='u_bias').reset_index(level=0)
bus_bias = (grouped_bus - global_mean).to_frame(name='b_bias').reset_index(level=0)

In [0]:
user_bias

Unnamed: 0,user_id,u_bias
0,---1lKK3aKOuomHnwAkAow,0.26
1,--0kuuLmuYBe3Rmu0Iycww,-0.29
2,--2HUmLkcNHZp0xw6AMBPg,0.83
3,--2vR0DIsmQ6WfcSzKWigw,0.37
4,--3WaS23LcIXtxyFULJHTA,0.18
...,...,...
286125,zzrZ8gFESj5pe-SFZ-h0lw,1.05
286126,zzsmdXHUFBYuKUtPbXWjRA,1.25
286127,zzvV3l9IqTRX7Db8nxThbA,0.50
286128,zzw0Z6-_VDp9ShIRSKIsQw,-0.53


In [0]:
bus_bias

Unnamed: 0,business_id,b_bias
0,--1UhMGODdWsrMastO9DZw,0.50
1,--6MefnULPED_I942VcFNA,-0.45
2,--7zmmkVg-IMGaXbuVd0SQ,0.04
3,--8LPVSo5i0Oo61X01sV9A,-0.75
4,--9QQLMTbFzLJ_oT-ON3Xw,-0.38
...,...,...
183393,zzvlwkcNR1CCqOPXwuvz2A,0.25
183394,zzwaS0xn1MVEPEf0hNLjew,-0.16
183395,zzwhN7x37nyjP0ZM8oiHmw,0.33
183396,zzwicjPC9g246MK2M1ZFBA,-0.64


Read last review file and make a prediction for this data

In [0]:
last_review = pd.read_csv(drive_root+"Clean Data/last_review.csv", low_memory=False)

In [0]:
#merge test set with bias columns
last_review_pred = pd.merge(pd.merge(last_review,user_bias, how='left',on='user_id'),bus_bias, how='left',on='business_id')

In [0]:
last_review_pred

Unnamed: 0,review_id,user_id,business_id,rating,date,u_bias,b_bias
0,wEJK2PeiS6Au0TOyxTyqUA,---1lKK3aKOuomHnwAkAow,Hqs4YNST_ZHbshwyi4bnsQ,5.00,2018-10-11 23:29:57,0.26,0.65
1,8QLqN2qZPeN6qh4i9tj-rw,--0kuuLmuYBe3Rmu0Iycww,PYe_FDw6QTbTf66WcGE_tw,2.00,2014-04-21 16:58:28,-0.29,0.25
2,dgtVm1qaV3KSzC1Cv7M7kg,--2HUmLkcNHZp0xw6AMBPg,KW9RNyBPmc77f9FsO92qYw,5.00,2018-10-04 02:02:28,0.83,-0.73
3,419OFmkJ4DSc2zkjuxX5hw,--2vR0DIsmQ6WfcSzKWigw,BLIJ-p5wYuAhw6Pp6mh6mw,3.00,2018-01-11 04:24:17,0.37,-0.34
4,6KblMvyPMH__K_eMKhHWOg,--3WaS23LcIXtxyFULJHTA,UKrfUw8quQiQM2N9i1nH0g,4.00,2018-09-03 19:32:11,0.18,0.92
...,...,...,...,...,...,...,...
286125,oOxa0qNQK6RLK-EsasCQzg,zzrZ8gFESj5pe-SFZ-h0lw,P2uNvUI1RCX8RGKFzj4ceA,5.00,2018-08-02 21:26:39,1.05,-2.75
286126,ElFGQ4pTi4geHQ2mYBDzug,zzsmdXHUFBYuKUtPbXWjRA,P6zYuNINXKKm5AZ8P5exmQ,1.00,2018-06-03 22:16:42,1.25,-1.16
286127,3NlTwm1txpcG-ZepmZ66Yg,zzvV3l9IqTRX7Db8nxThbA,LucXqItGj0JZD6EgySOakg,5.00,2018-03-10 03:41:30,0.50,-0.30
286128,_htR70NBrPuJ5sQzm-RAkw,zzw0Z6-_VDp9ShIRSKIsQw,1hfYBwAI7pFz50l5n4JpqQ,5.00,2018-07-22 16:42:02,-0.53,0.82


In [0]:
#check business' with no bias
last_review_pred.isnull().sum()

review_id         0
user_id           0
business_id       0
rating            0
date              0
u_bias            0
b_bias         2913
dtype: int64

In [0]:
#replace Nan business bias with zeros
last_review_pred.b_bias = last_review_pred.b_bias.fillna(0)

In [0]:
last_review_pred.isnull().sum()

review_id      0
user_id        0
business_id    0
rating         0
date           0
u_bias         0
b_bias         0
dtype: int64

Calculate predictions for last review file

In [0]:
#create and calculate predictions for last review
last_review_pred['prediction'] = global_mean+ last_review_pred.u_bias + last_review_pred.b_bias

In [0]:
last_review_pred

Unnamed: 0,review_id,user_id,business_id,rating,date,u_bias,b_bias,prediction
0,wEJK2PeiS6Au0TOyxTyqUA,---1lKK3aKOuomHnwAkAow,Hqs4YNST_ZHbshwyi4bnsQ,5.00,2018-10-11 23:29:57,0.26,0.65,4.66
1,8QLqN2qZPeN6qh4i9tj-rw,--0kuuLmuYBe3Rmu0Iycww,PYe_FDw6QTbTf66WcGE_tw,2.00,2014-04-21 16:58:28,-0.29,0.25,3.71
2,dgtVm1qaV3KSzC1Cv7M7kg,--2HUmLkcNHZp0xw6AMBPg,KW9RNyBPmc77f9FsO92qYw,5.00,2018-10-04 02:02:28,0.83,-0.73,3.85
3,419OFmkJ4DSc2zkjuxX5hw,--2vR0DIsmQ6WfcSzKWigw,BLIJ-p5wYuAhw6Pp6mh6mw,3.00,2018-01-11 04:24:17,0.37,-0.34,3.77
4,6KblMvyPMH__K_eMKhHWOg,--3WaS23LcIXtxyFULJHTA,UKrfUw8quQiQM2N9i1nH0g,4.00,2018-09-03 19:32:11,0.18,0.92,4.85
...,...,...,...,...,...,...,...,...
286125,oOxa0qNQK6RLK-EsasCQzg,zzrZ8gFESj5pe-SFZ-h0lw,P2uNvUI1RCX8RGKFzj4ceA,5.00,2018-08-02 21:26:39,1.05,-2.75,2.05
286126,ElFGQ4pTi4geHQ2mYBDzug,zzsmdXHUFBYuKUtPbXWjRA,P6zYuNINXKKm5AZ8P5exmQ,1.00,2018-06-03 22:16:42,1.25,-1.16,3.84
286127,3NlTwm1txpcG-ZepmZ66Yg,zzvV3l9IqTRX7Db8nxThbA,LucXqItGj0JZD6EgySOakg,5.00,2018-03-10 03:41:30,0.50,-0.30,3.95
286128,_htR70NBrPuJ5sQzm-RAkw,zzw0Z6-_VDp9ShIRSKIsQw,1hfYBwAI7pFz50l5n4JpqQ,5.00,2018-07-22 16:42:02,-0.53,0.82,4.05


With predictions done, calculate RMSE of the model on Test set

In [0]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

rmse = np.sqrt(mean_squared_error(last_review_pred.rating, last_review_pred.prediction))
mae = mean_absolute_error(last_review_pred.rating, last_review_pred.prediction)

print("The RMSE of bias based model is : {}".format(rmse))
print("The MAE of bias based model is : {}".format(mae))

The RMSE of bias based model is : 1.3915867020822716
The MAE of bias based model is : 1.0669430218853058


Save predictions for last review to csv file

In [0]:
predictions_csv = last_review_pred[['review_id','user_id','business_id','rating' ,'prediction']].copy()

In [0]:
predictions_csv.to_csv(drive_root+"Bias Model/biasmodel_lastreview_predictions.csv", index=False)