<a href="https://colab.research.google.com/github/jiangzl2016/yelp-rating-prediction/blob/master/VAE_RecSys.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
!pip install -U deepctr[gpu]

Requirement already up-to-date: deepctr[gpu] in /usr/local/lib/python3.6/dist-packages (0.7.0)


In [0]:
## data handling
# setup libraries and env
import os
import shutil
import sys

import numpy as np
from scipy import sparse

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sn
sn.set()

import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from deepctr.models import DeepFM, CCPM, FNN, PNN, WDL, MLR, NFM, AFM, DCN, DIN, DIEN, DSIN, xDeepFM, AutoInt, NFFM, FGCNN, FiBiNET
from deepctr.inputs import SparseFeat,get_feature_names


training =  pd.read_csv('/content/drive/My Drive/final_project_datasets/ratings_sample_train.csv', index_col = 0)
validation = pd.read_csv('/content/drive/My Drive/final_project_datasets/ratings_sample_cv.csv', index_col = 0)
test = pd.read_csv('/content/drive/My Drive/final_project_datasets/ratings_sample_holdout.csv', index_col = 0)

In [0]:
# convert object to datetime
training.date = pd.to_datetime(training.date)
validation.date = pd.to_datetime(validation.date)
test.date = pd.to_datetime(test.date)

# find hour from datetime
training['hour'] = training.date.dt.hour
validation['hour'] = validation.date.dt.hour
test['hour'] = test.date.dt.hour

test.head()

Unnamed: 0,user_id,business_id,rating,date,text,hour
1,n6-Gk65cPZL6Uz8qRm3NYw,hk5wpV-_pi5jmDDVPeG8DA,5.0,2018-09-14 18:50:19,"I highly recommend Arizona Pet Mortuary, David...",18
403,FIk4lQQu1eTe2EpzQ4xhBA,jLxeBgWhLRbII2ACkgH1Sg,4.0,2018-09-30 18:00:41,First time for me to come inside at least! Hav...,18
737,_N7Ndn29bpll_961oPeEfw,O-b5osM0NO4f31dp6_DatQ,3.0,2014-08-01 01:55:23,"I can only comment on their macarons, which I'...",1
797,C_hUvw2z0R-Rv0yZb6QCZA,O19VReN1I2TBrJsbXUAIJg,5.0,2018-10-18 05:39:38,Very good. Probably one of the best restaurant...,5
851,DbccYu3OppWKl21OanZnTg,dTqKpbaiUgEd4rPSzKuWqQ,5.0,2018-10-07 01:33:10,"As someone with a big sweet tooth, this review...",1


In [0]:
len(set(test.user_id) - set(training.user_id))

0

In [0]:
test = test.loc[test.business_id.isin(training.business_id)]
validation = validation.loc[validation.business_id.isin(training.business_id)]

In [0]:
# map each user_id, business_id to an index
user_mapping = {}
for n,i in enumerate(training.user_id.unique()):
  user_mapping[i] = n

business_mapping = {}
for n,i in enumerate(training.business_id.unique()):
  business_mapping[i] = n

In [0]:
# for training
training['user_id'] = training['user_id'].map(user_mapping)
training['business_id'] = training['business_id'].map(business_mapping)
# for validation
validation['user_id'] = validation['user_id'].map(user_mapping)
validation['business_id'] = validation['business_id'].map(business_mapping)
# for test
test['user_id'] = test['user_id'].map(user_mapping)
test['business_id'] = test['business_id'].map(business_mapping)

In [0]:
test.head()

Unnamed: 0,user_id,business_id,rating,date,text,hour
403,1,16012,4.0,2018-09-30 18:00:41,First time for me to come inside at least! Hav...,18
737,2,4847,3.0,2014-08-01 01:55:23,"I can only comment on their macarons, which I'...",1
797,3,36986,5.0,2018-10-18 05:39:38,Very good. Probably one of the best restaurant...,5
851,4,10745,5.0,2018-10-07 01:33:10,"As someone with a big sweet tooth, this review...",1
854,5,23359,2.0,2017-10-28 17:56:07,This place had the best customer service I've ...,17


In [0]:
# 1.Label Encoding for sparse features,and do simple Transformation for dense features
sparse_features = ["user_id", "business_id", "hour"]
target = ['rating']
for feat in sparse_features:
  lbe = LabelEncoder()
  training[feat] = lbe.fit_transform(training[feat])
  validation[feat] = lbe.transform(validation[feat])
  test[feat] = lbe.transform(test[feat])

In [0]:
# 2.count #unique features for each sparse field
fixlen_feature_columns = [SparseFeat(feat, training[feat].nunique(), embedding_dim = 8) for feat in sparse_features]
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [0]:
train_model_input = {name:training[name].values for name in feature_names}
valid_model_input = {name:validation[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}

In [0]:
# 4.Define Model,train,predict and evaluate
model = NFM(linear_feature_columns, dnn_feature_columns, task='regression', dnn_hidden_units = (128, 128), dnn_dropout = 0.3, l2_reg_embedding=1e-05, l2_reg_dnn=1e-05, l2_reg_linear=1e-05)

In [0]:
model.compile("adam", "mse", metrics=['mse'], )

In [0]:
history = model.fit(train_model_input, training[target].values, batch_size=256, epochs=10, verbose=2, validation_data= (valid_model_input, validation[target].values))

Train on 406042 samples, validate on 25174 samples
Epoch 1/10


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


406042/406042 - 30s - loss: 1.9493 - mse: 1.9412 - val_loss: 1.7994 - val_mse: 1.7827
Epoch 2/10
406042/406042 - 29s - loss: 1.2845 - mse: 1.2568 - val_loss: 1.8162 - val_mse: 1.7786
Epoch 3/10
406042/406042 - 28s - loss: 0.9721 - mse: 0.9267 - val_loss: 1.9982 - val_mse: 1.9452
Epoch 4/10
406042/406042 - 28s - loss: 0.7268 - mse: 0.6708 - val_loss: 2.1509 - val_mse: 2.0906
Epoch 5/10
406042/406042 - 28s - loss: 0.6206 - mse: 0.5590 - val_loss: 2.2590 - val_mse: 2.1942
Epoch 6/10
406042/406042 - 28s - loss: 0.5683 - mse: 0.5027 - val_loss: 2.3703 - val_mse: 2.3021
Epoch 7/10
406042/406042 - 29s - loss: 0.5360 - mse: 0.4673 - val_loss: 2.4009 - val_mse: 2.3296
Epoch 8/10
406042/406042 - 28s - loss: 0.5120 - mse: 0.4405 - val_loss: 2.4647 - val_mse: 2.3910
Epoch 9/10
406042/406042 - 28s - loss: 0.4941 - mse: 0.4202 - val_loss: 2.5188 - val_mse: 2.4428
Epoch 10/10
406042/406042 - 29s - loss: 0.4807 - mse: 0.4046 - val_loss: 2.5270 - val_mse: 2.4490


In [0]:
pred_ans = model.predict(test_model_input, batch_size=256)

In [0]:
print("test MSE", round(mean_squared_error(
        test[target].values, pred_ans), 4))

test MSE 2.5012


In [0]:
pred_ans

array([[3.9471264],
       [3.6658187],
       [2.7839057],
       ...,
       [4.43822  ],
       [2.2483878],
       [4.1733575]], dtype=float32)

In [0]:
(test[target].values<2).sum(), (test[target].values<3).sum(), (test[target].values<4).sum()

(3687, 5672, 8015)

In [0]:
(pred_ans<2).sum(),(pred_ans<3).sum(), (pred_ans<4).sum() 

(2246, 6599, 14234)