In [45]:
import numpy as np
import pandas as pd

from sklearn import preprocessing as pp
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.metrics import roc_curve, auc, roc_auc_score, mean_squared_error

# import lightgbm as lgb

import tensorflow as tf
import keras
from keras import backend as K
from keras.models import Sequential, Model
from keras.layers import Activation, Dense, Dropout
from keras.layers import BatchNormalization, Input, Lambda
from keras import regularizers
# from keras.losses import mse, binary_crossentropy

# 데이터 불러오기

In [46]:
ratingDF = pd.read_csv("philadelphia_reviews.csv")
ratingDF

Unnamed: 0,user_id,business_id,stars
0,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5
1,kSMOJwJXuEUqzfmuFncK4A,kxX2SOes4o-D3ZQBkiMRfA,2
2,mqBWACmaHflW4eh_Ofp16Q,kxX2SOes4o-D3ZQBkiMRfA,5
3,Z-xgVb4nM42943m2wbBkFw,kxX2SOes4o-D3ZQBkiMRfA,5
4,2SEoXb6r6hPKrl9V9VzBgA,kxX2SOes4o-D3ZQBkiMRfA,5
...,...,...,...
738683,reFwg-F-MCoRS1pA2YexMg,BEuGzy5gxtGyOEk8iwDfTg,5
738684,dh3UhfqpFSoCitl8WCrdfw,BEuGzy5gxtGyOEk8iwDfTg,5
738685,nXwPLYvazD_Nd2Sar3eSWA,BEuGzy5gxtGyOEk8iwDfTg,5
738686,MlOJCxiiB1TeZN-is2Q4SA,BEuGzy5gxtGyOEk8iwDfTg,5


## 데이터 집계

In [47]:
n_users = ratingDF.user_id.unique().shape[0]
n_businesses = ratingDF.business_id.unique().shape[0]
n_ratings = len(ratingDF)
aver_ratings_per_user = n_ratings/n_users

print("Num of users:", n_users)
print("Num of businesses:", n_businesses)
print("Num of ratings:", n_ratings)
print("Average of ratings per user:", aver_ratings_per_user)

Num of users: 216944
Num of businesses: 7076
Num of ratings: 738688
Average of ratings per user: 3.4049708680581166


# 인덱스 정수형 변환

## User ID 생성

In [48]:
users = ratingDF.user_id.unique()
userDF = pd.DataFrame(data=users, columns=["originalUserId"])
userDF["newUserId"] = userDF.index + 1
userDF

Unnamed: 0,originalUserId,newUserId
0,_7bHUi9Uuf5__HHc_Q8guQ,1
1,kSMOJwJXuEUqzfmuFncK4A,2
2,mqBWACmaHflW4eh_Ofp16Q,3
3,Z-xgVb4nM42943m2wbBkFw,4
4,2SEoXb6r6hPKrl9V9VzBgA,5
...,...,...
216939,HzsMA4M0wwJtYpIuy-TKig,216940
216940,xbX64Rq2_xCb4WRO3aYG-g,216941
216941,4rYuARoWNIYSfVyG6zNrmA,216942
216942,WLaCTpXXdrFKlUTXDsp6Tg,216943


## Business ID 생성

In [49]:
businesses = ratingDF.business_id.unique()
businessDF = pd.DataFrame(data=businesses, columns=["originalBusinessId"])
businessDF["newBusinessId"] = businessDF.index + 1
businessDF

Unnamed: 0,originalBusinessId,newBusinessId
0,kxX2SOes4o-D3ZQBkiMRfA,1
1,04UD14gamNjLY0IDYVhHJg,2
2,RZtGWDLCAtuipwaZ-UfjmQ,3
3,YtSqYv1Q_pOltsVPSx54SA,4
4,eFvzHawVJofxSnD7TgbZtg,5
...,...,...
7071,ruPaJWpkFnyBxsN_LnnmQw,7072
7072,7gNY2fx_8-RFicmecY3RsA,7073
7073,suu00nBOfDsTOjSm_WtV8A,7074
7074,K1SsvIPfFcHniNSPc3IG7g,7075


## 정수 인덱스 추가

In [50]:
ratingDF = ratingDF.merge(userDF, left_on="user_id", right_on="originalUserId")
ratingDF.drop(labels="originalUserId", axis=1, inplace=True)

ratingDF = ratingDF.merge(businessDF, left_on="business_id", right_on="originalBusinessId")
ratingDF.drop(labels="originalBusinessId", axis=1, inplace=True)

ratingDF

Unnamed: 0,user_id,business_id,stars,newUserId,newBusinessId
0,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,1
1,kSMOJwJXuEUqzfmuFncK4A,kxX2SOes4o-D3ZQBkiMRfA,2,2,1
2,mqBWACmaHflW4eh_Ofp16Q,kxX2SOes4o-D3ZQBkiMRfA,5,3,1
3,mqBWACmaHflW4eh_Ofp16Q,kxX2SOes4o-D3ZQBkiMRfA,5,3,1
4,Z-xgVb4nM42943m2wbBkFw,kxX2SOes4o-D3ZQBkiMRfA,5,4,1
...,...,...,...,...,...
738683,BByOqQ8d_pKDH8pz4QvWWA,AuOBDegADgRWYDVpfgsgfw,1,202065,6228
738684,P1L-wYUTpvLEDq9D2TOexQ,AuOBDegADgRWYDVpfgsgfw,1,202066,6228
738685,GRAOH8iUq9B8MtZVjRZl_Q,AuOBDegADgRWYDVpfgsgfw,5,202067,6228
738686,o3UaM3DbI5dbERHYHD9Njw,AuOBDegADgRWYDVpfgsgfw,1,202068,6228
