In [15]:
pip install pandas-gbq --user

Note: you may need to restart the kernel to use updated packages.


In [8]:
import pandas as pd
from pandas.io import gbq
from sklearn.neighbors import NearestNeighbors
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import make_pipeline

In [3]:
!pip install scikit-surprise

Collecting scikit-surprise
  Using cached scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.1


In [4]:
import numpy as np
import joblib
import pickle
from scipy import sparse
from surprise import Dataset, Reader
from surprise import SVD
from surprise import accuracy
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from joblib import parallel_backend

In [5]:
queryYelp_bus = """
Select business_id,local_name
from Yelp.business
"""

In [6]:
queryYelp_us="""
Select user_id, business_id, rating
from Yelp.Reviews
where business_id in (select business_id from Yelp.business)
"""

In [9]:
data_bus = pd.read_gbq(queryYelp_bus, dialect='standard')

In [10]:
data_us= pd.read_gbq(queryYelp_us, dialect='standard')

In [11]:
data_us

Unnamed: 0,user_id,business_id,rating
0,Hix_0MLiJtCIyhhC-ceBdQ,iZNR8-rqsBL2afDk4Zxe8A,1.0
1,F5lONxVG4eQeJLPcfpSV9Q,QHOim2XPDxt_752IDXvmrA,1.0
2,JBKTNfhVlu4EqbG-WvJ8xA,AM0TA-5mW3-yZ5WrMYYPtw,1.0
3,2UQLhyDMaKsh72HGLbEpWA,MVlXCYKHwuk1Rs4wHELGIA,1.0
4,ALWsmfbAqnRkVnCimrIHIw,4YU-VlpC_DU0EnOjlr4C1w,1.0
...,...,...,...
1483925,bldlSShNjxRmlEhz_ls25w,f3eve2cxUIzyanWnHdMtFQ,5.0
1483926,rHudXub0t_MUXW8nkbAm4Q,WeDn7iD1ckcEg7YttkduWg,5.0
1483927,wAN_AY-VbWGnS-7L5k71aw,Ul7iiiIR7AXcI6ta06SkjQ,5.0
1483928,nWyWNBqB659sV_hshwyKbA,owM_gW2UpuL9U3ZgqhvahA,5.0


In [12]:
data_us['rating'] = data_us['rating'].astype(float)

In [13]:
val = (
  data_us
 .query('`rating` >= 3')
)

In [14]:
reader = Reader(line_format='user item rating', rating_scale=(1, 5))
data = Dataset.load_from_df(val, reader)

In [15]:
set_entreno, set_prueba = train_test_split(data, test_size=.30)

In [16]:
modelo_us= SVD()

In [17]:
modelo_us.fit(set_entreno)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fcd4e94a2d0>

In [26]:
predictions = modelo_us.test(set_prueba)

In [32]:
modelo_us.predict('bldlSShNjxRmlEhz_ls25w','f3eve2cxUIzyanWnHdMtFQ')

Prediction(uid='bldlSShNjxRmlEhz_ls25w', iid='f3eve2cxUIzyanWnHdMtFQ', r_ui=None, est=4.826572166033648, details={'was_impossible': False})

In [18]:
usuario = 'bldlSShNjxRmlEhz_ls25w'
rating = 4   # Tomamos películas a las que haya calificado con 4 o 5 estrellas
df_user = data_us[(data_us['user_id'] == usuario) & (data_us['rating'] >= rating)]
df_user = df_user.reset_index(drop=True)
df_user= pd.merge(df_user, data_bus, on='business_id', how='inner')
df_user

Unnamed: 0,user_id,business_id,rating,local_name
0,bldlSShNjxRmlEhz_ls25w,x2ly0NsSTMmMpljpdIZdYA,5.0,Cadiz Restaurant & Bar
1,bldlSShNjxRmlEhz_ls25w,f3eve2cxUIzyanWnHdMtFQ,5.0,Corazon Cocina


In [40]:
recomendaciones_usuario = data_bus.copy()
print(recomendaciones_usuario.shape)
recomendaciones_usuario.head()

(30253, 2)


Unnamed: 0,business_id,local_name
0,BJ0Z74sTz9sxRr1R533Inw,Best Rate Home Services
1,dm9UbH8XPAP6735lHRYILA,Ruby's Roof Jamaican Restaurant
2,aVRVBncwdK3GrqLKLFPKAg,Paradise Pizzeria
3,IMn6n4kjpvp0ur_KT6C3pw,Dollar General
4,_WxO_7bJmeeKF-wCcvgeFQ,Seafood Sensations


In [38]:
usuario_vistas = data_us[data_us['user_id'] == usuario]
print(usuario_vistas.shape)
usuario_vistas.head()

(2, 3)


Unnamed: 0,user_id,business_id,rating
1150297,bldlSShNjxRmlEhz_ls25w,x2ly0NsSTMmMpljpdIZdYA,5.0
1483925,bldlSShNjxRmlEhz_ls25w,f3eve2cxUIzyanWnHdMtFQ,5.0


In [44]:
recomendaciones_usuario = pd.merge(recomendaciones_usuario, usuario_vistas[['business_id']], how='left', indicator=True)
recomendaciones_usuario = recomendaciones_usuario[recomendaciones_usuario['_merge'] == 'left_only']

In [46]:
recomendaciones_usuario=recomendaciones_usuario.drop('_merge', axis=1)

In [49]:
recomendaciones_usuario['Estimate_Score'] = recomendaciones_usuario['business_id'].apply(lambda x: modelo_us.predict(usuario, x).est)

In [50]:
recomendaciones_usuario = recomendaciones_usuario.sort_values('Estimate_Score', ascending=False)
recomendaciones_usuario.head()

Unnamed: 0,business_id,local_name,Estimate_Score
28866,Ku2b7oA9s56mHoflb5lJ2w,Alma Del Mar,5.0
28810,sKl0EzbH76zZL5CFLpdrgA,BRŪ Florida Growler Bar,5.0
28843,CLtf4i6Mm8nj5t2wB9myEg,Gulf Coast Sourdough & Wild Yeast Breads,5.0
28841,dF0N_SLXe6TUa4CfQ42LYA,The 439 Magic Experience,5.0
28838,zCmdpK9TYREr3sO1QO6BCw,Philly Brew Tours by City Brew Tours,5.0


In [51]:
accuracy.rmse(predictions)

RMSE: 0.6518


0.6517908898319645

In [52]:
rmse_test_means = []
factores = [1,2,4,8,16,32,64,128,256]

for factor in factores:
    print(factor)
    model = SVD(n_factors=factor)
    cv = cross_validate(model, data, measures=['RMSE'], cv = 3, verbose=True)
    rmse_test_means.append(np.mean(cv['test_rmse']))

1
Evaluating RMSE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.6473  0.6462  0.6476  0.6470  0.0006  
Fit time          16.68   15.90   16.45   16.35   0.33    
Test time         3.98    3.18    4.75    3.97    0.64    
2
Evaluating RMSE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.6475  0.6476  0.6465  0.6472  0.0005  
Fit time          17.06   16.69   17.33   17.03   0.26    
Test time         4.96    4.75    3.99    4.57    0.42    
4
Evaluating RMSE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.6474  0.6472  0.6473  0.6473  0.0001  
Fit time          18.23   17.76   17.75   17.91   0.23    
Test time         4.08    4.28    4.02    4.13    0.11    
8
Evaluating RMSE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.6467  0.6

In [19]:
param_grid = {'n_factors': [5,50,100],'n_epochs': [5, 10,20], 'lr_all': [0.001, 0.002, 0.005],
              'reg_all': [0.002, 0.02, 0.2]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3, n_jobs = -1)
gs.fit(data)

In [20]:
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

0.6474753799705083
{'n_factors': 5, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.02}


In [21]:
modelo_yelp = SVD(n_factors=5, n_epochs=5, lr_all=0.005, reg_all=0.002)

In [27]:
modelo_yelp=gs.best_estimator['rmse']

In [28]:
modelo_yelp.fit(set_entreno)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fcd1cba6550>

In [29]:
predictions = modelo_yelp.test(set_prueba)

In [30]:
predictions[1]

Prediction(uid='8YJ9YKVDohgfVY46ljjaig', iid='XSuO2E30ArnrjH9jquaK5Q', r_ui=5.0, est=4.884173392563544, details={'was_impossible': False})

In [33]:
def get_recommendationyelp(usuario, data_us=data_us, data_bus=data_bus):
    rating = 4   # Tomamos películas a las que haya calificado con 4 o 5 estrellas
    df_user = data_us[(data_us['user_id'] == usuario) & (data_us['rating'] >= rating)]
    df_user = df_user.reset_index(drop=True)
    df_user= pd.merge(df_user, data_bus, on='business_id', how='inner')
    
    recomendaciones_usuario = data_bus.copy()
    usuario_vistas = data_us[data_us['user_id'] == usuario]
    
    recomendaciones_usuario = pd.merge(recomendaciones_usuario, usuario_vistas[['business_id']], how='left', indicator=True)
    recomendaciones_usuario = recomendaciones_usuario[recomendaciones_usuario['_merge'] == 'left_only']
    recomendaciones_usuario=recomendaciones_usuario.drop('_merge', axis=1)
    
    recomendaciones_usuario['Estimate_Score'] = recomendaciones_usuario['business_id'].apply(lambda x: modelo_yelp.predict(usuario, x).est)
    recomendaciones_usuario = recomendaciones_usuario.sort_values('Estimate_Score', ascending=False)
    recomendaciones= list(recomendaciones_usuario.local_name.head())
    return {'restaurantes recomendados': recomendaciones}

In [34]:
get_recommendationyelp('8YJ9YKVDohgfVY46ljjaig')

{'restaurantes recomendados': ['Utsav Indian Cuisine',
  "Pedro's Smog Check",
  'Frozen Rolled Icecream',
  'Pomodoro Pizza & Italian Restaurant',
  "Nature's Food Patch Market & Café"]}

In [35]:
joblib.dump(modelo_yelp, 'modelo.joblib')

['modelo.joblib']

In [38]:
from google.cloud import storage
import os

In [39]:
model_directory = 'gs://model-yelp'
storage_path = os.path.join(model_directory, 'modelo.joblib')
blob = storage.blob.Blob.from_string(storage_path, 
                                    client = storage.Client(project=''))
blob.upload_from_filename("modelo.joblib")

In [45]:
import pyarrow as pa
import pyarrow.parquet as pq

In [46]:
table_us = pa.Table.from_pandas(data_us)

In [47]:
pq.write_table(table_us, 'data_us.parquet')

In [48]:
table_bus = pa.Table.from_pandas(data_bus)

In [49]:
pq.write_table(table_bus, 'data_bus.parquet')

In [50]:
model_directory = 'gs://model-yelp'
storage_path = os.path.join(model_directory, 'data_us.parquet')
blob = storage.blob.Blob.from_string(storage_path, 
                                    client = storage.Client(project=''))
blob.upload_from_filename("data_us.parquet")

In [51]:
model_directory = 'gs://model-yelp'
storage_path = os.path.join(model_directory, 'data_bus.parquet')
blob = storage.blob.Blob.from_string(storage_path, 
                                    client = storage.Client(project=''))
blob.upload_from_filename("data_bus.parquet")