In [1]:
# Importacion de librerias a utilizar
from google.colab import drive
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import pickle
import pyarrow.parquet as pq

# Establece conexión con google Drive
drive.mount('/content/drive')

def aplicar_ETL():
  # Variables donde se almacenan las rutas de los archivos crudos
  ruta1='/content/drive/MyDrive/Proyecto Final/Yelp/business.pkl'
  ruta2='/content/drive/MyDrive/Proyecto Final/Yelp/checkin.json'
  ruta3='/content/drive/MyDrive/Proyecto Final/Yelp/review.json'
  ruta4='/content/drive/MyDrive/Proyecto Final/Yelp/tip.json'
  ruta5='/content/drive/MyDrive/Proyecto Final/Yelp/user.parquet'


  # Desarrollo de ETL---------------------------------
  try:
    #  # Lee archivo y devuelve un dataframe de pandas
    business=pd.read_pickle(ruta1)
    business=business.iloc[:,:14]
    # Seleccion de columna importantes
    df_business=business.drop(columns=['is_open','attributes','hours'])
    # Elimina valores nulos
    df_business=df_business.dropna().reset_index(drop=True)
    # Cambio de Tipos
    df_business['latitude']=df_business['latitude'].astype('float')
    df_business['longitude']=df_business['longitude'].astype('float')
    df_business['stars']=df_business['stars'].astype('int')
    df_business['review_count']=df_business['review_count'].astype('int')
    # Lista de estados seleccionados
    estados_seleccionados=['PA','FL','CA','IL']
    # Filtado registros por estados
    df_business=df_business[df_business['state'].isin(estados_seleccionados)].reset_index(drop=True)
    # Trabajaremos con los negocios que contengan mas de 20 reseñas, ya que menos reseñas no seria eficiente para el sistema de recomendación
    df_business=df_business[df_business['review_count']>20].reset_index(drop=True)
    # Filtrado de negocios por rubro gastronómico
    palabras_clave=["Restaurants","Food","Bubble Tea","Coffee & Tea","Bakery","Barbecue","Buffets","Burgers","Cafes","Catering","Cocktail Bars","Desserts","Diners","Food Trucks","Ice Cream & Frozen Yogurt","Juice Bars & Smoothies","Pizza","Seafood","Steakhouses","Sushi Bars","Tapas Bars","Wine Bars","Vegetarian","Vegan","Gluten-Free"]
    filtro_gastronomico=df_business['categories'].str.contains('|'.join(palabras_clave),case=False)
    df_business=df_business[filtro_gastronomico].reset_index(drop=True)
    # Exportar archivo en formato parquet
    df_business.to_parquet('business_procesado.parquet')
  except: pass

  try:
    # Lee archivo y devuelve un dataframe de pandas
    checkin=pd.read_json(ruta2,lines=True)
    # Transforma los datos de la columna "date" en una lista de fechas
    checkin['date']=checkin['date'].str.split(',')
    # Expandir datos
    checkin=checkin.explode('date').reset_index(drop=True)
    # Cruzar con datos de df_business y filtrar unicamente los negocios del rubro gastronomico
    df_merge_checkin_business=pd.merge(checkin,df_business,on='business_id',how='inner')
    df_checkin=df_merge_checkin_business[checkin.columns]
    # Exportar archivo en formato parquet
    df_checkin.to_parquet('checkin_procesado.parquet')
  except: pass

  try:
    # Lee archivo y devuelve un dataframe de pandas
    review=pd.read_json(ruta3,lines=True)
    # Cambiar nombre de variable "stars" para evitar errores al momento de cruzar los datos
    review.rename({'stars':'stars_review'},axis=1,inplace=True)
    # Cruzar con datos del data set con df_biusiness (resultado luego del etl del dataset "business")
    df_review=pd.merge(review,df_business,on='business_id',how='inner')
    df_review=df_review[review.columns]
    # Exportar archivo en formato parquet
    df_review.to_parquet('review_procesado.parquet')
  except: pass

  try:
    tabla_parquet=pq.read_table(ruta5)
    user=tabla_parquet.to_pandas()
    # Lee archivo y devuelve un dataframe de pandas
    # user=pd.read_parquet(ruta5)
    #Eliminar registros duplicados
    df_user=user.drop_duplicates().reset_index(drop=True)
    # Filtrar usuarios con mayor cantidad de reseñas efectuadas (usuarios mas activos en la plataforma es un indicador de fiabilidad del comentario tanto positivo como negativo)
    df_user=df_user[df_user['review_count']>20]
    # Exportar archivo en formato parquet
    df_user.to_parquet('user_procesado.parquet')
  except:pass


  # # Muestra datasets ya procesados
  print('\n---------------------------------------------------------------------Dataset "business" procesado-------------------------------------------------------\n')
  display(df_business)
  print('\n---------------------------------------------------------------------Dataset "checkin" procesado--------------------------------------------------------\n')
  display(df_checkin)
  print('\n---------------------------------------------------------------------Dataset "review" procesado----------------------------------------------------------\n')
  display(df_review)
  print('\n---------------------------------------------------------------------Dataset "user" procesado----------------------------------------------------------\n')
  display(df_user)




Mounted at /content/drive


In [3]:
ruta1='/content/drive/MyDrive/Proyecto Final/Yelp/business.pkl'
business=pd.read_pickle(ruta1)
business=business.iloc[:,:14]
# Seleccion de columna importantes
df_business=business.drop(columns=['is_open','attributes','hours'])
# Elimina valores nulos
df_business=df_business.dropna().reset_index(drop=True)
# Cambio de Tipos
df_business['latitude']=df_business['latitude'].astype('float')
df_business['longitude']=df_business['longitude'].astype('float')
df_business['stars']=df_business['stars'].astype('int')
df_business['review_count']=df_business['review_count'].astype('int')
# Lista de estados seleccionados
estados_seleccionados=['PA','FL','CA','IL']
# Filtado registros por estados
df_business=df_business[df_business['state'].isin(estados_seleccionados)].reset_index(drop=True)
# Trabajaremos con los negocios que contengan mas de 20 reseñas, ya que menos reseñas no seria eficiente para el sistema de recomendación
df_business=df_business[df_business['review_count']>20].reset_index(drop=True)
# Filtrado de negocios por rubro gastronómico
palabras_clave=["Restaurants","Food","Bubble Tea","Coffee & Tea","Bakery","Barbecue","Buffets","Burgers","Cafes","Catering","Cocktail Bars","Desserts","Diners","Food Trucks","Ice Cream & Frozen Yogurt","Juice Bars & Smoothies","Pizza","Seafood","Steakhouses","Sushi Bars","Tapas Bars","Wine Bars","Vegetarian","Vegan","Gluten-Free"]
filtro_gastronomico=df_business['categories'].str.contains('|'.join(palabras_clave),case=False)
df_business=df_business[filtro_gastronomico].reset_index(drop=True)
# Exportar archivo en formato parquet
df_business.to_parquet('business_procesado.parquet')
df_business


Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,categories
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,CA,19107,39.955505,-75.155564,4,80,"Restaurants, Food, Bubble Tea, Coffee & Tea, B..."
1,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.916116,-82.760461,4,100,"Food, Delis, Italian, Bakeries, Restaurants"
2,QdN72BWoyFypdGJhhI5r7g,Bar One,767 S 9th St,Philadelphia,PA,19147,39.939825,-75.157447,4,65,"Cocktail Bars, Bars, Italian, Nightlife, Resta..."
3,Mjboz24M9NlBeiOJKLEd_Q,DeSandro on Main,4105 Main St,Philadelphia,PA,19127,40.022466,-75.218314,3,41,"Pizza, Restaurants, Salad, Soup"
4,aPNXGTDkf-4bjhyMBQxqpQ,Craft Hall,901 N Delaware Ave,Philadelphia,PA,19123,39.962582,-75.135657,3,65,"Eatertainment, Arts & Entertainment, Brewpubs,..."
...,...,...,...,...,...,...,...,...,...,...,...
17021,wVxXRFf10zTTAs11nr4xeA,PrimoHoagies,6024 Ridge Ave,Philadelphia,CA,19128,40.032483,-75.214430,3,55,"Restaurants, Specialty Food, Food, Sandwiches,..."
17022,8n93L-ilMAsvwUatarykSg,Kitchen Gia,3716 Spruce St,Philadelphia,PA,19104,39.951018,-75.198240,3,22,"Coffee & Tea, Food, Sandwiches, American (Trad..."
17023,2MAQeAqmD8enCT2ZYqUgIQ,The Melting Pot - Nashville,"166 2nd Ave N, Ste A",Nashville,PA,37201,36.163875,-86.776311,4,204,"Fondue, Beer, Wine & Spirits, Food, Restaurants"
17024,w_4xUt-1AyY2ZwKtnjW0Xg,Bittercreek Alehouse,246 N 8th St,Boise,PA,83702,43.616590,-116.202383,4,998,"Bars, Gastropubs, Sandwiches, Nightlife, Resta..."


In [5]:
df_business.astype(str).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17026 entries, 0 to 17025
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   business_id   17026 non-null  object
 1   name          17026 non-null  object
 2   address       17026 non-null  object
 3   city          17026 non-null  object
 4   state         17026 non-null  object
 5   postal_code   17026 non-null  object
 6   latitude      17026 non-null  object
 7   longitude     17026 non-null  object
 8   stars         17026 non-null  object
 9   review_count  17026 non-null  object
 10  categories    17026 non-null  object
dtypes: object(11)
memory usage: 1.4+ MB


In [None]:
# LLamar a la funcion de ETL

aplicar_ETL()


---------------------------------------------------------------------Dataset "business" procesado-------------------------------------------------------



Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,categories
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,CA,19107,39.955505,-75.155564,4,80,"Restaurants, Food, Bubble Tea, Coffee & Tea, B..."
1,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.916116,-82.760461,4,100,"Food, Delis, Italian, Bakeries, Restaurants"
2,QdN72BWoyFypdGJhhI5r7g,Bar One,767 S 9th St,Philadelphia,PA,19147,39.939825,-75.157447,4,65,"Cocktail Bars, Bars, Italian, Nightlife, Resta..."
3,Mjboz24M9NlBeiOJKLEd_Q,DeSandro on Main,4105 Main St,Philadelphia,PA,19127,40.022466,-75.218314,3,41,"Pizza, Restaurants, Salad, Soup"
4,aPNXGTDkf-4bjhyMBQxqpQ,Craft Hall,901 N Delaware Ave,Philadelphia,PA,19123,39.962582,-75.135657,3,65,"Eatertainment, Arts & Entertainment, Brewpubs,..."
...,...,...,...,...,...,...,...,...,...,...,...
17021,wVxXRFf10zTTAs11nr4xeA,PrimoHoagies,6024 Ridge Ave,Philadelphia,CA,19128,40.032483,-75.214430,3,55,"Restaurants, Specialty Food, Food, Sandwiches,..."
17022,8n93L-ilMAsvwUatarykSg,Kitchen Gia,3716 Spruce St,Philadelphia,PA,19104,39.951018,-75.198240,3,22,"Coffee & Tea, Food, Sandwiches, American (Trad..."
17023,2MAQeAqmD8enCT2ZYqUgIQ,The Melting Pot - Nashville,"166 2nd Ave N, Ste A",Nashville,PA,37201,36.163875,-86.776311,4,204,"Fondue, Beer, Wine & Spirits, Food, Restaurants"
17024,w_4xUt-1AyY2ZwKtnjW0Xg,Bittercreek Alehouse,246 N 8th St,Boise,PA,83702,43.616590,-116.202383,4,998,"Bars, Gastropubs, Sandwiches, Nightlife, Resta..."



---------------------------------------------------------------------Dataset "checkin" procesado--------------------------------------------------------



Unnamed: 0,business_id,date
0,---kPU91CF4Lq2-WlRu9Lw,2020-03-13 21:10:56
1,---kPU91CF4Lq2-WlRu9Lw,2020-06-02 22:18:06
2,---kPU91CF4Lq2-WlRu9Lw,2020-07-24 22:42:27
3,---kPU91CF4Lq2-WlRu9Lw,2020-10-24 21:36:13
4,---kPU91CF4Lq2-WlRu9Lw,2020-12-09 21:23:33
...,...,...
28930,-TboXPMTf45s24FPyD8OAA,2019-10-19 13:04:35
28931,-TboXPMTf45s24FPyD8OAA,2021-04-18 12:53:10
28932,-TboXPMTf45s24FPyD8OAA,2021-05-18 11:46:56
28933,-TboXPMTf45s24FPyD8OAA,2021-05-20 01:55:28



---------------------------------------------------------------------Dataset "review" procesado----------------------------------------------------------



Unnamed: 0,review_id,user_id,business_id,stars_review,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,_ZeMknuYdlQcUqng_Im3yg,yfFzsLmaWF2d4Sr0UNbBgg,LHSTtnW3YHCeUkRDGyJOyw,5,2,0,0,Amazingly amazing wings and homemade bleu chee...,2015-08-07 02:29:16
2,TP0HYy4GqYtrWMlFIhxw3A,YRsNXrlyvjnr7NLSDwen0Q,LHSTtnW3YHCeUkRDGyJOyw,1,4,2,0,We arrived a few minutes early for a 7pm reser...,2016-05-31 01:58:22
3,pUycOfUwM8vqX7KjRRhUEA,59MxRhNVhU9MYndMkz0wtw,gebiRewfieSdtt17PTW6Zg,3,0,0,0,Had a party of 6 here for hibachi. Our waitres...,2016-07-25 07:31:06
4,l3Wk_mvAog6XANIuGQ9C7Q,ZbqSHbgCjzVAqaa7NKWn5A,EQ-TZ2eeD_E0BHuvoaeG5Q,4,0,0,0,"Locals recommended Milktooth, and it's an amaz...",2015-08-19 14:31:45
...,...,...,...,...,...,...,...,...,...
343,ssHxl6hukWdxl3CMgs4zyQ,yEBHkmp0qGax6d-gk0Wr9A,2dlQX5sP9X6Dlm1MmNOlSw,5,0,0,0,Is there a 12-step program to help manage a Re...,2016-02-23 19:45:23
344,8YLmnA2LuDj_XqEkc_GnzQ,4qEBMFM2SiGe3oHCRfyNwQ,tYCok-NtWvg8_k7woeB83w,5,0,0,0,Love love love this place for HH n dinner is g...,2017-08-07 23:48:51
345,o8Amz_DLuluPmF6HD9Lesg,sws7O4Ek6gq6fTRA5d-rWw,g5ogvPhw3PSobtaZFkdEXg,4,0,0,0,Beautifully sweet ending to a sweet trip to NO...,2012-04-28 13:09:12
346,V8dVnEHMdDNxCX6yFFam2Q,HjMKmLryj5emWPXcpBvt6Q,L7i_5DydYEKwPLfcDBRYDA,5,0,0,0,This Chipotle location was great! I wanted to ...,2017-05-13 14:09:53



---------------------------------------------------------------------Dataset "user" procesado----------------------------------------------------------



Unnamed: 0,user_id,name,review_count,yelping_since,useful,funny,cool,elite,friends,fans,...,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
0,qVc8ODYU5SZjKXVBgXdI7w,Walker,585,2007-01-25 16:47:26,7217,1259,5994,2007,"NSCy54eWehBJyZdG2iE84w, pe42u7DcCH2QmI81NX-8qA...",267,...,65,55,56,18,232,844,467,467,239,180
1,j14WgRoU_-2ZE1aw1dXrJg,Daniel,4333,2009-01-25 04:35:42,43091,13066,27281,"2009,2010,2011,2012,2013,2014,2015,2016,2017,2...","ueRPE0CX75ePGMqOFVj6IQ, 52oH4DrRvzzl8wh5UXyU0A...",3138,...,264,184,157,251,1847,7054,3131,3131,1521,1946
2,2WnXYQFK0hXEoTxPtV2zvg,Steph,665,2008-07-25 10:41:00,2086,1010,1003,20092010201120122013,"LuO3Bn4f3rlhyHIaNfTlnA, j9B4XdHUhDfTKVecyWQgyA...",52,...,13,10,17,3,66,96,119,119,35,18
3,SZDeASXq7o05mMNLshsdIA,Gwen,224,2005-11-29 04:38:33,512,330,299,200920102011,"enx1vVPnfdNUdPho6PH_wg, 4wOcvMLtU6a9Lslggq74Vg...",28,...,4,1,6,2,12,16,26,26,10,9
4,hA5lMy-EnncsH4JoR-hFGQ,Karen,79,2007-01-05 19:40:59,29,15,7,,"PBK4q9KEEBHhFvSXCUirIw, 3FWPpM7KU1gXeOM_ZbYMbA...",1,...,1,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1987828,EhB9cOpW1qxsJzUKh7djOA,Ellen,28,2013-06-04 16:32:54,19,12,3,,,0,...,1,0,0,0,0,0,0,0,0,0
1987833,izloIVOMqvIF4xN5nTipFg,Lex,37,2013-03-30 22:18:39,18,3,0,,,0,...,0,0,0,0,1,2,0,0,0,0
1987854,3LvxaTQ7I9OEdTF52f_S5Q,Jeff,120,2015-04-08 22:04:07,102,8,13,,,0,...,0,1,0,0,2,0,0,0,1,0
1987881,E7Stb54xluW_QabdPRwvog,Sandy,43,2012-04-09 19:47:58,29,4,4,,,0,...,0,0,0,0,1,2,0,0,1,0


In [None]:
# CODIGO ADAPTADO PARA GCP
# Importacion de librerias a utilizar
import pandas as pd
import json
import pickle
import pyarrow.parquet as pq
from pandas.io import gbq

def hello_gcs(event, context):

    def aplicar_ETL():
    # Variables donde se almacenan las rutas de los archivos crudos
    ruta= pd.read_json('gs://' + event['bucket'] + 'business.pkl')


    # Desarrollo de ETL---------------------------------
    try:
        #  # Lee archivo y devuelve un dataframe de pandas
        business=pd.read_pickle(ruta)
        business=business.iloc[:,:14]
        # Seleccion de columna importantes
        df_business=business.drop(columns=['is_open','attributes','hours'])
        # Elimina valores nulos
        df_business=df_business.dropna().reset_index(drop=True)
        # Cambio de Tipos
        df_business['latitude']=df_business['latitude'].astype('float')
        df_business['longitude']=df_business['longitude'].astype('float')
        df_business['stars']=df_business['stars'].astype('int')
        df_business['review_count']=df_business['review_count'].astype('int')
        # Lista de estados seleccionados
        estados_seleccionados=['PA','FL','CA','IL']
        # Filtado registros por estados
        df_business=df_business[df_business['state'].isin(estados_seleccionados)].reset_index(drop=True)
        # Trabajaremos con los negocios que contengan mas de 20 reseñas, ya que menos reseñas no seria eficiente para el sistema de recomendación
        df_business=df_business[df_business['review_count']>20].reset_index(drop=True)
        # Filtrado de negocios por rubro gastronómico
        palabras_clave=["Restaurants","Food","Bubble Tea","Coffee & Tea","Bakery","Barbecue","Buffets","Burgers","Cafes","Catering","Cocktail Bars","Desserts","Diners","Food Trucks","Ice Cream & Frozen Yogurt","Juice Bars & Smoothies","Pizza","Seafood","Steakhouses","Sushi Bars","Tapas Bars","Wine Bars","Vegetarian","Vegan","Gluten-Free"]
        filtro_gastronomico=df_business['categories'].str.contains('|'.join(palabras_clave),case=False)
        df_business=df_business[filtro_gastronomico].reset_index(drop=True)
    except: pass

    df_business.to_gbq(destination_table='datos.prueba',
                                project_id='boreal-phoenix-414821',
                                table_schema=None,
                                if_exists='append', progress_bar=False,  auth_local_webserver=False,  location='us')