In [23]:
import json
import pickle
import pandas as pd
import numpy as np


In [24]:
#Se cargan las reviews de Darden
path_data = '../data/clean'
df_reviews_darden = pd.read_parquet(f'{path_data}/reviews_darden.parquet')

In [25]:
df_reviews_darden.sample()

Unnamed: 0,name,review_id,user_id,business_id,stars,text,date,day,text_reply,city,state,postal_code,coordinates
6367,LongHorn Steakhouse,WZxBZQrymff4DtylaU780A,6GX4GZtx6C7m60p3rO0uVg,CS_GzUYlEPa6QHTRY224wQ,3.0,"Food was good- SERVICE WAS NOT\n\nOkay, we wen...",2018-09-01,₆ Saturday,,Norristown,FL,19403,"40.1274771937,-75.4040751479"


In [26]:
reviews_darden_negativas = df_reviews_darden[df_reviews_darden.stars <= 3][['review_id','date','name','stars','text']]
reviews_darden_negativas.sample()

Unnamed: 0,review_id,date,name,stars,text
384,IveTDlvZ8lzCq-XoyFLwog,2013-09-17,Olive Garden Italian Restaurant,3.0,"Good food and friendly, attentive service. We..."


In [27]:
with open('modelo_ML_negativas2.pkl', 'rb') as archivo:
    modelo_ml = pickle.load(archivo)

In [28]:
def etiquetar_reviews(df, modelo_ml) -> dict:
    """
    """
    def np_encoder(object):
        #
        if isinstance(object, np.generic):
            return object.item()
    
    problematicas = [modelo_ml.predict([review])[0] for review in df['text']]
    
    df_reviews_ml = pd.DataFrame({
        'review_id': df['review_id'],
        'date': df['date'],
        'name': df['name'],
        'problem': problematicas
    })

    df_reviews_problem = df_reviews_ml['problem'].apply(pd.Series)
    df_reviews_problem.rename(
        columns={
            0: 'bad_food',
            1: 'cost',
            2: 'filthy',
            3: 'missing_food',
            4: 'order_problem',
            5: 'rude_service',
            6: 'bad_neighborhood',
            7: 'slow_service'
        },
        inplace=True
    )

    df_reviews_etiquetadas = pd.concat([df_reviews_ml[['review_id', 'date','name']], df_reviews_problem], axis=1)
    
    df_reviews_etiquetadas['date'] = pd.to_datetime(df_reviews_etiquetadas['date'])
    df_reviews_etiquetadas['quarter'] = df_reviews_etiquetadas['date'].dt.to_period('Q')
    df_reviews_etiquetadas['quarter'] = df_reviews_etiquetadas['quarter'].astype('str')

    # Se por trimestre y restaurante, y crear un diccionario las reseñas y sus problematicas
    df_reviews_etiquetadas = df_reviews_etiquetadas \
        .groupby(['quarter', 'name']) \
        .apply(lambda x:
            json.dumps({
                'review_ids': x['review_id'].to_list(),
                'Bad_Food': sum(x['bad_food']),
                'Cost': sum(x['cost']),
                'Cleanliness': sum(x['filthy']),
                #'Missing_Food': sum(x['missing_food']),
                'Order_Problem': sum(x['order_problem']),
                'Rude_Service': sum(x['rude_service']),
                #'Bad_Neighborhood': sum(x['bad_neighborhood']),
                'Slow_Service': sum(x['slow_service'])
            },
            default=np_encoder
            )
        ) \
        .unstack()
    
    return df_reviews_etiquetadas


In [29]:
df_reviews_etiquetadas = etiquetar_reviews(reviews_darden_negativas, modelo_ml)

  .apply(lambda x:


In [30]:
def reemplazar_nan_con_json(df: pd.DataFrame):
    """
    """
    empty_json = json.dumps({
        'review_ids': [],
        'Bad_Food': 0,
        'Cost': 0,
        'Cleanliness': 0,
        #'Missing_Food': 0,
        'Order_Problem': 0,
        'Rude_Service': 0,
        #'Bad_Neighborhood': 0,
        'Slow_Service': 0
    })
    df.fillna(value=empty_json, inplace=True)

    return df

In [31]:
df_etiquetada = reemplazar_nan_con_json(df_reviews_etiquetadas)

In [32]:
def json_anidado_a_dataframe(columna: pd.Series) -> pd.DataFrame:
    """
    """
    columna = columna.apply(json.loads)
    df_anidado = columna.apply(lambda x: pd.json_normalize(x))
    df_final = pd.concat(df_anidado.tolist(), axis=0)
    df_final.index = columna.index

    return df_final

In [33]:
dfs = {}
for col in df_etiquetada.columns:
    df = json_anidado_a_dataframe(df_etiquetada[col])
    dfs[col] = df

In [34]:
dfs.keys()

dict_keys(['Bahama Breeze', "Cheddar's Scratch Kitchen", "Eddie V's Prime Seafood", 'LongHorn Steakhouse', 'Olive Garden Italian Restaurant', "Ruth's Chris Steak House", 'Seasons 52', 'Yard House'])

In [35]:
for name, df in dfs.items():
    name: str = name.replace("'","").replace(" ","")
    print(name)
    #df.to_parquet(f'ml_labels_{name}.parquet')

BahamaBreeze
CheddarsScratchKitchen
EddieVsPrimeSeafood
LongHornSteakhouse
OliveGardenItalianRestaurant
RuthsChrisSteakHouse
Seasons52
YardHouse


In [73]:
dfs['Olive Garden Italian Restaurant'].sample(10)

Unnamed: 0_level_0,review_ids,Bad_Food,Cost,Cleanliness,Order_Problem,Rude_Service,Slow_Service
quarter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2014Q1,"[r_h7bcqqOVfrhU49DFGAgw, 0YmCCSprFIhIy3YJHHUhZ...",1,0,0,2,4,1
2017Q4,"[inCRkoX79Ilc1q0u3IiyjA, ebjjoDJn9iz_SM0xVfd4X...",7,0,1,2,4,4
2019Q4,"[6v95QXQ-R1AONuRLvU-QHA, lvllT9r7OpCfQR6Y1yJVO...",1,0,0,2,3,2
2011Q3,"[unX-Hkh9yEgZgYBwTKvBNg, nZ--lAwvFU3aFF_tSjewS...",3,0,0,0,3,2
2012Q3,"[j1sHpwfVGzCxJ1PmUm6FXw, oUP29wdp5byJx4UeXfnpM...",1,0,0,1,1,0
2018Q3,"[ktKXkNCKhz_JmU4oDGdF8w, 1sIpyZ9-A4-vii9XvZYJ8...",3,0,2,3,6,0
2010Q2,[W3jYreM-j4sh79zMCVJySA],0,0,0,0,0,0
2012Q1,"[vM7NON64_i4UxR7vH0l_YA, 7R81L8k2r5X8b7q68nvVE...",1,0,0,0,1,2
2021Q1,"[UNO7HEvkW17Ko7YHy7flow, 77KBMsjYXKOwuaBIQEic-...",1,0,0,3,4,0
2008Q4,[lke7HAcU-OgjdAoe-1-3nA],0,0,0,0,0,0
