In [86]:
import json
import pickle
import pandas as pd
import numpy as np


In [87]:
#Se cargan las reviews de Darden
path_data = '../data/clean'
df_reviews_darden = pd.read_parquet(f'{path_data}/reviews_darden.parquet')

In [88]:
df_reviews_darden.sample()

Unnamed: 0,name,review_id,user_id,business_id,stars,text,date,day,text_reply,city,state,postal_code,coordinates
5416,Yard House,oItvcrmF-li-V63oFsjSUg,OLGQ7alK4VKl3YdQk6UF5g,Hr_zdqPZ9VkGjKNv4UIQRg,4.0,"Searching Yelp for breweries in KOP, Yard Hous...",2018-09-09,₇ Sunday,,King of Prussia,FL,19406,"40.0879239996,-75.3952012584"


In [89]:
reviews_darden_negativas = df_reviews_darden[df_reviews_darden.stars <= 3][['review_id','date','name','stars','text']]
reviews_darden_negativas.sample()

Unnamed: 0,review_id,date,name,stars,text
1917,5gHOtgpGMsTSOc_8GjOU4Q,2017-10-02,LongHorn Steakhouse,2.0,"What's a fellow to do in this ""Close enough is..."


In [90]:
with open('modelo_ML_negativas2.pkl', 'rb') as archivo:
    modelo_ml = pickle.load(archivo)

In [91]:
def etiquetar_reviews(df, modelo_ml) -> dict:
    """
    """
    def np_encoder(object):
        #
        if isinstance(object, np.generic):
            return object.item()
    
    problematicas = [modelo_ml.predict([review])[0] for review in df['text']]
    
    df_reviews_ml = pd.DataFrame({
        'review_id': df['review_id'],
        'date': df['date'],
        'name': df['name'],
        'problem': problematicas
    })

    df_reviews_problem = df_reviews_ml['problem'].apply(pd.Series)
    df_reviews_problem.rename(
        columns={
            0: 'bad_food',
            1: 'cost',
            2: 'filthy',
            3: 'missing_food',
            4: 'order_problem',
            5: 'rude_service',
            6: 'bad_neighborhood',
            7: 'slow_service'
        },
        inplace=True
    )

    df_reviews_etiquetadas = pd.concat([df_reviews_ml[['review_id', 'date','name']], df_reviews_problem], axis=1)
    
    df_reviews_etiquetadas['date'] = pd.to_datetime(df_reviews_etiquetadas['date'])
    df_reviews_etiquetadas['quarter'] = df_reviews_etiquetadas['date'].dt.to_period('Q')
    df_reviews_etiquetadas['quarter'] = df_reviews_etiquetadas['quarter'].astype('str')

    # Se por trimestre y restaurante, y crear un diccionario las reseñas y sus problematicas
    df_reviews_etiquetadas = df_reviews_etiquetadas \
        .groupby(['quarter', 'name']) \
        .apply(lambda x:
            json.dumps({
                'Bad_Food': sum(x['bad_food']),
                'Cost': sum(x['cost']),
                'Cleanliness': sum(x['filthy']),
                #'Missing_Food': sum(x['missing_food']),
                'Order_Problem': sum(x['order_problem']),
                'Rude_Service': sum(x['rude_service']),
                #'Bad_Neighborhood': sum(x['bad_neighborhood']),
                'Slow_Service': sum(x['slow_service'])
            },
            default=np_encoder
            )
        ) \
        .unstack()
    
    return df_reviews_etiquetadas


In [92]:
df_reviews_etiquetadas = etiquetar_reviews(reviews_darden_negativas, modelo_ml)

  .apply(lambda x:


In [93]:
def reemplazar_nan_con_json(df: pd.DataFrame):
    """
    """
    empty_json = json.dumps({
        'Bad_Food': 0,
        'Cost': 0,
        'Cleanliness': 0,
        #'Missing_Food': 0,
        'Order_Problem': 0,
        'Rude_Service': 0,
        #'Bad_Neighborhood': 0,
        'Slow_Service': 0
    })
    df.fillna(value=empty_json, inplace=True)

    return df

In [94]:
df_etiquetada = reemplazar_nan_con_json(df_reviews_etiquetadas)

In [95]:
def json_anidado_a_dataframe(columna: pd.Series) -> pd.DataFrame:
    """
    """
    columna = columna.apply(json.loads)
    df_anidado = columna.apply(lambda x: pd.json_normalize(x))
    df_final = pd.concat(df_anidado.tolist(), axis=0)
    df_final.index = columna.index

    return df_final

In [96]:
dfs = {}
for col in df_etiquetada.columns:
    df = json_anidado_a_dataframe(df_etiquetada[col])
    data = []
    for trimestre, row in df.iterrows():
        for etiqueta, frequency in row.items():
            for _ in range(frequency):
                data.append({'Trimestre': trimestre, 'Etiqueta': etiqueta})
    df_data = pd.DataFrame(data)
    df_data['Cadena'] = col
    dfs[col] = df_data

In [97]:
dfs.keys()

dict_keys(['Bahama Breeze', "Cheddar's Scratch Kitchen", "Eddie V's Prime Seafood", 'LongHorn Steakhouse', 'Olive Garden Italian Restaurant', "Ruth's Chris Steak House", 'Seasons 52', 'Yard House'])

In [98]:
df_out = pd.DataFrame()
for df in dfs.values():
    df_out = pd.concat([df_out, df], ignore_index=True)

In [99]:
print(df_out)

     Trimestre       Etiqueta         Cadena
0       2009Q1       Bad_Food  Bahama Breeze
1       2009Q4       Bad_Food  Bahama Breeze
2       2010Q1   Rude_Service  Bahama Breeze
3       2010Q2   Slow_Service  Bahama Breeze
4       2011Q3       Bad_Food  Bahama Breeze
...        ...            ...            ...
1742    2021Q4   Rude_Service     Yard House
1743    2021Q4   Rude_Service     Yard House
1744    2021Q4   Rude_Service     Yard House
1745    2021Q4   Slow_Service     Yard House
1746    2022Q1  Order_Problem     Yard House

[1747 rows x 3 columns]


In [100]:
df_out.to_parquet('ml_labels_All.parquet')

In [101]:
"""for name, df in dfs.items():
    #df['cadena'] = name
    name: str = name.replace("'","").replace(" ","")
    print(name)
    #df.to_parquet(f'ml_labels_{name}.parquet')"""

'for name, df in dfs.items():\n    #df[\'cadena\'] = name\n    name: str = name.replace("\'","").replace(" ","")\n    print(name)\n    #df.to_parquet(f\'ml_labels_{name}.parquet\')'

In [102]:
#dfs['Olive Garden Italian Restaurant'].sample(1)