In [2]:
import json
import pickle
import pandas as pd
import numpy as np


In [3]:
#Se cargan las reviews de Darden
path_data = '../data/clean'
df_reviews_darden = pd.read_parquet(f'{path_data}/reviews_darden.parquet')

In [4]:
df_reviews_darden.sample()

Unnamed: 0,name,review_id,user_id,business_id,stars,text,date,day,text_reply,city,state,postal_code,coordinates
5792,Seasons 52,oin05HO5ghbEzXRl-uhhPA,ET8n-r7glWYqZhuR6GcdNw,Mfvk9uEEhdCqj8S2u7dWgQ,3.0,Still a fan of Seasons 52 although sometimes I...,2012-01-13,₅ Friday,,Cherry Hill,PA,8002,"39.9403603389,-75.0264621411"


In [5]:
reviews_darden_negativas = df_reviews_darden[df_reviews_darden.stars <= 3][['review_id','date','name','stars','text']]
reviews_darden_negativas.sample()

Unnamed: 0,review_id,date,name,stars,text
6284,YlerfZSVOJSGF2OD4IXrZQ,2018-05-07,Cheddar's Scratch Kitchen,2.0,Visited Cheddars Carrollwood today for Sunday ...


In [6]:
with open('modelo_ML_negativas2.pkl', 'rb') as archivo:
    modelo_ml = pickle.load(archivo)

In [7]:
def etiquetar_reviews(df, modelo_ml) -> dict:
    """
    """
    def np_encoder(object):
        #
        if isinstance(object, np.generic):
            return object.item()
    
    problematicas = [modelo_ml.predict([review])[0] for review in df['text']]
    
    df_reviews_ml = pd.DataFrame({
        'review_id': df['review_id'],
        'date': df['date'],
        'name': df['name'],
        'problem': problematicas
    })

    df_reviews_problem = df_reviews_ml['problem'].apply(pd.Series)
    df_reviews_problem.rename(
        columns={
            0: 'bad_food',
            1: 'cost',
            2: 'filthy',
            3: 'missing_food',
            4: 'order_problem',
            5: 'rude_service',
            6: 'bad_neighborhood',
            7: 'slow_service'
        },
        inplace=True
    )

    df_reviews_etiquetadas = pd.concat([df_reviews_ml[['review_id', 'date','name']], df_reviews_problem], axis=1)
    
    df_reviews_etiquetadas['date'] = pd.to_datetime(df_reviews_etiquetadas['date'])
    df_reviews_etiquetadas['quarter'] = df_reviews_etiquetadas['date'].dt.to_period('Q')
    df_reviews_etiquetadas['quarter'] = df_reviews_etiquetadas['quarter'].astype('str')

    # Se por trimestre y restaurante, y crear un diccionario las reseñas y sus problematicas
    df_reviews_etiquetadas = df_reviews_etiquetadas \
        .groupby(['quarter', 'name']) \
        .apply(lambda x:
            json.dumps({
                'Bad_Food': sum(x['bad_food']),
                'Cost': sum(x['cost']),
                'Cleanliness': sum(x['filthy']),
                #'Missing_Food': sum(x['missing_food']),
                'Order_Problem': sum(x['order_problem']),
                'Rude_Service': sum(x['rude_service']),
                #'Bad_Neighborhood': sum(x['bad_neighborhood']),
                'Slow_Service': sum(x['slow_service'])
            },
            default=np_encoder
            )
        ) \
        .unstack()
    
    return df_reviews_etiquetadas


In [8]:
df_reviews_etiquetadas = etiquetar_reviews(reviews_darden_negativas, modelo_ml)

  .apply(lambda x:


In [9]:
def reemplazar_nan_con_json(df: pd.DataFrame):
    """
    """
    empty_json = json.dumps({
        'Bad_Food': 0,
        'Cost': 0,
        'Cleanliness': 0,
        #'Missing_Food': 0,
        'Order_Problem': 0,
        'Rude_Service': 0,
        #'Bad_Neighborhood': 0,
        'Slow_Service': 0
    })
    df.fillna(value=empty_json, inplace=True)

    return df

In [10]:
df_etiquetada = reemplazar_nan_con_json(df_reviews_etiquetadas)

In [11]:
def json_anidado_a_dataframe(columna: pd.Series) -> pd.DataFrame:
    """
    """
    columna = columna.apply(json.loads)
    df_anidado = columna.apply(lambda x: pd.json_normalize(x))
    df_final = pd.concat(df_anidado.tolist(), axis=0)
    df_final.index = columna.index

    return df_final

In [12]:
dfs = {}
for col in df_etiquetada.columns:
    df = json_anidado_a_dataframe(df_etiquetada[col])
    data = []
    for trimestre, row in df.iterrows():
        for etiqueta, frequency in row.items():
            for _ in range(frequency):
                data.append({'Trimestre': trimestre, 'Etiqueta': etiqueta})
    df_data = pd.DataFrame(data)
    df_data['Cadena'] = col
    dfs[col] = df_data

In [13]:
dfs.keys()

dict_keys(['Bahama Breeze', "Cheddar's Scratch Kitchen", "Eddie V's Prime Seafood", 'LongHorn Steakhouse', 'Olive Garden Italian Restaurant', "Ruth's Chris Steak House", 'Seasons 52', 'Yard House'])

In [14]:
df_out = pd.DataFrame()
for df in dfs.values():
    df_out = pd.concat([df_out, df], ignore_index=True)

In [20]:
print(df_out.head(50))

   Trimestre       Etiqueta         Cadena
0     2009Q1       Bad_Food  Bahama Breeze
1     2009Q4       Bad_Food  Bahama Breeze
2     2010Q1   Rude_Service  Bahama Breeze
3     2010Q2   Slow_Service  Bahama Breeze
4     2011Q3       Bad_Food  Bahama Breeze
5     2012Q1       Bad_Food  Bahama Breeze
6     2012Q2       Bad_Food  Bahama Breeze
7     2012Q2    Cleanliness  Bahama Breeze
8     2012Q2   Rude_Service  Bahama Breeze
9     2012Q2   Rude_Service  Bahama Breeze
10    2012Q2   Slow_Service  Bahama Breeze
11    2012Q4       Bad_Food  Bahama Breeze
12    2012Q4       Bad_Food  Bahama Breeze
13    2012Q4   Rude_Service  Bahama Breeze
14    2012Q4   Rude_Service  Bahama Breeze
15    2012Q4   Slow_Service  Bahama Breeze
16    2013Q1       Bad_Food  Bahama Breeze
17    2013Q1       Bad_Food  Bahama Breeze
18    2013Q1       Bad_Food  Bahama Breeze
19    2013Q1   Slow_Service  Bahama Breeze
20    2013Q2       Bad_Food  Bahama Breeze
21    2013Q2       Bad_Food  Bahama Breeze
22    2013Q

In [16]:
df_out.sample(100)

Unnamed: 0,Trimestre,Etiqueta,Cadena
18,2013Q1,Bad_Food,Bahama Breeze
562,2014Q4,Slow_Service,LongHorn Steakhouse
1664,2019Q1,Slow_Service,Yard House
1524,2017Q3,Bad_Food,Seasons 52
1605,2017Q3,Rude_Service,Yard House
...,...,...,...
491,2011Q1,Bad_Food,LongHorn Steakhouse
828,2021Q4,Slow_Service,LongHorn Steakhouse
175,2018Q3,Rude_Service,Bahama Breeze
1180,2019Q2,Bad_Food,Olive Garden Italian Restaurant


In [17]:
#df_out.to_parquet('ml_labels_All.parquet')

In [18]:
"""for name, df in dfs.items():
    #df['cadena'] = name
    name: str = name.replace("'","").replace(" ","")
    print(name)
    #df.to_parquet(f'ml_labels_{name}.parquet')"""

'for name, df in dfs.items():\n    #df[\'cadena\'] = name\n    name: str = name.replace("\'","").replace(" ","")\n    print(name)\n    #df.to_parquet(f\'ml_labels_{name}.parquet\')'

In [19]:
#dfs['Olive Garden Italian Restaurant'].sample(1)