In [1]:
import warnings
import pandas as pd
from ftfy import fix_text
from rapidfuzz import process, fuzz

In [2]:
import plotly.express as px

In [3]:
warnings.filterwarnings('ignore')

In [4]:
# Opción que deshabilita el limite de columnas y filas mostradas
pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)

---
## Carga data

In [5]:
# Path de los archivos procesados (formato parquet)
path_data = '../../data/clean'
df_business = pd.read_parquet(f'{path_data}/y_business_CLEAN.parquet')
df_review = pd.read_parquet(f'{path_data}/y_review_CLEAN.parquet')

---
---
## Análisis Exploratorio

### Setup

#### Dict target

In [6]:
# Diccionario de marcas de Darden y competidores
brands = {
    # Darden brands
    'darden': [
        "Olive Garden Italian Restaurant",
        "Olive Garden",
        "LongHorn Steakhouse",
        "Cheddar's Scratch Kitchen",
        "Yard House",
        "The Capital Grille",
        "Seasons 52",
        "Bahama Breeze",
        "Eddie V's",
        "Eddie V's Prime Seafood",
        "Ruth's Chris Steak House"
    ],
    # Bloomin brands
    'bloomin': [
        "Outback Steakhouse",
        "Carrabba's Italian Grill",
        "Bonefish Grill",
        "Fleming's Prime Steakhouse & Wine Bar",
        "Aussie Grill by Outback",
        "Aussie Grill - Brandon",
        "Aussie Grill - Clearwater",
        "Aussie Grill - ",
        "Aussie Grill"
    ],
    # Brinker brands
    'brinker': [
        "Chili's",
        "Chili's Grill & Bar",
        "Maggiano's Little Italy",
        "It's Just Wings"
    ],
    # Texas Roadhouse brands
    'texasroadhouse': [
        "Texas Roadhouse",
        "Bubba's 33"
    ]
}

In [7]:
# Agregar columna con nombre de compañia dueño de las marcas/brands
# Invertir el dict `brands_por_compania` para crear un mapeo de `name` nombre de compañia
dict_invertido = {string: key for key, lista in brands.items() for string in lista}
# Mapeo de nombre de marcas `name` a su compañia dueño
df_business['company'] = df_business['name'].map(dict_invertido)

In [8]:
df_business[['company', 'name']].dropna().value_counts()

company         name                           
brinker         Chili's                            15
bloomin         Outback Steakhouse                 12
darden          Olive Garden Italian Restaurant    11
                LongHorn Steakhouse                 9
bloomin         Bonefish Grill                      8
texasroadhouse  Texas Roadhouse                     6
bloomin         Carrabba's Italian Grill            5
darden          Cheddar's Scratch Kitchen           2
                Yard House                          2
                Seasons 52                          2
                Ruth's Chris Steak House            2
brinker         Maggiano's Little Italy             2
darden          Bahama Breeze                       2
                Eddie V's Prime Seafood             1
bloomin         Aussie Grill - Brandon              1
brinker         It's Just Wings                     1
bloomin         Aussie Grill by Outback             1
                Aussie Grill - Cle

### Filtro de brands

In [9]:
# Corrigir errores de decoding de texto, columna `name`
df_business['name'] = df_business['name'].apply(fix_text)

In [10]:
# Funcion que implementa busqueda 'fuzzy' en textos
def fuzzy_match(x, match_to, threshold=90):
    match, score, _ = process.extractOne(x, match_to, scorer=fuzz.WRatio)
    return match if score >= threshold else None

#### Darden

In [11]:
# Aplicar fuzzy matching para Darden
df_business['name_match'] = df_business['name'].apply(fuzzy_match, match_to=brands['darden'])
# Filtrar nulos (donde nulo representa no-match)
df_darden = df_business[df_business['name_match'].notnull()]
df_darden.drop('name_match', axis=1, inplace=True)
df_darden['name'].value_counts()

name
Olive Garden Italian Restaurant    11
LongHorn Steakhouse                 9
Bahama Breeze                       2
Seasons 52                          2
Yard House                          2
Cheddar's Scratch Kitchen           2
Ruth's Chris Steak House            2
Eddie V's Prime Seafood             1
Name: count, dtype: int64

In [12]:
df_darden['state'].value_counts()

state
PA    15
FL    13
NJ     3
Name: count, dtype: int64

#### Bloomin'

In [13]:
# Aplicar fuzzy matching para Bloomin
df_business['name_match'] = df_business['name'].apply(fuzzy_match, match_to=brands['bloomin'])
# Filtrar nulos (donde nulo representa no-match)
df_bloomin = df_business[df_business['name_match'].notnull()]
df_bloomin.drop('name_match', axis=1, inplace=True)
df_bloomin['name'].value_counts()

name
Outback Steakhouse                       12
Bonefish Grill                            8
Carrabba's Italian Grill                  5
Aussie Grill by Outback                   1
Aussie Grill - Brandon                    1
Aussie Grill                              1
Fleming's Prime Steakhouse & Wine Bar     1
Aussie Grill - Clearwater                 1
Name: count, dtype: int64

In [14]:
df_bloomin['state'].value_counts()

state
FL    12
PA    12
NJ     4
DE     2
Name: count, dtype: int64

#### Brinker

In [15]:
# Aplicar fuzzy matching para Brinker
df_business['name_match'] = df_business['name'].apply(fuzzy_match, match_to=brands['brinker'])
# Filtrar nulos (donde nulo representa no-match)
df_brinker = df_business[df_business['name_match'].notnull()]
df_brinker.drop('name_match', axis=1, inplace=True)
df_brinker['name'].value_counts()

name
Chili's                    15
Maggiano's Little Italy     2
It's Just Wings             1
Name: count, dtype: int64

In [16]:
df_brinker['state'].value_counts()

state
PA    10
FL     6
DE     1
NJ     1
Name: count, dtype: int64

#### Texas Roadhouse

In [17]:
# Aplicar fuzzy matching para Texas Roadhouse
df_business['name_match'] = df_business['name'].apply(fuzzy_match, match_to=brands['texasroadhouse'])
# Filtrar nulos (donde nulo representa no-match)
df_texasroadhouse = df_business[df_business['name_match'].notnull()]
df_texasroadhouse.drop('name_match', axis=1, inplace=True)
df_texasroadhouse['name'].value_counts()

name
Texas Roadhouse    6
Name: count, dtype: int64

In [18]:
df_texasroadhouse['state'].value_counts()

state
PA    3
FL    3
Name: count, dtype: int64

---
### Filtro de reviews por brand

In [19]:
business_cols = ['business_id', 'city', 'state', 'postal_code', 'coordinates']

In [20]:
df_review['day'] = pd.to_datetime(df_review['date']).dt.day_name()

In [21]:
df_review['day'] = df_review['day'].replace({
    'Monday': '₁ Monday',
    'Tuesday': '₂ Tuesday',
    'Wednesday': '₃ Wednesday',
    'Thursday': '₄ Thursday',
    'Friday': '₅ Friday',
    'Saturday': '₆ Saturday',
    'Sunday': '₇ Sunday'
})

#### Darden

In [22]:
# Filtrar `df_review` por la col `business_id` en `df_darden`
filtro_id = df_darden['business_id']
# Filtrar los review
df_darden_review = df_review[df_review['business_id'].isin(filtro_id)]

# Corrigir errores de decoding de texto, columna `text`
df_darden_review['text'] = df_darden_review['text'].apply(fix_text).replace('\n', ' ')

In [23]:
# Nueva columna para respuestas de reviews
df_darden_review['text_reply'] = ''

In [24]:
df_darden_review = pd.merge(df_darden_review, df_darden[business_cols], on='business_id', how='left')

# Fijar nueva columna `name` como la primer columna del df 
nom_col = df_darden_review.pop('name')
df_darden_review.insert(0, 'name', nom_col)

In [25]:
df_darden_review['state'].value_counts()

state
PA    4146
FL    2510
NJ     497
Name: count, dtype: int64

In [26]:
fig = px.histogram(df_darden_review, x='date', nbins=200, title='Cantidad de Reseñas por Fecha - Darden')
fig.show()

#### Bloomin'

In [27]:
# Filtrar `df_review` por la col `business_id` en `df_bloomin`
filtro_id = df_bloomin['business_id']
# Filtrar los review
df_bloomin_review = df_review[df_review['business_id'].isin(filtro_id)]

# Corrigir errores de decoding de texto, columna `text`
df_bloomin_review['text'] = df_bloomin_review['text'].apply(fix_text).replace('\n', ' ')
df_bloomin_review.sample(1)

Unnamed: 0,name,review_id,user_id,business_id,stars,useful,funny,cool,text,date,day
780552,Bonefish Grill,r6M1-rknb3LSFkzSdVCxXQ,U_MpQN6goYXEEH06-sLTNg,TkupWfgSwBd_h05HAORDJg,5.0,0,0,0,I love it here. I always feel like I'm on a fi...,2016-10-14,₅ Friday


In [28]:
df_bloomin_review = pd.merge(df_bloomin_review, df_bloomin[business_cols], on='business_id', how='left')

# Fijar nueva columna `name` como la primer columna del df 
nom_col = df_bloomin_review.pop('name')
df_bloomin_review.insert(0, 'name', nom_col)

In [29]:
fig = px.histogram(df_bloomin_review, x='date', nbins=200, title='Cantidad de Reseñas por Fecha - Bloomin')
fig.show()

#### Brinker

In [30]:
# Filtrar `df_review` por la col `business_id` en `df_brinker`
filtro_id = df_brinker['business_id']
# Filtrar los review
df_brinker_review = df_review[df_review['business_id'].isin(filtro_id)]

# Corrigir errores de decoding de texto, columna `text`
df_brinker_review['text'] = df_brinker_review['text'].apply(fix_text).replace('\n', ' ')
df_brinker_review.sample()

Unnamed: 0,name,review_id,user_id,business_id,stars,useful,funny,cool,text,date,day
783124,Chili's,XNKGnKDrW5qJzaD0ybwQGQ,u8FTX0uDbDewm1RctRBrdg,12SbnXfsthBbBaF2YzhLfA,3.0,0,0,0,I like chili's but it has become mainstream an...,2015-05-31,₇ Sunday


In [31]:
df_brinker_review = pd.merge(df_brinker_review, df_brinker[business_cols], on='business_id', how='left')

# Fijar nueva columna `name` como la primer columna del df 
nom_col = df_brinker_review.pop('name')
df_brinker_review.insert(0, 'name', nom_col)

In [32]:
fig = px.histogram(df_brinker_review, x='date', nbins=200, title='Cantidad de Reseñas por Fecha - Brinker')
fig.show()

#### Texas Roadhouse

In [33]:
# Filtrar `df_review` por la col `business_id` en `df_texasroadhouse`
filtro_id = df_texasroadhouse['business_id']
# Filtrar los review
df_texasroadhouse_review = df_review[df_review['business_id'].isin(filtro_id)]

# Corrigir errores de decoding de texto, columna `text`
df_texasroadhouse_review['text'] = df_texasroadhouse_review['text'].apply(fix_text).replace('\n', ' ')
df_texasroadhouse_review.sample()

Unnamed: 0,name,review_id,user_id,business_id,stars,useful,funny,cool,text,date,day
153565,Texas Roadhouse,oNOhZdr7HVNrkK6aMCuPTg,yH2uaB3bQ9VOUo-jh44Vhg,q3k9lwaHNqbtiGc-XeHnKg,4.0,0,0,0,The service was great! I liked ghat the manage...,2019-12-28,₆ Saturday


In [34]:
df_texasroadhouse_review = pd.merge(df_texasroadhouse_review, df_texasroadhouse[business_cols], on='business_id', how='left')

# Fijar nueva columna `name` como la primer columna del df 
nom_col = df_texasroadhouse_review.pop('name')
df_texasroadhouse_review.insert(0, 'name', nom_col)

In [35]:
fig = px.histogram(df_texasroadhouse_review, x='date', nbins=200, title='Cantidad de Reseñas por Fecha - Texas RH')
fig.show()

#### Resumen y exportacion de filtros aplicados

In [36]:
print(
    "Cantidad de Reviews\n"
    f"Darden: {df_darden_review.shape[0]}\n"
    f"Bloomin': {df_bloomin_review.shape[0]}\n"
    f"Brinker: {df_brinker_review.shape[0]}\n"
    f"Texas RH: {df_texasroadhouse_review.shape[0]}"
)

Cantidad de Reviews
Darden: 7153
Bloomin': 4230
Brinker: 2422
Texas RH: 1265


---

In [37]:
dfs = [
    ['darden', df_darden_review],
    ['bloomin', df_bloomin_review],
    ['brinker', df_brinker_review],
    ['texasroadhouse', df_texasroadhouse_review]
]

drop_cols = ['useful','funny','cool']

for _, df in dfs:
    df.drop(columns=drop_cols, inplace=True)

In [38]:
df_darden_review.sample()

Unnamed: 0,name,review_id,user_id,business_id,stars,text,date,day,text_reply,city,state,postal_code,coordinates
6357,LongHorn Steakhouse,EYV7TskAXKDLnSZe09dZBQ,NorDe24AvFKjODj6YL5w3g,CS_GzUYlEPa6QHTRY224wQ,4.0,I wish we had a Longhorn in North Texas. We w...,2015-10-06,₂ Tuesday,,Norristown,FL,19403,"40.1274771937,-75.4040751479"


## Almacenar a disco

In [39]:
#for nom, df in dfs:
#    df.to_parquet(f'{path_data}/reviews_{nom}.parquet')