In [1]:
import warnings
import pandas as pd

In [2]:
from ftfy import fix_text
from rapidfuzz import process, fuzz

In [3]:
warnings.filterwarnings('ignore')

In [4]:
# Opción que deshabilita el limite de columnas y filas mostradas
pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)

---
## Carga data

In [5]:
# Path de los archivos procesados (formato parquet)
path_data = '../../data/clean'
df_business = pd.read_parquet(f'{path_data}/y_business_CLEAN.parquet')
df_review = pd.read_parquet(f'{path_data}/y_review_CLEAN.parquet')

---
---
## Análisis Exploratorio

### Setup

#### Dict target

In [6]:
# Diccionario de marcas de Darden y competidores
brands = {
    # Darden brands
    'darden': [
        "Olive Garden Italian Restaurant",
        "LongHorn Steakhouse",
        "Cheddar's Scratch Kitchen",
        "Yard House",
        "The Capital Grille",   ####
        "Seasons 52",
        "Bahama Breeze",
        "Eddie V's",  ####
        "Ruth's Chris Steak House"
    ],
    # Bloomin brands
    'bloomin': [
        "Outback Steakhouse",
        "Carrabba's Italian Grill",
        "Bonefish Grill",
        "Fleming's Prime Steakhouse & Wine Bar",
        "Aussie Grill",
        "Aussie Grill - Brandon"
    ],
    # Brinker brands
    'brinker': [
        "Chili's",
        "Chili's Grill & Bar",
        "Maggiano's Little Italy",
        "It's Just Wings"
    ],
    # Texas Roadhouse brands
    'texasroadhouse': [
        "Texas Roadhouse",
        "Bubba's 33",
        #"Jaggers"  # Fast food, excluir?   ####
    ]
}

### Filtro de brands

In [7]:
# Corrigir errores de decoding de texto, columna `name`
df_business['name'] = df_business['name'].apply(fix_text)

In [8]:
# Funcion que implementa busqueda 'fuzzy' en textos
def fuzzy_match(x, match_to, threshold=90):
    match, score, _ = process.extractOne(x, match_to, scorer=fuzz.WRatio)
    return match if score >= threshold else None

#### Darden

In [9]:
# Aplicar fuzzy matching para Darden
df_business['name_match'] = df_business['name'].apply(fuzzy_match, match_to=brands['darden'])
# Filtrar nulos (donde nulo representa no-match)
df_darden = df_business[df_business['name_match'].notnull()]
df_darden.drop('name_match', axis=1, inplace=True)
df_darden['name'].value_counts()

name
Olive Garden Italian Restaurant    10
LongHorn Steakhouse                 9
Cheddar's Scratch Kitchen           3
Ruth's Chris Steak House            3
Bahama Breeze                       2
Seasons 52                          2
Yard House                          1
Name: count, dtype: int64

#### Bloomin'

In [10]:
# Aplicar fuzzy matching para Bloomin
df_business['name_match'] = df_business['name'].apply(fuzzy_match, match_to=brands['bloomin'])
# Filtrar nulos (donde nulo representa no-match)
df_bloomin = df_business[df_business['name_match'].notnull()]
df_bloomin.drop('name_match', axis=1, inplace=True)
df_bloomin['name'].value_counts()

name
Outback Steakhouse                       13
Bonefish Grill                            7
Carrabba's Italian Grill                  4
Fleming's Prime Steakhouse & Wine Bar     3
Aussie Grill - Brandon                    1
Name: count, dtype: int64

#### Brinker

In [11]:
# Aplicar fuzzy matching para Brinker
df_business['name_match'] = df_business['name'].apply(fuzzy_match, match_to=brands['brinker'])
# Filtrar nulos (donde nulo representa no-match)
df_brinker = df_business[df_business['name_match'].notnull()]
df_brinker.drop('name_match', axis=1, inplace=True)
df_brinker['name'].value_counts()

name
Chili's                    20
Maggiano's Little Italy     3
It's Just Wings             1
Chili's Grill & Bar         1
Name: count, dtype: int64

#### Texas Roadhouse

In [12]:
# Aplicar fuzzy matching para Texas Roadhouse
df_business['name_match'] = df_business['name'].apply(fuzzy_match, match_to=brands['texasroadhouse'])
# Filtrar nulos (donde nulo representa no-match)
df_texasroadhouse = df_business[df_business['name_match'].notnull()]
df_texasroadhouse.drop('name_match', axis=1, inplace=True)
df_texasroadhouse['name'].value_counts()

name
Texas Roadhouse    6
Bubba's 33         1
Name: count, dtype: int64

---
### Filtro de reviews por brand

In [13]:
business_cols = ['business_id', 'city', 'state', 'postal_code', 'coordinates']

In [14]:
df_review['day'] = df_review['date'].dt.day_name()

#### Darden

In [15]:
# Filtrar `df_review` por la col `business_id` en `df_darden`
filtro_id = df_darden['business_id']
# Filtrar los review
df_darden_review = df_review[df_review['business_id'].isin(filtro_id)]

# Corrigir errores de decoding de texto, columna `text`
df_darden_review['text'] = df_darden_review['text'].apply(fix_text)
df_darden_review.sample(1)

Unnamed: 0,name,review_id,user_id,business_id,stars,useful,funny,cool,text,date,day
642725,Seasons 52,PvPb2g-ZDHOWcXQZPEgrVw,V74njIb9fS-Ktt3QepblyA,Mfvk9uEEhdCqj8S2u7dWgQ,5.0,2,0,1,One of my favorite restaurants! You can never ...,2019-10-05 02:16:34,Saturday


In [16]:
df_darden_review = pd.merge(df_darden_review, df_darden[business_cols], on='business_id', how='left')

# Fijar nueva columna `name` como la primer columna del df 
nom_col = df_darden_review.pop('name')
df_darden_review.insert(0, 'name', nom_col)

#### Bloomin'

In [17]:
# Filtrar `df_review` por la col `business_id` en `df_bloomin`
filtro_id = df_bloomin['business_id']
# Filtrar los review
df_bloomin_review = df_review[df_review['business_id'].isin(filtro_id)]

# Corrigir errores de decoding de texto, columna `text`
df_bloomin_review['text'] = df_bloomin_review['text'].apply(fix_text)
df_bloomin_review.sample(1)

Unnamed: 0,name,review_id,user_id,business_id,stars,useful,funny,cool,text,date,day
143675,Bonefish Grill,5Il-kQZnOMu8kaM-iWkzvg,8vfjrYvKn2z-ScReGXXu6g,VlTwxhPtKWvWxg6VSS9uyA,2.0,0,0,0,We generally love this place as the food is de...,2016-05-08 21:08:49,Sunday


In [18]:
df_bloomin_review = pd.merge(df_bloomin_review, df_bloomin[business_cols], on='business_id', how='left')

# Fijar nueva columna `name` como la primer columna del df 
nom_col = df_bloomin_review.pop('name')
df_bloomin_review.insert(0, 'name', nom_col)

#### Brinker

In [19]:
# Filtrar `df_review` por la col `business_id` en `df_brinker`
filtro_id = df_brinker['business_id']
# Filtrar los review
df_brinker_review = df_review[df_review['business_id'].isin(filtro_id)]

# Corrigir errores de decoding de texto, columna `text`
df_brinker_review['text'] = df_brinker_review['text'].apply(fix_text)
df_brinker_review.sample()

Unnamed: 0,name,review_id,user_id,business_id,stars,useful,funny,cool,text,date,day
347344,Chili's,7UaWb6GOuNNws-A4mpYgiA,ArPlxEWHseF0Um2TPnOKLw,U50T86i8wyNWGWxsP7GIRw,1.0,0,0,0,very unorganized no one knew what they were do...,2021-07-19 00:14:14,Monday


In [20]:
df_brinker_review = pd.merge(df_brinker_review, df_brinker[business_cols], on='business_id', how='left')

# Fijar nueva columna `name` como la primer columna del df 
nom_col = df_brinker_review.pop('name')
df_brinker_review.insert(0, 'name', nom_col)

#### Texas Roadhouse

In [21]:
# Filtrar `df_review` por la col `business_id` en `df_texasroadhouse`
filtro_id = df_texasroadhouse['business_id']
# Filtrar los review
df_texasroadhouse_review = df_review[df_review['business_id'].isin(filtro_id)]

# Corrigir errores de decoding de texto, columna `text`
df_texasroadhouse_review['text'] = df_texasroadhouse_review['text'].apply(fix_text)
df_texasroadhouse_review.sample()

Unnamed: 0,name,review_id,user_id,business_id,stars,useful,funny,cool,text,date,day
26638,Texas Roadhouse,vSbzEcWf36XeS9WfBHVQ5Q,uwNcU-rR5ove6DQIhqPElQ,AEgMDoFmikKJv3wYYKlseA,5.0,0,0,0,Texas Roadhouse is my hubby's family to go res...,2022-01-01 23:54:35,Saturday


In [22]:
df_texasroadhouse_review = pd.merge(df_texasroadhouse_review, df_texasroadhouse[business_cols], on='business_id', how='left')

# Fijar nueva columna `name` como la primer columna del df 
nom_col = df_texasroadhouse_review.pop('name')
df_texasroadhouse_review.insert(0, 'name', nom_col)

#### Resumen y exportacion de filtros aplicados

In [23]:
print(
    "Cantidad de Reviews\n"
    f"Darden: {df_darden_review.shape[0]}\n"
    f"Bloomin': {df_bloomin_review.shape[0]}\n"
    f"Brinker: {df_brinker_review.shape[0]}\n"
    f"Texas RH: {df_texasroadhouse_review.shape[0]}"
)

Cantidad de Reviews
Darden: 6365
Bloomin': 4772
Brinker: 3209
Texas RH: 1372


---

In [24]:
dfs = [
    ['darden', df_darden_review],
    ['bloomin', df_bloomin_review],
    ['brinker', df_brinker_review],
    ['texasroadhouse', df_texasroadhouse_review]
]

drop_cols = ['useful','funny','cool']

for _, df in dfs:
    df.drop(columns=drop_cols, inplace=True)

In [27]:
df_darden_review.sample(10).to_csv('reviews_----.csv')

In [25]:
#for nom, df in dfs:
#    df.to_parquet(f'reviews_{nom}.parquet')