In [1]:
import warnings
import pandas as pd

In [2]:
from ftfy import fix_text
from rapidfuzz import process, fuzz

In [3]:
warnings.filterwarnings('ignore')

In [4]:
# Opción que deshabilita el limite de columnas y filas mostradas
pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)

---
## Carga data

In [None]:
# Path de los archivos procesados (formato parquet)
path_data = '../../data'
df_business = pd.read_parquet(f'{path_data}/y_business_CLEAN.parquet')
df_review = pd.read_parquet(f'{path_data}/y_review_CLEAN.parquet')

---
---
## Análisis Exploratorio

### Setup

#### Dict target

In [5]:
# Diccionario de marcas de Darden y competidores
brands = {
    # Darden brands
    'darden': [
        "Olive Garden Italian Restaurant",
        "LongHorn Steakhouse",
        "Cheddar's Scratch Kitchen",
        "Yard House",
        "The Capital Grille",   ####
        "Seasons 52",
        "Bahama Breeze",
        "Eddie V's",  ####
        "Ruth's Chris Steak House"
    ],
    # Bloomin brands
    'bloomin': [
        "Outback Steakhouse",
        "Carrabba's Italian Grill",
        "Bonefish Grill",
        "Fleming's Prime Steakhouse & Wine Bar",
        "Aussie Grill",
        "Aussie Grill - Brandon"
    ],
    # Brinker brands
    'brinker': [
        "Chili's",
        "Chili's Grill & Bar",
        "Maggiano's Little Italy",
        "It's Just Wings"
    ],
    # Texas Roadhouse brands
    'texasroadhouse': [
        "Texas Roadhouse",
        "Bubba's 33",
        #"Jaggers"  # Fast food, excluir?   ####
    ]
}

### Filtro de brands

In [None]:
# Corrigir errores de decoding de texto, columna `name`
df_business['name'] = df_business['name'].apply(fix_text)

In [10]:
# Funcion que implementa busqueda 'fuzzy' en textos
def fuzzy_match(x, match_to, threshold=90):
    match, score, _ = process.extractOne(x, match_to, scorer=fuzz.WRatio)
    return match if score >= threshold else None

#### Darden

In [11]:
# Aplicar fuzzy matching para Darden
df_business['name_match'] = df_business['name'].apply(fuzzy_match, match_to=brands['darden'])
# Filtrar nulos (donde nulo representa no-match)
df_darden = df_business[df_business['name_match'].notnull()]
df_darden['name'].value_counts()

name
Olive Garden Italian Restaurant    47
LongHorn Steakhouse                43
Ruth's Chris Steak House           17
Cheddar's Scratch Kitchen          13
Bahama Breeze                       6
The Capital Grille                  6
Yard House                          6
Seasons 52                          4
Eddie V's Prime Seafood             3
Olive                               1
Grill                               1
House                               1
Name: count, dtype: int64

#### Bloomin'

In [None]:
# Aplicar fuzzy matching para Bloomin
df_business['name_match'] = df_business['name'].apply(fuzzy_match, match_to=brands['bloomin'])
# Filtrar nulos (donde nulo representa no-match)
df_bloomin = df_business[df_business['name_match'].notnull()]
df_bloomin['name'].value_counts()

#### Brinker

In [None]:
# Aplicar fuzzy matching para Brinker
df_business['name_match'] = df_business['name'].apply(fuzzy_match, match_to=brands['brinker'])
# Filtrar nulos (donde nulo representa no-match)
df_brinker = df_business[df_business['name_match'].notnull()]
df_brinker['name'].value_counts()

#### Texas Roadhouse

In [None]:
# Aplicar fuzzy matching para Texas Roadhouse
df_business['name_match'] = df_business['name'].apply(fuzzy_match, match_to=brands['texasroadhouse'])
# Filtrar nulos (donde nulo representa no-match)
df_texasroadhouse = df_business[df_business['name_match'].notnull()]
df_texasroadhouse['name'].value_counts()

---
### Filtro de reviews por brand

#### Darden

In [None]:
# Filtrar `df_review` por la col `business_id` en `df_darden`
filtro_id = df_darden['business_id']
# Filtrar los review
df_darden_review = df_review[df_review['business_id'].isin(filtro_id)]
df_darden_review.shape

In [None]:
# Corrigir errores de decoding de texto, columna `text`
df_darden_review['text'] = df_darden_review['text'].apply(fix_text)

In [None]:
df_darden_review = pd.merge(df_darden_review, df_darden[['business_id', 'name']], on='business_id', how='left')

# Fijar nueva columna `name` como la primer columna del df 
nom_col = df_darden_review.pop('name')
df_darden_review.insert(0, 'name', nom_col)

#### Bloomin'

In [None]:
# Filtrar `df_review` por la col `business_id` en `df_bloomin`
filtro_id = df_bloomin['business_id']
# Filtrar los review
df_bloomin_review = df_review[df_review['business_id'].isin(filtro_id)]
df_bloomin_review.shape

In [None]:
# Corrigir errores de decoding de texto, columna `text`
df_bloomin_review['text'] = df_bloomin_review['text'].apply(fix_text)

In [None]:
df_bloomin_review = pd.merge(df_bloomin_review, df_bloomin[['business_id', 'name']], on='business_id', how='left')

# Fijar nueva columna `name` como la primer columna del df 
nom_col = df_bloomin_review.pop('name')
df_bloomin_review.insert(0, 'name', nom_col)

#### Brinker

In [None]:
# Filtrar `df_review` por la col `business_id` en `df_brinker`
filtro_id = df_brinker['business_id']
# Filtrar los review
df_brinker_review = df_review[df_review['business_id'].isin(filtro_id)]
df_brinker_review.shape

In [None]:
# Corrigir errores de decoding de texto, columna `text`
df_brinker_review['text'] = df_brinker_review['text'].apply(fix_text)

In [None]:
df_brinker_review = pd.merge(df_brinker_review, df_brinker[['business_id', 'name']], on='business_id', how='left')

# Fijar nueva columna `name` como la primer columna del df 
nom_col = df_brinker_review.pop('name')
df_brinker_review.insert(0, 'name', nom_col)

#### Texas Roadhouse

In [None]:
# Filtrar `df_review` por la col `business_id` en `df_texasroadhouse`
filtro_id = df_texasroadhouse['business_id']
# Filtrar los review
df_texasroadhouse_review = df_review[df_review['business_id'].isin(filtro_id)]
df_texasroadhouse_review.shape

In [None]:
# Corrigir errores de decoding de texto, columna `text`
df_texasroadhouse_review['text'] = df_texasroadhouse_review['text'].apply(fix_text)

In [None]:
df_texasroadhouse_review = pd.merge(df_texasroadhouse_review, df_texasroadhouse[['business_id', 'name']], on='business_id', how='left')

# Fijar nueva columna `name` como la primer columna del df 
nom_col = df_texasroadhouse_review.pop('name')
df_texasroadhouse_review.insert(0, 'name', nom_col)

#### Resumen y exportacion de filtros aplicados

In [None]:
print(
    "Cantidad de Reviews\n"
    f"Darden: {df_darden_review.shape[0]}\n"
    f"Bloomin': {df_bloomin_review.shape[0]}\n"
    f"Brinker: {df_brinker_review.shape[0]}\n"
    f"Texas RH: {df_texasroadhouse_review.shape[0]}"
)

In [None]:
#df_darden_review.to_csv('y_darden_review.csv')
#df_bloomin_review.to_csv('y_bloomin_review.csv')
#df_brinker_review.to_csv('y_brinker_review.csv')
#df_texasroadhouse_review.to_csv('y_texasroadhouse_review.csv')