In [1]:
import warnings
import pandas as pd
from ftfy import fix_text
from rapidfuzz import process, fuzz

In [2]:
warnings.filterwarnings('ignore')

In [3]:
# Opción que deshabilita el limite de columnas y filas mostradas
pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)

---
## Carga data

In [4]:
# Path de los archivos procesados (formato parquet)
path_data = '../../data/clean'
df_business = pd.read_parquet(f'{path_data}/y_business_CLEAN.parquet')
df_review = pd.read_parquet(f'{path_data}/y_review_CLEAN.parquet')

---
---
## Análisis Exploratorio

### Setup

#### Dict target

In [5]:
# Diccionario de marcas de Darden y competidores
brands = {
    # Darden brands
    'darden': [
        "Olive Garden Italian Restaurant",
        "LongHorn Steakhouse",
        "Cheddar's Scratch Kitchen",
        "Yard House",
        "The Capital Grille",   ####
        "Seasons 52",
        "Bahama Breeze",
        "Eddie V's",  ####
        "Ruth's Chris Steak House"
    ],
    # Bloomin brands
    'bloomin': [
        "Outback Steakhouse",
        "Carrabba's Italian Grill",
        "Bonefish Grill",
        "Fleming's Prime Steakhouse & Wine Bar",
        "Aussie Grill",
        "Aussie Grill - Brandon"
    ],
    # Brinker brands
    'brinker': [
        "Chili's",
        "Chili's Grill & Bar",
        "Maggiano's Little Italy",
        "It's Just Wings"
    ],
    # Texas Roadhouse brands
    'texasroadhouse': [
        "Texas Roadhouse",
        "Bubba's 33",
        #"Jaggers"  # Fast food, excluir?   ####
    ]
}

### Filtro de brands

In [6]:
# Corrigir errores de decoding de texto, columna `name`
df_business['name'] = df_business['name'].apply(fix_text)

In [7]:
# Funcion que implementa busqueda 'fuzzy' en textos
def fuzzy_match(x, match_to, threshold=90):
    match, score, _ = process.extractOne(x, match_to, scorer=fuzz.WRatio)
    return match if score >= threshold else None

#### Darden

In [8]:
# Aplicar fuzzy matching para Darden
df_business['name_match'] = df_business['name'].apply(fuzzy_match, match_to=brands['darden'])
# Filtrar nulos (donde nulo representa no-match)
df_darden = df_business[df_business['name_match'].notnull()]
df_darden.drop('name_match', axis=1, inplace=True)
df_darden['name'].value_counts()

name
Olive Garden Italian Restaurant    11
LongHorn Steakhouse                 9
Bahama Breeze                       2
Seasons 52                          2
Yard House                          2
Cheddar's Scratch Kitchen           2
Ruth's Chris Steak House            2
Eddie V's Prime Seafood             1
Name: count, dtype: int64

#### Bloomin'

In [9]:
# Aplicar fuzzy matching para Bloomin
df_business['name_match'] = df_business['name'].apply(fuzzy_match, match_to=brands['bloomin'])
# Filtrar nulos (donde nulo representa no-match)
df_bloomin = df_business[df_business['name_match'].notnull()]
df_bloomin.drop('name_match', axis=1, inplace=True)
df_bloomin['name'].value_counts()

name
Outback Steakhouse                       12
Bonefish Grill                            8
Carrabba's Italian Grill                  5
Aussie Grill by Outback                   1
Aussie Grill - Brandon                    1
Aussie Grill                              1
Fleming's Prime Steakhouse & Wine Bar     1
Aussie Grill - Clearwater                 1
Name: count, dtype: int64

#### Brinker

In [10]:
# Aplicar fuzzy matching para Brinker
df_business['name_match'] = df_business['name'].apply(fuzzy_match, match_to=brands['brinker'])
# Filtrar nulos (donde nulo representa no-match)
df_brinker = df_business[df_business['name_match'].notnull()]
df_brinker.drop('name_match', axis=1, inplace=True)
df_brinker['name'].value_counts()

name
Chili's                    15
Maggiano's Little Italy     2
It's Just Wings             1
Name: count, dtype: int64

#### Texas Roadhouse

In [11]:
# Aplicar fuzzy matching para Texas Roadhouse
df_business['name_match'] = df_business['name'].apply(fuzzy_match, match_to=brands['texasroadhouse'])
# Filtrar nulos (donde nulo representa no-match)
df_texasroadhouse = df_business[df_business['name_match'].notnull()]
df_texasroadhouse.drop('name_match', axis=1, inplace=True)
df_texasroadhouse['name'].value_counts()

name
Texas Roadhouse    6
Name: count, dtype: int64

---
### Filtro de reviews por brand

In [12]:
business_cols = ['business_id', 'city', 'state', 'postal_code', 'coordinates']

In [13]:
df_review['day'] = pd.to_datetime(df_review['date']).dt.day_name()

#### Darden

In [14]:
# Filtrar `df_review` por la col `business_id` en `df_darden`
filtro_id = df_darden['business_id']
# Filtrar los review
df_darden_review = df_review[df_review['business_id'].isin(filtro_id)]

# Corrigir errores de decoding de texto, columna `text`
df_darden_review['text'] = df_darden_review['text'].apply(fix_text)
df_darden_review.sample(1)

Unnamed: 0,name,review_id,user_id,business_id,stars,useful,funny,cool,text,date,day
373196,Bahama Breeze,kUd28HDda8ArBl97eJFhcA,qpxHZ6yDRbPWmTMY_Px52w,AlH5V97JSAu7AL_xdibMIg,1.0,0,0,0,GO TO APPLEBEES NOT TO THIS PLACE.\n\nI went t...,2019-10-20,Sunday


In [15]:
# Nueva columna para respuestas de reviews
df_darden_review['text_reply'] = ''

In [16]:
df_darden_review = pd.merge(df_darden_review, df_darden[business_cols], on='business_id', how='left')

# Fijar nueva columna `name` como la primer columna del df 
nom_col = df_darden_review.pop('name')
df_darden_review.insert(0, 'name', nom_col)

#### Bloomin'

In [17]:
# Filtrar `df_review` por la col `business_id` en `df_bloomin`
filtro_id = df_bloomin['business_id']
# Filtrar los review
df_bloomin_review = df_review[df_review['business_id'].isin(filtro_id)]

# Corrigir errores de decoding de texto, columna `text`
df_bloomin_review['text'] = df_bloomin_review['text'].apply(fix_text)
df_bloomin_review.sample(1)

Unnamed: 0,name,review_id,user_id,business_id,stars,useful,funny,cool,text,date,day
99076,Bonefish Grill,0n6GNv0VggfvTkUKi3qQEg,5RHa6ARjqHnjkk2puc5y_w,TD2Ln8rzK-UbGECZkD6V4Q,2.0,5,2,1,So the fam and I gave Bonefish Grill a second ...,2014-06-05,Thursday


In [18]:
df_bloomin_review = pd.merge(df_bloomin_review, df_bloomin[business_cols], on='business_id', how='left')

# Fijar nueva columna `name` como la primer columna del df 
nom_col = df_bloomin_review.pop('name')
df_bloomin_review.insert(0, 'name', nom_col)

#### Brinker

In [19]:
# Filtrar `df_review` por la col `business_id` en `df_brinker`
filtro_id = df_brinker['business_id']
# Filtrar los review
df_brinker_review = df_review[df_review['business_id'].isin(filtro_id)]

# Corrigir errores de decoding de texto, columna `text`
df_brinker_review['text'] = df_brinker_review['text'].apply(fix_text)
df_brinker_review.sample()

Unnamed: 0,name,review_id,user_id,business_id,stars,useful,funny,cool,text,date,day
712998,Maggiano's Little Italy,e3jSbVQZ77ftu15SSUKzjw,2z-j-VW3f84ojwjtMen3kg,2WGnykxiM-Mp-qIm2u7iAw,5.0,1,0,0,Love this place. I went here while my boyfrien...,2009-06-24,Wednesday


In [20]:
df_brinker_review = pd.merge(df_brinker_review, df_brinker[business_cols], on='business_id', how='left')

# Fijar nueva columna `name` como la primer columna del df 
nom_col = df_brinker_review.pop('name')
df_brinker_review.insert(0, 'name', nom_col)

#### Texas Roadhouse

In [21]:
# Filtrar `df_review` por la col `business_id` en `df_texasroadhouse`
filtro_id = df_texasroadhouse['business_id']
# Filtrar los review
df_texasroadhouse_review = df_review[df_review['business_id'].isin(filtro_id)]

# Corrigir errores de decoding de texto, columna `text`
df_texasroadhouse_review['text'] = df_texasroadhouse_review['text'].apply(fix_text)
df_texasroadhouse_review.sample()

Unnamed: 0,name,review_id,user_id,business_id,stars,useful,funny,cool,text,date,day
646726,Texas Roadhouse,H-7Gn18-BoBBnTm8HX5s5Q,Bv8KkadmPMSABBTk9SX-CA,OCz2ba1y_lIAonOGU0B7zA,1.0,0,0,0,I contacted Texas Roadhouse corporate customer...,2015-07-02,Thursday


In [22]:
df_texasroadhouse_review = pd.merge(df_texasroadhouse_review, df_texasroadhouse[business_cols], on='business_id', how='left')

# Fijar nueva columna `name` como la primer columna del df 
nom_col = df_texasroadhouse_review.pop('name')
df_texasroadhouse_review.insert(0, 'name', nom_col)

#### Resumen y exportacion de filtros aplicados

In [23]:
print(
    "Cantidad de Reviews\n"
    f"Darden: {df_darden_review.shape[0]}\n"
    f"Bloomin': {df_bloomin_review.shape[0]}\n"
    f"Brinker: {df_brinker_review.shape[0]}\n"
    f"Texas RH: {df_texasroadhouse_review.shape[0]}"
)

Cantidad de Reviews
Darden: 7153
Bloomin': 4230
Brinker: 2422
Texas RH: 1265


---

In [24]:
dfs = [
    ['darden', df_darden_review],
    ['bloomin', df_bloomin_review],
    ['brinker', df_brinker_review],
    ['texasroadhouse', df_texasroadhouse_review]
]

drop_cols = ['useful','funny','cool']

for _, df in dfs:
    df.drop(columns=drop_cols, inplace=True)

In [25]:
df_darden_review.sample(10).to_csv('reviews_----.csv')

In [26]:
for nom, df in dfs:
    df.to_parquet(f'reviews_{nom}.parquet')

In [28]:
df_bloomin_review.sample()

Unnamed: 0,name,review_id,user_id,business_id,stars,text,date,day,city,state,postal_code,coordinates
926,Outback Steakhouse,pxrdJ8AABI-yhQSyL-Qo2A,ZBfdJlnH8jusi9kBJDSAuw,y5nl-lRDlphGQ9XiwDNmtg,1.0,I enjoy Outback but since a waitress wrote dow...,2021-03-07,Sunday,North Wales,NJ,19454,"40.235486084,-75.2409985192"
