# **ETL `bussines_yelp`**

In [1]:
import os
import pandas as pd # type: ignore
import matplotlib.pyplot as plt # type: ignore
from collections import Counter
import itertools
import json
import re
import ast

In [2]:
business_yelp = pd.read_pickle(r'Yelp\business.pkl')

In [3]:
# Eliminar columnas duplicadas manteniendo la primera aparición
business_yelp = business_yelp.loc[:, ~business_yelp.columns.duplicated()]

# Verificar que se eliminaron los duplicados
print(business_yelp.columns)

Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       'attributes', 'categories', 'hours'],
      dtype='object')


In [4]:
business_yelp = business_yelp.copy()

In [5]:
# Convertir la columna a string (por si tiene valores no string o NaN)
business_yelp['categories'] = business_yelp['categories'].astype(str)

# Filtrar negocios que pertenecen a las categorías "Restaurants", "Food" o "Bar"
business_filtered = business_yelp[business_yelp['categories'].str.contains(r"restaurant|food|\bbar\b", na=False, case=False)]

In [6]:
business_filtered.head(2)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,CA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,MO,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


In [7]:
ciudades = pd.read_csv(r'florida_population.csv')
ciudades.head(2)

Unnamed: 0,Rank,City,Population
0,1,Jacksonville,985843
1,2,Miami,455924


In [8]:
business_filtered = business_filtered.copy()
business_filtered['city'] = business_filtered['city'].str.strip().str.lower()

ciudades = ciudades.copy()
ciudades['City'] = ciudades['City'].str.strip().str.lower()

business_filtered = business_filtered[business_filtered['city'].isin(ciudades['City'])]

In [9]:
business_filtered = business_filtered.copy()
business_filtered['state'] = "FL"

In [10]:
business_filtered.head(2)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
14,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,largo,FL,33771,27.916116,-82.760461,4.5,100,0,"{'OutdoorSeating': 'False', 'RestaurantsGoodFo...","Food, Delis, Italian, Bakeries, Restaurants","{'Monday': '10:0-18:0', 'Tuesday': '10:0-20:0'..."
54,0qNpTGTcqPwOLi2hADx4Xw,Charlie's Market,2815 E Sligh Ave,tampa,FL,33610,28.01036,-82.430042,3.0,9,1,"{'BusinessParking': '{'garage': False, 'street...","Food, Grocery, Convenience Stores",


In [11]:
# Asegurar que todos los valores de 'attributes' sean diccionarios
def convert_to_dict(value):
    if isinstance(value, str):  # Si viene como string, convertir a diccionario
        try:
            return ast.literal_eval(value)
        except (ValueError, SyntaxError):
            return {}  # Si hay error, devolver un diccionario vacío
    elif isinstance(value, dict):
        return value  # Si ya es un diccionario, lo dejamos igual
    else:
        return {}  # Si es otro tipo de dato, lo convertimos a diccionario vacío

# Aplicar la conversión
business_filtered['attributes'] = business_filtered['attributes'].apply(convert_to_dict)

# Expandir la columna attributes en múltiples columnas
attributes_expanded = business_filtered['attributes'].apply(pd.Series)

# Concatenar las columnas originales con las nuevas
business_filtered = pd.concat([business_filtered, attributes_expanded], axis=1)

# Eliminar la columna original 'attributes' (opcional)
business_filtered.drop(columns=['attributes'], inplace=True)

# Ver los primeros valores para comprobar
print(business_filtered.head())


               business_id                  name                 address  \
14  0bPLkL0QhhPO5kt1_EXmNQ  Zio's Italian Market           2575 E Bay Dr   
54  0qNpTGTcqPwOLi2hADx4Xw      Charlie's Market        2815 E Sligh Ave   
58  uI9XODGY_2_ieTE6xJ0myw           Roman Forum  10440 N Dale Mabry Hwy   
59  JgpnXv_0XhV3SfbfB50nxw           Joe's Pizza   2038 N Dale Mabry Hwy   
69  KWA2qtdwVEkMpd5soSKaGA   Publix Super Market         12101 Little Rd   

      city state postal_code   latitude  longitude stars review_count  ...  \
14   largo    FL       33771  27.916116 -82.760461   4.5          100  ...   
54   tampa    FL       33610   28.01036 -82.430042   3.0            9  ...   
58   tampa    FL       33618  28.046203 -82.505053   4.0           23  ...   
59   tampa    FL       33607  27.960514 -82.506127   4.0           35  ...   
69  hudson    FL       34667  28.332601 -82.668107   3.5            7  ...   

   BYOB DogsAllowed DriveThru Corkage BYOBCorkage AcceptsInsurance  \
14  

In [12]:
business_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10951 entries, 14 to 150297
Data columns (total 51 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   business_id                 10951 non-null  object
 1   name                        10951 non-null  object
 2   address                     10951 non-null  object
 3   city                        10951 non-null  object
 4   state                       10951 non-null  object
 5   postal_code                 10951 non-null  object
 6   latitude                    10951 non-null  object
 7   longitude                   10951 non-null  object
 8   stars                       10951 non-null  object
 9   review_count                10951 non-null  object
 10  is_open                     10951 non-null  object
 11  categories                  10951 non-null  object
 12  hours                       9593 non-null   object
 13  OutdoorSeating              7728 non-null   objec

In [13]:
business = business_filtered.copy()

Analizamos y expandimos `OutdoorSeating`

In [14]:
business['OutdoorSeating'].value_counts()

OutdoorSeating
True     3901
False    3484
None      343
Name: count, dtype: int64

In [15]:
business['OutdoorSeating'] = business['OutdoorSeating'].astype(str).map({'True': 1, 'False': 0, 'None': 0}).fillna(0).astype(int)

Analizamos y expandimos `RestaurantsGoodForGroups`

In [16]:
business['RestaurantsGoodForGroups'].value_counts()

RestaurantsGoodForGroups
True     5939
False     920
Name: count, dtype: int64

In [17]:
business['RestaurantsGoodForGroups'] = business['RestaurantsGoodForGroups'].astype(str).map({'True': 1, 'False': 0, 'None': 0}).fillna(0).astype(int)

Analizamos y expandimos `WiFi`

In [18]:
business['WiFi'].value_counts()

WiFi
u'free'    3024
u'no'      1637
'free'     1319
'no'       1061
u'paid'      32
'paid'       18
None         10
Name: count, dtype: int64

In [19]:
business["WiFi"] = business["WiFi"].astype(str).str.replace("u'", "").str.replace("'", "").str.strip()
print(business["WiFi"].unique())  # Para verificar la limpieza

['no' 'nan' 'free' 'paid' 'None']


In [20]:
business["WiFi"] = business["WiFi"].map({'no': 0,
                                         'nan': 0,
                                         'None': 0,
                                         'free': 1, 
                                         'paid': 2}).fillna(0).astype(int)

Analizamos y expandimos `RestaurantsPriceRange2`

In [21]:
business['RestaurantsPriceRange2'].value_counts()

RestaurantsPriceRange2
2       4552
1       4210
3        227
4         27
None       1
Name: count, dtype: int64

In [22]:
business["RestaurantsPriceRange2"] = business["RestaurantsPriceRange2"].map({'None': 0,}).fillna(0).astype(int)
business['RestaurantsPriceRange2'] = business['RestaurantsPriceRange2'].astype(int)

Analizamos `RestaurantsDelivery`

In [23]:
business['RestaurantsDelivery'].value_counts()

RestaurantsDelivery
True     5524
False    3080
None      570
Name: count, dtype: int64

In [24]:
business['RestaurantsDelivery'] = business['RestaurantsDelivery'].astype(str).map({'True': 1, 'False': 0, 'None': 0}).fillna(0).astype(int)

Analisis de `RestaurantsAttire`

In [25]:
business['RestaurantsAttire'].value_counts()

RestaurantsAttire
u'casual'    3627
'casual'     2568
u'dressy'      54
'dressy'       30
None            6
'formal'        6
u'formal'       5
Name: count, dtype: int64

In [26]:
business["RestaurantsAttire"] = business["RestaurantsAttire"].astype(str).str.replace("u'", "").str.replace("'", "").str.strip()
print(business["RestaurantsAttire"].unique())  # Para verificar la limpieza

['casual' 'nan' 'dressy' 'None' 'formal']


In [27]:
business["RestaurantsAttire"] = business["RestaurantsAttire"].replace(['nan', 'None'], None)

In [28]:
# Extraer y listar las categorías únicas correctamente
restaurantsattire_list = set()

for row in business["RestaurantsAttire"].dropna():
    if isinstance(row, str):  # Si es un string, agregarlo directamente
        restaurantsattire_list.add(row)

print(f"Cantidad de categorías únicas en 'RestaurantsAttire': {len(restaurantsattire_list)}")
print(f"Categorías encontradas: {restaurantsattire_list}")

Cantidad de categorías únicas en 'RestaurantsAttire': 3
Categorías encontradas: {'dressy', 'formal', 'casual'}


In [29]:
# Convertir NaN a string vacío para evitar errores
business["RestaurantsAttire"] = business["RestaurantsAttire"].fillna("")

# Crear columnas binarias para cada categoría encontrada
for category in restaurantsattire_list:
    business[f"RestaurantsAttire_{category.replace(' ', '_')}"] = business["RestaurantsAttire"].apply(lambda x: 1 if x == category else 0)

# Eliminar la columna original
business.drop(columns=["RestaurantsAttire"], inplace=True)

Analizamos `BusinessAcceptsCreditCards`

In [30]:
business['BusinessAcceptsCreditCards'].value_counts()

BusinessAcceptsCreditCards
True     9668
False     148
None        6
Name: count, dtype: int64

In [31]:
business['BusinessAcceptsCreditCards'] = business['BusinessAcceptsCreditCards'].astype(str).map({'True': 1, 'False': 0, 'None': 0}).fillna(0).astype(int)

Analizamos `RestaurantsTakeOut`

In [32]:
business['RestaurantsTakeOut'].value_counts()

RestaurantsTakeOut
True     8864
False     685
None      280
Name: count, dtype: int64

In [33]:
business['RestaurantsTakeOut'] = business['RestaurantsTakeOut'].astype(str).map({'True': 1, 'False': 0, 'None': 0}).fillna(0).astype(int)

Analizamos `Caters`

In [34]:
business['Caters'].value_counts()

Caters
True     4079
False    2710
None        6
Name: count, dtype: int64

In [35]:
business['Caters'] = business['Caters'].astype(str).map({'True': 1, 'False': 0, 'None': 0}).fillna(0).astype(int)

Analizamos `NoiseLevel`                 

In [36]:
business['NoiseLevel'].value_counts()

NoiseLevel
u'average'      3151
u'quiet'        1029
'average'        975
u'loud'          267
'quiet'          264
u'very_loud'      99
'loud'            73
'very_loud'       15
None               3
Name: count, dtype: int64

In [37]:
business["NoiseLevel"] = business["NoiseLevel"].astype(str).str.replace("u'", "").str.replace("'", "").str.strip()
print(business["NoiseLevel"].unique())  # Para verificar la limpieza

['average' 'nan' 'quiet' 'loud' 'very_loud' 'None']


In [38]:
business["NoiseLevel"] = business["NoiseLevel"].replace(['nan', 'None'], None)

In [39]:
# Extraer y listar las categorías únicas correctamente
noiselevel_list = set()

for row in business["NoiseLevel"].dropna():
    if isinstance(row, str):  # Si es un string, agregarlo directamente
        noiselevel_list.add(row)

print(f"Cantidad de categorías únicas en 'NoiseLevel': {len(noiselevel_list)}")
print(f"Categorías encontradas: {noiselevel_list}")

Cantidad de categorías únicas en 'NoiseLevel': 4
Categorías encontradas: {'quiet', 'very_loud', 'average', 'loud'}


In [40]:
# Convertir NaN a string vacío para evitar errores
business["NoiseLevel"] = business["NoiseLevel"].fillna("")

# Crear columnas binarias para cada categoría encontrada
for category in noiselevel_list:
    business[f"NoiseLevel_{category.replace(' ', '_')}"] = business["NoiseLevel"].apply(lambda x: 1 if x == category else 0)

# Eliminar la columna original
business.drop(columns=["NoiseLevel"], inplace=True)

Analizamos `Ambience`

In [41]:
business['Ambience'].value_counts()

Ambience
{'touristy': False, 'hipster': False, 'romantic': False, 'divey': False, 'intimate': False, 'trendy': False, 'upscale': False, 'classy': False, 'casual': False}         1256
{'romantic': False, 'intimate': False, 'classy': False, 'hipster': False, 'divey': False, 'touristy': False, 'trendy': False, 'upscale': False, 'casual': False}          848
{'romantic': False, 'intimate': False, 'classy': False, 'hipster': False, 'divey': False, 'touristy': False, 'trendy': False, 'upscale': False, 'casual': True}           694
{'romantic': False, 'intimate': False, 'touristy': False, 'hipster': False, 'divey': False, 'classy': False, 'trendy': False, 'upscale': False, 'casual': True}           580
{'romantic': False, 'intimate': False, 'touristy': False, 'hipster': False, 'divey': False, 'classy': False, 'trendy': False, 'upscale': False, 'casual': False}          478
                                                                                                                         

In [42]:
# Extraer y listar las categorías únicas
ambience_list = set()

for row in business["Ambience"].dropna():
    if isinstance(row, dict):  # Si ya es un diccionario
        ambience_list.update(row.keys())
    elif isinstance(row, str):  # Si es un string que representa un diccionario
        try:
            categories = ast.literal_eval(row)  # Convertir string a dict
            if isinstance(categories, dict):
                ambience_list.update(categories.keys())  # Agregar claves únicas
        except:
            print(f"Error procesando: {row}")  # Para depuración

print(f"Cantidad de categorías únicas en 'Ambience': {len(ambience_list)}")
print(f"Categorías encontradas: {ambience_list}")

Cantidad de categorías únicas en 'Ambience': 9
Categorías encontradas: {'touristy', 'intimate', 'trendy', 'divey', 'casual', 'romantic', 'classy', 'hipster', 'upscale'}


In [43]:
# Convertir strings a diccionarios si es necesario
business["Ambience"] = business["Ambience"].dropna().apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Crear columnas dummies
for category in ambience_list:
    business[f"Ambience_{category}"] = business["Ambience"].apply(lambda x: 1 if isinstance(x, dict) and x.get(category, False) else 0)

# Eliminar la columna original
business.drop(columns=["Ambience"], inplace=True)

Analizamos `GoodForKids`

In [44]:
business['GoodForKids'].value_counts()

GoodForKids
True     6113
False     718
None        2
Name: count, dtype: int64

In [45]:
business['GoodForKids'] = business['GoodForKids'].astype(str).map({'True': 1, 'False': 0, 'None': 0}).fillna(0).astype(int)

Analizamos `BusinessAcceptsBitcoin`


In [46]:
business['BusinessAcceptsBitcoin'].value_counts()

BusinessAcceptsBitcoin
False    1376
True       34
Name: count, dtype: int64

In [47]:
business['BusinessAcceptsBitcoin'] = business['BusinessAcceptsBitcoin'].astype(str).map({'True': 1, 'False': 0, 'None': 0}).fillna(0).astype(int)

Analizamos `BusinessParking`


In [48]:
business['BusinessParking'].value_counts()

BusinessParking
{'garage': False, 'street': False, 'validated': False, 'lot': True, 'valet': False}        4292
{'garage': False, 'street': False, 'validated': False, 'lot': False, 'valet': False}       2205
{'garage': False, 'street': True, 'validated': False, 'lot': False, 'valet': False}         527
{'garage': False, 'street': True, 'validated': False, 'lot': True, 'valet': False}          392
None                                                                                        162
                                                                                           ... 
{u'valet': False, u'garage': True, u'street': True, u'lot': None, u'validated': False}        1
{'garage': True, 'street': True, 'validated': None, 'lot': None, 'valet': False}              1
{u'valet': False, u'garage': True, u'street': False, u'lot': None, u'validated': False}       1
{}                                                                                            1
{u'valet': False, u'gara

In [49]:
# Extraer y listar las categorías únicas
businessparking_list = set()

for row in business["BusinessParking"].dropna():
    if isinstance(row, dict):  # Si ya es un diccionario
        businessparking_list.update(row.keys())
    elif isinstance(row, str):  # Si es un string que representa un diccionario
        try:
            categories = ast.literal_eval(row)  # Convertir string a dict
            if isinstance(categories, dict):
                businessparking_list.update(categories.keys())  # Agregar claves únicas
        except:
            print(f"Error procesando: {row}")  # Para depuración

print(f"Cantidad de categorías únicas en 'BusinessParking': {len(businessparking_list)}")
print(f"Categorías encontradas: {businessparking_list}")

Cantidad de categorías únicas en 'BusinessParking': 5
Categorías encontradas: {'validated', 'garage', 'street', 'valet', 'lot'}


In [50]:
# Convertir strings a diccionarios si es necesario
business["BusinessParking"] = business["BusinessParking"].dropna().apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Crear columnas dummies
for category in businessparking_list:
    business[f"BusinessParking_{category}"] = business["BusinessParking"].apply(lambda x: 1 if isinstance(x, dict) and x.get(category, False) else 0)

# Eliminar la columna original
business.drop(columns=["BusinessParking"], inplace=True)

Analizamos `BikeParking`

In [51]:
business['BikeParking'].value_counts()

BikeParking
True     5851
False    1766
None        5
Name: count, dtype: int64

In [52]:
business['BikeParking'] = business['BikeParking'].astype(str).map({'True': 1, 'False': 0, 'None': 0}).fillna(0).astype(int)

Analizamos `RestaurantsTableService`

In [53]:
business['RestaurantsTableService'].value_counts()

RestaurantsTableService
True     2423
False    1247
None        2
Name: count, dtype: int64

In [54]:
business['RestaurantsTableService'] = business['RestaurantsTableService'].astype(str).map({'True': 1, 'False': 0, 'None': 0}).fillna(0).astype(int)

Analizamos `Alcohol`

In [55]:
business['Alcohol'].value_counts()

Alcohol
u'none'             2119
u'full_bar'         1656
u'beer_and_wine'    1334
'none'               890
'beer_and_wine'      377
'full_bar'           362
None                   4
Name: count, dtype: int64

In [56]:
business["Alcohol"] = business["Alcohol"].astype(str).str.replace("u'", "").str.replace("'", "").str.strip()
print(business["Alcohol"].unique())  # Para verificar la limpieza

['none' 'nan' 'full_bar' 'beer_and_wine' 'None']


In [57]:
# Crear la columna `alcoholic_beverages` que coincide con la tabla de Google
business["alcoholic_beverages"] = business["Alcohol"].apply(lambda x: 1 if x in ["full_bar", "beer_and_wine"] else 0)

# Eliminar la columna original
business.drop(columns=["Alcohol"], inplace=True)

Analizamos `RestaurantsReservations`

In [58]:
business['RestaurantsReservations'].value_counts()

RestaurantsReservations
False    4918
True     2245
None       51
Name: count, dtype: int64

In [59]:
business['RestaurantsReservations'] = business['RestaurantsReservations'].astype(str).map({'True': 1, 'False': 0, 'None': 0}).fillna(0).astype(int)

Analizamos `GoodForMeal`

In [60]:
business['GoodForMeal'].value_counts()

GoodForMeal
{'dessert': False, 'latenight': False, 'lunch': False, 'dinner': False, 'brunch': False, 'breakfast': False}     1688
{'dessert': False, 'latenight': False, 'lunch': True, 'dinner': True, 'brunch': False, 'breakfast': False}        628
{'dessert': False, 'latenight': False, 'lunch': True, 'dinner': False, 'brunch': False, 'breakfast': False}       378
{'dessert': False, 'latenight': False, 'lunch': False, 'dinner': True, 'brunch': False, 'breakfast': False}       161
{'dessert': None, 'latenight': False, 'lunch': True, 'dinner': True, 'brunch': False, 'breakfast': False}         137
                                                                                                                 ... 
{'dessert': False, 'latenight': None, 'lunch': None, 'dinner': None, 'brunch': False, 'breakfast': False}           1
{'dessert': None, 'latenight': None, 'lunch': True, 'dinner': False, 'brunch': None, 'breakfast': True}             1
{u'breakfast': True, u'brunch': None, u'lunc

In [61]:
# Extraer y listar las categorías únicas
goodformeal_list = set()

for row in business["GoodForMeal"].dropna():
    if isinstance(row, dict):  # Si ya es un diccionario
        goodformeal_list.update(row.keys())
    elif isinstance(row, str):  # Si es un string que representa un diccionario
        try:
            categories = ast.literal_eval(row)  # Convertir string a dict
            if isinstance(categories, dict):
                goodformeal_list.update(categories.keys())  # Agregar claves únicas
        except:
            print(f"Error procesando: {row}")  # Para depuración

print(f"Cantidad de categorías únicas en 'GoodForMeal': {len(goodformeal_list)}")
print(f"Categorías encontradas: {goodformeal_list}")

Cantidad de categorías únicas en 'GoodForMeal': 6
Categorías encontradas: {'dessert', 'brunch', 'lunch', 'breakfast', 'latenight', 'dinner'}


In [62]:
# Convertir strings a diccionarios si es necesario
business["GoodForMeal"] = business["GoodForMeal"].dropna().apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Crear columnas dummies
for category in goodformeal_list:
    business[f"GoodForMeal_{category}"] = business["GoodForMeal"].apply(lambda x: 1 if isinstance(x, dict) and x.get(category, False) else 0)

# Eliminar la columna original
business.drop(columns=["GoodForMeal"], inplace=True)

Analizamos `HasTV`

In [63]:
business['HasTV'].value_counts()

HasTV
True     5719
False    1473
None        5
Name: count, dtype: int64

In [64]:
business['HasTV'] = business['HasTV'].astype(str).map({'True': 1, 'False': 0, 'None': 0}).fillna(0).astype(int)

Analizamos `WheelchairAccessible`


In [65]:
business['WheelchairAccessible'].value_counts()

WheelchairAccessible
True     2853
False     173
None        2
Name: count, dtype: int64

In [66]:
business['WheelchairAccessible'] = business['WheelchairAccessible'].astype(str).map({'True': 1, 'False': 0, 'None': 0}).fillna(0).astype(int)

Analizamos `BestNights`


In [67]:
business['BestNights'].value_counts()

BestNights
{'monday': False, 'tuesday': False, 'friday': False, 'wednesday': False, 'thursday': False, 'sunday': False, 'saturday': False}        153
{'monday': False, 'tuesday': False, 'friday': True, 'wednesday': False, 'thursday': True, 'sunday': False, 'saturday': True}            90
{'monday': False, 'tuesday': False, 'friday': True, 'wednesday': False, 'thursday': False, 'sunday': True, 'saturday': True}            58
{'monday': False, 'tuesday': False, 'friday': True, 'wednesday': False, 'thursday': False, 'sunday': False, 'saturday': True}           57
{'monday': False, 'tuesday': False, 'friday': True, 'wednesday': False, 'thursday': False, 'sunday': False, 'saturday': False}          46
                                                                                                                                      ... 
{u'monday': False, u'tuesday': True, u'wednesday': False, u'thursday': False, u'friday': True, u'saturday': True, u'sunday': False}      1
{u'monday': True

In [68]:
# Extraer y listar las categorías únicas
bestnights_list = set()

for row in business["BestNights"].dropna():
    if isinstance(row, dict):  # Si ya es un diccionario
        bestnights_list.update(row.keys())
    elif isinstance(row, str):  # Si es un string que representa un diccionario
        try:
            categories = ast.literal_eval(row)  # Convertir string a dict
            if isinstance(categories, dict):
                bestnights_list.update(categories.keys())  # Agregar claves únicas
        except:
            print(f"Error procesando: {row}")  # Para depuración

print(f"Cantidad de categorías únicas en 'BestNights': {len(bestnights_list)}")
print(f"Categorías encontradas: {bestnights_list}")

Cantidad de categorías únicas en 'BestNights': 7
Categorías encontradas: {'monday', 'tuesday', 'saturday', 'sunday', 'friday', 'thursday', 'wednesday'}


In [69]:
# Convertir strings a diccionarios si es necesario
business["BestNights"] = business["BestNights"].dropna().apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Crear columnas dummies
for category in bestnights_list:
    business[f"BestNights_{category}"] = business["BestNights"].apply(lambda x: 1 if isinstance(x, dict) and x.get(category, False) else 0)

# Eliminar la columna original
business.drop(columns=["BestNights"], inplace=True)

Analizamos `ByAppointmentOnly`


In [70]:
business['ByAppointmentOnly'].value_counts()

ByAppointmentOnly
False    826
True      42
None       1
Name: count, dtype: int64

In [71]:
business['ByAppointmentOnly'] = business['ByAppointmentOnly'].astype(str).map({'True': 1, 'False': 0, 'None': 0}).fillna(0).astype(int)

Analizamos `GoodForDancing`


In [72]:
business['GoodForDancing'].value_counts()

GoodForDancing
False    561
True     119
Name: count, dtype: int64

In [73]:
business['GoodForDancing'] = business['GoodForDancing'].astype(str).map({'True': 1, 'False': 0, 'None': 0}).fillna(0).astype(int)

Analizamos `HappyHour`


In [74]:
business['HappyHour'].value_counts()

HappyHour
True     1623
False     833
Name: count, dtype: int64

In [75]:
business['HappyHour'] = business['HappyHour'].astype(str).map({'True': 1, 'False': 0, 'None': 0}).fillna(0).astype(int)

Analizamos `Smoking`


In [76]:
business['Smoking'].value_counts()

Smoking
u'outdoor'    375
u'no'         220
u'yes'         36
'no'            4
'outdoor'       1
Name: count, dtype: int64

In [77]:
business["Smoking"] = business["Smoking"].astype(str).str.replace("u'", "").str.replace("'", "").str.strip()
print(business["Smoking"].unique())  # Para verificar la limpieza

['nan' 'outdoor' 'no' 'yes']


In [78]:
business["smoking_allowed"] = business["Smoking"].apply(lambda x: 1 if x in ["yes", "outdoor"] else 0)

# Eliminar la columna original
business.drop(columns=["Smoking"], inplace=True)

Analizamos `CoatCheck`

In [79]:
business['CoatCheck'].value_counts()

CoatCheck
False    833
True      12
Name: count, dtype: int64

In [80]:
business['CoatCheck'] = business['CoatCheck'].astype(str).map({'True': 1, 'False': 0, 'None': 0}).fillna(0).astype(int)

Analizamos `Music`


In [81]:
business['Music'].value_counts()

Music
{'dj': False, 'background_music': False, 'no_music': False, 'jukebox': False, 'live': False, 'video': False, 'karaoke': False}        346
{'dj': False, 'background_music': False, 'no_music': False, 'jukebox': False, 'live': True, 'video': False, 'karaoke': False}         262
{'dj': False}                                                                                                                          44
{'dj': False, 'background_music': False, 'no_music': False, 'jukebox': False, 'live': False, 'video': False, 'karaoke': None}          32
{u'dj': None, u'live': False, u'jukebox': None, u'video': False, u'background_music': False, u'karaoke': None, u'no_music': False}     30
                                                                                                                                     ... 
{'dj': True, 'background_music': False, 'karaoke': False, 'live': False, 'video': True, 'jukebox': False}                               1
{'dj': True, 'background_mus

In [82]:
# Extraer y listar las categorías únicas
music_list = set()

for row in business["Music"].dropna():
    if isinstance(row, dict):  # Si ya es un diccionario
        music_list.update(row.keys())
    elif isinstance(row, str):  # Si es un string que representa un diccionario
        try:
            categories = ast.literal_eval(row)  # Convertir string a dict
            if isinstance(categories, dict):
                music_list.update(categories.keys())  # Agregar claves únicas
        except:
            print(f"Error procesando: {row}")  # Para depuración

print(f"Cantidad de categorías únicas en 'Music': {len(music_list)}")
print(f"Categorías encontradas: {music_list}")

Cantidad de categorías únicas en 'Music': 7
Categorías encontradas: {'video', 'live', 'dj', 'karaoke', 'no_music', 'background_music', 'jukebox'}


In [83]:
# Convertir strings a diccionarios si es necesario
business["Music"] = business["Music"].dropna().apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Crear columnas dummies
for category in music_list:
    business[f"Music_{category}"] = business["Music"].apply(lambda x: 1 if isinstance(x, dict) and x.get(category, False) else 0)

# Eliminar la columna original
business.drop(columns=["Music"], inplace=True)

Analizamos `BYOB`


In [84]:
business['BYOB'].value_counts()

BYOB
False    650
True     136
Name: count, dtype: int64

In [85]:
business['BYOB'] = business['BYOB'].astype(str).map({'True': 1, 'False': 0, 'None': 0}).fillna(0).astype(int)

Analizamos `DogsAllowed`


In [86]:
business['DogsAllowed'].value_counts()

DogsAllowed
False    1586
True      863
None        3
Name: count, dtype: int64

In [87]:
business['DogsAllowed'] = business['DogsAllowed'].astype(str).map({'True': 1, 'False': 0, 'None': 0}).fillna(0).astype(int)

Analizamos `DriveThrut`

In [88]:
business['DriveThru'].value_counts()

DriveThru
True     825
False    431
None     174
Name: count, dtype: int64

In [89]:
business['DriveThru'] = business['DriveThru'].astype(str).map({'True': 1, 'False': 0, 'None': 0}).fillna(0).astype(int)

Analizamos `Corkage`

In [90]:
business['Corkage'].value_counts()

Corkage
False    374
True     207
None       1
Name: count, dtype: int64

In [91]:
business['Corkage'] = business['Corkage'].astype(str).map({'True': 1, 'False': 0, 'None': 0}).fillna(0).astype(int)

Analizamos `BYOBCorkage`

In [92]:
business['BYOBCorkage'].value_counts()

BYOBCorkage
'no'             40
'yes_free'       34
'yes_corkage'     7
u'yes_free'       1
Name: count, dtype: int64

> Esta columna tiene muy pocos datos no nulos, de los cuales la mitad son positivos, por lo que se elimina

Analizamos `AcceptsInsurance`


In [93]:
business['AcceptsInsurance'].value_counts()

AcceptsInsurance
False    10
True      2
Name: count, dtype: int64

> Idem columna `BYOBCorkage`

Analizamos `AgesAllowed`


In [94]:
business['AgesAllowed'].value_counts()

AgesAllowed
u'allages'    12
u'21plus'      1
Name: count, dtype: int64

> Idem columna `BYOBCorkage`

Eliminamos `HairSpecializesIn`, `Open24Hours`, `DietaryRestrictions`

In [95]:
business = business.drop(columns = ['HairSpecializesIn', 
                                    'Open24Hours', 
                                    'DietaryRestrictions', 
                                    'BYOBCorkage', 
                                    'AcceptsInsurance',
                                    'AgesAllowed'])

In [96]:
business.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10951 entries, 14 to 150297
Data columns (total 79 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   business_id                 10951 non-null  object
 1   name                        10951 non-null  object
 2   address                     10951 non-null  object
 3   city                        10951 non-null  object
 4   state                       10951 non-null  object
 5   postal_code                 10951 non-null  object
 6   latitude                    10951 non-null  object
 7   longitude                   10951 non-null  object
 8   stars                       10951 non-null  object
 9   review_count                10951 non-null  object
 10  is_open                     10951 non-null  object
 11  categories                  10951 non-null  object
 12  hours                       9593 non-null   object
 13  OutdoorSeating              10951 non-null  int64

Se eliminan las columnas `hours` y `state`, ya que `hours` no se usa y `state` porque filtramos solo por el estado de Florida (FL)

In [97]:
business = business.drop(columns = ['hours', 'state'])

In [98]:
business.to_csv('business_fl.csv')