# **ETL `metadata_gm`**

In [1]:
import os
import pandas as pd # type: ignore
from collections import Counter
import itertools
import json
import ast
import numpy as np

In [2]:
# Definir la ruta de la carpeta
folder_path = os.path.join("Google Maps", "metadata-sitios")

# Lista para almacenar los DataFrames de cada archivo
df_list = []

# Iterar sobre los archivos del 1 al 11
for i in range(1, 12):  # del 1 al 11
    file_path = os.path.join(folder_path, f"{i}.json")
    
    if os.path.exists(file_path):  # Verificar si el archivo existe
        try:
            df_temp = pd.read_json(file_path, lines=True)  # Cargar el JSON línea por línea
            df_list.append(df_temp)  # Agregar al listado de DataFrames
        except ValueError as e:
            print(f"Error al procesar {file_path}: {e}")

# Concatenar todos los DataFrames en uno solo
metadatos_gm = pd.concat(df_list, ignore_index=True)

In [3]:
metadata = metadatos_gm.copy()

In [4]:
# Asegurar que la columna 'category' contiene listas (si está en formato string, convertirla)
metadata['category'] = metadata['category'].apply(lambda x: eval(x) if isinstance(x, str) else x)

# Aplanar la lista de categorías y contar frecuencia
category_counts = Counter(itertools.chain.from_iterable(metadata['category'].dropna()))

# Convertir a DataFrame
df_categories = pd.DataFrame(category_counts.items(), columns=['Category', 'Frequency'])

# Filtrar categorías que contengan "restaurant" o "food" (sin importar mayúsculas/minúsculas)
filtered_categories = df_categories[
    df_categories["Category"].str.contains(r"restaurant|food|\bbar\b", case=False, na=False)
]

In [5]:
# Convertir las categorías permitidas en un conjunto en minúsculas
allowed_categories = set(filtered_categories["Category"].str.lower().str.strip())

# Separar la columna 'category' en filas individuales (una categoría por fila)
metadata_exploded = metadata.explode("category")

# Convertir a minúsculas y quitar espacios extra para evitar problemas
metadata_exploded["category"] = metadata_exploded["category"].str.lower().str.strip()

# Filtrar solo las filas donde la categoría está en allowed_categories
metadata_filtered = metadata_exploded[metadata_exploded["category"].isin(allowed_categories)]

# Volver a agrupar por gmap_id para restaurar la estructura original
metadata_filtered = metadata_filtered.groupby("gmap_id").first().reset_index()

In [6]:
metadata_filtered.info()

metadata = metadata_filtered.copy()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264449 entries, 0 to 264448
Data columns (total 15 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   gmap_id           264449 non-null  object 
 1   name              264449 non-null  object 
 2   address           262834 non-null  object 
 3   description       81601 non-null   object 
 4   latitude          264449 non-null  float64
 5   longitude         264449 non-null  float64
 6   category          264449 non-null  object 
 7   avg_rating        264449 non-null  float64
 8   num_of_reviews    264449 non-null  int64  
 9   price             118326 non-null  object 
 10  hours             234272 non-null  object 
 11  MISC              259331 non-null  object 
 12  state             235441 non-null  object 
 13  relative_results  217914 non-null  object 
 14  url               264449 non-null  object 
dtypes: float64(3), int64(1), object(11)
memory usage: 30.3+ MB


In [7]:
# Diccionario de estados de EE.UU.
state_abbreviations = {
    "Alabama": "AL", "Alaska": "AK", "Arizona": "AZ", "Arkansas": "AR", "California": "CA",
    "Colorado": "CO", "Connecticut": "CT", "Delaware": "DE", "Florida": "FL", "Georgia": "GA",
    "Hawaii": "HI", "Idaho": "ID", "Illinois": "IL", "Indiana": "IN", "Iowa": "IA",
    "Kansas": "KS", "Kentucky": "KY", "Louisiana": "LA", "Maine": "ME", "Maryland": "MD",
    "Massachusetts": "MA", "Michigan": "MI", "Minnesota": "MN", "Mississippi": "MS", "Missouri": "MO",
    "Montana": "MT", "Nebraska": "NE", "Nevada": "NV", "New Hampshire": "NH", "New Jersey": "NJ",
    "New Mexico": "NM", "New York": "NY", "North Carolina": "NC", "North Dakota": "ND", "Ohio": "OH",
    "Oklahoma": "OK", "Oregon": "OR", "Pennsylvania": "PA", "Rhode Island": "RI", "South Carolina": "SC",
    "South Dakota": "SD", "Tennessee": "TN", "Texas": "TX", "Utah": "UT", "Vermont": "VT",
    "Virginia": "VA", "Washington": "WA", "West Virginia": "WV", "Wisconsin": "WI", "Wyoming": "WY"
}

# Función para separar la columna address
def split_address(df):
    # Eliminar direcciones nulas
    df = df.dropna(subset=['address']).copy()
    
    # Contar la cantidad de comas en cada dirección
    comma_counts = df['address'].str.count(',')
    
    # Casos con exactamente 3 comas (estructura correcta de 4 secciones)
    valid_rows = comma_counts == 3
    split_data = df.loc[valid_rows, 'address'].str.split(',', expand=True)
    df.loc[valid_rows, 'street_address'] = split_data[1].str.strip()
    df.loc[valid_rows, 'city'] = split_data[2].str.strip()
    state_cp = split_data[3].str.strip().str.split(' ', n=1, expand=True)
    df.loc[valid_rows, 'states'] = state_cp[0].copy()
    df.loc[valid_rows, 'CP'] = state_cp[1].copy()
    
    # Casos con más de 3 comas (nombre con comas)
    complex_rows = comma_counts > 3
    split_data = df.loc[complex_rows, 'address'].str.rsplit(',', n=3, expand=True)
    df.loc[complex_rows, 'street_address'] = split_data[1].str.strip()
    df.loc[complex_rows, 'city'] = split_data[2].str.strip()
    state_cp = split_data[3].str.strip().str.split(' ', n=1, expand=True)
    df.loc[complex_rows, 'states'] = state_cp[0].copy()
    df.loc[complex_rows, 'CP'] = state_cp[1].copy()
    
    # Casos con exactamente 2 comas (sin dirección, pero con ciudad y estado/CP)
    partial_rows = comma_counts == 2
    split_data = df.loc[partial_rows, 'address'].str.split(',', expand=True)
    df.loc[partial_rows, 'city'] = split_data[1].str.strip()
    state_cp = split_data[2].str.strip().str.split(' ', n=1, expand=True)
    df.loc[partial_rows, 'states'] = state_cp[0].copy()
    df.loc[partial_rows, 'CP'] = state_cp[1].copy()
    
    # Casos con solo 1 coma (nombre + estado completo con o sin CP)
    state_only_rows = comma_counts == 1
    split_data = df.loc[state_only_rows, 'address'].str.split(',', expand=True)
    df.loc[state_only_rows, 'states'] = split_data[1].str.strip().str.extract(f"({'|'.join(state_abbreviations.keys())})")[0]
    df.loc[state_only_rows, 'states'] = df.loc[state_only_rows, 'states'].map(state_abbreviations)
    df.loc[state_only_rows, 'CP'] = split_data[1].str.extract(r'(\d{5})')[0]
    
    # Eliminar registros con solo una sección en 'address', contienen muchos nulos en sus campos
    df = df[comma_counts > 0]
    
    # Identificar registros que no encajan en ninguna de las estructuras tratadas
    unprocessed_rows = ~valid_rows & ~complex_rows & ~partial_rows & ~state_only_rows
    unprocessed_data = df.loc[unprocessed_rows, ['name', 'address']]
    
    return df, unprocessed_data

# Aplicar la función
metadata, unprocessed_metadata = split_address(metadata)

In [8]:
# Filtrar registros donde el estado es Florida
metadata = metadata[metadata['states'] == 'FL']

In [9]:
metadata_fl = metadata.copy()

metadata_fl.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15794 entries, 1 to 263843
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gmap_id           15794 non-null  object 
 1   name              15794 non-null  object 
 2   address           15794 non-null  object 
 3   description       4521 non-null   object 
 4   latitude          15794 non-null  float64
 5   longitude         15794 non-null  float64
 6   category          15794 non-null  object 
 7   avg_rating        15794 non-null  float64
 8   num_of_reviews    15794 non-null  int64  
 9   price             5881 non-null   object 
 10  hours             13768 non-null  object 
 11  MISC              15507 non-null  object 
 12  state             13818 non-null  object 
 13  relative_results  12078 non-null  object 
 14  url               15794 non-null  object 
 15  street_address    15578 non-null  object 
 16  city              15793 non-null  object 
 1

A partir de la información de `state`, analizamos si el negocio se encuentra abierto o no, y creamos una la columna `is_open`

In [10]:
metadata_fl = metadata_fl.copy()  # Evitar problemas de vistas sobre el DataFrame
metadata_fl['is_open'] = metadata_fl['state'].apply(lambda x: 0 if x == 'Permanently closed' else 1)

Se eliminan todas las columnas que no se van a usar en el análisis y las que ya fueron analizadas.

In [11]:
metadata_fl = metadata_fl.drop(columns = ['address', 
                                          'description', 
                                          'url', 
                                          'relative_results',
                                          'state',
                                          'hours',
                                          'price'])

Renombrar columnas para que coincidan con la tabla de `yelp`

In [12]:
metadata_fl = metadata_fl.rename(columns={'avg_rating': 'stars', 
                                          'num_of_reviews': 'review_count', 
                                          'states': 'state', 
                                          'CP' : 'postal_code',
                                          'category': 'categories'})

Transformación de la columna MISC

In [13]:
# Reemplazar valores nulos en MISC con un diccionario vacío
metadata_fl['MISC'] = metadata_fl['MISC'].fillna('{}')

# Convertir la columna MISC a diccionarios reales si es un string en formato JSON
metadata_fl['MISC'] = metadata_fl['MISC'].apply(lambda x: json.loads(x) if isinstance(x, str) else x)

# Expandir los diccionarios en columnas
misc_expanded = metadata_fl['MISC'].apply(pd.Series)

# Unir las nuevas columnas al DataFrame original
metadata_fl = pd.concat([metadata_fl, misc_expanded], axis=1)

# Eliminar la columna original MISC
metadata_fl = metadata_fl.drop(columns=['MISC'])

La columna `Health & safety` describe las medidas de precaucion y higiene que se tomaron durante la pandemia, por lo que no es información que utilicemos en este proyecto

In [14]:
# eliminar columna  'Health & safety'
metadata_fl = metadata_fl.drop(columns = 'Health & safety')

Análisis y expansión de `Service options`

In [15]:
# Extraer todas las categorías únicas dentro de "Service options"
unique_service_options = set()

for value in metadata_fl["Service options"].dropna():  # Ignorar valores nulos
    if isinstance(value, list):  # Confirmar que es una lista
        clean_values = [x.strip().lower().replace("-", "").replace(" ", "_") for x in value]
        unique_service_options.update(clean_values)  # Añadir al conjunto (elimina duplicados)

# Crear DataFrame con las opciones en formato binario
service_options_df = pd.DataFrame()

for option in unique_service_options:
    service_options_df[option] = metadata_fl["Service options"].apply(
        lambda x: 1 if isinstance(x, list) and option in [i.strip().lower().replace("-", "").replace(" ", "_") for i in x] else 0
    )

# Sumar la cantidad de negocios que tienen cada opción activada
option_counts = service_options_df.sum().sort_values(ascending=False)

In [16]:
# Fusionar categorías en 'delivery'
metadata_fl["delivery"] = metadata_fl["Service options"].apply(
    lambda x: 1 if isinstance(x, list) and any(opt in x for opt in ["Delivery", "Same-day delivery", "No-contact delivery"]) else 0
)

# Fusionar categorías en 'takeout'
metadata_fl["takeout"] = metadata_fl["Service options"].apply(
    lambda x: 1 if isinstance(x, list) and any(opt in x for opt in ["Takeout", "Curbside pickup"]) else 0
)

# Crear columnas individuales sin modificar
metadata_fl["dinein"] = metadata_fl["Service options"].apply(lambda x: 1 if isinstance(x, list) and "Dine-in" in x else 0)
metadata_fl["outdoor_seating"] = metadata_fl["Service options"].apply(lambda x: 1 if isinstance(x, list) and "Outdoor seating" in x else 0)
metadata_fl["drivethrough"] = metadata_fl["Service options"].apply(lambda x: 1 if isinstance(x, list) and "Drive-through" in x else 0)

# Eliminar la columna original "Service options"
metadata_fl = metadata_fl.drop(columns=["Service options"])

Analizamos y expandimos la columna `Popular for`

In [17]:
# Extraer todas las categorías únicas dentro de "Popular for"
unique_popular_for = set()

for value in metadata_fl["Popular for"].dropna():  # Ignorar valores nulos
    if isinstance(value, list):  # Confirmar que es una lista
        clean_values = [x.strip().lower().replace("-", "").replace(" ", "_") for x in value]
        unique_popular_for.update(clean_values)  # Añadir al conjunto (elimina duplicados)

# Crear DataFrame con las opciones en formato binario
popular_for_df = pd.DataFrame()

for option in unique_popular_for:
    popular_for_df[option] = metadata_fl["Popular for"].apply(
        lambda x: 1 if isinstance(x, list) and option in [i.strip().lower().replace("-", "").replace(" ", "_") for i in x] else 0
    )

# Sumar la cantidad de negocios que tienen cada opción activada
option_counts = popular_for_df.sum().sort_values(ascending=False)

In [18]:
print(unique_popular_for)

{'lunch', 'solo_dining', 'dinner', 'good_for_working_on_laptop', 'breakfast'}


In [19]:
# Función para limpiar los valores en "Popular for"
def clean_popular_for(value):
    if isinstance(value, str):
        try:
            value = ast.literal_eval(value)  # Convertir string a lista
        except (ValueError, SyntaxError):
            return ""  # Si falla la conversión, devolver vacío
    
    if isinstance(value, list):
        clean_values = [x.strip().lower().replace("-", "").replace(" ", "_") for x in value]
        return ", ".join(sorted(set(clean_values)))  # Eliminar duplicados y unir
    
    return ""

# Aplicar limpieza
metadata_fl["Popular for"] = metadata_fl["Popular for"].apply(clean_popular_for)

# Expansión en variables dummies
popular_for_dummies = metadata_fl["Popular for"].str.get_dummies(sep=", ")

# Renombrar columnas para mayor claridad
popular_for_dummies.columns = [f"Popular_for_{col}" for col in popular_for_dummies.columns]

# Unir al dataframe original y eliminar la columna original
metadata_fl = pd.concat([metadata_fl, popular_for_dummies], axis=1).drop(columns=["Popular for"])

Analizamos y expandimos la columna `Accessibility`

In [20]:
# Función para extraer todas las categorías únicas en "Accessibility"
def extract_unique_accessibility(series):
    unique_categories = set()
    
    for value in series.dropna():  # Ignorar valores nulos
        try:
            value = ast.literal_eval(value) if isinstance(value, str) else value  # Convertir string a lista
            if isinstance(value, list):  # Si es una lista, agregar los valores
                clean_values = [x.strip().lower().replace("-", "").replace(" ", "_") for x in value]
                unique_categories.update(clean_values)  # Añadir al conjunto (elimina duplicados)
        except (ValueError, SyntaxError):
            pass  # Ignorar errores de conversión
    
    return unique_categories

# Obtener las categorías únicas
unique_accessibility = extract_unique_accessibility(metadata_fl["Accessibility"])


In [21]:
# Función para transformar 'Accessibility' en 'wheelchair_friendly'
def transform_wheelchair_friendly(df, column):
    wheelchair_categories = {
        'wheelchair_rental',
        'wheelchair_accessible_restroom',
        'wheelchair_accessible_elevator',
        'wheelchair_accessible_entrance',
        'wheelchair_accessible_parking_lot',
        'wheelchair_accessible_seating'
    }
    
    def normalize(text):
        return text.strip().lower().replace(" ", "_").replace("-", "_")
    
    def check_wheelchair_friendly(value):
        if isinstance(value, float) and np.isnan(value):
            return 0
        if isinstance(value, list):
            normalized_values = {normalize(item) for item in value}
            return 1 if wheelchair_categories & normalized_values else 0
        return 0
    
    df['wheelchair_friendly'] = df[column].apply(check_wheelchair_friendly)
    df.drop(columns=[column], inplace=True)
    return df

# Aplicar las transformaciones
metadata_fl = transform_wheelchair_friendly(metadata_fl, 'Accessibility')

Analisis y expansión de `Offerings`

In [22]:
# Función para extraer todas las categorías únicas en "Offerings"
def extract_unique_offerings(series):
    unique_categories = set()
    
    for value in series.dropna():  # Ignorar valores nulos
        try:
            value = ast.literal_eval(value) if isinstance(value, str) else value  # Convertir string a lista
            if isinstance(value, list):  # Si es una lista, agregar los valores
                clean_values = [x.strip().lower().replace("-", "").replace(" ", "_") for x in value]
                unique_categories.update(clean_values)  # Añadir al conjunto (elimina duplicados)
        except (ValueError, SyntaxError):
            pass  # Ignorar errores de conversión
    
    return unique_categories

# Obtener las categorías únicas
unique_offerings = extract_unique_offerings(metadata_fl["Offerings"])

In [23]:
# Función para transformar 'Offerings'
def transform_offerings(df, column):
    remove_categories = {
        'buys_used_goods', 'car_wash', 'full_service_gas', 'ethanolfree_gas', 'check_cashing', 'oil_change', 
        'food', 'repair_services', 'service_guarantee', 'assembly_service', 'prepared_foods', 'food_at_bar', 
        'small_plates', 'latenight_food'
    }
    alcohol_beverage = {'hard_liquor', 'alcohol', 'beer', 'cocktails', 'wine', 'happy_hour_drinks'}
    healthy_food = {'salad_bar', 'organic_dishes', 'healthy_options', 'halal_food', 'vegetarian_options', 'organic_products'}
    kids = {"kids'_menu", "kids'_toys"}
    fast_comfort_food = {'comfort_food', 'happy_hour_food', 'quick_bite'}
    separate_categories = {'coffee', 'dancing', 'all_you_can_eat', 'braille_menu'}
    
    def normalize(text):
        return text.strip().lower().replace(" ", "_").replace("-", "_")
    
    def categorize_offerings(value):
        if isinstance(value, float) and np.isnan(value):
            return set()
        try:
            if isinstance(value, str):
                value = ast.literal_eval(value)
            if isinstance(value, (list, np.ndarray, pd.Series)):
                normalized_values = {normalize(item) for item in value}  # Normalizar antes de comparar
                return normalized_values - remove_categories
        except (ValueError, SyntaxError):
            return set()
        return set()
    
    df[column] = df[column].apply(categorize_offerings)
    
    df['alcohol_beverage'] = df[column].apply(lambda x: 1 if alcohol_beverage & x else 0)
    df['healthy_food'] = df[column].apply(lambda x: 1 if healthy_food & x else 0)
    df['kids'] = df[column].apply(lambda x: 1 if kids & x else 0)
    df['fast_comfort_food'] = df[column].apply(lambda x: 1 if fast_comfort_food & x else 0)
    
    for category in separate_categories:
        df[category] = df[column].apply(lambda x: 1 if category in x else 0)
    
    df.drop(columns=[column], inplace=True)
    return df

# Aplicar las transformaciones
metadata_fl = transform_offerings(metadata_fl, 'Offerings')

Analizamos y expandimos `Dining options`

In [24]:
# Crear un conjunto vacío para almacenar las categorías únicas
dining_options_set = set()

# Recorrer la columna y extraer las categorías
metadata_fl["Dining options"].dropna().apply(lambda x: dining_options_set.update(x) if isinstance(x, list) else dining_options_set.add(x))

1         None
9         None
17        None
18        None
19        None
          ... 
191048    None
191051    None
204241    None
204271    None
245363    None
Name: Dining options, Length: 7617, dtype: object

In [25]:
# Función para expandir 'Dining options' en columnas binarias
def expand_dining_options(df, column):
    unique_categories = {
        'Catering', 'Outside food allowed', 'Breakfast', 'Seating', 'Pay ahead', 
        'Lunch', 'Dinner', 'Dessert', 'Counter service'
    }
    
    def parse_dining_options(value):
        if isinstance(value, float) and pd.isna(value):
            return set()
        try:
            if isinstance(value, str):
                value = ast.literal_eval(value)
            if isinstance(value, list):
                return set(value)
        except (ValueError, SyntaxError):
            return set()
        return set()
    
    df[column] = df[column].apply(parse_dining_options)
    
    for category in unique_categories:
        df[category] = df[column].apply(lambda x: 1 if category in x else 0)
    
    df.drop(columns=[column], inplace=True)
    return df

# Función para fusionar 'Popular for' y 'Dining options'
def merge_popular_dining(df):
    # Crear nuevas columnas fusionadas
    df['breakfast'] = df[['Popular_for_breakfast', 'Breakfast']].max(axis=1)
    df['lunch'] = df[['Popular_for_lunch', 'Lunch']].max(axis=1)
    df['dinner'] = df[['Popular_for_dinner', 'Dinner']].max(axis=1)
    df['dessert'] = df['Dessert']  # Mantener Dessert tal cual
    
    # Mantener sin cambios las otras categorías
    keep_columns = [
        'Popular_for_solo_dining', 'Popular_for_good_for_working_on_laptop',
        'Seating', 'Pay ahead', 'Outside food allowed', 'Catering', 'Counter service'
    ]
    
    # Eliminar columnas originales
    df.drop(columns=['Popular_for_breakfast', 'Popular_for_lunch', 'Popular_for_dinner', 'Breakfast', 'Lunch', 'Dinner'], inplace=True)
    
    return df

# Aplicar la expansión y fusión
metadata_fl = expand_dining_options(metadata_fl, 'Dining options')
metadata_fl = merge_popular_dining(metadata_fl)

Análisis y expansión de `Atmosphere`

In [26]:
# Expansión de la columna "Atmosphere"
def expand_atmosphere(df, column):
    df[column] = df[column].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    atmosphere_list = set()
    df[column].dropna().apply(lambda x: atmosphere_list.update(x))

    for category in atmosphere_list:
        df[category.lower().replace(" ", "_")] = df[column].apply(lambda x: 1 if isinstance(x, list) and category in x else 0)

# Aplicar la transformación a "Atmosphere"
expand_atmosphere(metadata_fl, "Atmosphere")

In [27]:
# Función para transformar 'Atmosphere' en categorías agrupadas
def transform_atmosphere(df, column):
    casual_categories = {'Casual', 'Cozy', 'Quiet'}
    formal_categories = {'Upscale', 'Historic'}
    trendy_categories = {'Trending'}
    romantic_categories = {'Romantic'}
    
    def parse_atmosphere(value):
        if isinstance(value, float) and pd.isna(value):
            return set()
        try:
            if isinstance(value, str):
                value = ast.literal_eval(value)
            if isinstance(value, list):
                return set(value)
        except (ValueError, SyntaxError):
            return set()
        return set()
    
    df[column] = df[column].apply(parse_atmosphere)
    
    df['casual'] = df[column].apply(lambda x: 1 if casual_categories & x else 0)
    df['formal'] = df[column].apply(lambda x: 1 if formal_categories & x else 0)
    df['trendy'] = df[column].apply(lambda x: 1 if trendy_categories & x else 0)
    df['romantic'] = df[column].apply(lambda x: 1 if romantic_categories & x else 0)
    
    df.drop(columns=[column], inplace=True)
    return df

# Aplicar la transformación
metadata_fl = transform_atmosphere(metadata_fl, 'Atmosphere')

Análisis y expansión de `Planning`

In [28]:
# Extraer y listar las categorías únicas
planning_list = set()

metadata_fl["Planning"].dropna().apply(lambda x: planning_list.update(ast.literal_eval(x) if isinstance(x, str) else x))

4         None
11519     None
26691     None
70232     None
105491    None
          ... 
191052    None
204241    None
213813    None
221977    None
263843    None
Name: Planning, Length: 3051, dtype: object

In [29]:
# Función para transformar 'Planning' en categorías agrupadas
def transform_planning(df, column):
    reservation_categories = {
        'Brunch reservations recommended', 'Appointments recommended', 'Accepts reservations',
        'Dinner reservations recommended', 'Lunch reservations recommended'
    }
    wait_categories = {'Usually a wait'}
    quick_visit_categories = {'Quick visit'}
    
    def parse_planning(value):
        if isinstance(value, float) and pd.isna(value):
            return set()
        try:
            if isinstance(value, str):
                value = ast.literal_eval(value)
            if isinstance(value, list):
                return set(value)
        except (ValueError, SyntaxError):
            return set()
        return set()
    
    df[column] = df[column].apply(parse_planning)
    
    df['with_reservation'] = df[column].apply(lambda x: 1 if reservation_categories & x else 0)
    df['usually_a_wait'] = df[column].apply(lambda x: 1 if wait_categories & x else 0)
    df['quick_visit'] = df[column].apply(lambda x: 1 if quick_visit_categories & x else 0)
    
    df.drop(columns=[column], inplace=True)
    return df

# Aplicar la transformación
metadata_fl = transform_planning(metadata_fl, 'Planning')

Analizamos y expandimos `From the business`

In [30]:
business_list = set()

# Iterar sobre las listas dentro de la columna
metadata_fl["From the business"].dropna().apply(lambda x: business_list.update(x))

9         None
171803    None
171898    None
171900    None
171944    None
          ... 
190647    None
190677    None
190966    None
190985    None
191013    None
Name: From the business, Length: 702, dtype: object

In [31]:
# Crear columnas binarias
metadata_fl["black_owned"] = metadata_fl["From the business"].apply(lambda x: 1 if isinstance(x, list) and 'Identifies as Black-owned' in x else 0)
metadata_fl["women_led"] = metadata_fl["From the business"].apply(lambda x: 1 if isinstance(x, list) and 'Identifies as women-led' in x else 0)
metadata_fl["veteran_led"] = metadata_fl["From the business"].apply(lambda x: 1 if isinstance(x, list) and 'Identifies as veteran-led' in x else 0)

# Eliminar la columna original
metadata_fl.drop(columns=["From the business"], inplace=True)


Analizamos y expandimos `Highlights`

In [32]:
# Función para extraer todas las categorías únicas en "Highlights"
def extract_unique_highlights(series):
    unique_categories = set()
    
    for value in series.dropna():  # Ignorar valores nulos
        try:
            value = ast.literal_eval(value) if isinstance(value, str) else value  # Convertir string a lista
            if isinstance(value, list):  # Si es una lista, agregar los valores
                clean_values = [x.strip().lower().replace("-", "").replace(" ", "_") for x in value]
                unique_categories.update(clean_values)  # Añadir al conjunto (elimina duplicados)
        except (ValueError, SyntaxError):
            pass  # Ignorar errores de conversión
    
    return unique_categories

# Obtener las categorías únicas
unique_highlights = extract_unique_highlights(metadata_fl["Highlights"])

In [33]:
# Función para transformar 'Highlights'
def transform_highlights(df, column):
    remove_categories = {
        'great_dessert', 'great_tea_selection', 'great_cocktails', 'great_beer_selection', 
        'great_bar_food', 'great_wine_list', 'great_produce', 'active_military_discounts', 
        'great_coffee', 'serves_local_specialty'
    }
    entertainment = {'play_area', 'trivia_night', 'bar_games'}
    live_entertainment = {'live_performances', 'karaoke', 'live_music'}
    lgbtq_friendly = {'transgender_safespace', 'lgbtq_friendly'}
    separate_categories = {'rooftop_seating', 'fast_service', 'sports', 'fireplace'}
    
    def normalize(text):
        return text.strip().lower().replace(" ", "_").replace("-", "_")
    
    def categorize_highlights(value):
        if isinstance(value, float) and np.isnan(value):
            return set()
        try:
            if isinstance(value, str):
                value = ast.literal_eval(value)
            if isinstance(value, (list, np.ndarray, pd.Series)):
                normalized_values = {normalize(item) for item in value}
                return normalized_values - remove_categories
        except (ValueError, SyntaxError):
            return set()
        return set()
    
    df[column] = df[column].apply(categorize_highlights)
    
    df['entertainment'] = df[column].apply(lambda x: 1 if entertainment & x else 0)
    df['live_entertainment'] = df[column].apply(lambda x: 1 if live_entertainment & x else 0)
    df['lgbtq_friendly'] = df[column].apply(lambda x: 1 if lgbtq_friendly & x else 0)
    
    for category in separate_categories:
        df[category] = df[column].apply(lambda x: 1 if category in x else 0)
    
    df.drop(columns=[column], inplace=True)
    return df

# Aplicar las transformaciones
metadata_fl = transform_highlights(metadata_fl, 'Highlights')

Analizamos y expandimos `Crowd`

In [34]:
# Función para extraer todas las categorías únicas en "Crowd"
def extract_unique_crowd(series):
    unique_categories = set()
    
    for value in series.dropna():  # Ignorar valores nulos
        try:
            value = ast.literal_eval(value) if isinstance(value, str) else value  # Convertir string a lista
            if isinstance(value, list):  # Si es una lista, agregar los valores
                clean_values = [x.strip().lower().replace("-", "_").replace(" ", "_") for x in value]
                unique_categories.update(clean_values)  # Añadir al conjunto (elimina duplicados)
        except (ValueError, SyntaxError):
            pass  # Ignorar errores de conversión
    
    return unique_categories

# Obtener las categorías únicas
unique_crowd = extract_unique_crowd(metadata_fl["Crowd"])

In [35]:
# Función para transformar 'Crowd'
def transform_crowd(df, column):
    separate_categories = {'locals', 'tourists', 'college_students', 'family_friendly', 'groups'}
    
    def normalize(text):
        return text.strip().lower().replace(" ", "_").replace("-", "_")
    
    def categorize_crowd(value):
        if isinstance(value, float) and np.isnan(value):
            return set()
        try:
            if isinstance(value, str):
                value = ast.literal_eval(value)
            if isinstance(value, (list, np.ndarray, pd.Series)):
                normalized_values = {normalize(item) for item in value}
                return normalized_values
        except (ValueError, SyntaxError):
            return set()
        return set()
    
    df[column] = df[column].apply(categorize_crowd)
    
    for category in separate_categories:
        df[category] = df[column].apply(lambda x: 1 if category in x else 0)
    
    df.drop(columns=[column], inplace=True)
    return df

# Aplicar las transformaciones
metadata_fl = transform_crowd(metadata_fl, 'Crowd')

Analizamos y expandimos `Amenities`

In [36]:
# Función para extraer todas las categorías únicas en "Amenities"
def extract_unique_amenities(series):
    unique_categories = set()
    
    for value in series.dropna():  # Ignorar valores nulos
        try:
            value = ast.literal_eval(value) if isinstance(value, str) else value  # Convertir string a lista
            if isinstance(value, list):  # Si es una lista, agregar los valores
                clean_values = [x.strip().lower().replace("-", "_").replace(" ", "_") for x in value]
                unique_categories.update(clean_values)  # Añadir al conjunto (elimina duplicados)
        except (ValueError, SyntaxError):
            pass  # Ignorar errores de conversión
    
    return unique_categories

# Obtener las categorías únicas
unique_amenities = extract_unique_amenities(metadata_fl["Amenities"])

In [37]:
# Función para transformar 'Amenities'
def transform_amenities(df, column):
    remove_categories = {'public_restroom', 'stadium_seating'}
    kids_friendly = {'high_chairs', 'good_for_kids'}
    separate_categories = {'gender_neutral_restroom', 'wi_fi', 'bar_onsite'}
    
    def normalize(text):
        return text.strip().lower().replace(" ", "_").replace("-", "_")
    
    def categorize_amenities(value):
        if isinstance(value, float) and np.isnan(value):
            return set()
        try:
            if isinstance(value, str):
                value = ast.literal_eval(value)
            if isinstance(value, (list, np.ndarray, pd.Series)):
                normalized_values = {normalize(item) for item in value}
                return normalized_values - remove_categories
        except (ValueError, SyntaxError):
            return set()
        return set()
    
    df[column] = df[column].apply(categorize_amenities)
    
    df['kids_friendly'] = df[column].apply(lambda x: 1 if kids_friendly & x else 0)
    
    for category in separate_categories:
        df[category] = df[column].apply(lambda x: 1 if category in x else 0)
    
    df.drop(columns=[column], inplace=True)
    return df

# Aplicar las transformaciones
metadata_fl = transform_amenities(metadata_fl, 'Amenities')

Analisis y expansión de `Payments` 

In [38]:
# Función para extraer todas las categorías únicas en "Payments"
def extract_unique_payments(series):
    unique_categories = set()
    
    for value in series.dropna():  # Ignorar valores nulos
        try:
            value = ast.literal_eval(value) if isinstance(value, str) else value  # Convertir string a lista
            if isinstance(value, list):  # Si es una lista, agregar los valores
                clean_values = [x.strip().lower().replace("-", "_").replace(" ", "_") for x in value]
                unique_categories.update(clean_values)  # Añadir al conjunto (elimina duplicados)
        except (ValueError, SyntaxError):
            pass  # Ignorar errores de conversión
    
    return unique_categories

# Obtener las categorías únicas
unique_payments = extract_unique_payments(metadata_fl["Payments"])

In [39]:
# Función para transformar 'Payments'
def transform_payments(df, column):
    separate_categories = {'credit_cards', 'debit_cards', 'nfc_mobile_payments', 'cash_only', 'checks'}
    
    def normalize(text):
        return text.strip().lower().replace(" ", "_").replace("-", "_")
    
    def categorize_payments(value):
        if isinstance(value, float) and np.isnan(value):
            return set()
        try:
            if isinstance(value, str):
                value = ast.literal_eval(value)
            if isinstance(value, (list, np.ndarray, pd.Series)):
                normalized_values = {normalize(item) for item in value}
                return normalized_values
        except (ValueError, SyntaxError):
            return set()
        return set()
    
    df[column] = df[column].apply(categorize_payments)
    
    for category in separate_categories:
        df[category] = df[column].apply(lambda x: 1 if category in x else 0)
    
    df.drop(columns=[column], inplace=True)
    return df

# Aplicar las transformaciones
metadata_fl = transform_payments(metadata_fl, 'Payments')

Analizamos y expandimos `Recycling`

In [40]:
# Función para extraer todas las categorías únicas en "Recycling"
def extract_unique_recycling(series):
    unique_categories = set()
    
    for value in series.dropna():  # Ignorar valores nulos
        try:
            value = ast.literal_eval(value) if isinstance(value, str) else value  # Convertir string a lista
            if isinstance(value, list):  # Si es una lista, agregar los valores
                clean_values = [x.strip().lower().replace("-", "_").replace(" ", "_") for x in value]
                unique_categories.update(clean_values)  # Añadir al conjunto (elimina duplicados)
        except (ValueError, SyntaxError):
            pass  # Ignorar errores de conversión
    
    return unique_categories

# Obtener las categorías únicas
unique_recycling = extract_unique_recycling(metadata_fl["Recycling"])

In [41]:
# Función para transformar 'Recycling'
def transform_recycling(df, column):
    separate_categories = {'plastic_bags', 'glass_bottles'}
    
    def normalize(text):
        return text.strip().lower().replace(" ", "_").replace("-", "_")
    
    def categorize_recycling(value):
        if isinstance(value, float) and np.isnan(value):
            return set()
        try:
            if isinstance(value, str):
                value = ast.literal_eval(value)
            if isinstance(value, (list, np.ndarray, pd.Series)):
                normalized_values = {normalize(item) for item in value}
                return normalized_values
        except (ValueError, SyntaxError):
            return set()
        return set()
    
    df[column] = df[column].apply(categorize_recycling)
    
    for category in separate_categories:
        df[category] = df[column].apply(lambda x: 1 if category in x else 0)
    
    df.drop(columns=[column], inplace=True)
    return df

# Aplicar las transformaciones
metadata_fl = transform_recycling(metadata_fl, 'Recycling')

--- 

In [42]:
# Agrupar columnas combinando valores
metadata_fl['outdoor_seating'] = metadata_fl[['Outside food allowed', 'outdoor_seating']].max(axis=1)
metadata_fl['kids_friendly'] = metadata_fl[['kids', 'kids_friendly']].max(axis=1)
metadata_fl['dessert'] = metadata_fl[['dessert', 'Dessert']].max(axis=1)
metadata_fl['lgbtq_friendly'] = metadata_fl[['lgbtq_friendly', 'gender_neutral_restroom']].max(axis=1)
metadata_fl['recycling'] = metadata_fl[['plastic_bags', 'glass_bottles']].max(axis=1)

# Eliminar las columnas antiguas
metadata_fl.drop(columns=['Outside food allowed', 'kids', 'Dessert', 'gender_neutral_restroom', 'plastic_bags', 'glass_bottles'], inplace=True)

In [43]:
# Normalizar nombres de columnas para consistencia y legibilidad
metadata_fl.rename(columns=lambda x: x.strip().lower().replace(" ", "_").replace("-", "_").replace("popular_for_", ""), inplace=True)

In [44]:
metadata_fl.reset_index(drop=True, inplace=True)

In [45]:
metadata_fl.to_csv('metadatos_fl.csv')