In [13]:
import pandas as pd # type: ignore
import ast
import re

In [14]:
# Diccionario con los grupos y sus categorías
categorias_clasificadas = {
    # Asia
    "Chinese": ["chinese", "taiwanese","chinese noodle", "cantonese", "sichuan", "dim sum", "delivery chinese", "hong kong style fast food", "mandarin"],
    "Japanese": ["japanese", "authentic japanese", "japanese curry", "izakaya", "modern izakaya", "ramen", "sushi", "teppanyaki", "udon noodle", "yakitori"],
    "Korean": ["korean", "korean barbecue"],
    "Thai": ["thai"],
    "Vietnamese": ["vietnamese", "pho"],
    "Pacific & Filipino": ["filipino", "hawaiian"],
    "Indian": ["indian", "modern indian", "indian muslim", "indian sizzler", "kerala", "biryani"],
    "Pakistani & Afghani": ["pakistani", "afghani","bangladeshi"],
    "Middle Eastern": ["middle eastern", "lebanese", "israeli", "falafel", "shawarma", "persian", "turkish", "gyro"],
    "Pan-Asian": ["asian", "asian fusion", "panasian"],

    # África y el Medio Oriente
    "North African": ["moroccan", "egyptian"],
    "West African": ["west african", "cape verdean"],
    "General African": ["african", "ethiopian","south african"],

    # América
    "Mexican": ["mexican", "mexican torta", "taco", "texmex", "burrito"],
    "Centroamerican": ["central american", "costa rican", "guatemalan", "honduran", "nicaraguan", "salvadoran"],
    "Southern Cone": ["argentinian", "chilean", "uruguayan"],
    "Tropical South American": ["brazilian", "colombian", "ecuadorian", "peruvian", "venezuelan"],
    "Caribbean": ["caribbean", "cuban", "dominican", "haitian", "jamaican", "puerto rican"],

    # Europa
    "Western European": ["french", "modern french", "french steakhouse", "belgian", "dutch", "spanish", "portuguese", "italian", "modern european", "tuscan", "northern italian", "southern italian", "neapolitan"],
    "Central European": ["german", "austrian", "swiss", "czech", "polish", "hungarian"],
    "Eastern European": ["eastern european", "bulgarian", "romanian", "russian", "serbian", "georgian", "uzbeki"],
    "Nordic & Scandinavian": ["scandinavian", "swedish", "icelandic"],
    "British & Irish": ["british", "modern british", "english", "irish"],

    # Estados Unidos y Canadá
    "NorthAmerican": ["canadian","american", "new american", "traditional american", "southern us", "southwestern us", "cajun", "creole", "contemporary louisiana", "down home cooking", "floridian", "new england", "californian"],
    
    # Australia
    "Australian": ["australian"],

    # Fast Food, Street Food & Specialties
    "Pizza": ["pizza"],
    "Burgers & Fast Food": ["fast food", "hamburger", "cheesesteak", "chicken wings", "hot dog", "hoagie", "po boys"],
    "Hot Dog & Sandwiches": [],
    "Tacos & Tex-Mex": [],
    "Seafood": ["seafood", "angler fish", "fish chips", "seafood donburi", "seafood farm", "seafood market", "seafood wholesaler"],
    "Poke": ["poke", "poke bar"],
    "Sushi": [],
    "Steakhouse & Grilled": ["chophouse", "barbecue", "mongolian barbecue"],
    "Breakfast & brunch": ["breakfast", "brunch"],

    # Healthy & Specialty Diets
    "Vegetarian & Vegan": ["vegetarian", "vegan", "glutenfree", "organic", "raw food", "tofu"],

    # Bakery, Desserts & Snacks
    "Desserts & Bakeries": ["dessert", "pancake", "sundae"],
    "Fondue & Raclette": ["fondue", "raclette"],
    "Rice & Grain-Based": ["rice", "biryani"],

    # Cafés & Small Restaurants
    "Cafés": ["or cafe", "espresso bar", "coffee"],
    "Espresso & Coffee": [],

    # Buffets & Dining Styles
    "Restaurant" : ["restaurant"],
    "Buffet": ["buffet"],
    "Fine Dining": ["fine dining"],

    # Drinks & Bars
    "Bars & Nightlife": ["bar", "bar grill", "cocktail bar", "cider bar", "dart bar", "gay bar", "hookah bar", "karaoke bar", "live music bar", "piano bar", "sports bar", "stand bar", "tiki bar", "wine bar"],
    "Bar & Grill": [],
    "LGBTQ+ Friendly Bars": [],

    # Miscellaneous & Unclassified
    "Food Markets & Wholesale": ["fresh food market", "frozen food store", "frozen food manufacturer", "self service", "wholesale food store", "dried seafood store", "seafood market"],
    "Food Industry & Supply": ["catering food and drink supplier", "food and beverage consultant", "food and beverage exporter", "food bank", "food broker", "food court", "food delivery", "food machinery supplier", "food manufacturer", "food manufacturing supply", "food processing company", "food processing equipment", "food producer", "food products supplier", "food seasoning manufacturer", "food service", "food store", "supply store"],
    "Eyebrow & Beauty Bars": ["eyebrow bar"],

    # Grupos adicionales creados
    "Ethnic & Fusion": ["eclectic", "ethnic", "fusion", "panlatin", "nuevo latino"],
    "Family & Traditional": ["family", "traditional", "country food", "soul food"],
    "Snacks & Small Plates": ["snack bar", "small plates", "tapas", "tapas bar"],
    "Soup & Noodles": ["soup", "chinese noodle", "udon noodle", "ramen", "pho"],
    "Unclassified": ["bar pmu", "bar furniture store", "bar stool supplier", "food", "food and drink"],

    # Nuevos grupos para categorías faltantes
    "Mediterranean & Greek": ["mediterranean", "greek"],
    "Southeast Asian": ["southeast asian", "cambodian", "laotian", "indonesian", "singaporean"],
    "South Asian": ["south asian", "nepalese"],
    "Latin American": ["latin american", "south american"],
    "Jewish & Kosher": ["jewish", "kosher"],
    "Halal": ["halal"],
    "Health & Organic": ["health food", "health food store", "organic food store"],
    "Chicken & Meat Dishes": ["chicken", "meat dish"],
    "Wok & Stir-Fry": ["wok"],
    "Oyster Bars": ["oyster bar"],
    "Karaoke": ["karaoke"],
    "Delivery & Takeout": ["delivery", "takeout"],
    "Lunch": ["lunch"],
    "Hot Pot": ["hot pot"],
    "Continental": ["continental"],
    "European": ["european"],
    "Armenian": ["armenian"]
}

# Crear un DataFrame con las categorías clasificadas
df_categorias_final = pd.DataFrame(columns=["category", "group"])

# Asignar cada categoría a su grupo correspondiente
for grupo, categorias in categorias_clasificadas.items():
    for categoria in categorias:
        if categoria in categorias:  # Verificar si la categoría está en la lista cruda
            df_categorias_final = pd.concat(
                [df_categorias_final, pd.DataFrame({"category": [categoria], "group": [grupo]})],
                ignore_index=True
            )

# Verificar si hay categorías crudas que no se asignaron a ningún grupo
categorias_sin_grupo = set(categorias) - set(df_categorias_final["category"])
if categorias_sin_grupo:
    print("Categorías sin grupo asignado:", categorias_sin_grupo)

# Mostrar DataFrame final con categorías organizadas
print("Número de categorías únicas:", df_categorias_final["category"].nunique())  # Debería ser 248
print("Número de grupos:", df_categorias_final["group"].nunique())                # Debería ser alrededor de 60

Número de categorías únicas: 252
Número de grupos: 65


In [15]:
# Crear un diccionario de grupos únicos con IDs numéricos
grupo_ids = {grupo: idx for idx, grupo in enumerate(df_categorias_final['group'].unique(), start=1)}

# Crear la nueva columna 'category_id' asignando el ID según el grupo
df_categorias_final.insert(0, 'category_id', df_categorias_final['group'].map(grupo_ids))

In [16]:
df_categorias_final.to_csv('categorias_normalizadas.csv', index = False)