In [1]:
import json 
from rapidfuzz import process, fuzz
import pandas as pd
from tqdm import tqdm

tqdm.pandas()

with open("C:\\Users\\juaji\\AFI\\AA_entregas_afi\\OPEN DATA\\data\\allergens_dict.json") as f:
    allergy_dict = json.load(f)

recipes = pd.read_csv("C:\\Users\\juaji\\AFI\\AA_entregas_afi\\OPEN DATA\\data\\recipes_df.csv")
recipes.head()


Unnamed: 0,category,subcategory,recipe_name,recipe_url,people_served,time,type_of_dish,difficulty,ingredients,Calorías,Proteínas,Grasas,Carbohidratos,Fibra,ingredients_en,ingredients_list,detected_allergies
0,Recetas de aperitivos y tapas,Empanadas,Receta de Empanadas de camarones,https://www.recetasgratis.net/receta-de-empana...,12.0,45.0,Plato principal,Dificultad media,['2½ tazas de harina de trigo integral o harin...,195.0,6.5,10.5,19.2,1.7,['2 ½ cups of whole wheat flour or regular flo...,"['whole wheat flour', 'salt', 'cold butter in ...","['Milk allergy / Lactose intolerance', 'Allium..."
1,Recetas de aperitivos y tapas,Empanadas,Receta de Empanada tucumana de carne,https://www.recetasgratis.net/receta-de-empana...,20.0,60.0,,Dificultad media,"['500 gramos de harina 0000', '100 gramos de g...",286.5,11.1,17.7,21.3,1.1,"['500 grams of flour 0000', '100 grams of fat ...","['flour', 'fat vaccine', 'million water', 'sal...","['Allium Allergy', 'Poultry Allergy']"
2,Recetas de aperitivos y tapas,Empanadas,Receta de Empanada gallega de la abuela,https://www.recetasgratis.net/receta-de-empana...,9.0,60.0,Entrante,Dificultad baja,"['500 gramos de harina común', '150 mililitros...",443.3,12.9,24.7,42.7,2.6,"['500 g common flour', '150 g whole milk', '1 ...","['common flour', 'whole milk', 'salt', 'soft o...","['Allium Allergy', 'Tannin Allergy', 'Milk all..."
3,Recetas de aperitivos y tapas,Empanadas,Receta de Masa para empanadas de camarón,https://www.recetasgratis.net/receta-de-masa-p...,12.0,45.0,Plato principal,Dificultad baja,"['2½ tazas de harina de trigo 350 gramos', '1 ...",262.5,12.2,15.8,21.7,0.8,"['2 ½ cups of wheat flour 350 grams', '1 table...","['wheat flour', 'salt', 'cold butter at cubes'...","['Milk allergy / Lactose intolerance', 'Allium..."
4,Recetas de aperitivos y tapas,Empanadas,Receta de Empanadas de pino chilenas,https://www.recetasgratis.net/receta-de-empana...,6.0,120.0,Cena,Dificultad media,"['500 gramos de harina de trigo', '1 huevo', '...",701.7,21.3,32.0,78.3,3.3,"['500 g wheat flour', '1 egg', '70 g of white ...","['wheat flour', 'egg', 'white wine', 'boiling ...","['Milk allergy / Lactose intolerance', 'Allium..."


In [2]:
import ast

def convert_to_real_list(texto):
    if pd.isna(texto):
        return []
    try:
        if isinstance(texto, list):
            return texto
        return ast.literal_eval(str(texto))
    except:
        # fallback manual
        texto_limpio = str(texto).strip('[]')
        return [ing.strip().strip("'\"") for ing in texto_limpio.split(",") if ing.strip()]
    
recipes["ingredients_list"] = recipes["ingredients_en"].apply(convert_to_real_list)


In [3]:
import re

def clean_ingredient(ingredient):
    ingredient = str(ingredient).lower()
    ingredient = re.sub(r"[\[\]'\"()]", "", ingredient)
    ingredient = re.sub(r"\s+", " ", ingredient)
    return ingredient.strip()

recipes["ingredients_en"] = recipes["ingredients_list"].apply(
    lambda lst: [clean_ingredient(i) for i in lst]
)


def fuzzy_search_allergens(ingredient, allergy_dict, score_cutoff=90):
    ingredient = clean_ingredient(ingredient)  # aseguramos limpieza
    
    found_allergens = set()
    
    # Lista plana de todos los alimentos en el diccionario
    allergens = [a for allergen_list in allergy_dict.values() for a in allergen_list]
    
    # Hacemos fuzzy match sobre la cadena completa
    matches = process.extract(
        ingredient,
        allergens,
        scorer=fuzz.WRatio,
        score_cutoff=score_cutoff,
        limit=None
    )
    
    for m in matches:
        found_allergens.add(m[0])
    
    # Mapear los alimentos encontrados a sus alergias
    found_allergies = set()
    for allergy, allergen_list in allergy_dict.items():
        if any(fa in allergen_list for fa in found_allergens):
            found_allergies.add(allergy)
    
    return list(found_allergies)

In [4]:
[fuzzy_search_allergens(ing , allergy_dict) for ing in ['cold butter in cubes']]

[['Milk allergy / Lactose intolerance']]

In [5]:
def detect_allergies(ingredient_list):
    found = set()
    for ing in ingredient_list:
        for allergy in fuzzy_search_allergens(ing, allergy_dict):
            found.add(allergy)
    return list(found)

recipes["detected_allergies"] = recipes["ingredients_en"].progress_apply(detect_allergies)


100%|██████████| 8438/8438 [00:10<00:00, 775.09it/s] 


In [6]:
recipes.head()

Unnamed: 0,category,subcategory,recipe_name,recipe_url,people_served,time,type_of_dish,difficulty,ingredients,Calorías,Proteínas,Grasas,Carbohidratos,Fibra,ingredients_en,ingredients_list,detected_allergies
0,Recetas de aperitivos y tapas,Empanadas,Receta de Empanadas de camarones,https://www.recetasgratis.net/receta-de-empana...,12.0,45.0,Plato principal,Dificultad media,['2½ tazas de harina de trigo integral o harin...,195.0,6.5,10.5,19.2,1.7,[2 ½ cups of whole wheat flour or regular flou...,[2 ½ cups of whole wheat flour or regular flou...,"[Citrus Allergy, Allium Allergy, Nightshade Al..."
1,Recetas de aperitivos y tapas,Empanadas,Receta de Empanada tucumana de carne,https://www.recetasgratis.net/receta-de-empana...,20.0,60.0,,Dificultad media,"['500 gramos de harina 0000', '100 gramos de g...",286.5,11.1,17.7,21.3,1.1,"[500 grams of flour 0000, 100 grams of fat vac...","[500 grams of flour 0000, 100 grams of fat vac...","[Allium Allergy, Poultry Allergy]"
2,Recetas de aperitivos y tapas,Empanadas,Receta de Empanada gallega de la abuela,https://www.recetasgratis.net/receta-de-empana...,9.0,60.0,Entrante,Dificultad baja,"['500 gramos de harina común', '150 mililitros...",443.3,12.9,24.7,42.7,2.6,"[500 g common flour, 150 g whole milk, 1 table...","[500 g common flour, 150 g whole milk, 1 table...","[Allium Allergy, Tannin Allergy, Fish Allergy,..."
3,Recetas de aperitivos y tapas,Empanadas,Receta de Masa para empanadas de camarón,https://www.recetasgratis.net/receta-de-masa-p...,12.0,45.0,Plato principal,Dificultad baja,"['2½ tazas de harina de trigo 350 gramos', '1 ...",262.5,12.2,15.8,21.7,0.8,"[2 ½ cups of wheat flour 350 grams, 1 tablespo...","[2 ½ cups of wheat flour 350 grams, 1 tablespo...","[Gluten Allergy, Allium Allergy, Milk allergy ..."
4,Recetas de aperitivos y tapas,Empanadas,Receta de Empanadas de pino chilenas,https://www.recetasgratis.net/receta-de-empana...,6.0,120.0,Cena,Dificultad media,"['500 gramos de harina de trigo', '1 huevo', '...",701.7,21.3,32.0,78.3,3.3,"[500 g wheat flour, 1 egg, 70 g of white wine,...","[500 g wheat flour, 1 egg, 70 g of white wine,...","[Gluten Allergy, Allium Allergy, Mint Allergy,..."


In [None]:
import re
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
from collections import defaultdict

#############################################
# 1. LIMPIEZA Y NORMALIZACIÓN DE INGREDIENTES
#############################################

stopwords = {"de", "la", "el", "a", "al", "con", "sin", "y"}

def normalize_ingredient(text):
    if not isinstance(text, str):
        return ""

    text = text.lower()

    text = re.sub(r"[^a-zA-Záéíóúñ\s]", " ", text)

    tokens = [t.strip() for t in text.split() if t.strip()]

    tokens = [t for t in tokens if t not in stopwords]

    if not tokens:
        return ""

    return " ".join(sorted(tokens))

#############################################
# 2. GENERAR LISTA DE TODOS LOS INGREDIENTES LIMPIOS
#############################################
def explode_ingredients_column(df):
    all_ing = []

    for ing_list in df["ingredients_list"].dropna():
        if isinstance(ing_list, list):
            parts = ing_list
        else:
            parts = str(ing_list).split(",")

        for ing in parts:
            norm = normalize_ingredient(ing)
            if norm:
                all_ing.append(norm)

    return list(set(all_ing))


#############################################
# 3. EMBEDDINGS + CLUSTERING SEMÁNTICO
#############################################


def cluster_ingredients(ingredients, distance_threshold=1.3):
    model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = model.encode(ingredients)

    cluster = AgglomerativeClustering(
        n_clusters=None,
        distance_threshold=distance_threshold,
        linkage="ward"    # métrica fija: euclidean
    )

    labels = cluster.fit_predict(embeddings)

    clusters = defaultdict(list)
    for ing, label in zip(ingredients, labels):
        clusters[label].append(ing)

    canonical = {}
    for label, group in clusters.items():
        base = min(group, key=len)
        for item in group:
            canonical[item] = base

    return canonical

#############################################
# 4. APLICAR EL MAPEO CANÓNICO A TODO EL DF
#############################################
def apply_canonical_mapping(df, canonical):
    def transform_list(ing_list):
        if not isinstance(ing_list, list):
            return []

        normed = []
        for ing in ing_list:
            n = normalize_ingredient(ing)
            if n in canonical:
                normed.append(canonical[n])

        return list(dict.fromkeys(normed))

    df["ingredients_normalized"] = df["ingredients_list"].apply(transform_list)
    return df



#############################################
# ------- USO COMPLETO DEL PROCESO ---------
#############################################

# 1. Sacar ingredientes individuales
ingredients = explode_ingredients_column(recipes)

# 2. Crear clusters + diccionario canónico
canonical = cluster_ingredients(ingredients, distance_threshold=1.3)

# 3. Aplicar normalización al dataframe
recipes = apply_canonical_mapping(recipes, canonical)

recipes.head()


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

TypeError: AgglomerativeClustering.__init__() got an unexpected keyword argument 'affinity'