# Web Scraping 

Web Scraping para la obtencion de un corpus y creación de un sistema de recuperación.

### Obtención del Corpus

La página objetivo para el scraping es "allrecipes.com" Una página que contiene una gran variedad de contenido sobre recetas de cocina.

Para el scraping utilizaremos las librerias requests y Beautifulsoup. Requests permite realizar solicitudes HTTP  y Beautifulsoup permite obtener datos de documentos HTML y XML. 

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

Se especifica la url de la página a la cual se va a realizar el scraping

In [2]:
url_base = "https://www.allrecipes.com/ingredients-a-z-6740416#alphabetical-list-a"

Los headers(encabezados) proporcionan información adicional al servidor sobre la solicitud HTTP que se realiza al servidor para obtener el contenido de la página. User-Agent especifica información sobre el cliente que esta haciendo la solicitud. En este caso se especifica que es un navegador. 

In [3]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

In [4]:
#Solicitud a la pagina objetivo
response_base = requests.get(url_base, headers=headers)

#Verificación de si la solicitud fue exitosa
if response_base.status_code == 200:
    print("Solicitud exitosa")
else:
    print(f"Error en la solicitud: {response_base.status_code}")

Solicitud exitosa


In [5]:
#Obtencion del contenido del HTML de la pagina objetivo
soup_base = BeautifulSoup(response_base.content, 'html.parser')

Para obtener los datos por medio de Beautifulsoup se debe realizar una inspeccion previa al HTML de la pagina objetivo e identificar las etiquetas en las cuales se encuentran los datos que queremos obtener.

In [6]:
#Listas para almacenar la información que vamos a extraer
lista_urls = []
lista_titulos = []
lista_descripcion = []
lista_ingredientes = []
lista_pasos = []
lista_categorias = []

In [7]:
#Extracción de los urls de recetas por categorias(pastel, pollo,sopa)
urls_categorias = soup_base.select('.loc.mntl-link-list li a')
#Se recorre cada categoria
for url in urls_categorias:
    #Se obtiene la url de cada categoria
    url_categoria = url.get('href')
    categoria = url.get_text(strip=True)
    #Se realiza la solicitud a la pagina categoria
    response_categoria = requests.get(url_categoria, headers=headers)
    #Se obtiene el contenido de la url de la categoria que contiene varias recetas
    soup_categoria = BeautifulSoup(response_categoria.content, 'html.parser')
    #Extraccion de las urls de recetas en la categoria
    urls_recetas = soup_categoria.select('#mntl-taxonomysc-article-list-group_1-0 div div a')

    #Se recorre cada receta
    for url in urls_recetas:
        #Se obtiene la url de cada receta
        url_receta = url.get('href')
        #Se realiza la solicitud a la pagina receta
        response_receta = requests.get(url_receta, headers=headers)
        #Se obtiene el contenido de la url de la receta 
        soup_receta = BeautifulSoup(response_receta.content, 'html.parser')

        #Extraemos la información que necesitamos
        titulo = soup_receta.select_one('.article-heading.text-headline-400')
        if titulo:
            lista_titulos.append(titulo.get_text(strip=True))
        else:
            lista_titulos.append("No Disponible")

        descripcion = soup_receta.select_one('.article-subheading.text-body-100')
        if descripcion:            
            lista_descripcion.append(descripcion.get_text(strip=True))
        else:
            lista_descripcion.append("No Disponible")
        
        info_ingredientes = soup_receta.select('.mm-recipes-structured-ingredients__list li p')
        ingredientes = ""
        if info_ingredientes:
            for info in info_ingredientes:
                ingredientes = ingredientes + info.get_text(strip=True) + " "
            lista_ingredientes.append(ingredientes)
        else:
            lista_ingredientes.append("No Disponible")
        
        info_pasos = soup_receta.select('#mntl-sc-block_1-0 li p')
        pasos = ""
        if info_pasos:
            for info in info_pasos:
                pasos = pasos + info.get_text(strip=True)+" "
            lista_pasos.append(pasos)
        else:
            lista_pasos.append("No Disponible")        

        lista_urls.append(url_receta)
        lista_categorias.append(categoria)

    #Para saber si estamos extrayendo la informacion imprimiremos el tamaño de cada lista 
    print(f" Categoria: {len(lista_categorias)} Titulo:{len(lista_titulos)} Descripcion:{len(lista_descripcion)} Ingredientes:{len(lista_ingredientes)} Pasos:{len(lista_pasos)} Urls:{len(lista_urls)} ")
    

 Categoria: 64 Titulo:64 Descripcion:64 Ingredientes:64 Pasos:64 Urls:64 
 Categoria: 128 Titulo:128 Descripcion:128 Ingredientes:128 Pasos:128 Urls:128 
 Categoria: 133 Titulo:133 Descripcion:133 Ingredientes:133 Pasos:133 Urls:133 
 Categoria: 197 Titulo:197 Descripcion:197 Ingredientes:197 Pasos:197 Urls:197 
 Categoria: 261 Titulo:261 Descripcion:261 Ingredientes:261 Pasos:261 Urls:261 
 Categoria: 325 Titulo:325 Descripcion:325 Ingredientes:325 Pasos:325 Urls:325 
 Categoria: 389 Titulo:389 Descripcion:389 Ingredientes:389 Pasos:389 Urls:389 
 Categoria: 453 Titulo:453 Descripcion:453 Ingredientes:453 Pasos:453 Urls:453 
 Categoria: 517 Titulo:517 Descripcion:517 Ingredientes:517 Pasos:517 Urls:517 
 Categoria: 581 Titulo:581 Descripcion:581 Ingredientes:581 Pasos:581 Urls:581 
 Categoria: 645 Titulo:645 Descripcion:645 Ingredientes:645 Pasos:645 Urls:645 
 Categoria: 709 Titulo:709 Descripcion:709 Ingredientes:709 Pasos:709 Urls:709 
 Categoria: 773 Titulo:773 Descripcion:773 Ing

Guardamos la data en un archivo csv

In [8]:
df_registro = pd.DataFrame({
    "Categoria": lista_categorias,
    "Nombre":lista_titulos,
    "Descripcion": lista_descripcion,
    "Ingredientes": lista_ingredientes,
    "Pasos":lista_pasos,
    "Links": lista_urls
})
df_registro = df_registro.drop_duplicates(subset="Links")
df_registro.to_csv("AllRecipes-Corpus-v2.csv", encoding='utf-8', index=False)

# Sistema de Recuperacion

In [None]:
import pandas as pd
archivo_csv = './AllRecipes-Corpus-v2.csv'
df = pd.read_csv(archivo_csv)

In [None]:
# Crear la columna 'ID' asignando un ID único a cada categoría
df['ID'] = df['Categoria'].astype('category').cat.codes + 1

# Crear un subíndice incremental para cada fila dentro de una categoría
df['SubID'] = df.groupby('ID').cumcount() + 1

# Combinar 'ID' y 'SubID' para crear el índice compuesto
df['ID_Completo'] = df['ID'].astype(str) + '.' + df['SubID'].astype(str)

# Reorganizar las columnas para que 'ID_Completo' sea la primera
df = df[['ID_Completo'] + [col for col in df.columns if col != 'ID_Completo']]

# Guardar el archivo con los IDs generados
df.to_csv('archivo_con_ids_compuestos.csv', index=False)

In [None]:
df_final=df[['ID','ID_Completo','Categoria','Nombre','Descripcion','Ingredientes','Pasos','Links']]
df_final

Unnamed: 0,ID,ID_Completo,Categoria,Nombre,Descripcion,Ingredientes,Pasos,Links
0,1,1.1,Almond Meal,Quick Almond Flour Pancakes,These almond flour pancakes are a wonderful su...,1cupalmond flour ¼cupwater 2eggs 1tablespoonma...,Gather all ingredients. Dotdash Meredith Food ...,https://www.allrecipes.com/recipe/234702/quick...
1,1,1.2,Almond Meal,90-Second Keto Bread in a Mug,Try keto bread microwaved quickly and easily w...,1tablespoonbutter ⅓cupblanched almond flour 1e...,Microwave butter in a microwave-safe mug until...,https://www.allrecipes.com/recipe/263032/90-se...
2,1,1.3,Almond Meal,Macarons (French Macaroons),"Macarons are made with finely ground almonds, ...",3egg whites ¼cupwhite sugar 1 ⅔cupsconfectione...,Line a baking sheet with a silicone baking mat...,https://www.allrecipes.com/recipe/223234/macar...
3,1,1.4,Almond Meal,Chaffles with Almond Flour,This chaffle recipe uses almond flour to creat...,1largeegg 1tablespoonblanched almond flour ¼te...,"Whisk egg, almond flour, and baking powder tog...",https://www.allrecipes.com/recipe/278328/chaff...
4,1,1.5,Almond Meal,Almond Flour-Blueberry Muffins,"These healthy, tender, almond flour-blueberry ...",2cupsfinely ground almond flour ½teaspoonbakin...,Preheat the oven to 350 degrees F (175 degrees...,https://www.allrecipes.com/recipe/283581/almon...
...,...,...,...,...,...,...,...,...
3523,69,69.44,Wild Rice,Chicken and Mushroom Wild Rice Soup,A slight variation on a recipe I received from...,½cupbutter 1finely chopped onion ½cupchopped c...,Gather the ingredients. Melt butter in a large...,https://www.allrecipes.com/recipe/18448/chicke...
3524,69,69.45,Wild Rice,Ekaterina's Wild Rice and Kale Salad,"Healthy and filling, this salad is perfect for...","3 ¼cupswater, divided ½cupwild rice ½cupbarley...",Bring 2 cups water and wild rice to a boil in ...,https://www.allrecipes.com/recipe/255848/ekate...
3525,69,69.46,Wild Rice,Calico Wild Rice Soup,"This is a very hearty and flavorful soup, I do...",2cupswild rice 6cupswater 4cupschicken broth 1...,In a medium sauce pan cook rice in 6 cups wate...,https://www.allrecipes.com/recipe/13383/calico...
3526,69,69.47,Wild Rice,Instant Pot Chicken and Wild Rice Chowder,"When I was a little girl, my mamaw made a chic...",3tablespoonsbutter ½cupsliced mushrooms 1shall...,Turn on a multi-functional pressure cooker (su...,https://www.allrecipes.com/recipe/277361/insta...


In [None]:
df_final.isna().sum()

Unnamed: 0,0
ID,0
ID_Completo,0
Categoria,0
Nombre,0
Descripcion,0
Ingredientes,0
Pasos,0
Links,0


In [None]:
df_final.loc[:, 'textoLimpio'] = df_final['Descripcion'].str.lower().str.replace('.', '', regex=False).str.replace(',', '', regex=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final.loc[:, 'textoLimpio'] = df_final['Descripcion'].str.lower().str.replace('.', '', regex=False).str.replace(',', '', regex=False)


In [None]:
df_final.head()

Unnamed: 0,ID,ID_Completo,Categoria,Nombre,Descripcion,Ingredientes,Pasos,Links,textoLimpio
0,1,1.1,Almond Meal,Quick Almond Flour Pancakes,These almond flour pancakes are a wonderful su...,1cupalmond flour ¼cupwater 2eggs 1tablespoonma...,Gather all ingredients. Dotdash Meredith Food ...,https://www.allrecipes.com/recipe/234702/quick...,these almond flour pancakes are a wonderful su...
1,1,1.2,Almond Meal,90-Second Keto Bread in a Mug,Try keto bread microwaved quickly and easily w...,1tablespoonbutter ⅓cupblanched almond flour 1e...,Microwave butter in a microwave-safe mug until...,https://www.allrecipes.com/recipe/263032/90-se...,try keto bread microwaved quickly and easily w...
2,1,1.3,Almond Meal,Macarons (French Macaroons),"Macarons are made with finely ground almonds, ...",3egg whites ¼cupwhite sugar 1 ⅔cupsconfectione...,Line a baking sheet with a silicone baking mat...,https://www.allrecipes.com/recipe/223234/macar...,macarons are made with finely ground almonds c...
3,1,1.4,Almond Meal,Chaffles with Almond Flour,This chaffle recipe uses almond flour to creat...,1largeegg 1tablespoonblanched almond flour ¼te...,"Whisk egg, almond flour, and baking powder tog...",https://www.allrecipes.com/recipe/278328/chaff...,this chaffle recipe uses almond flour to creat...
4,1,1.5,Almond Meal,Almond Flour-Blueberry Muffins,"These healthy, tender, almond flour-blueberry ...",2cupsfinely ground almond flour ½teaspoonbakin...,Preheat the oven to 350 degrees F (175 degrees...,https://www.allrecipes.com/recipe/283581/almon...,these healthy tender almond flour-blueberry mu...


In [None]:
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize

df_final.loc[:, 'tokens'] = df_final['textoLimpio'].apply(word_tokenize)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
df_final.head()

Unnamed: 0,ID,ID_Completo,Categoria,Nombre,Descripcion,Ingredientes,Pasos,Links,textoLimpio,tokens
0,1,1.1,Almond Meal,Quick Almond Flour Pancakes,These almond flour pancakes are a wonderful su...,1cupalmond flour ¼cupwater 2eggs 1tablespoonma...,Gather all ingredients. Dotdash Meredith Food ...,https://www.allrecipes.com/recipe/234702/quick...,these almond flour pancakes are a wonderful su...,"[these, almond, flour, pancakes, are, a, wonde..."
1,1,1.2,Almond Meal,90-Second Keto Bread in a Mug,Try keto bread microwaved quickly and easily w...,1tablespoonbutter ⅓cupblanched almond flour 1e...,Microwave butter in a microwave-safe mug until...,https://www.allrecipes.com/recipe/263032/90-se...,try keto bread microwaved quickly and easily w...,"[try, keto, bread, microwaved, quickly, and, e..."
2,1,1.3,Almond Meal,Macarons (French Macaroons),"Macarons are made with finely ground almonds, ...",3egg whites ¼cupwhite sugar 1 ⅔cupsconfectione...,Line a baking sheet with a silicone baking mat...,https://www.allrecipes.com/recipe/223234/macar...,macarons are made with finely ground almonds c...,"[macarons, are, made, with, finely, ground, al..."
3,1,1.4,Almond Meal,Chaffles with Almond Flour,This chaffle recipe uses almond flour to creat...,1largeegg 1tablespoonblanched almond flour ¼te...,"Whisk egg, almond flour, and baking powder tog...",https://www.allrecipes.com/recipe/278328/chaff...,this chaffle recipe uses almond flour to creat...,"[this, chaffle, recipe, uses, almond, flour, t..."
4,1,1.5,Almond Meal,Almond Flour-Blueberry Muffins,"These healthy, tender, almond flour-blueberry ...",2cupsfinely ground almond flour ½teaspoonbakin...,Preheat the oven to 350 degrees F (175 degrees...,https://www.allrecipes.com/recipe/283581/almon...,these healthy tender almond flour-blueberry mu...,"[these, healthy, tender, almond, flour-blueber..."


In [None]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
df_final.loc[:, 'tokens'] = df_final['tokens'].apply(lambda tokens: [token for token in tokens if token not in stop_words])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df_final.head()

Unnamed: 0,ID,ID_Completo,Categoria,Nombre,Descripcion,Ingredientes,Pasos,Links,textoLimpio,tokens
0,1,1.1,Almond Meal,Quick Almond Flour Pancakes,These almond flour pancakes are a wonderful su...,1cupalmond flour ¼cupwater 2eggs 1tablespoonma...,Gather all ingredients. Dotdash Meredith Food ...,https://www.allrecipes.com/recipe/234702/quick...,these almond flour pancakes are a wonderful su...,"[almond, flour, pancakes, wonderful, substitut..."
1,1,1.2,Almond Meal,90-Second Keto Bread in a Mug,Try keto bread microwaved quickly and easily w...,1tablespoonbutter ⅓cupblanched almond flour 1e...,Microwave butter in a microwave-safe mug until...,https://www.allrecipes.com/recipe/263032/90-se...,try keto bread microwaved quickly and easily w...,"[try, keto, bread, microwaved, quickly, easily..."
2,1,1.3,Almond Meal,Macarons (French Macaroons),"Macarons are made with finely ground almonds, ...",3egg whites ¼cupwhite sugar 1 ⅔cupsconfectione...,Line a baking sheet with a silicone baking mat...,https://www.allrecipes.com/recipe/223234/macar...,macarons are made with finely ground almonds c...,"[macarons, made, finely, ground, almonds, conf..."
3,1,1.4,Almond Meal,Chaffles with Almond Flour,This chaffle recipe uses almond flour to creat...,1largeegg 1tablespoonblanched almond flour ¼te...,"Whisk egg, almond flour, and baking powder tog...",https://www.allrecipes.com/recipe/278328/chaff...,this chaffle recipe uses almond flour to creat...,"[chaffle, recipe, uses, almond, flour, create,..."
4,1,1.5,Almond Meal,Almond Flour-Blueberry Muffins,"These healthy, tender, almond flour-blueberry ...",2cupsfinely ground almond flour ½teaspoonbakin...,Preheat the oven to 350 degrees F (175 degrees...,https://www.allrecipes.com/recipe/283581/almon...,these healthy tender almond flour-blueberry mu...,"[healthy, tender, almond, flour-blueberry, muf..."


In [None]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

In [None]:
 #Entrenamiento de Word2Vec
# Crear una lista de listas de palabras (tokens) para entrenar Word2Vec
sentences = df_final['tokens'].tolist()

# Entrenar el modelo Word2Vec
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# 3. Obtener embeddings para cada frase
# Función para promediar los vectores de palabras en cada oración
def obtener_embedding(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if vectors:
        return sum(vectors) / len(vectors)  # Promedio de vectores
    else:
        return [0] * model.vector_size  # Si no hay palabras en el vocabulario, retornar ceros

In [None]:
 #Entrenamiento de Word2Vec
# Crear una lista de listas de palabras (tokens) para entrenar Word2Vec
sentences = df_final['tokens'].tolist()

# Entrenar el modelo Word2Vec
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# 3. Obtener embeddings para cada frase
# Función para promediar los vectores de palabras en cada oración
def obtener_embedding(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if vectors:
        return sum(vectors) / len(vectors)  # Promedio de vectores
    else:
        return [0] * model.vector_size  # Si no hay palabras en el vocabulario, retornar ceros

In [None]:
# Aplicar la función para obtener los embeddings
df_final['embedding'] = df_final['tokens'].apply(lambda x: obtener_embedding(x, model))

df_final

Unnamed: 0,ID,ID_Completo,Categoria,Nombre,Descripcion,Ingredientes,Pasos,Links,textoLimpio,tokens,embedding
0,1,1.1,Almond Meal,Quick Almond Flour Pancakes,These almond flour pancakes are a wonderful su...,1cupalmond flour ¼cupwater 2eggs 1tablespoonma...,Gather all ingredients. Dotdash Meredith Food ...,https://www.allrecipes.com/recipe/234702/quick...,these almond flour pancakes are a wonderful su...,"[almond, flour, pancakes, wonderful, substitut...","[-0.17397314, 0.35485378, 0.0730868, 0.0579810..."
1,1,1.2,Almond Meal,90-Second Keto Bread in a Mug,Try keto bread microwaved quickly and easily w...,1tablespoonbutter ⅓cupblanched almond flour 1e...,Microwave butter in a microwave-safe mug until...,https://www.allrecipes.com/recipe/263032/90-se...,try keto bread microwaved quickly and easily w...,"[try, keto, bread, microwaved, quickly, easily...","[-0.19779213, 0.39884582, 0.079875164, 0.06729..."
2,1,1.3,Almond Meal,Macarons (French Macaroons),"Macarons are made with finely ground almonds, ...",3egg whites ¼cupwhite sugar 1 ⅔cupsconfectione...,Line a baking sheet with a silicone baking mat...,https://www.allrecipes.com/recipe/223234/macar...,macarons are made with finely ground almonds c...,"[macarons, made, finely, ground, almonds, conf...","[-0.16381578, 0.33807915, 0.068985015, 0.05124..."
3,1,1.4,Almond Meal,Chaffles with Almond Flour,This chaffle recipe uses almond flour to creat...,1largeegg 1tablespoonblanched almond flour ¼te...,"Whisk egg, almond flour, and baking powder tog...",https://www.allrecipes.com/recipe/278328/chaff...,this chaffle recipe uses almond flour to creat...,"[chaffle, recipe, uses, almond, flour, create,...","[-0.16592766, 0.33546615, 0.06739347, 0.053851..."
4,1,1.5,Almond Meal,Almond Flour-Blueberry Muffins,"These healthy, tender, almond flour-blueberry ...",2cupsfinely ground almond flour ½teaspoonbakin...,Preheat the oven to 350 degrees F (175 degrees...,https://www.allrecipes.com/recipe/283581/almon...,these healthy tender almond flour-blueberry mu...,"[healthy, tender, almond, flour-blueberry, muf...","[-0.20216085, 0.40848288, 0.08354452, 0.068534..."
...,...,...,...,...,...,...,...,...,...,...,...
3523,69,69.44,Wild Rice,Chicken and Mushroom Wild Rice Soup,A slight variation on a recipe I received from...,½cupbutter 1finely chopped onion ½cupchopped c...,Gather the ingredients. Melt butter in a large...,https://www.allrecipes.com/recipe/18448/chicke...,a slight variation on a recipe i received from...,"[slight, variation, recipe, received, fellow, ...","[-0.18565513, 0.3739121, 0.07516736, 0.0628829..."
3524,69,69.45,Wild Rice,Ekaterina's Wild Rice and Kale Salad,"Healthy and filling, this salad is perfect for...","3 ¼cupswater, divided ½cupwild rice ½cupbarley...",Bring 2 cups water and wild rice to a boil in ...,https://www.allrecipes.com/recipe/255848/ekate...,healthy and filling this salad is perfect for ...,"[healthy, filling, salad, perfect, lunch, ligh...","[-0.23697631, 0.48668578, 0.095420934, 0.07665..."
3525,69,69.46,Wild Rice,Calico Wild Rice Soup,"This is a very hearty and flavorful soup, I do...",2cupswild rice 6cupswater 4cupschicken broth 1...,In a medium sauce pan cook rice in 6 cups wate...,https://www.allrecipes.com/recipe/13383/calico...,this is a very hearty and flavorful soup i do ...,"[hearty, flavorful, soup, believe, enjoy, !, !]","[-0.2670103, 0.53904444, 0.10641797, 0.0919911..."
3526,69,69.47,Wild Rice,Instant Pot Chicken and Wild Rice Chowder,"When I was a little girl, my mamaw made a chic...",3tablespoonsbutter ½cupsliced mushrooms 1shall...,Turn on a multi-functional pressure cooker (su...,https://www.allrecipes.com/recipe/277361/insta...,when i was a little girl my mamaw made a chick...,"[little, girl, mamaw, made, chicken, casserole...","[-0.18128169, 0.36434838, 0.071414016, 0.05183..."


In [None]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Asegúrate de que la columna de embeddings es una lista de numpy arrays
embeddings = np.array(df_final['embedding'].tolist())

# Número de clusters (esto puedes ajustarlo según tus necesidades)
n_clusters = 5

# Aplicar K-Means
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
df_final['cluster'] = kmeans.fit_predict(embeddings)

# Mostrar el resultado
df_final.head()


Unnamed: 0,ID,ID_Completo,Categoria,Nombre,Descripcion,Ingredientes,Pasos,Links,textoLimpio,tokens,embedding,cluster
0,1,1.1,Almond Meal,Quick Almond Flour Pancakes,These almond flour pancakes are a wonderful su...,1cupalmond flour ¼cupwater 2eggs 1tablespoonma...,Gather all ingredients. Dotdash Meredith Food ...,https://www.allrecipes.com/recipe/234702/quick...,these almond flour pancakes are a wonderful su...,"[almond, flour, pancakes, wonderful, substitut...","[-0.17397314, 0.35485378, 0.0730868, 0.0579810...",2
1,1,1.2,Almond Meal,90-Second Keto Bread in a Mug,Try keto bread microwaved quickly and easily w...,1tablespoonbutter ⅓cupblanched almond flour 1e...,Microwave butter in a microwave-safe mug until...,https://www.allrecipes.com/recipe/263032/90-se...,try keto bread microwaved quickly and easily w...,"[try, keto, bread, microwaved, quickly, easily...","[-0.19779213, 0.39884582, 0.079875164, 0.06729...",2
2,1,1.3,Almond Meal,Macarons (French Macaroons),"Macarons are made with finely ground almonds, ...",3egg whites ¼cupwhite sugar 1 ⅔cupsconfectione...,Line a baking sheet with a silicone baking mat...,https://www.allrecipes.com/recipe/223234/macar...,macarons are made with finely ground almonds c...,"[macarons, made, finely, ground, almonds, conf...","[-0.16381578, 0.33807915, 0.068985015, 0.05124...",2
3,1,1.4,Almond Meal,Chaffles with Almond Flour,This chaffle recipe uses almond flour to creat...,1largeegg 1tablespoonblanched almond flour ¼te...,"Whisk egg, almond flour, and baking powder tog...",https://www.allrecipes.com/recipe/278328/chaff...,this chaffle recipe uses almond flour to creat...,"[chaffle, recipe, uses, almond, flour, create,...","[-0.16592766, 0.33546615, 0.06739347, 0.053851...",2
4,1,1.5,Almond Meal,Almond Flour-Blueberry Muffins,"These healthy, tender, almond flour-blueberry ...",2cupsfinely ground almond flour ½teaspoonbakin...,Preheat the oven to 350 degrees F (175 degrees...,https://www.allrecipes.com/recipe/283581/almon...,these healthy tender almond flour-blueberry mu...,"[healthy, tender, almond, flour-blueberry, muf...","[-0.20216085, 0.40848288, 0.08354452, 0.068534...",4


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Obtener los centroides de cada cluster
centroides = kmeans.cluster_centers_

# Función para encontrar la similitud coseno entre un vector y el centroide de un cluster
def obtener_similitud_con_centroide(embedding, centroide):
    return cosine_similarity([embedding], [centroide])[0][0]

# Crear un diccionario para almacenar las oraciones más representativas de cada cluster
oraciones_topico = {}

# Iterar sobre cada cluster y encontrar las oraciones más cercanas al centroide
for cluster_id in range(n_clusters):
    # Obtener las oraciones en este cluster
    oraciones_cluster = df_final[df_final['cluster'] == cluster_id]

    # Obtener los embeddings de estas oraciones
    embeddings_cluster = np.array(oraciones_cluster['embedding'].tolist())

    # Calcular la similitud coseno entre cada embedding y el centroide del cluster
    similitudes = [obtener_similitud_con_centroide(embedding, centroides[cluster_id]) for embedding in embeddings_cluster]

    # Encontrar el índice de la oración más cercana al centroide
    indice_max_similitud = np.argmax(similitudes)

    # Obtener la oración más representativa y asociarla con el cluster
    oracion_representativa = oraciones_cluster.iloc[indice_max_similitud]['Descripcion']

    # Guardar la oración más representativa en el diccionario
    oraciones_topico[cluster_id] = oracion_representativa

# Mostrar las oraciones más representativas de cada cluster (tópico)
for cluster_id, oracion in oraciones_topico.items():
    print(f"Cluster {cluster_id}: {oracion}")


Cluster 0: This easy venison recipe is the simple and tasty way I make venison. Like anything else, it is better with fresh ingredients rather than canned, but this is what I had on hand. Wonderful served over brown or wild rice.
Cluster 1: The reason I'm calling this salsa verde and not pesto is because whenever you say pesto, people instantly think of the traditional version with the pine nuts and basil. I'm using the term salsa verde the way it's used around northern California. It's a very generic term for any fresh green sauce, usually starring some type of herb, but also can be made with spinach, arugula, etc.
Cluster 2: If you're like most people, you've probably made the same turkey every Thanksgiving, year after year. Why not jazz up the flavor this time with a little pomegranate molasses? This Middle Eastern syrup is sweet but also deeply flavored with a little sourness. Think of it as a puckered-up version of balsamic vinegar. They're both irresistible. You can make your own

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


def obtener_embedding_busqueda(clave, modelo):
    """Genera un embedding para las palabras clave usando el modelo proporcionado."""
    tokens = clave.split()
    vectores = [modelo.wv[word] for word in tokens if word in modelo.wv]
    if len(vectores) > 0:
        return np.mean(vectores, axis=0)
    else:
        return np.zeros(modelo.vector_size)


def buscar_por_topico(clave, modelo, df, top_n=5):
    """
    Busca recetas en el DataFrame basado en la similitud coseno entre embeddings.

    Args:
        clave: Palabras clave de búsqueda.
        modelo: Modelo de word embeddings (e.g., Word2Vec).
        df: DataFrame con los datos y embeddings.
        top_n: Número de resultados a devolver.

    Returns:
        DataFrame con las recetas más relevantes según la búsqueda.
    """
    # Obtener el embedding de la búsqueda
    embedding_busqueda = obtener_embedding_busqueda(clave, modelo)

    # Calcular similitud coseno con los embeddings de las recetas
    embeddings_oraciones = np.array(df['embedding'].tolist())
    similitudes = cosine_similarity([embedding_busqueda], embeddings_oraciones)[0]

    # Agregar la similitud al DataFrame
    df['similitud'] = similitudes

    # Ordenar por similitud y seleccionar los top_n resultados
    resultados = df.sort_values(by='similitud', ascending=False).head(top_n)

    return resultados


def obtener_titulo_por_id(ids, df):
    """
    Filtra las filas del DataFrame que coinciden con los IDs proporcionados.

    Args:
        ids: Lista de IDs a buscar.
        df: DataFrame con los datos de las recetas.

    Returns:
        DataFrame con los campos relevantes.
    """
    return df[df['ID'].isin(ids)][['ID', 'ID_Completo', 'Nombre', 'Links']]

# Realizar la búsqueda
clave_busqueda = "macaroons"  # Palabras clave de búsqueda
resultados_busqueda = buscar_por_topico(clave_busqueda, model, df_final)

# Obtener los IDs de las recetas relevantes
ids_resultados = resultados_busqueda['ID'].tolist()

# Filtrar y obtener los detalles de las recetas
resultados_finales = obtener_titulo_por_id(ids_resultados, df_final)

# Mostrar resultados finales
print(resultados_finales)


      ID ID_Completo                                  Nombre  \
0      1         1.1             Quick Almond Flour Pancakes   
1      1         1.2           90-Second Keto Bread in a Mug   
2      1         1.3             Macarons (French Macaroons)   
3      1         1.4              Chaffles with Almond Flour   
4      1         1.5          Almond Flour-Blueberry Muffins   
...   ..         ...                                     ...   
2578  49       49.56    Schnitzbrot (German Christmas Bread)   
2579  49       49.57  Rogaliki (Polish Jam-Filled Crescents)   
2580  49       49.58       Zwetschekuchen (German Plum Tart)   
2581  49       49.59                             Plum Buckle   
2582  49       49.60                     Zucchini Plum Bread   

                                                  Links  
0     https://www.allrecipes.com/recipe/234702/quick...  
1     https://www.allrecipes.com/recipe/263032/90-se...  
2     https://www.allrecipes.com/recipe/223234/macar...  