# Librerías

In [None]:
import os
import pandas as pd
import numpy as np

from collections import Counter
import re
import pandas as pd
import spacy
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader, TensorDataset

!pip install gliner
from gliner import GLiNER

!pip install sentence-transformers
from sentence_transformers import SentenceTransformer, util
import torch
from tqdm import tqdm

from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity

#web scraping
import requests
from bs4 import BeautifulSoup
import time

import ipywidgets as widgets
from IPython.display import display, clear_output



#Procesamiento de datos

In [None]:
REPO_NAME = "NLP-TUIA-Garcia-Herrera"
if REPO_NAME not in os.getcwd():
  if not os.path.exists(REPO_NAME):
    !git clone https://github.com/juliangg17/NLP-TUIA-Garcia-Herrera.git
  os.chdir(REPO_NAME)

## Funciones

Carga de datos de juegos

In [None]:
# Función para procesar el archivo CSV y preparar el DataFrame
def get_games(file_path):
    # Leer el archivo CSV
    df_games = pd.read_csv(file_path, delimiter=",", encoding="utf-8")

    # Cambiar el tipo de datos de las columnas
    df_games = df_games.astype({
        "rank": "Int64",
        "game_name": "string",
        "game_href": "string",
        "geek_rating": "float",
        "avg_rating": "float",
        "num_voters": "Int64",
        "description": "string",
        "yearpublished": "Int64",
        "minplayers": "Int64",
        "maxplayers": "Int64",
        "minplaytime": "Int64",
        "maxplaytime": "Int64",
        "minage": "Int64",
        "avgweight": "float",
        "best_num_players": "string",
        "designers": "string",
        "mechanics": "string",
        "categories": "string"
    })

    # Seleccionar solo las columnas especificadas
    df_games = df_games[["game_name", "description", "minplayers", "maxplayers", "minage", "categories"]]

    # Cambiar los nombres de las columnas
    df_games = df_games.rename(columns={
        "game_name": "Title",
        "description": "Description",
        "categories": "Genre"
    })

    # Agregar columna "People" con valor fijo "-"
    df_games["People"] = "-"

    # Agregar columna "allow_kids" basado en la edad mínima
    df_games["allow_kids"] = df_games["minage"].apply(lambda x: 1 if x < 16 else 0)

    # Eliminar la columna "minage"
    df_games = df_games.drop(columns=["minage"])

    # Agregar columna "Class" con valor fijo "games"
    df_games["Class"] = "games"

    # Devolver el DataFrame final
    return df_games

Carga de datos de películas

In [None]:
# Función para procesar el archivo CSV y preparar el DataFrame
def get_films(file_path):
    # Leer el archivo CSV
    df_films = pd.read_csv(file_path, delimiter=",", encoding="utf-8")

    # Cambiar el tipo de datos de las columnas
    df_films = df_films.astype({
        "Rank": "Int64",
        "Title": "string",
        "Genre": "string",
        "Description": "string",
        "Director": "string",
        "Actors": "string",
        "Year": "Int64",
        "Runtime (Minutes)": "Int64",
        "Rating": "float",
        "Votes": "Int64",
        "Revenue (Millions)": "float",
        "Metascore": "Int64"
    })

    # Seleccionar solo las columnas especificadas
    df_films = df_films[["Title", "Genre", "Description", "Director", "Actors"]]

    # Combinar las columnas "Director" y "Actors" en una sola columna "People"
    df_films["People"] = df_films["Director"] + ", " + df_films["Actors"]

    # Eliminar las columnas originales de "Director" y "Actors"
    df_films = df_films.drop(columns=["Director", "Actors"])

    # Agregar columna "Class" con el valor fijo "films"
    df_films["Class"] = "films"

    # Agregar columnas "minplayers" y "maxplayers" con valores fijos
    df_films["minplayers"] = 1
    df_films["maxplayers"] = 99

    # Agregar columna "allow_kids" basado en si "Genre" contiene la palabra "Family"
    df_films["allow_kids"] = df_films["Genre"].apply(lambda x: 1 if "Family" in x else 0)

    # Devolver el DataFrame final
    return df_films

Web scraping de datos de libros

In [None]:
def scrap_books(num_books=1000, output_file="gutenberg_books_1000.csv"):
    url = "https://www.gutenberg.org/browse/scores/top1000.php#books-last1"

    # Solicitud GET a la página web
    response = requests.get(url)
    response.raise_for_status()  # Verifica que la solicitud sea exitosa

    # Crear el objeto BeautifulSoup para parsear el HTML
    soup = BeautifulSoup(response.text, 'html.parser')

    # Encontrar el elemento <h2 id="books-last1"> y luego el siguiente <ol> con los libros
    books_section = soup.select_one("h2#books-last1")
    books_data = []

    if books_section:
        books_list = books_section.find_next("ol")
        if books_list:
            for book in tqdm(books_list.find_all("li")[:num_books], desc="Extrayendo datos de libros"):  # Agregar barra de progreso
                title = book.get_text()
                link = book.find("a")["href"]
                book_url = f"https://www.gutenberg.org{link}"  # Completar la URL

                # Entrar a la URL de cada libro para obtener detalles
                book_response = requests.get(book_url)
                book_response.raise_for_status()
                book_soup = BeautifulSoup(book_response.text, 'html.parser')

                # Buscar directamente la tabla con clase "bibrec" y extraer la información
                info_table = book_soup.find("table", class_="bibrec")
                book_info = {"Title": title, "URL": book_url}  # Diccionario para almacenar info del libro

                # Extraer todos los temas (subjects) en una lista
                subjects = []

                if info_table:
                    for row in info_table.find_all("tr"):
                        header = row.find("th")
                        value = row.find("td")
                        if header and value:
                            header_text = header.get_text(strip=True)
                            value_text = value.get_text(strip=True)

                            # Si el encabezado es "Subject", añadir a la lista de subjects
                            if header_text == "Subject":
                                subjects.append(value_text)
                            else:
                                book_info[header_text] = value_text

                # Unir todos los subjects en una sola cadena separada por comas
                book_info["Subject"] = ", ".join(subjects)

                # Agregar el diccionario de info del libro a la lista principal
                books_data.append(book_info)

                # Pausa para evitar sobrecargar el servidor
                time.sleep(0.1)

    # Crear un DataFrame de Pandas con la información de los libros
    books_df = pd.DataFrame(books_data)

    # Guardar el DataFrame en un archivo CSV
    books_df.to_csv(output_file, index=False, encoding="utf-8")

    #print(f"Se ha completado la extracción y el archivo CSV ha sido guardado como '{output_file}'")

    # Procesar el DataFrame
    books_df = books_df.drop(columns=['URL'])
    books_df = books_df.rename(columns={'Author': 'People'})
    books_df['Class'] = 'books'
    books_df['minplayers'] = 1
    books_df['maxplayers'] = 1
    books_df = books_df.rename(columns={'Summary': 'Description'})
    books_df = books_df.rename(columns={'Subject': 'Genre'})
    books_df['allow_kids'] = 0
    books_df = books_df.reindex(columns=['Title', 'People', 'Class', 'minplayers', 'maxplayers', 'allow_kids', 'Description', 'Genre'])
    df_books.to_csv('books.csv', index=False)
    return books_df

Carga de datos de libros

In [None]:
# Función para procesar el archivo CSV y preparar el DataFrame
def get_books(file_path):
    df_books = pd.read_csv(file_path, delimiter=",", encoding="utf-8")
    return df_books

Procesamiento de todos los datasets

In [None]:
# Función para combinar los DataFrames y procesar los datos
def get_all(df_books, df_films, df_games):
    # Combinar las tablas en una sola
    df_all = pd.concat([df_books, df_films, df_games], ignore_index=True)

    # Reemplazar valores en las columnas especificadas
    columns_to_clean = ["Title", "People", "Description", "Genre"]
    for col in columns_to_clean:
        df_all[col] = df_all[col].str.replace(";", "", regex=False)
        df_all[col] = df_all[col].str.replace(",", "", regex=False)

    # Crear la columna "Data" concatenando las otras columnas
    df_all["Data"] = df_all["Title"] + "- " + df_all["People"] + "- " + df_all["Class"] + "- " + df_all["Description"] + "- " + df_all["Genre"]

    # Asegurar que la columna 'Data' sea de tipo string
    df_all['Data'] = df_all['Data'].astype(str)

    # Convertir Data a minúsculas
    df_all['Data'] = df_all['Data'].str.lower()

    # Quitar duplicados basándose en la columna "Title"
    df_all = df_all.drop_duplicates(subset=["Title"])

    # Asegurar que la columna 'Genre' sea de tipo string
    df_all['Genre'] = df_all['Genre'].astype(str)

    # Aplicar la limpieza a la columna 'Genre'
    df_all['Genre'] = df_all['Genre'].apply(clean_genre)

    # Devolver el DataFrame final
    return df_all

Limpieza de columna Genre

In [None]:
import re
# Definir una función para limpiar cada género
def clean_genre(text):
    # Eliminar comillas simples, dobles, corchetes, números romanos, y caracteres especiales
    text = re.sub(r"[\"'\[\]/,;.&:]", "", text)

    # Eliminar palabras específicas (como "of", "and", "for") usando bordes de palabra
    text = re.sub(r"\b(of|and|for|i|ii|iii)\b", "", text)

    # Separar palabras por letras mayúsculas
    text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)

    # Eliminar espacios extra que pueden quedar
    text = re.sub(r"\s+", " ", text).strip()

    # Remover paréntesis, números y signos de pregunta
    text = re.sub(r'[()\d?]', '', text)

    # Convertir a minúsculas para simplificar la eliminación de palabras específicas
    text = text.lower()

    # Eliminar comillas simples, dobles, corchetes, números romanos, y caracteres especiales
    text = re.sub(r"-", "", text)

    return text

## Carga de datos

In [None]:
#df_books = scrap_books()

In [None]:
df_games = get_games("bgg_database.csv")
df_films = get_films("IMDB-Movie-Data.csv")
df_books = get_books("books.csv")
df_all = get_all(df_books, df_films, df_games)
df_all.tail()

Unnamed: 0,Title,People,Class,minplayers,maxplayers,allow_kids,Description,Genre,Data
2995,Tiny Epic Dinosaurs,-,games,1,4,1,In Tiny Epic Dinosaurs breakthroughs in modern...,animals fantasy farming science fiction,tiny epic dinosaurs- -- games- in tiny epic di...
2996,Kingsburg (Second Edition),-,games,2,5,1,The realm of Kingsburg is under attack! Monstr...,city building dice fantasy medieval,kingsburg (second edition)- -- games- the real...
2997,Condottiere,-,games,2,6,1,It is 13th century Italy. Trade is flourishing...,bluffing card game medieval negotiation,condottiere- -- games- it is 13th century ital...
2998,Qwirkle,-,games,2,4,1,The abstract game of Qwirkle consists of 108 w...,abstract strategy,qwirkle- -- games- the abstract game of qwirkl...
2999,Dungeons & Dragons: Castle Ravenloft Board Game,-,games,1,5,1,Castle Ravenloft Boardgame by Bill Slavicsek M...,adventure exploration fantasy fighting horror ...,dungeons & dragons: castle ravenloft board gam...


In [None]:
#Tratado de faltantes
df_all.fillna('-', inplace=True)
df_all.isnull().sum()

Unnamed: 0,0
Title,0
People,0
Class,0
minplayers,0
maxplayers,0
allow_kids,0
Description,0
Genre,0
Data,0


#Filtrado parental y por cantidad de jugadores

In [None]:
def parental_filter(df_all,input_kids):
  # Filtrar si se requiere permitir contenido para niños
  if input_kids == "S":
    df_all = df_all[df_all['allow_kids'] == 1]
  return df_all

def players_filter(df_all, input_players):
    # Filtrar filas donde el número de jugadores es mayor o igual a min_players y menor o igual a maxplayers
    df_all = df_all[(df_all['maxplayers'] >= input_players) & (df_all['minplayers'] <= input_players)]
    return df_all

#Filtrado por estado de ánimo

##Funciones

In [None]:
def train_sentiments(txt_path):
    """
    Función para entrenar un modelo de clasificación de frases utilizando embeddings

    Args:
        txt_path (str): Ruta del archivo .txt con frases y etiquetas, separado por tabulación

    Returns:
        modelo (LogisticRegression): Modelo entrenado para la clasificación
        embedder (SentenceTransformer): Modelo de embeddings de frases
    """
    # Cargar el archivo de texto en un DataFrame
    df = pd.read_csv(txt_path, delimiter="\t", names=["etiqueta", "frase"], encoding="utf-8")

    # Dividir los datos en entrenamiento y prueba
    X_train, X_test, y_train, y_test = train_test_split(df['frase'], df['etiqueta'], test_size=0.2, random_state=42)

    # Cargar el modelo preentrenado de embeddings
    embedder = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

    # Generar embeddings para las frases de entrenamiento y prueba
    X_train_embeddings = embedder.encode(X_train.tolist())
    X_test_embeddings = embedder.encode(X_test.tolist())

    # Entrenar un clasificador con los embeddings generados
    modelo = LogisticRegression()
    modelo.fit(X_train_embeddings, y_train)

    # Evaluar el modelo en el conjunto de prueba (opcional)
    y_pred = modelo.predict(X_test_embeddings)
    print("Evaluación del modelo en el conjunto de prueba:")
    print(classification_report(y_test, y_pred))

    return modelo, embedder

def class_sentiment(frase, modelo, embedder):
    # Generar el embedding de la frase
    frase_embedding = embedder.encode([frase])

    # Predecir la clase
    etiqueta = modelo.predict(frase_embedding)[0]
    probabilidad = modelo.predict_proba(frase_embedding).max()

    # Verificar si la probabilidad es baja y asignar etiqueta "neutro"
    if probabilidad < 0.65:
        etiqueta = "neutro"

    return etiqueta, probabilidad

In [None]:
def class_genre(df_all, modelo, embedder):
    # Crear listas para almacenar las etiquetas y probabilidades
    etiquetas = []
    probabilidades = []

    # Iterar sobre cada género en la columna "Genre"
    for frase in df_all['Genre']:
        # Generar el embedding de cada frase
        frase_embedding = embedder.encode([frase])

        # Predecir la clase y obtener la probabilidad
        etiqueta = modelo.predict(frase_embedding)[0]
        probabilidad = modelo.predict_proba(frase_embedding).max()

        # Verificar si la probabilidad es baja y asignar etiqueta "neutro"
        if probabilidad < 0.65:
            etiqueta = "neutro"

        # Almacenar los resultados en las listas
        etiquetas.append(etiqueta)

    # Agregar los resultados como nuevas columnas al DataFrame
    df_all['Genre_Sentiment'] = etiquetas

    return df_all

In [None]:
def sentiment_filter(df_all, etiqueta):
    if etiqueta == 'melancólico':
        df_all = df_all[df_all['Genre_Sentiment'] == 'alegre']
    elif etiqueta == 'neutro':
        df_all = df_all[df_all['Genre_Sentiment'] != 'neutro']
    elif etiqueta == 'alegre':
        df_all = df_all
    return df_all

##Aplicación

In [None]:
# Entrenar el modelo y obtener el embedder
model_sentiment, embedder_sentiment = train_sentiments("dataset_estado_animo.txt")



Evaluación del modelo en el conjunto de prueba:
              precision    recall  f1-score   support

      alegre       0.84      0.84      0.84        76
 melancólico       0.81      0.81      0.81        64

    accuracy                           0.83       140
   macro avg       0.83      0.83      0.83       140
weighted avg       0.83      0.83      0.83       140



In [None]:
# Ejemplo de uso
nueva_frase = "estoy a pleno"
etiqueta, probabilidad = class_sentiment(nueva_frase, model_sentiment, embedder_sentiment)
print(f"Clase: {etiqueta}, Probabilidad: {probabilidad:.2f}")

Clase: alegre, Probabilidad: 0.88


In [None]:
class_genre(df_all, model_sentiment, embedder_sentiment)

Unnamed: 0,Title,People,Class,minplayers,maxplayers,allow_kids,Description,Genre,Data,Genre_Sentiment
0,Frankenstein Or The Modern Prometheus,Shelley Mary Wollstonecraft 1797-1851,books,1,1,0,"""Frankenstein Or The Modern Prometheus"" by Mar...",science fiction horror tales gothic fiction sc...,frankenstein or the modern prometheus- shelley...,melancólico
1,呻吟語,Lü Kun 1536-1618,books,1,1,0,"""呻吟語"" by Kun Lü is a philosophical treatise wr...",conduct life,"呻吟語- lü kun 1536-1618- books- ""呻吟語"" by kun lü ...",alegre
2,Pride and Prejudice,Austen Jane 1775-1817,books,1,1,0,"""Pride and Prejudice"" by Jane Austen is a clas...",england fiction young women fiction love sto...,pride and prejudice- austen jane 1775-1817- bo...,neutro
3,Moby Dick Or The Whale,Melville Herman 1819-1891,books,1,1,0,"""Moby Dick Or The Whale"" by Herman Melville is...",whaling fiction sea stories psychological fic...,moby dick or the whale- melville herman 1819-1...,melancólico
4,Romeo and Juliet,Shakespeare William 1564-1616,books,1,1,0,"""Romeo and Juliet"" by William Shakespeare is a...",vendetta drama youth drama verona italy dra...,romeo and juliet- shakespeare william 1564-161...,melancólico
...,...,...,...,...,...,...,...,...,...,...
2995,Tiny Epic Dinosaurs,-,games,1,4,1,In Tiny Epic Dinosaurs breakthroughs in modern...,animals fantasy farming science fiction,tiny epic dinosaurs- -- games- in tiny epic di...,melancólico
2996,Kingsburg (Second Edition),-,games,2,5,1,The realm of Kingsburg is under attack! Monstr...,city building dice fantasy medieval,kingsburg (second edition)- -- games- the real...,melancólico
2997,Condottiere,-,games,2,6,1,It is 13th century Italy. Trade is flourishing...,bluffing card game medieval negotiation,condottiere- -- games- it is 13th century ital...,melancólico
2998,Qwirkle,-,games,2,4,1,The abstract game of Qwirkle consists of 108 w...,abstract strategy,qwirkle- -- games- the abstract game of qwirkl...,alegre


In [None]:
sentiment_filter(df_all, 'melancólico')

Unnamed: 0,Title,People,Class,minplayers,maxplayers,allow_kids,Description,Genre,Data,Genre_Sentiment
1,呻吟語,Lü Kun 1536-1618,books,1,1,0,"""呻吟語"" by Kun Lü is a philosophical treatise wr...",conduct life,"呻吟語- lü kun 1536-1618- books- ""呻吟語"" by kun lü ...",alegre
7,A Room with a View,Forster E. M. (Edward Morgan) 1879-1970,books,1,1,0,"""A Room with a View"" by E. M. Forster is a nov...",humorous stories england fiction young women ...,a room with a view- forster e. m. (edward morg...,alegre
24,A Modest ProposalFor preventing the children o...,Swift Jonathan 1667-1745,books,1,1,0,"""A Modest Proposal"" by Jonathan Swift is a sat...",political satire english religious satire engl...,a modest proposalfor preventing the children o...,alegre
39,The Prince,Machiavelli Niccolò 1469-1527,books,1,1,0,"""The Prince"" by Niccolò Machiavelli is a polit...",state the early works to political science ...,the prince- machiavelli niccolò 1469-1527- boo...,alegre
47,Don Quijote,Cervantes Saavedra Miguel de 1547-1616,books,1,1,0,"""Don Quijote"" by Miguel de Cervantes Saavedra ...",spain social life customs th century fictio...,don quijote- cervantes saavedra miguel de 1547...,alegre
...,...,...,...,...,...,...,...,...,...,...
2986,Tak,-,games,2,2,1,&quotMy next several hours were spent learning...,abstract strategy,tak- -- games- &quotmy next several hours were...,alegre
2991,Signorie,-,games,2,4,1,Italy during the 15th century was a country fu...,dice renaissance,signorie- -- games- italy during the 15th cent...,alegre
2993,Call to Adventure,-,games,1,4,1,Make your fate! Inspired by character-driven f...,card game fantasy novelbased,call to adventure- -- games- make your fate! i...,alegre
2994,Lanterns: The Harvest Festival,-,games,2,4,1,The harvest is in and the artisans are hard at...,abstract strategy,lanterns: the harvest festival- -- games- the ...,alegre


#Filtrado por NER

##Funciones

In [None]:
# Cargar el modelo preentrenado 'gliner_multi-v2.1' desde Hugging Face
model_ner = GLiNER.from_pretrained("urchade/gliner_multi-v2.1")

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

gliner_config.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.77k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.16G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]



In [None]:
def get_ner(text, model, labels=["person", "location", "actor", "character"], threshold=0.4):
    model.eval()
    entities = model.predict_entities(text, labels, threshold=threshold)
 # Verificar si se detectaron entidades
    if entities:
        # Crear el DataFrame con las entidades detectadas
        df_entities = pd.DataFrame(entities)
        return df_entities[['text', 'label']]
    else:
        # Crear un DataFrame vacío con las columnas 'text' y 'label'
        return pd.DataFrame(columns=['text', 'label'])

In [None]:
def filter_ner(df_all, df_entities, column="Data"):
    # Verificar que df_entities no esté vacío
    if df_entities.empty:
        print("No hay entidades para filtrar.")
        return df_all

    # Obtener los valores únicos de las entidades
    entity_values = [entity.lower() for entity in df_entities['text'].unique()]

    # Filtrar df_all para que solo contenga filas donde la columna 'Data' contenga alguna de las entidades
    df_filtered = df_all[df_all[column].apply(lambda x: any(entity in x for entity in entity_values))]

    return df_filtered

##Aplicación

In [None]:
input_ner = "quiero leer algo de Frankenstein"
get_ner(input_ner,model_ner)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unnamed: 0,text,label
0,Frankenstein,character


In [None]:
filter_ner(df_all, get_ner(input_ner,model_ner))

Unnamed: 0,Title,People,Class,minplayers,maxplayers,allow_kids,Description,Genre,Data,Genre_Sentiment
0,Frankenstein Or The Modern Prometheus,Shelley Mary Wollstonecraft 1797-1851,books,1,1,0,"""Frankenstein Or The Modern Prometheus"" by Mar...",science fiction horror tales gothic fiction sc...,frankenstein or the modern prometheus- shelley...,melancólico
1326,Victor Frankenstein,Paul McGuigan Daniel Radcliffe James McAvoy Je...,films,1,99,0,Told from Igor's perspective we see the troubl...,drama horror scifi,victor frankenstein- paul mcguigan daniel radc...,melancólico
2205,Horrified,-,games,1,5,1,The stakes have been raised. Imagine living in...,horror miniatures movies tv radio theme,horrified- -- games- the stakes have been rais...,melancólico
2908,Abomination: The Heir of Frankenstein,-,games,2,4,1,It's been twenty years since Victor Frankenste...,horror novelbased science fiction,abomination: the heir of frankenstein- -- game...,melancólico


#Búsqueda semántica por interés

## Funciones

In [None]:
# Función optimizada para precomputar los embeddings
def precomputar_embeddings(df, columna, batch_size=64, device='cuda' if torch.cuda.is_available() else 'cpu'):
    modelo.to(device)
    embeddings_column = []
    textos = df[columna].astype(str).tolist()

    # Crear DataLoader para procesamiento por lotes
    dataset = textos
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    with torch.no_grad():  # Desactiva el cálculo de gradientes para mayor eficiencia
        for batch_textos in tqdm(dataloader, desc="Calculando embeddings"):
            batch_embeddings = modelo.encode(batch_textos, convert_to_tensor=True, device=device)
            embeddings_column.append(batch_embeddings.cpu())  # Devuelve a la CPU para evitar ocupar la memoria de la GPU

    return torch.cat(embeddings_column, dim=0)  # Concatenamos y aseguramos forma 2D

def guardar_embeddings(embeddings, file_path):
    embeddings_np = embeddings.cpu().numpy() if embeddings.is_cuda else embeddings.numpy()
    df = pd.DataFrame(embeddings_np)
    df.to_csv(file_path, index=False)
    print("Embeddings guardados exitosamente en formato CSV.")

In [None]:
# Función para realizar la búsqueda semántica y devolver el DataFrame ordenado
def semantic_search(input_text, df_all, embeddings_path='embeddings_data.csv'):
    # Cargar el archivo CSV de embeddings
    try:
        df_embeddings = pd.read_csv(embeddings_path, delimiter=",", encoding="utf-8")
    except FileNotFoundError:
        print("El archivo de embeddings no se encontró. Asegúrate de que 'embeddings_data.csv' exista en el directorio.")
        return

    # Asegurarnos de que el DataFrame tiene una columna de etiquetas (si es necesario)
    if 'label' in df_embeddings.columns:
        embeddings_data = df_embeddings.drop(columns=['label']).values  # Ignorar columna de etiquetas
    else:
        embeddings_data = df_embeddings.values  # Usar todo si no hay columna 'label'

    # Convertir los embeddings a tensores
    embeddings_data = torch.tensor(embeddings_data, dtype=torch.float32)

    # Codificamos el input
    embeddings_input = modelo.encode([input_text], convert_to_tensor=True).reshape(1, -1)  # Aseguramos forma 2D

    # Calculamos las puntuaciones de similitud
    puntuaciones_coseno = util.cos_sim(embeddings_input, embeddings_data)[0].cpu().numpy()

    # Añadimos las puntuaciones de similitud al DataFrame
    df_all['Similitud'] = puntuaciones_coseno

    # Ordenamos el DataFrame por la columna de similitud de mayor a menor
    df_ordenado = df_all.sort_values(by='Similitud', ascending=False)

    return df_ordenado

In [None]:
# Instancia del modelo preentrenado multilingüe
modelo = SentenceTransformer('distiluse-base-multilingual-cased-v1')



In [None]:
#embeddings_data = precomputar_embeddings(df_all, 'Data')
#guardar_embeddings(embeddings_data, 'embeddings_data.csv')

## Aplicación

In [None]:
input_text = 'Quiero algo de Stars'
semantic_search(input_text, df_all, 'embeddings_data.csv')

Unnamed: 0,Title,People,Class,minplayers,maxplayers,allow_kids,Description,Genre,Data,Genre_Sentiment,Similitud
2811,The Networks,-,games,1,5,1,In The Networks you and your opponents are new...,economic humor movies tv radio theme,the networks- -- games- in the networks you an...,alegre,0.302965
2636,Star Trek: Ascendancy,-,games,3,3,1,Boldly go where no one has gone before. In Sta...,civilization exploration miniatures movies tv ...,star trek: ascendancy- -- games- boldly go whe...,melancólico,0.256019
2155,Star Realms,-,games,2,2,1,Star Realms is a spaceship combat deck-buildin...,card game fighting science fiction,star realms- -- games- star realms is a spaces...,melancólico,0.248929
2907,Among the Stars,-,games,2,4,1,Among the Stars takes place in a war-ravaged g...,card game city building science fiction,among the stars- -- games- among the stars tak...,melancólico,0.241681
2303,Star Realms: Frontiers,-,games,1,4,1,Star Realms: Frontiers is a new standalone Sta...,card game fighting science fiction,star realms: frontiers- -- games- star realms:...,melancólico,0.240142
...,...,...,...,...,...,...,...,...,...,...,...
2205,Horrified,-,games,1,5,1,The stakes have been raised. Imagine living in...,horror miniatures movies tv radio theme,horrified- -- games- the stakes have been rais...,melancólico,-0.083887
115,The Critique of Pure Reason,Kant Immanuel 1724-1804,books,1,1,0,"""The Critique of Pure Reason"" by Immanuel Kant...",knowledge theory causation reason philosophy g...,the critique of pure reason- kant immanuel 172...,alegre,-0.084377
2833,Wallenstein,-,games,3,5,1,Historically Albrecht von Wallenstein duke of ...,economic pike shot territory building wargame,wallenstein- -- games- historically albrecht v...,melancólico,-0.089275
439,Ecce HomoComplete Works Volume Seventeen,Nietzsche Friedrich Wilhelm 1844-1900,books,1,1,0,"""Ecce Homo"" by Friedrich Wilhelm Nietzsche is ...",nietzsche friedrich wilhelm philosophers ger...,ecce homocomplete works volume seventeen- niet...,alegre,-0.090085


#Puesta en marcha

##Funciones

In [None]:
def app(df_all,input_1,input_2,input_3,input_4):
  etiqueta, probabilidad = class_sentiment(input_1, model_sentiment, embedder_sentiment)
  print(f"Clase: {etiqueta}, Probabilidad: {probabilidad:.2f}")
  #Ordenar por actividades más similares
  df_4 = semantic_search(input_4, df_all, 'embeddings_data.csv')

  #Filtrar por ner
  df_ner = get_ner(input_4,model_ner)
  df_4 = filter_ner(df_4, df_ner)

  #Filtrar por cantidad de jugadores
  df_2 = players_filter(df_4,input_2)

  #Filtrar por control parental
  df_3 = parental_filter(df_2,input_3)

  if df_ner.empty:
    #Filtrar por estado de ánimo
    df_1 = sentiment_filter(df_3, etiqueta)
  else:
    df_1 = df_3

  #Ordenar por similitud
  df_1 = df_1.sort_values(by='Similitud', ascending=False)
  df_output = df_1.drop(columns=['maxplayers', 'minplayers', 'allow_kids', 'Data', 'Similitud'], errors='ignore')
  df_output = df_output.head(5)
  return df_4,df_2,df_3,df_1,df_output

In [None]:
def recomendador(df_all, app):
    # Crear widgets para cada pregunta con estilo personalizado para ajustar el ancho
    input_1_widget = widgets.Text(description="¿Cómo estás hoy?", placeholder="Escribe cómo te sientes",
                                  style={'description_width': '150px'}, layout=widgets.Layout(width='700px'))
    input_2_widget = widgets.IntText(description="¿Cuántas personas juegan?", placeholder="Número de personas",
                                     style={'description_width': '150px'}, layout=widgets.Layout(width='700px'))
    input_3_widget = widgets.Dropdown(
        options=['S', 'N'],
        description="¿Hay niños? (S/N)",
        style={'description_width': '150px'},
        layout=widgets.Layout(width='700px')
    )
    input_4_widget = widgets.Text(description="¿De qué tenés ganas?", placeholder="Escribe aquí tus deseos",
                                  style={'description_width': '150px'}, layout=widgets.Layout(width='700px'))

    # Salida para mostrar el resultado de la función
    result_output = widgets.Output()

    # Botón de envío
    button = widgets.Button(description="Procesar", layout=widgets.Layout(width='150px'))

    # Acción al presionar el botón
    def on_button_click(b):
        with result_output:
            clear_output()  # Limpia la salida anterior

            # Captura de los valores ingresados
            input_1 = input_1_widget.value
            input_2 = input_2_widget.value
            input_3 = input_3_widget.value
            input_4 = input_4_widget.value

            # Llamada a la función `app` y obtener `df_output`
            _, _, _, _, df_output = app(df_all, input_1, input_2, input_3, input_4)

            # Mostrar el DataFrame resultante `df_output`
            print("Resultado de df_output:")
            display(df_output)

    # Asociar el evento de clic al botón
    button.on_click(on_button_click)

    # Mostrar la interfaz
    display(input_1_widget, input_2_widget, input_3_widget, input_4_widget, button, result_output)

# Llamada de ejemplo para iniciar la función
# recomendador(df_all, app)


##Interfaz

In [None]:
recomendador(df_all,app)

Text(value='', description='¿Cómo estás hoy?', layout=Layout(width='700px'), placeholder='Escribe cómo te sien…

IntText(value=0, description='¿Cuántas personas juegan?', layout=Layout(width='700px'), style=DescriptionStyle…

Dropdown(description='¿Hay niños? (S/N)', layout=Layout(width='700px'), options=('S', 'N'), style=DescriptionS…

Text(value='', description='¿De qué tenés ganas?', layout=Layout(width='700px'), placeholder='Escribe aquí tus…

Button(description='Procesar', layout=Layout(width='150px'), style=ButtonStyle())

Output()