# Librerías

In [None]:
import os
import pandas as pd
import numpy as np

from collections import Counter # Import the Counter class
import re
import pandas as pd
import spacy
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

!pip install gliner
# Importa la clase GLiNER desde la biblioteca gliner
from gliner import GLiNER

!pip install sentence-transformers
from sentence_transformers import SentenceTransformer, util
import torch
from tqdm import tqdm

#web scraping
import requests
from bs4 import BeautifulSoup
import time



#Procesamiento de datos

In [None]:
REPO_NAME = "NLP-TUIA-Garcia-Herrera"
if REPO_NAME not in os.getcwd():
  if not os.path.exists(REPO_NAME):
    !git clone https://github.com/juliangg17/NLP-TUIA-Garcia-Herrera.git
  os.chdir(REPO_NAME)

## Funciones

Carga de datos de juegos

In [None]:
# Función para procesar el archivo CSV y preparar el DataFrame
def get_games(file_path):
    # Leer el archivo CSV
    df_games = pd.read_csv(file_path, delimiter=",", encoding="utf-8")

    # Cambiar el tipo de datos de las columnas
    df_games = df_games.astype({
        "rank": "Int64",
        "game_name": "string",
        "game_href": "string",
        "geek_rating": "float",
        "avg_rating": "float",
        "num_voters": "Int64",
        "description": "string",
        "yearpublished": "Int64",
        "minplayers": "Int64",
        "maxplayers": "Int64",
        "minplaytime": "Int64",
        "maxplaytime": "Int64",
        "minage": "Int64",
        "avgweight": "float",
        "best_num_players": "string",
        "designers": "string",
        "mechanics": "string",
        "categories": "string"
    })

    # Seleccionar solo las columnas especificadas
    df_games = df_games[["game_name", "description", "minplayers", "maxplayers", "minage", "categories"]]

    # Cambiar los nombres de las columnas
    df_games = df_games.rename(columns={
        "game_name": "Title",
        "description": "Description",
        "categories": "Genre"
    })

    # Agregar columna "People" con valor fijo "-"
    df_games["People"] = "-"

    # Agregar columna "allow_kids" basado en la edad mínima
    df_games["allow_kids"] = df_games["minage"].apply(lambda x: 1 if x < 16 else 0)

    # Eliminar la columna "minage"
    df_games = df_games.drop(columns=["minage"])

    # Agregar columna "Class" con valor fijo "games"
    df_games["Class"] = "games"

    # Devolver el DataFrame final
    return df_games

Carga de datos de películas

In [None]:
# Función para procesar el archivo CSV y preparar el DataFrame
def get_films(file_path):
    # Leer el archivo CSV
    df_films = pd.read_csv(file_path, delimiter=",", encoding="utf-8")

    # Cambiar el tipo de datos de las columnas
    df_films = df_films.astype({
        "Rank": "Int64",
        "Title": "string",
        "Genre": "string",
        "Description": "string",
        "Director": "string",
        "Actors": "string",
        "Year": "Int64",
        "Runtime (Minutes)": "Int64",
        "Rating": "float",
        "Votes": "Int64",
        "Revenue (Millions)": "float",
        "Metascore": "Int64"
    })

    # Seleccionar solo las columnas especificadas
    df_films = df_films[["Title", "Genre", "Description", "Director", "Actors"]]

    # Combinar las columnas "Director" y "Actors" en una sola columna "People"
    df_films["People"] = df_films["Director"] + ", " + df_films["Actors"]

    # Eliminar las columnas originales de "Director" y "Actors"
    df_films = df_films.drop(columns=["Director", "Actors"])

    # Agregar columna "Class" con el valor fijo "films"
    df_films["Class"] = "films"

    # Agregar columnas "minplayers" y "maxplayers" con valores fijos
    df_films["minplayers"] = 1
    df_films["maxplayers"] = 99

    # Agregar columna "allow_kids" basado en si "Genre" contiene la palabra "Family"
    df_films["allow_kids"] = df_films["Genre"].apply(lambda x: 1 if "Family" in x else 0)

    # Devolver el DataFrame final
    return df_films

Web scraping de datos de libros

In [None]:
def scrap_books(num_books=1000, output_file="gutenberg_books_1000.csv"):
    url = "https://www.gutenberg.org/browse/scores/top1000.php#books-last1"

    # Solicitud GET a la página web
    response = requests.get(url)
    response.raise_for_status()  # Verifica que la solicitud sea exitosa

    # Crear el objeto BeautifulSoup para parsear el HTML
    soup = BeautifulSoup(response.text, 'html.parser')

    # Encontrar el elemento <h2 id="books-last1"> y luego el siguiente <ol> con los libros
    books_section = soup.select_one("h2#books-last1")
    books_data = []

    if books_section:
        books_list = books_section.find_next("ol")
        if books_list:
            for book in tqdm(books_list.find_all("li")[:num_books], desc="Extrayendo datos de libros"):  # Agregar barra de progreso
                title = book.get_text()
                link = book.find("a")["href"]
                book_url = f"https://www.gutenberg.org{link}"  # Completar la URL

                # Entrar a la URL de cada libro para obtener detalles
                book_response = requests.get(book_url)
                book_response.raise_for_status()
                book_soup = BeautifulSoup(book_response.text, 'html.parser')

                # Buscar directamente la tabla con clase "bibrec" y extraer la información
                info_table = book_soup.find("table", class_="bibrec")
                book_info = {"Title": title, "URL": book_url}  # Diccionario para almacenar info del libro

                # Extraer todos los temas (subjects) en una lista
                subjects = []

                if info_table:
                    for row in info_table.find_all("tr"):
                        header = row.find("th")
                        value = row.find("td")
                        if header and value:
                            header_text = header.get_text(strip=True)
                            value_text = value.get_text(strip=True)

                            # Si el encabezado es "Subject", añadir a la lista de subjects
                            if header_text == "Subject":
                                subjects.append(value_text)
                            else:
                                book_info[header_text] = value_text

                # Unir todos los subjects en una sola cadena separada por comas
                book_info["Subject"] = ", ".join(subjects)

                # Agregar el diccionario de info del libro a la lista principal
                books_data.append(book_info)

                # Pausa para evitar sobrecargar el servidor
                time.sleep(0.1)

    # Crear un DataFrame de Pandas con la información de los libros
    books_df = pd.DataFrame(books_data)

    # Guardar el DataFrame en un archivo CSV
    books_df.to_csv(output_file, index=False, encoding="utf-8")

    #print(f"Se ha completado la extracción y el archivo CSV ha sido guardado como '{output_file}'")

    # Procesar el DataFrame
    books_df = books_df.drop(columns=['URL'])
    books_df = books_df.rename(columns={'Author': 'People'})
    books_df['Class'] = 'books'
    books_df['minplayers'] = 1
    books_df['maxplayers'] = 1
    books_df = books_df.rename(columns={'Summary': 'Description'})
    books_df = books_df.rename(columns={'Subject': 'Genre'})
    books_df['allow_kids'] = 0
    books_df = books_df.reindex(columns=['Title', 'People', 'Class', 'minplayers', 'maxplayers', 'allow_kids', 'Description', 'Genre'])
    df_books.to_csv('books.csv', index=False)
    return books_df

Carga de datos de libros

In [None]:
# Función para procesar el archivo CSV y preparar el DataFrame
def get_books(file_path):
    df_books = pd.read_csv(file_path, delimiter=",", encoding="utf-8")
    return df_books

Procesamiento de todos los datasets

In [None]:
# Función para combinar los DataFrames y procesar los datos
def get_all(df_books, df_films, df_games):
    # Combinar las tablas en una sola
    df_all = pd.concat([df_books, df_films, df_games], ignore_index=True)

    # Reemplazar valores en las columnas especificadas
    columns_to_clean = ["Title", "People", "Description", "Genre"]
    for col in columns_to_clean:
        df_all[col] = df_all[col].str.replace(";", "", regex=False)
        df_all[col] = df_all[col].str.replace(",", "", regex=False)

    # Crear la columna "Data" concatenando las otras columnas
    df_all["Data"] = df_all["Title"] + "- " + df_all["People"] + "- " + df_all["Class"] + "- " + df_all["Description"] + "- " + df_all["Genre"]

    # Asegurar que la columna 'Data' sea de tipo string
    df_all['Data'] = df_all['Data'].astype(str)

    # Convertir Data a minúsculas
    df_all['Data'] = df_all['Data'].str.lower()

    # Quitar duplicados basándose en la columna "Title"
    df_all = df_all.drop_duplicates(subset=["Title"])

    # Asegurar que la columna 'Genre' sea de tipo string
    df_all['Genre'] = df_all['Genre'].astype(str)

    # Aplicar la limpieza a la columna 'Genre'
    df_all['Genre'] = df_all['Genre'].apply(clean_genre)

    # Devolver el DataFrame final
    return df_all

Limpieza de columna Genre

In [None]:
import re
# Definir una función para limpiar cada género
def clean_genre(text):
    # Eliminar comillas simples, dobles, corchetes, números romanos, y caracteres especiales
    text = re.sub(r"[\"'\[\]/,;.&:]", "", text)

    # Eliminar palabras específicas (como "of", "and", "for") usando bordes de palabra
    text = re.sub(r"\b(of|and|for|i|ii|iii)\b", "", text)

    # Separar palabras por letras mayúsculas
    text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)

    # Eliminar espacios extra que pueden quedar
    text = re.sub(r"\s+", " ", text).strip()

    # Remover paréntesis, números y signos de pregunta
    text = re.sub(r'[()\d?]', '', text)

    # Convertir a minúsculas para simplificar la eliminación de palabras específicas
    text = text.lower()

    # Eliminar comillas simples, dobles, corchetes, números romanos, y caracteres especiales
    text = re.sub(r"-", "", text)

    return text

## Carga de datos

In [None]:
#df_books = scrap_books()

In [None]:
df_games = get_games("bgg_database.csv")
df_films = get_films("IMDB-Movie-Data.csv")
df_books = get_books("books.csv")
df_all = get_all(df_books, df_films, df_games)
df_all.head()

Unnamed: 0,Title,People,Class,minplayers,maxplayers,allow_kids,Description,Genre,Data
0,Frankenstein Or The Modern Prometheus,Shelley Mary Wollstonecraft 1797-1851,books,1,1,0,"""Frankenstein Or The Modern Prometheus"" by Mar...",science fiction horror tales gothic fiction sc...,frankenstein or the modern prometheus- shelley...
1,呻吟語,Lü Kun 1536-1618,books,1,1,0,"""呻吟語"" by Kun Lü is a philosophical treatise wr...",conduct life,"呻吟語- lü kun 1536-1618- books- ""呻吟語"" by kun lü ..."
2,Pride and Prejudice,Austen Jane 1775-1817,books,1,1,0,"""Pride and Prejudice"" by Jane Austen is a clas...",england fiction young women fiction love sto...,pride and prejudice- austen jane 1775-1817- bo...
3,Moby Dick Or The Whale,Melville Herman 1819-1891,books,1,1,0,"""Moby Dick Or The Whale"" by Herman Melville is...",whaling fiction sea stories psychological fic...,moby dick or the whale- melville herman 1819-1...
4,Romeo and Juliet,Shakespeare William 1564-1616,books,1,1,0,"""Romeo and Juliet"" by William Shakespeare is a...",vendetta drama youth drama verona italy dra...,romeo and juliet- shakespeare william 1564-161...


#Filtrado parental y por cantidad de jugadores

In [None]:
def parental_filter(df_all,input_kids):
  # Filtrar si se requiere permitir contenido para niños
  if input_kids == "S":
    df_all = df_all[df_all['allow_kids'] == 1]
  return df_all

def players_filter(df_all, input_players):
    # Filtrar filas donde el número de jugadores es mayor o igual a min_players y menor o igual a maxplayers
    df_all = df_all[(df_all['maxplayers'] >= input_players) & (df_all['minplayers'] <= input_players)]
    return df_all

#Filtrado por estado de ánimo

##Funciones

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

def train_sentiments(txt_path):
    """
    Función para entrenar un modelo de clasificación de frases utilizando embeddings.

    Args:
        txt_path (str): Ruta del archivo .txt con frases y etiquetas, separado por tabulación.

    Returns:
        modelo (LogisticRegression): Modelo entrenado para la clasificación.
        embedder (SentenceTransformer): Modelo de embeddings de frases.
    """
    # Cargar el archivo de texto en un DataFrame
    df = pd.read_csv(txt_path, delimiter="\t", names=["etiqueta", "frase"], encoding="utf-8")

    # Dividir los datos en entrenamiento y prueba
    X_train, X_test, y_train, y_test = train_test_split(df['frase'], df['etiqueta'], test_size=0.2, random_state=42)

    # Cargar el modelo preentrenado de embeddings
    embedder = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

    # Generar embeddings para las frases de entrenamiento y prueba
    X_train_embeddings = embedder.encode(X_train.tolist())
    X_test_embeddings = embedder.encode(X_test.tolist())

    # Entrenar un clasificador con los embeddings generados
    modelo = LogisticRegression()
    modelo.fit(X_train_embeddings, y_train)

    # Evaluar el modelo en el conjunto de prueba (opcional)
    y_pred = modelo.predict(X_test_embeddings)
    print("Evaluación del modelo en el conjunto de prueba:")
    print(classification_report(y_test, y_pred))

    return modelo, embedder

def class_sentiment(frase, modelo, embedder):
    # Generar el embedding de la frase
    frase_embedding = embedder.encode([frase])

    # Predecir la clase
    etiqueta = modelo.predict(frase_embedding)[0]
    probabilidad = modelo.predict_proba(frase_embedding).max()

    # Verificar si la probabilidad es menor a 0.5
    if probabilidad < 0.65:
        etiqueta = "neutro"  # Asignar etiqueta "neutro" si la probabilidad es baja

    return etiqueta, probabilidad

In [None]:
def genre_sentiment_analysis(df_all):
  import spacy
  from transformers import AutoTokenizer, AutoModel
  import torch
  from sklearn.metrics.pairwise import cosine_similarity
  from collections import Counter # Import the Counter class

  # Crear un DataFrame con las palabras únicas y su frecuencia
  todas_palabras = " ".join(df_all['Genre']).split()
  conteo_palabras = Counter(todas_palabras)

  # Crear un DataFrame a partir del conteo de palabras
  df_frecuencia = pd.DataFrame(conteo_palabras.items(), columns=['Palabra', 'Frecuencia'])

  # Ordenar el DataFrame por frecuencia de mayor a menor
  df_frecuencia = df_frecuencia.sort_values(by='Frecuencia', ascending=False)

  # Cargando el modelo de spaCy
  nlp = spacy.load("en_core_web_sm")

  # Inicializar el tokenizer y el modelo de embeddings de transformers
  model_name = "sentence-transformers/all-MiniLM-L6-v2"
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  model = AutoModel.from_pretrained(model_name)

  # Usando spaCy para etiquetar cada palabra con su parte de discurso
  pos_tags = []
  embeddings = []
  for palabra in df_frecuencia['Palabra']:
      doc = nlp(palabra)
      pos_tags.append(doc[0].pos_ if doc else None)
      embeddings.append(doc.vector)

  df_frecuencia['POS'] = pos_tags

  # Calcular la similitud de coseno con la palabra "feliz" usando transformers
  happy_embedding = model(**tokenizer("happy", return_tensors="pt"))[0].mean(dim=1).detach()
  cosine_similarities = []

  for palabra in df_frecuencia['Palabra']:
      word_embedding = model(**tokenizer(palabra, return_tensors="pt"))[0].mean(dim=1).detach()
      similarity = cosine_similarity(word_embedding, happy_embedding).item()
      cosine_similarities.append(similarity)

  df_frecuencia['CS_Happy'] = cosine_similarities
  df_frecuencia = df_frecuencia.sort_values(by='CS_Happy', ascending=False)

  # Escalar las similitudes de coseno al rango de -1 a 1
  max_sim = df_frecuencia['CS_Happy'].max()
  min_sim = df_frecuencia['CS_Happy'].min()
  df_frecuencia['CS_Happy'] = df_frecuencia['CS_Happy'].apply(
      lambda x: 2 * ((x - min_sim) / (max_sim - min_sim)) - 1
  )

  # Crear una lista de palabras que sean PROPN, X
  palabras_filtradas = df_frecuencia[(df_frecuencia['POS'].isin(['PROPN', 'X']))]['Palabra'].tolist()

  # Quitar de la columna 'Genre' las palabras filtradas
  df_all['Genre'] = df_all['Genre'].apply(lambda x: ' '.join([word for word in x.split() if word not in palabras_filtradas]))

  # Agregar una columna con el promedio de las similitudes de coseno en cada celda de la columna 'Genre'
  def calcular_promedio_similitud(genre_text):
      palabras = genre_text.split()
      similitudes = [df_frecuencia.loc[df_frecuencia['Palabra'] == palabra, 'CS_Happy'].values[0]
                    for palabra in palabras if not df_frecuencia.loc[df_frecuencia['Palabra'] == palabra].empty]
      return np.mean(similitudes) if similitudes else 0

  df_all['CS_Happy'] = df_all['Genre'].apply(calcular_promedio_similitud)

  df_all = df_all.sort_values(by='CS_Happy', ascending=False)

  # Escalar las similitudes de coseno al rango de -1 a 1
  max_sim = df_all['CS_Happy'].max()
  min_sim = df_all['CS_Happy'].min()
  df_all['CS_Happy'] = df_all['CS_Happy'].apply(
      lambda x: 2 * ((x - min_sim) / (max_sim - min_sim)) - 1
  )

  return df_frecuencia, df_all

In [None]:
def input_sentiment_analysis(input_text):
  # Cargamos el tokenizador y el modelo
  model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
  tokenizer = BertTokenizer.from_pretrained(model_name)
  model = BertForSequenceClassification.from_pretrained(model_name)

  # Creamos un pipeline de clasificación
  nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

  # Función para convertir la clasificación en estrellas a un puntaje de -1 a 1
  def convert_to_score(label):
      stars = int(label.split()[0])  # Extraemos el número de estrellas
      return (stars - 3) / 2  # Escala las estrellas de 1-5 a -1 a 1

  # Obtenemos las predicciones de sentimiento para cada frase y las convertimos
  result = nlp(input_text)
  label = result[0]['label']
  score = result[0]['score']
  sentiment_score = convert_to_score(label)
  print(f"Frase: '{input_text}'")
  print(f"Sentimiento: {sentiment_score:.2f}, Confianza: {score:.3f}")
  print()
  return sentiment_score

In [None]:
def sentiment_filter(df_all, etiqueta):
  # Filtrar df_all basado en las condiciones de sentimiento
  if etiqueta == 'melancólico':
      df_all = df_all[df_all['CS_Happy'] >= 0.5]
  elif etiqueta == 'neutro':
      df_all = df_all[df_all['CS_Happy'] > 0]
  elif etiqueta == 'alegre':
      df_all = df_all  # No filtrar
  return df_all

##Aplicación

In [None]:
# Entrenar el modelo y obtener el embedder
model_sentiment, embedder_sentiment = train_sentiments("dataset_estado_animo.txt")



Evaluación del modelo en el conjunto de prueba:
              precision    recall  f1-score   support

      alegre       0.75      0.84      0.79        68
 melancólico       0.81      0.71      0.76        66

    accuracy                           0.78       134
   macro avg       0.78      0.78      0.77       134
weighted avg       0.78      0.78      0.78       134



In [None]:
# Ejemplo de uso
nueva_frase = "en disney"
etiqueta, probabilidad = class_sentiment(nueva_frase, model_sentiment, embedder_sentiment)
print(f"Clase: {etiqueta}, Probabilidad: {probabilidad:.2f}")

Clase: alegre, Probabilidad: 0.91


In [None]:
input_sentiment = "me siento un poco triste"
sentiment_score=input_sentiment_analysis(input_sentiment)



Frase: 'me siento un poco triste'
Sentimiento: 0.00, Confianza: 0.497



In [None]:
df_frecuencia, df_all = genre_sentiment_analysis(df_all)

In [None]:
sentiment_filter(df_all, etiqueta)

Unnamed: 0,Title,People,Class,minplayers,maxplayers,allow_kids,Description,Genre,Data,CS_Happy
76,The Kama Sutra of VatsyayanaTranslated From th...,Vatsyayana,books,1,1,0,"""The Kama Sutra of Vatsyayana"" by Vatsyayana i...",love sex,the kama sutra of vatsyayanatranslated from th...,1.000000
198,Perpetual Peace: A Philosophical Essay,Kant Immanuel 1724-1804,books,1,1,0,"""Perpetual Peace: A Philosophical Essay"" by Im...",peace,perpetual peace: a philosophical essay- kant i...,0.969210
2700,Time's Up!,-,games,4,18,1,Time's Up! is a charades-based party game for ...,humor party game,time's up!- -- games- time's up! is a charades...,0.636019
245,The Consolation of Philosophy,Boethius 480-525?,books,1,1,0,"""The Consolation of Philosophy"" by Boethius is...",philosophy religion happiness,the consolation of philosophy- boethius 480-52...,0.593434
825,The Condition of the Working-Class in England ...,Engels Friedrich 1820-1895,books,1,1,0,"""The Condition of the Working-Class in England...",great economic conditions working class great,the condition of the working-class in england ...,0.583702
...,...,...,...,...,...,...,...,...,...,...
690,Fables de La Fontaine,La Fontaine Jean de 1621-1695,books,1,1,0,"""Fables de La Fontaine"" by Jean de La Fontaine...",aesops fables adaptations fables greek adaptat...,fables de la fontaine- la fontaine jean de 162...,-0.909764
537,Palmistry for All,Cheiro 1866-1936,books,1,1,0,"""Palmistry for All"" by Cheiro is a comprehensi...",palmistry,"palmistry for all- cheiro 1866-1936- books- ""p...",-0.927708
977,Principles of Orchestration with Musical Examp...,Rimsky-Korsakov Nikolay 1844-1908,books,1,1,0,"""Principles of Orchestration with Musical Exam...",instrumentation orchestration,principles of orchestration with musical examp...,-0.939240
721,The passing of the great race or The racial ba...,Grant Madison 1865-1937,books,1,1,0,"""The Passing of the Great Race or The Racial B...",ethnology,the passing of the great race or the racial ba...,-0.955194


#Filtrado por NER

##Funciones

In [None]:
# Carga el modelo preentrenado 'gliner_multi-v2.1' desde Hugging Face
model_ner = GLiNER.from_pretrained("urchade/gliner_multi-v2.1")

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/1.16G [00:00<?, ?B/s]

gliner_config.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.77k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]



In [None]:
def get_ner(text, model, labels=["person", "location", "actor", "character"], threshold=0.4):
    model.eval()
    entities = model.predict_entities(text, labels, threshold=threshold)
 # Verificar si se detectaron entidades
    if entities:
        # Crear el DataFrame con las entidades detectadas
        df_entities = pd.DataFrame(entities)
        return df_entities[['text', 'label']]
    else:
        # Crear un DataFrame vacío con las columnas 'text' y 'label'
        return pd.DataFrame(columns=['text', 'label'])

In [None]:
def filter_ner(df_all, df_entities, column="Data"):
    # Verificar que df_entities no esté vacío
    if df_entities.empty:
        print("No hay entidades para filtrar.")
        return df_all

    # Obtener los valores únicos de las entidades
    entity_values = [entity.lower() for entity in df_entities['text'].unique()]

    # Filtrar df_all para que solo contenga filas donde la columna 'Data' contenga alguna de las entidades
    df_filtered = df_all[df_all[column].apply(lambda x: any(entity in x for entity in entity_values))]

    return df_filtered

##Aplicación

In [None]:
input_ner = "quiero leer algo de Frankenstein"
get_ner(input_ner,model_ner)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unnamed: 0,text,label
0,Frankenstein,character


In [None]:
filter_ner(df_all, get_ner(input_ner,model_ner))

Unnamed: 0,Title,People,Class,minplayers,maxplayers,allow_kids,Description,Genre,Data,CS_Happy
2205,Horrified,-,games,1,5,1,The stakes have been raised. Imagine living in...,horror miniatures movies tv radio theme,horrified- -- games- the stakes have been rais...,-0.184678
2908,Abomination: The Heir of Frankenstein,-,games,2,4,1,It's been twenty years since Victor Frankenste...,horror novelbased science fiction,abomination: the heir of frankenstein- -- game...,-0.251569
1326,Victor Frankenstein,Paul McGuigan Daniel Radcliffe James McAvoy Je...,films,1,99,0,Told from Igor's perspective we see the troubl...,drama horror scifi,victor frankenstein- paul mcguigan daniel radc...,-0.266918
0,Frankenstein Or The Modern Prometheus,Shelley Mary Wollstonecraft 1797-1851,books,1,1,0,"""Frankenstein Or The Modern Prometheus"" by Mar...",science fiction horror tales fiction scientist...,frankenstein or the modern prometheus- shelley...,-0.362713


#Búsqueda semántica por interés

## Funciones

In [None]:
from torch.utils.data import DataLoader, TensorDataset

# Función optimizada para precomputar los embeddings
def precomputar_embeddings(df, columna, batch_size=64, device='cuda' if torch.cuda.is_available() else 'cpu'):
    modelo.to(device)
    embeddings_column = []
    textos = df[columna].astype(str).tolist()

    # Crear DataLoader para procesamiento por lotes
    dataset = textos
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    with torch.no_grad():  # Desactiva el cálculo de gradientes para mayor eficiencia
        for batch_textos in tqdm(dataloader, desc="Calculando embeddings"):
            batch_embeddings = modelo.encode(batch_textos, convert_to_tensor=True, device=device)
            embeddings_column.append(batch_embeddings.cpu())  # Devuelve a la CPU para evitar ocupar la memoria de la GPU

    return torch.cat(embeddings_column, dim=0)  # Concatenamos y aseguramos forma 2D

def guardar_embeddings(embeddings, file_path):
    embeddings_np = embeddings.cpu().numpy() if embeddings.is_cuda else embeddings.numpy()
    df = pd.DataFrame(embeddings_np)
    df.to_csv(file_path, index=False)
    print("Embeddings guardados exitosamente en formato CSV.")

In [None]:
# Función para realizar la búsqueda semántica y devolver el DataFrame ordenado
def semantic_search(input_text, df_all, embeddings_path='embeddings_data.csv'):
    # Cargar el archivo CSV de embeddings
    try:
        df_embeddings = pd.read_csv(embeddings_path, delimiter=",", encoding="utf-8")
    except FileNotFoundError:
        print("El archivo de embeddings no se encontró. Asegúrate de que 'embeddings_data.csv' exista en el directorio.")
        return

    # Asegurarnos de que el DataFrame tiene una columna de etiquetas (si es necesario)
    if 'label' in df_embeddings.columns:
        embeddings_data = df_embeddings.drop(columns=['label']).values  # Ignorar columna de etiquetas
    else:
        embeddings_data = df_embeddings.values  # Usar todo si no hay columna 'label'

    # Convertir los embeddings a tensores
    embeddings_data = torch.tensor(embeddings_data, dtype=torch.float32)

    # Codificamos el input
    embeddings_input = modelo.encode([input_text], convert_to_tensor=True).reshape(1, -1)  # Aseguramos forma 2D

    # Calculamos las puntuaciones de similitud
    puntuaciones_coseno = util.cos_sim(embeddings_input, embeddings_data)[0].cpu().numpy()

    # Añadimos las puntuaciones de similitud al DataFrame
    df_all['Similitud'] = puntuaciones_coseno

    # Ordenamos el DataFrame por la columna de similitud de mayor a menor
    df_ordenado = df_all.sort_values(by='Similitud', ascending=False)

    return df_ordenado

In [None]:
# Instancia del modelo preentrenado multilingüe
modelo = SentenceTransformer('distiluse-base-multilingual-cased-v1')

modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/556 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/539M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/452 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

In [None]:
#embeddings_data = precomputar_embeddings(df_all, 'Data')
#guardar_embeddings(embeddings_data, 'embeddings_data.csv')

## Aplicación

In [None]:
input_text = 'Quiero algo de Verne'
semantic_search(input_text, df_all, 'embeddings_data.csv')

Unnamed: 0,Title,People,Class,minplayers,maxplayers,allow_kids,Description,Genre,Data,CS_Happy,Similitud
2303,Star Realms: Frontiers,-,games,1,4,1,Star Realms: Frontiers is a new standalone Sta...,card game fighting science fiction,star realms: frontiers- -- games- star realms:...,0.098464,0.324973
1897,The Fall,Tarsem Singh Lee Pace Catinca Untaru Justine W...,films,1,99,0,In a hospital on the outskirts of 1920s Los An...,adventure comedy drama,the fall- tarsem singh lee pace catinca untaru...,0.019899,0.295733
1893,Sex and the City,Michael Patrick King Sarah Jessica Parker Kim ...,films,1,99,0,A New York writer on sex and love is finally g...,comedy drama romance,sex and the city- michael patrick king sarah j...,0.212994,0.292980
1480,Forgetting Sarah Marshall,Nicholas Stoller Kristen Bell Jason Segel Paul...,films,1,99,0,Devastated Peter takes a Hawaiian vacation in ...,comedy drama romance,forgetting sarah marshall- nicholas stoller kr...,0.212994,0.277124
1599,Megamind,Tom McGrath Will Ferrell Jonah Hill Brad Pitt ...,films,1,99,0,The supervillain Megamind finally defeats his ...,animation action comedy,megamind- tom mcgrath will ferrell jonah hill ...,0.096959,0.270992
...,...,...,...,...,...,...,...,...,...,...,...
224,The Sign of the Four,Doyle Arthur Conan 1859-1930,books,1,1,0,"""The Sign of the Four"" by Arthur Conan Doyle i...",holmes sherlock fictitious character fiction p...,the sign of the four- doyle arthur conan 1859-...,-0.333488,-0.110017
2590,Las Vegas,-,games,2,5,1,Developer Stefan Br&uumlck at alea describes L...,dice,las vegas- -- games- developer stefan br&uumlc...,-0.877478,-0.112842
260,The Junior Classics Volume 1: Fairy and wonder...,Neilson William Allan 1869-1946,books,1,1,0,,childrens literature,,-0.545182,-0.113944
2716,Three Sisters,-,games,1,4,1,Three Sisters is a strategic roll-and-write ga...,dice farming,three sisters- -- games- three sisters is a st...,-0.676159,-0.128148


#Puesta en marcha

##Funciones

In [None]:
def app(df_all,input_1,input_2,input_3,input_4):
  etiqueta, probabilidad = class_sentiment(input_1, model_sentiment, embedder_sentiment)
  print(f"Clase: {etiqueta}, Probabilidad: {probabilidad:.2f}")
  #ordenar por actividades más similares
  df_4 = semantic_search(input_4, df_all, 'embeddings_data.csv')

  #filtrar por ner
  df_4 = filter_ner(df_4, get_ner(input_4,model_ner))

  #filtrar por cantidad de jugadores
  df_2 = players_filter(df_4,input_2)

  #filtrar por control parental
  df_3 = parental_filter(df_2,input_3)

  #filtrar por estado de ánimo
  df_1 = sentiment_filter(df_3, sentiment_score)

  #ordenar por similitud
  df_1 = df_1.sort_values(by='Similitud', ascending=False)
  df_output = df_1.drop(columns=['maxplayers', 'minplayers', 'allow_kids', 'Data', 'CS_Happy', 'Similitud'], errors='ignore')
  return df_4,df_2,df_3,df_1,df_output

##Uso

In [None]:
#1.¿Cómo estás hoy?
input_1 = "en disney"
#2.¿Cuántas personas juegan?
input_2 = 1
#3.¿Hay niños? (S/N)
input_3 = "S"
#4.¿De qué tenés ganas?
input_4 = 'Quiero jugar a algún juego de cartas grupal'

df_4,df_2,df_3,df_1,df_output = app(df_all,input_1,input_2,input_3,input_4)

df_output

Clase: alegre, Probabilidad: 0.91
No hay entidades para filtrar.


Unnamed: 0,Title,People,Class,Description,Genre
2115,Troyes,-,games,In Troyes (pronounced &quottroah&quot) players...,dice economic medieval
2424,Dead Reckoning,-,games,Dead Reckoning is a game of exploration piracy...,exploration pirates territory building
2933,La Granja: Deluxe Master Set,-,games,La Granja: Deluxe Master Set is an upgraded an...,dice economic farming
2289,Harmonies,-,games,In Harmonies build landscapes by placing color...,animals environmental
2637,Legacy of Yu,-,games,During the reign of Emperor Yao the people of ...,ancient city building
...,...,...,...,...,...
1418,Harry Potter and the Deathly Hallows: Part 1,David Yates Daniel Radcliffe Emma Watson Ruper...,films,As Harry races against time and evil to destro...,adventure family fantasy
2315,Arkham Horror: The Card Game (Revised Edition),-,games,The boundaries between worlds have drawn peril...,adventure card game collectible components exp...
2686,Zombicide: Green Horde,-,games,This sequel to the Zombicide: Black Plague wil...,adventure fantasy fighting horror medieval min...
2129,Age of Steam,-,games,Steam-belching iron horses roar across the wil...,economic postnapoleonic trains transportation
