# Librerías

In [239]:
import os
import pandas as pd
import numpy as np

from collections import Counter # Import the Counter class
import re
import pandas as pd
import spacy
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

!pip install sentence-transformers
from sentence_transformers import SentenceTransformer, util
import torch
from tqdm import tqdm

#web scraping
import requests
from bs4 import BeautifulSoup
import time



#Procesamiento de datos

In [240]:
REPO_NAME = "NLP-TUIA-Garcia-Herrera"
if REPO_NAME not in os.getcwd():
  if not os.path.exists(REPO_NAME):
    !git clone https://github.com/juliangg17/NLP-TUIA-Garcia-Herrera.git
  os.chdir(REPO_NAME)

## Funciones

Carga de datos de juegos

In [242]:
# Función para procesar el archivo CSV y preparar el DataFrame
def get_games(file_path):
    # Leer el archivo CSV
    df_games = pd.read_csv(file_path, delimiter=",", encoding="utf-8")

    # Cambiar el tipo de datos de las columnas
    df_games = df_games.astype({
        "rank": "Int64",
        "game_name": "string",
        "game_href": "string",
        "geek_rating": "float",
        "avg_rating": "float",
        "num_voters": "Int64",
        "description": "string",
        "yearpublished": "Int64",
        "minplayers": "Int64",
        "maxplayers": "Int64",
        "minplaytime": "Int64",
        "maxplaytime": "Int64",
        "minage": "Int64",
        "avgweight": "float",
        "best_num_players": "string",
        "designers": "string",
        "mechanics": "string",
        "categories": "string"
    })

    # Seleccionar solo las columnas especificadas
    df_games = df_games[["game_name", "description", "minplayers", "maxplayers", "minage", "categories"]]

    # Cambiar los nombres de las columnas
    df_games = df_games.rename(columns={
        "game_name": "Title",
        "description": "Description",
        "categories": "Genre"
    })

    # Agregar columna "allow_kids" basado en la edad mínima
    df_games["allow_kids"] = df_games["minage"].apply(lambda x: 1 if x < 16 else 0)

    # Eliminar la columna "minage"
    df_games = df_games.drop(columns=["minage"])

    # Agregar columna "Class" con valor fijo "games"
    df_games["Class"] = "games"

    # Devolver el DataFrame final
    return df_games

Carga de datos de películas

In [243]:
# Función para procesar el archivo CSV y preparar el DataFrame
def get_films(file_path):
    # Leer el archivo CSV
    df_films = pd.read_csv(file_path, delimiter=",", encoding="utf-8")

    # Cambiar el tipo de datos de las columnas
    df_films = df_films.astype({
        "Rank": "Int64",
        "Title": "string",
        "Genre": "string",
        "Description": "string",
        "Director": "string",
        "Actors": "string",
        "Year": "Int64",
        "Runtime (Minutes)": "Int64",
        "Rating": "float",
        "Votes": "Int64",
        "Revenue (Millions)": "float",
        "Metascore": "Int64"
    })

    # Seleccionar solo las columnas especificadas
    df_films = df_films[["Title", "Genre", "Description", "Director", "Actors"]]

    # Combinar las columnas "Director" y "Actors" en una sola columna "People"
    df_films["People"] = df_films["Director"] + ", " + df_films["Actors"]

    # Eliminar las columnas originales de "Director" y "Actors"
    df_films = df_films.drop(columns=["Director", "Actors"])

    # Agregar columna "Class" con el valor fijo "films"
    df_films["Class"] = "films"

    # Agregar columnas "minplayers" y "maxplayers" con valores fijos
    df_films["minplayers"] = 1
    df_films["maxplayers"] = 99

    # Agregar columna "allow_kids" basado en si "Genre" contiene la palabra "Family"
    df_films["allow_kids"] = df_films["Genre"].apply(lambda x: 1 if "Family" in x else 0)

    # Devolver el DataFrame final
    return df_films

Carga de datos de libros

In [244]:
def get_books(num_books=1000, output_file="gutenberg_books_1000.csv"):
    url = "https://www.gutenberg.org/browse/scores/top1000.php#books-last1"

    # Solicitud GET a la página web
    response = requests.get(url)
    response.raise_for_status()  # Verifica que la solicitud sea exitosa

    # Crear el objeto BeautifulSoup para parsear el HTML
    soup = BeautifulSoup(response.text, 'html.parser')

    # Encontrar el elemento <h2 id="books-last1"> y luego el siguiente <ol> con los libros
    books_section = soup.select_one("h2#books-last1")
    books_data = []

    if books_section:
        books_list = books_section.find_next("ol")
        if books_list:
            for book in tqdm(books_list.find_all("li")[:num_books], desc="Extrayendo datos de libros"):  # Agregar barra de progreso
                title = book.get_text()
                link = book.find("a")["href"]
                book_url = f"https://www.gutenberg.org{link}"  # Completar la URL

                # Entrar a la URL de cada libro para obtener detalles
                book_response = requests.get(book_url)
                book_response.raise_for_status()
                book_soup = BeautifulSoup(book_response.text, 'html.parser')

                # Buscar directamente la tabla con clase "bibrec" y extraer la información
                info_table = book_soup.find("table", class_="bibrec")
                book_info = {"Title": title, "URL": book_url}  # Diccionario para almacenar info del libro

                # Extraer todos los temas (subjects) en una lista
                subjects = []

                if info_table:
                    for row in info_table.find_all("tr"):
                        header = row.find("th")
                        value = row.find("td")
                        if header and value:
                            header_text = header.get_text(strip=True)
                            value_text = value.get_text(strip=True)

                            # Si el encabezado es "Subject", añadir a la lista de subjects
                            if header_text == "Subject":
                                subjects.append(value_text)
                            else:
                                book_info[header_text] = value_text

                # Unir todos los subjects en una sola cadena separada por comas
                book_info["Subject"] = ", ".join(subjects)

                # Agregar el diccionario de info del libro a la lista principal
                books_data.append(book_info)

                # Pausa para evitar sobrecargar el servidor
                time.sleep(0.1)

    # Crear un DataFrame de Pandas con la información de los libros
    books_df = pd.DataFrame(books_data)

    # Guardar el DataFrame en un archivo CSV
    books_df.to_csv(output_file, index=False, encoding="utf-8")

    #print(f"Se ha completado la extracción y el archivo CSV ha sido guardado como '{output_file}'")

    # Procesar el DataFrame
    books_df = books_df.drop(columns=['URL'])
    books_df = books_df.rename(columns={'Author': 'People'})
    books_df['Class'] = 'books'
    books_df['minplayers'] = 1
    books_df['maxplayers'] = 1
    books_df = books_df.rename(columns={'Summary': 'Description'})
    books_df = books_df.rename(columns={'Subject': 'Genre'})
    books_df['allow_kids'] = 0
    books_df = books_df.reindex(columns=['Title', 'People', 'Class', 'minplayers', 'maxplayers', 'allow_kids', 'Description', 'Genre'])
    return books_df

Procesamiento de todos los datasets

In [245]:
# Función para combinar los DataFrames y procesar los datos
def get_all(df_books, df_films, df_games):
    # Combinar las tablas en una sola
    df_all = pd.concat([df_books, df_films, df_games], ignore_index=True)

    # Reemplazar valores en las columnas especificadas
    columns_to_clean = ["Title", "People", "Description", "Genre"]
    for col in columns_to_clean:
        df_all[col] = df_all[col].str.replace(";", "", regex=False)
        df_all[col] = df_all[col].str.replace(",", "", regex=False)

    # Crear la columna "Data" concatenando las otras columnas
    df_all["Data"] = df_all["Title"] + "- " + df_all["People"] + "- " + df_all["Class"] + "- " + df_all["Description"] + "- " + df_all["Genre"]

    # Quitar duplicados basándose en la columna "Title"
    df_all = df_all.drop_duplicates(subset=["Title"])

    # Asegurar que la columna 'Genre' sea de tipo string
    df_all['Genre'] = df_all['Genre'].astype(str)

    # Aplicar la limpieza a la columna 'Genre'
    df_all['Genre'] = df_all['Genre'].apply(clean_genre)

    # Devolver el DataFrame final
    return df_all

Limpieza de columna Genre

In [246]:
import re
# Definir una función para limpiar cada género
def clean_genre(text):
    # Eliminar comillas simples, dobles, corchetes, números romanos, y caracteres especiales
    text = re.sub(r"[\"'\[\]/,;.&:]", "", text)

    # Eliminar palabras específicas (como "of", "and", "for") usando bordes de palabra
    text = re.sub(r"\b(of|and|for|i|ii|iii)\b", "", text)

    # Separar palabras por letras mayúsculas
    text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)

    # Eliminar espacios extra que pueden quedar
    text = re.sub(r"\s+", " ", text).strip()

    # Remover paréntesis, números y signos de pregunta
    text = re.sub(r'[()\d?]', '', text)

    # Convertir a minúsculas para simplificar la eliminación de palabras específicas
    text = text.lower()

    # Eliminar comillas simples, dobles, corchetes, números romanos, y caracteres especiales
    text = re.sub(r"-", "", text)

    return text

## Carga de datos

In [None]:
df_games = get_games("bgg_database.csv")
df_films = get_films("IMDB-Movie-Data.csv")
df_books = get_books()
df_all = get_all(df_books, df_films, df_games)
df_all.head()

Extrayendo datos de libros: 100%|██████████| 1000/1000 [13:24<00:00,  1.24it/s]


Unnamed: 0,Title,People,Class,minplayers,maxplayers,allow_kids,Description,Genre,Data
0,呻吟語,Lü Kun 1536-1618,books,1,1,0,"""呻吟語"" by Kun Lü is a philosophical treatise wr...",conduct life,"呻吟語- Lü Kun 1536-1618- books- ""呻吟語"" by Kun Lü ..."
1,Frankenstein Or The Modern Prometheus,Shelley Mary Wollstonecraft 1797-1851,books,1,1,0,"""Frankenstein Or The Modern Prometheus"" by Mar...",science fiction horror tales gothic fiction sc...,Frankenstein Or The Modern Prometheus- Shelley...
2,Moby Dick Or The Whale,Melville Herman 1819-1891,books,1,1,0,"""Moby Dick Or The Whale"" by Herman Melville is...",whaling fiction sea stories psychological fic...,Moby Dick Or The Whale- Melville Herman 1819-1...
3,Romeo and Juliet,Shakespeare William 1564-1616,books,1,1,0,"""Romeo and Juliet"" by William Shakespeare is a...",vendetta drama youth drama verona italy dra...,Romeo and Juliet- Shakespeare William 1564-161...
4,Pride and Prejudice,Austen Jane 1775-1817,books,1,1,0,"""Pride and Prejudice"" by Jane Austen is a clas...",england fiction young women fiction love sto...,Pride and Prejudice- Austen Jane 1775-1817- bo...


#Filtrado parental y por cantidad de jugadores

In [None]:
def parental_filter(df_all,input_kids):
  # Filtrar si se requiere permitir contenido para niños
  if input_kids == "S":
    df_all = df_all[df_all['allow_kids'] == 1]
  return df_all

def players_filter(df_all, input_players):
    # Filtrar filas donde el número de jugadores es mayor o igual a min_players y menor o igual a maxplayers
    df_all = df_all[(df_all['maxplayers'] >= input_players) & (df_all['minplayers'] <= input_players)]
    return df_all

#Filtrado por estado de ánimo

##Funciones

In [None]:
def genre_sentiment_analysis(df_all):
  import spacy
  from transformers import AutoTokenizer, AutoModel
  import torch
  from sklearn.metrics.pairwise import cosine_similarity
  from collections import Counter # Import the Counter class

  # Crear un DataFrame con las palabras únicas y su frecuencia
  todas_palabras = " ".join(df_all['Genre']).split()
  conteo_palabras = Counter(todas_palabras)

  # Crear un DataFrame a partir del conteo de palabras
  df_frecuencia = pd.DataFrame(conteo_palabras.items(), columns=['Palabra', 'Frecuencia'])

  # Ordenar el DataFrame por frecuencia de mayor a menor
  df_frecuencia = df_frecuencia.sort_values(by='Frecuencia', ascending=False)

  # Cargando el modelo de spaCy
  nlp = spacy.load("en_core_web_sm")

  # Inicializar el tokenizer y el modelo de embeddings de transformers
  model_name = "sentence-transformers/all-MiniLM-L6-v2"
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  model = AutoModel.from_pretrained(model_name)

  # Usando spaCy para etiquetar cada palabra con su parte de discurso
  pos_tags = []
  embeddings = []
  for palabra in df_frecuencia['Palabra']:
      doc = nlp(palabra)
      pos_tags.append(doc[0].pos_ if doc else None)
      embeddings.append(doc.vector)

  df_frecuencia['POS'] = pos_tags

  # Calcular la similitud de coseno con la palabra "feliz" usando transformers
  happy_embedding = model(**tokenizer("happy", return_tensors="pt"))[0].mean(dim=1).detach()
  cosine_similarities = []

  for palabra in df_frecuencia['Palabra']:
      word_embedding = model(**tokenizer(palabra, return_tensors="pt"))[0].mean(dim=1).detach()
      similarity = cosine_similarity(word_embedding, happy_embedding).item()
      cosine_similarities.append(similarity)

  df_frecuencia['CS_Happy'] = cosine_similarities
  df_frecuencia = df_frecuencia.sort_values(by='CS_Happy', ascending=False)

  # Escalar las similitudes de coseno al rango de -1 a 1
  max_sim = df_frecuencia['CS_Happy'].max()
  min_sim = df_frecuencia['CS_Happy'].min()
  df_frecuencia['CS_Happy'] = df_frecuencia['CS_Happy'].apply(
      lambda x: 2 * ((x - min_sim) / (max_sim - min_sim)) - 1
  )

  # Crear una lista de palabras que sean PROPN, X
  palabras_filtradas = df_frecuencia[(df_frecuencia['POS'].isin(['PROPN', 'X']))]['Palabra'].tolist()

  # Quitar de la columna 'Genre' las palabras filtradas
  df_all['Genre'] = df_all['Genre'].apply(lambda x: ' '.join([word for word in x.split() if word not in palabras_filtradas]))

  # Agregar una columna con el promedio de las similitudes de coseno en cada celda de la columna 'Genre'
  def calcular_promedio_similitud(genre_text):
      palabras = genre_text.split()
      similitudes = [df_frecuencia.loc[df_frecuencia['Palabra'] == palabra, 'CS_Happy'].values[0]
                    for palabra in palabras if not df_frecuencia.loc[df_frecuencia['Palabra'] == palabra].empty]
      return np.mean(similitudes) if similitudes else 0

  df_all['CS_Happy'] = df_all['Genre'].apply(calcular_promedio_similitud)

  df_all = df_all.sort_values(by='CS_Happy', ascending=False)

  # Escalar las similitudes de coseno al rango de -1 a 1
  max_sim = df_all['CS_Happy'].max()
  min_sim = df_all['CS_Happy'].min()
  df_all['CS_Happy'] = df_all['CS_Happy'].apply(
      lambda x: 2 * ((x - min_sim) / (max_sim - min_sim)) - 1
  )

  return df_frecuencia, df_all

In [None]:
def input_sentiment_analysis(input_text):
  # Cargamos el tokenizador y el modelo
  model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
  tokenizer = BertTokenizer.from_pretrained(model_name)
  model = BertForSequenceClassification.from_pretrained(model_name)

  # Creamos un pipeline de clasificación
  nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

  # Función para convertir la clasificación en estrellas a un puntaje de -1 a 1
  def convert_to_score(label):
      stars = int(label.split()[0])  # Extraemos el número de estrellas
      return (stars - 3) / 2  # Escala las estrellas de 1-5 a -1 a 1

  # Obtenemos las predicciones de sentimiento para cada frase y las convertimos
  result = nlp(input_text)
  label = result[0]['label']
  score = result[0]['score']
  sentiment_score = convert_to_score(label)
  print(f"Frase: '{input_text}'")
  print(f"Sentimiento: {sentiment_score:.2f}, Confianza: {score:.3f}")
  print()
  return sentiment_score

In [None]:
def sentiment_filter(df_all, sentiment_score):
  # Filtrar df_all basado en las condiciones de sentimiento
  if sentiment_score <= -0.3:
      df_filtrado = df_all[df_all['CS_Happy'] >= 0.5]
  elif -0.3 < sentiment_score < 0.3:
      df_filtrado = df_all[df_all['CS_Happy'] > 0]
  else:
      df_filtrado = df_all  # No filtrar
  return df_filtrado

##Filtrado

In [None]:
input_sentiment = "me siento un poco triste"
sentiment_score=input_sentiment_analysis(input_sentiment)

Frase: 'me siento un poco triste'
Sentimiento: 0.00, Confianza: 0.497



In [None]:
df_frecuencia, df_all = genre_sentiment_analysis(df_all)



In [None]:
sentiment_filter(df_all, sentiment_score)

Unnamed: 0,Title,People,Class,minplayers,maxplayers,allow_kids,Description,Genre,Data,CS_Happy
66,The Kama Sutra of Vatsyayana,Vatsyayana,books,1,1,0,"""The Kama Sutra of Vatsyayana"" by Vatsyayana i...",love sex,The Kama Sutra of Vatsyayana- Vatsyayana- book...,1.000000
227,Perpetual Peace: A Philosophical Essay,Immanuel Kant,books,1,1,0,"""Perpetual Peace: A Philosophical Essay"" by Im...",peace,Perpetual Peace: A Philosophical Essay- Immanu...,0.969945
2979,Time's Up!,,games,4,18,1,Time's Up! is a charades-based party game for ...,humor party game,,0.644703
183,The Consolation of Philosophy,Boethius,books,1,1,0,"""The Consolation of Philosophy"" by Boethius is...",philosophy religion happiness,The Consolation of Philosophy- Boethius- books...,0.603135
777,The Condition of the Working-Class in England ...,Friedrich Engels,books,1,1,0,"""The Condition of the Working-Class in England...",great economic conditions working class great,The Condition of the Working-Class in England ...,0.593634
...,...,...,...,...,...,...,...,...,...,...
1124,How to Tell a Story and Other Essays,Mark Twain,books,1,1,0,"""How to Tell a Story and Other Essays"" by Mark...",essays short stories storytelling,How to Tell a Story and Other Essays- Mark Twa...,0.000873
299,On Heroes Hero-Worship and the Heroic in History,Thomas Carlyle,books,1,1,0,"""On Heroes Hero-Worship and the Heroic in Hist...",heroes hero worship,On Heroes Hero-Worship and the Heroic in Histo...,0.000591
196,Anthem,Ayn Rand,books,1,1,0,"""Anthem"" by Ayn Rand is a dystopian novel writ...",science fiction psychological fiction love sto...,"Anthem- Ayn Rand- books- ""Anthem"" by Ayn Rand ...",0.000526
2773,Ready Set Bet,,games,2,9,1,In Ready Set Bet you and your friends head to ...,animals party game racing realtime sports,,0.000297


#Búsqueda semántica por interés

## Funciones

In [None]:
from torch.utils.data import DataLoader, TensorDataset

# Función optimizada para precomputar los embeddings
def precomputar_embeddings(df, columna, batch_size=64, device='cuda' if torch.cuda.is_available() else 'cpu'):
    modelo.to(device)
    embeddings_column = []
    textos = df[columna].astype(str).tolist()

    # Crear DataLoader para procesamiento por lotes
    dataset = textos
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    with torch.no_grad():  # Desactiva el cálculo de gradientes para mayor eficiencia
        for batch_textos in tqdm(dataloader, desc="Calculando embeddings"):
            batch_embeddings = modelo.encode(batch_textos, convert_to_tensor=True, device=device)
            embeddings_column.append(batch_embeddings.cpu())  # Devuelve a la CPU para evitar ocupar la memoria de la GPU

    return torch.cat(embeddings_column, dim=0)  # Concatenamos y aseguramos forma 2D

# Función para guardar embeddings en un archivo
def guardar_embeddings(embeddings, file_path):
    torch.save(embeddings, file_path)
    print("Embeddings guardados exitosamente.")

# Función para cargar embeddings desde un archivo
def cargar_embeddings(file_path):
    if os.path.exists(file_path):
        return torch.load(file_path, map_location=torch.device('cpu'))
    else:
        raise FileNotFoundError(f"El archivo {file_path} no existe.")

In [None]:
# Función para realizar la búsqueda semántica y devolver el DataFrame ordenado
def semantic_search(input_text, df_all, embeddings_path='embeddings_data.pt'):
    try:
        embeddings_data = cargar_embeddings(embeddings_path)
    except FileNotFoundError:
        print("El archivo de embeddings no se encontró. Asegúrate de guardar los embeddings primero.")
        return

    # Codificamos el input
    embeddings_input = modelo.encode([input_text], convert_to_tensor=True).reshape(1, -1)  # Aseguramos forma 2D

    # Calculamos las puntuaciones de similitud
    puntuaciones_coseno = util.cos_sim(embeddings_input, embeddings_data)[0].cpu().numpy()

    # Añadimos las puntuaciones de similitud al DataFrame
    df_all['Similitud'] = puntuaciones_coseno

    # Ordenamos el DataFrame por la columna de similitud de mayor a menor
    df_ordenado = df_all.sort_values(by='Similitud', ascending=False)

    return df_ordenado

In [None]:
# Función para comparar dos conjuntos de embeddings
def comparar_embeddings(embeddings_1, embeddings_2):
    if embeddings_1.shape != embeddings_2.shape:
        print("Los embeddings tienen formas diferentes:")
        print(f"Forma de embeddings_1: {embeddings_1.shape}")
        print(f"Forma de embeddings_2: {embeddings_2.shape}")
        return False

    # Comparamos los embeddings y mostramos las diferencias si las hay
    diferencia = torch.allclose(embeddings_1, embeddings_2, atol=1e-6)
    if diferencia:
        print("Los embeddings son idénticos.")
    else:
        print("Los embeddings no son idénticos.")
        # Imprimir una diferencia de muestra
        diff_indices = (embeddings_1 - embeddings_2).abs() > 1e-6
        print("Diferencias de muestra:")
        print((embeddings_1 - embeddings_2)[diff_indices])

    return diferencia

In [None]:
# Instancia del modelo preentrenado multilingüe
modelo = SentenceTransformer('distiluse-base-multilingual-cased-v1')



In [None]:
#embeddings_data = precomputar_embeddings(df_all, 'Data')

Calculando embeddings: 100%|██████████| 51/51 [08:47<00:00, 10.33s/it]


In [None]:
guardar_embeddings(embeddings_data, 'embeddings_data.pt')

Embeddings guardados exitosamente.


## Búsqueda semántica

In [None]:
input_text = 'Quiero historia de amor en la selva'
semantic_search(input_text, df_all, 'embeddings_data.pt')

  return torch.load(file_path, map_location=torch.device('cpu'))


Unnamed: 0,Title,People,Class,minplayers,maxplayers,allow_kids,Description,Genre,Data,CS_Happy,Similitud
165,The Jungle Book,Rudyard Kipling,books,1,1,0,"""The Jungle Book"" by Rudyard Kipling is a coll...",short stories animals fiction jungles fiction ...,"The Jungle Book- Rudyard Kipling- books- ""The ...",-0.257864,0.364313
236,The Jungle,Upton Sinclair,books,1,1,0,"""The Jungle"" by Upton Sinclair is a novel writ...",meat industry trade fiction working class fict...,"The Jungle- Upton Sinclair- books- ""The Jungle...",-0.288391,0.349904
525,The Beast in the Jungle,Henry James,books,1,1,0,"""The Beast in the Jungle"" by Henry James is a ...",conduct life fiction fate fatalism fiction man...,"The Beast in the Jungle- Henry James- books- ""...",-0.271277,0.348216
621,Tarzan of the Apes,Edgar Rice Burroughs,books,1,1,0,"""Tarzan of the Apes"" by Edgar Rice Burroughs i...",tarzan fictitious character fiction fiction fa...,Tarzan of the Apes- Edgar Rice Burroughs- book...,-0.271380,0.323291
192,The call of the wild,Jack London,books,1,1,0,"""The Call of the Wild"" by Jack London is a nov...",dogs fiction adventure stories nature stories ...,"The call of the wild- Jack London- books- ""The...",-0.217275,0.321084
...,...,...,...,...,...,...,...,...,...,...,...
1702,300: Rise of an Empire,Noam Murro Sullivan Stapleton Eva Green Lena H...,films,1,99,0,Greek general Themistokles leads the charge ag...,action drama fantasy,300: Rise of an Empire- Noam Murro Sullivan St...,-0.046186,-0.078719
900,Proofs of a Conspiracy against all the Religio...,John Robison,books,1,1,0,"""Proofs of a Conspiracy against all the Religi...",freemasonry early works to early works to,Proofs of a Conspiracy against all the Religio...,0.152001,-0.083474
2069,Sisters,Jason Moore Amy Poehler Tina Fey Maya Rudolph ...,films,1,99,0,Two sisters decide to throw one last house par...,comedy,Sisters- Jason Moore Amy Poehler Tina Fey Maya...,0.193201,-0.083817
1731,Pandorum,Christian Alvart Dennis Quaid Ben Foster Cam G...,films,1,99,0,A pair of crew members aboard a spaceship wake...,action horror mystery,Pandorum- Christian Alvart Dennis Quaid Ben Fo...,0.031729,-0.091445


#Puesta en marcha

##Funciones

In [None]:
def app(df_all,input_1,input_2,input_3,input_4):
  sentiment_score=input_sentiment_analysis(input_1)
  df_4 = semantic_search(input_4, df_all, 'embeddings_data.pt')
  df_2 = players_filter(df_4,input_2)
  df_3 = parental_filter(df_2,input_3)
  df_1 = sentiment_filter(df_3, sentiment_score)
  df_1 = df_1.sort_values(by='Similitud', ascending=False).head(10)
  df_output = df_1.drop(columns=['maxplayers', 'minplayers', 'allow_kids', 'Data', 'CS_Happy', 'Similitud'], errors='ignore')
  return df_1,df_2,df_3,df_4,df_output

##Uso

In [None]:
#1.¿Cómo estás hoy?
input_1 = "me siento un poco triste"
#2.¿Cuántas personas juegan?
input_2 = 1
#3.¿Hay niños? (S/N)
input_3 = "N"
#4.¿De qué tenés ganas?
input_4 = 'Quiero historia de amor en la selva'

In [None]:
df_4,df_2,df_3,df_1,df_output = app(df_all,input_1,input_2,input_3,input_4)



Frase: 'me siento un poco triste'
Sentimiento: 0.00, Confianza: 0.497



  return torch.load(file_path, map_location=torch.device('cpu'))


In [None]:
df_output.head()

Unnamed: 0,Title,People,Class,Description,Genre
1386,The Legend of Tarzan,David Yates Alexander Skarsgård Rory J. Saper ...,films,Tarzan having acclimated to life in London is ...,action adventure drama
2266,Endless Love,Shana Feste Gabriella Wilde Alex Pettyfer Bruc...,films,The story of a privileged girl and a charismat...,drama romance
1726,Hunt for the Wilderpeople,Taika Waititi Sam Neill Julian Dennison Rima T...,films,A national manhunt is ordered for a rebellious...,adventure comedy drama
697,Tales and Stories,Mary Wollstonecraft Shelley,books,"""Tales and Stories"" by Mary Wollstonecraft She...",fiction th century short stories
1098,Bible animals :,J. G. Wood,books,"""Bible Animals: Being a Description of Every L...",animals in the nature in the
