# Merge dateset files

In [None]:
import pandas as pd
import os

Carpeta donde están tus CSV

In [None]:
DATA_DIR = "data"

Archivos a combinar

In [None]:
files = [
    "netflix_titles.csv",
    "amazon_prime_titles.csv",
    "hulu_titles.csv",
    "disney_plus_titles.csv"
]

Lista para guardar los DataFrames

In [None]:
dataframes = []

In [None]:
for f in files:
    path = os.path.join(DATA_DIR, f)
    if os.path.exists(path):
        df = pd.read_csv(path)
        df["platform"] = f.split("_")[0].capitalize()
        dataframes.append(df)
    else:
        print(f" No se encontró: {path}")

Combinar todos los DataFrames

In [None]:
merged = pd.concat(dataframes, ignore_index=True)

Normalizar nombres de columnas

In [None]:
merged.columns = [c.lower().strip() for c in merged.columns]

Columnas que queremos mantener (solo las que existen)

In [None]:
keep_cols = ["title", "description", "cast", "director", "country",
             "release_year", "duration", "listed_in", "platform"]

In [None]:
existing_cols = [c for c in keep_cols if c in merged.columns]
merged = merged[existing_cols]

Renombrar listed_in -> genre

In [None]:
if "listed_in" in merged.columns:
    merged.rename(columns={"listed_in": "genre"}, inplace=True)

Limpiar datos vacíos

In [None]:
merged.dropna(subset=["title", "description"], inplace=True)
merged.drop_duplicates(subset=["title", "platform"], inplace=True)

Crear carpeta clean si no existe

In [None]:
os.makedirs("data/clean", exist_ok=True)

Guardar CSV limpio

In [None]:
merged.to_csv("data/clean/tv_shows_complete.csv", index=False)

# Clean dataset

Ruta al CSV combinado

In [None]:
input_csv = "data/clean/tv_shows_complete.csv"

Cargar CSV

In [None]:
df = pd.read_csv(input_csv)

Detectar columnas sin valores vacíos

In [None]:
complete_cols = [col for col in df.columns if df[col].notna().all()]

Crear nuevo DataFrame solo con esas columnas

In [None]:
df_complete = df[complete_cols]

Crear carpeta clean si no existe

In [None]:
os.makedirs("data/clean", exist_ok=True)


Guardar CSV limpio solo con columnas completas

In [None]:
output_csv = "data/clean/tv_shows_clean.csv"
df_complete.to_csv(output_csv, index=False)

# Generate embeddinggs

In [None]:
from llama_index.embeddings.ollama import OllamaEmbedding

Configuración

In [None]:
INPUT_CSV = "data/clean/tv_shows_clean.csv"
OUTPUT_CSV = "data/clean/tv_shows_embeddings.csv"
MODEL_NAME = "mxbai-embed-large:latest"
BASE_URL = "http://localhost:11434"

Cargar CSV

In [None]:
df = pd.read_csv(INPUT_CSV)

Crear modelo de embeddings

In [None]:
emb_model = OllamaEmbedding(
    model_name=MODEL_NAME,
    base_url=BASE_URL
)

Función para combinar texto relevante

In [None]:
def text_for_embedding(row):
    return f"{row['title']} {row['description']} {row['genre']}"

Generar embeddings

In [None]:
embeddings = emb_model.get_text_embedding_batch(
    [text_for_embedding(r) for _, r in df.iterrows()],
    show_progress=True
)

Añadir embeddings al DataFrame

In [None]:
df['embedding'] = embeddings

Guardar CSV con embeddings

In [None]:
os.makedirs(os.path.dirname(OUTPUT_CSV), exist_ok=True)
df.to_csv(OUTPUT_CSV, index=False)

# Importing data to Neo4j

In [None]:
from neo4j import GraphDatabase
import ast

Configuración

In [None]:
CSV_PATH = "data/clean/tv_shows_embeddings.csv"
NEO4J_URI = "neo4j://127.0.0.1:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = ""
NEO4J_DB = "neo4j"

Conectar a Neo4j

In [None]:
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

Función para crear nodos

In [None]:
def crear_nodo(tx, show):
    tx.run("""
    CREATE (s:TVShow {
        title: $title,
        description: $description,
        release_year: $release_year,
        genre: $genre,
        platform: $platform,
        embedding: $embedding
    })
    """, **show)

Leer CSV

In [None]:
df = pd.read_csv(CSV_PATH)

Convertir la columna embedding de string a lista si es necesario

In [None]:
if isinstance(df['embedding'].iloc[0], str):
    df['embedding'] = df['embedding'].apply(ast.literal_eval)

Insertar en Neo4j

In [None]:
with driver.session(database=NEO4J_DB) as session:
    for _, row in df.iterrows():
        show = {
            "title": row['title'],
            "description": row['description'],
            "release_year": int(row['release_year']),
            "genre": row['genre'],
            "platform": row['platform'],
            "embedding": row['embedding']
        }
        session.execute_write(crear_nodo, show)


Cerrar la conexión

In [None]:
driver.close()

# Grok recommender

In [None]:
import numpy as np
from neo4j import GraphDatabase

from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.core.base.llms.types import ChatMessage

from llama_index.llms.groq import Groq

class GroqRecommender:
    def __init__(
            self,
            neo4j_uri="neo4j://127.0.0.1:7687",
            neo4j_user="neo4j",
            neo4j_password="",
            neo4j_db="neo4j",
            model_name_embed="mxbai-embed-large:latest",
            model_name_llm="llama-3.1-8b-instant",
            groq_api_key="",
            top_k=5,
    ):
        self.neo4j_uri = neo4j_uri
        self.neo4j_user = neo4j_user
        self.neo4j_password = neo4j_password
        self.neo4j_db = neo4j_db
        self.top_k = top_k

        self.driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_user, neo4j_password))

        self.emb_model = OllamaEmbedding(model_name=model_name_embed)


        self.llm = Groq(model=model_name_llm, api_key=groq_api_key)

    def _coseno_sim(self, a, b):
        a, b = np.array(a), np.array(b)
        return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

    def _get_shows(self):

        with self.driver.session(database=self.neo4j_db) as session:
            result = session.run("""
                MATCH (s:TVShow)
                RETURN s.title AS title, s.description AS description,
                       s.genre AS genre, s.platform AS platform,
                       s.embedding AS embedding
            """)
            shows = []
            for record in result:
                emb = record.get("embedding")
                if emb:
                    shows.append({
                        "title": record["title"],
                        "description": record["description"],
                        "genre": record["genre"],
                        "platform": record["platform"],
                        "embedding": np.array(emb, dtype=float)
                    })
            return shows

    def recommend(self, query: str) -> str:
        try:
            query_emb = self.emb_model.get_text_embedding_batch([query])[0]
            shows = self._get_shows()
            if not shows:
                return "No hay series disponibles en la base de datos."

            sims = [(self._coseno_sim(query_emb, s["embedding"]), s) for s in shows]
            sims = sorted(sims, reverse=True, key=lambda x: x[0])
            top_shows = [s[1] for s in sims[:self.top_k]]

            print("List by embeddings: ")
            print({chr(10).join(
                [f"- {s['title']} ({s['genre']}, {s['platform']}) — {s['description']}" for s in top_shows])})

            prompt = f"""
            You are an expert in TV and streaming entertainment.
            Your goal is to help the user discover TV shows based on their message.

            Instructions:
            - If the user ask for any other topic just tell them that you are only gonna help them with tv shows recommendations..
            - Recommendations:
            {chr(10).join([f"- {s['title']} ({s['genre']}, {s['platform']}) — {s['description']}" for s in top_shows])}

            User Query: "{query}"

            Respond in a warm tone recommending the best matches from the list above.
            """

            messages = [ChatMessage(role="user", content=prompt)]

            respuesta = self.llm.chat(messages)

            print("LLM Response: ")
            print(respuesta.message.content)
            return respuesta.message.content

        except Exception as e:
            return f"Error al generar recomendación: {str(e)}"

    def close(self):
        self.driver.close()

# FastAPI server

In [None]:
import os
import asyncio
import string
import random
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse
from fastapi.staticfiles import StaticFiles
import uvicorn
import re
from scripts.groq_recommender import GroqRecommender
from scripts.main import process_content

In [None]:
app = FastAPI()
llm_recommender = GroqRecommender()

BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
FRONT_DIR = os.path.join(BASE_DIR, "front")

Almacen temporal de resultados

In [None]:
results = {}

In [None]:
def format_bold(text):
    # Compilar la expresión regular
    pattern = re.compile(r"\*\*\s*(.*?)\s*\*\*")

    # Reemplazar todas las coincidencias
    def repl(match):
        return f"<b>{match.group(1)}</b>"

    return pattern.sub(repl, text)

In [None]:
def generate_random_id(length=8):
    return ''.join(random.choices(string.ascii_lowercase + string.digits, k=length))


Endpoint send

In [None]:



@app.post("/send")
async def receive_content(request: Request):
    data = await request.json()
    content = data.get("content", "")

    # Generar ID y registrar como "not ready"
    link_id = generate_random_id()
    results[link_id] = {"status": "not ready", "message": content}

    # Iniciar proceso asíncrono para procesar el texto
    asyncio.create_task(process_content(link_id))

    # Devolver link al cliente
    return JSONResponse({"link": f"/result/{link_id}"})

In [None]:
async def process_content(link_id: str):
    response = llm_recommender.recommend(results[link_id]["message"])
    response = response.replace("\n", "<br>")
    response = format_bold(response)
    results[link_id]["status"] = "ready"
    results[link_id]["message"] = response

In [None]:
@app.get("/result/{link_id}")
async def get_result(link_id: str):
    result = results.get(link_id)
    if not result:
        return JSONResponse({"error": "invalid link"}, status_code=404)

    if result["status"] == "not ready":
        return JSONResponse({"status": "not ready"})
    else:
        return JSONResponse({
            "status": "ready",
            "message": result["message"]
        })



In [None]:
app.mount("/", StaticFiles(directory=FRONT_DIR, html=True), name="front")

if __name__ == "__main__":
    uvicorn.run("scripts.main:app", host="127.0.0.1", port=8000, reload=True)