# Fase 1. Recolecci√≥n y organizaci√≥n de los datos

En docs "lineamientos_nomenclatura_y_citacion.md" est√° la informaci√≥n por si quieren guiarse y revisar.

# Fase 2. Procesamiento de texto

## Extracci√≥n del texto

In [None]:
from PyPDF2 import PdfReader
import os, re

# Directorio base del proyecto
BASE_DIR = "data"
RAW_PDF_DIR = os.path.join(BASE_DIR, "apuntes_raw")
OUT_RAW_DIR = os.path.join(BASE_DIR, "apuntes_clean", "raw")

# Crear la carpeta de salida si no existe
os.makedirs(OUT_RAW_DIR, exist_ok=True)

# Buscar todos los PDFs en la carpeta apuntes_raw
pdf_files = [f for f in os.listdir(RAW_PDF_DIR) if f.lower().endswith(".pdf")]

for pdf_file in pdf_files:
    pdf_path = os.path.join(RAW_PDF_DIR, pdf_file)
    txt_name = os.path.splitext(pdf_file)[0] + ".txt"
    out_txt_path = os.path.join(OUT_RAW_DIR, txt_name)

    reader = PdfReader(pdf_path)
    text_pages = []
    for page in reader.pages:
        text_pages.append(page.extract_text() or "")
    
    # Unir todas las p√°ginas con doble salto de l√≠nea
    full_text = "\n\n".join(text_pages)
    with open(out_txt_path, "w", encoding="utf-8") as f:
        f.write(full_text)

print(f"‚úÖ {len(pdf_files)} archivos extra√≠dos correctamente a {OUT_RAW_DIR}")


ModuleNotFoundError: No module named 'PyPDF2'

## Normalizaci√≥n de txt

In [None]:
import os, re, unicodedata

BASE_DIR = "data"
CLEAN_IN_DIR  = os.path.join(BASE_DIR, "apuntes_clean", "raw")        # carpeta de entrada
CLEAN_OUT_DIR = os.path.join(BASE_DIR, "apuntes_clean", "normalized") # carpeta de salida
os.makedirs(CLEAN_OUT_DIR, exist_ok=True)

def quitar_tildes_y_reparar_espacios(texto: str) -> str:
    # 1) Normalizaci√≥n Unicode para exponer diacr√≠ticos combinantes
    t = unicodedata.normalize("NFD", texto)

    # 2) Reemplazos t√≠picos de PDF 
    t = (t.replace("\u00A0", " ")      # NBSP -> espacio normal
           .replace("\u00AD", "")      # soft hyphen -> nada
           .replace("Ô¨Å", "fi").replace("Ô¨Ç", "fl")  # ligaduras
           .replace("\u0131", "i")     # ƒ± (i sin punto) -> i
           .replace("Àô", "").replace("`", "").replace("¬®", "").replace("ÀÜ", ""))

    # 3) Unir SOLO cuando hay acento suelto entre letras: "implementaci ¬¥on" -> "implementacion"
    t = re.sub(r"([A-Za-z√±√ë])\s*[\u00B4\u0301]\s*([A-Za-z√±√ë])", r"\1\2", t)

    # 4) Convertir virgulilla suelta (~ o \u02DC) en √ë/√± cuando corresponde (p.ej. "tama Àúno" -> "tama√±o")
    #    a) letra + ~ + n/N
    t = re.sub(r"([A-Za-z√±√ë])\s*[\u02DC~]\s*([Nn])",
               lambda m: m.group(1) + ("√ë" if m.group(2).isupper() else "√±"),
               t)
    #    b) ~ al inicio o tras espacio antes de n/N + vocal (p.ej. " Àúno " -> " √±o ")
    t = re.sub(r"(?<!\S)[\u02DC~]\s*([Nn])(?=[aeiou√°√©√≠√≥√∫AEIOU√Å√â√ç√ì√ö])",
               lambda m: ("√ë" if m.group(1).isupper() else "√±"),
               t)

    # 5) Eliminar diacr√≠ticos (tildes) PERO conservar √±/√ë
    t = ''.join(c for c in t if unicodedata.category(c) != 'Mn' or c.lower() == '√±')

    # 6) Limpieza suave: colapsar espacios repetidos y limitar saltos
    t = re.sub(r"[ \t]+", " ", t)
    t = re.sub(r"\n{3,}", "\n\n", t)

    return t.strip()

# Aplicar a todos los .txt de entrada
count = 0
for fname in os.listdir(CLEAN_IN_DIR):
    if not fname.lower().endswith(".txt"):
        continue

    with open(os.path.join(CLEAN_IN_DIR, fname), "r", encoding="utf-8", errors="ignore") as f:
        raw = f.read()

    norm = quitar_tildes_y_reparar_espacios(raw)

    with open(os.path.join(CLEAN_OUT_DIR, fname), "w", encoding="utf-8") as f:
        f.write(norm)
    count += 1

print(f"‚úÖ {count} archivos corregidos y guardados en {CLEAN_OUT_DIR}")


‚úÖ 46 archivos corregidos y guardados en data\apuntes_clean\normalized


## Segmentaci√≥n (P√°rrafos y Ventanas Deslizantes)

In [None]:
import os, re, csv, shutil
from statistics import mean

BASE_DIR = "data"
INPUT_DIR = os.path.join(BASE_DIR, "apuntes_clean", "normalized")

# --- P√°rrafos ---
PAR_MIN_CHARS = 480
PAR_MAX_CHARS = 2000
MERGE_TITLES = True
TITLE_MAX_CHARS = 140

# --- Ventanas deslizantes ---
WIN_WORDS = 240
WIN_OVERLAP = 0.20
WIN_STRIDE = max(1, int(WIN_WORDS * (1 - WIN_OVERLAP)))

# --- Salidas ---
OUT_PAR_DIR = os.path.join(BASE_DIR, "chunks_paragraphs")
OUT_WIN_DIR = os.path.join(BASE_DIR, "chunks_sliding")

# Limpiar salidas anteriores para √≠ndices consistentes
for d in (OUT_PAR_DIR, OUT_WIN_DIR):
    if os.path.exists(d):
        shutil.rmtree(d)
    os.makedirs(d, exist_ok=True)

IDX_PAR_CSV = os.path.join(OUT_PAR_DIR, "index_paragraphs.csv")
IDX_WIN_CSV = os.path.join(OUT_WIN_DIR, "index_sliding.csv")
SUMMARY_CSV = os.path.join(BASE_DIR, "chunks_summary.csv")

# ===================== UTILIDADES =====================
def read_txt(path):
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        return f.read()

def safe_filename_stem(fname):
    return os.path.splitext(os.path.basename(fname))[0]

def split_paragraphs(text):
    raw_pars = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()]
    pars = []
    for p in raw_pars:
        p = re.sub(r"[ \t]+", " ", p).strip()
        pars.append(p)
    return pars

def split_sentences(p):
    parts = re.split(r"(?<=[\.\?\!])\s+", p)
    return [s.strip() for s in parts if s.strip()]

def repartition_long_paragraph(p, max_chars):
    if len(p) <= max_chars:
        return [p]
    sent = split_sentences(p)
    chunks, buf = [], ""
    for s in sent:
        if not buf:
            buf = s
        elif len(buf) + 1 + len(s) <= max_chars:
            buf = buf + " " + s
        else:
            chunks.append(buf.strip())
            buf = s
    if buf:
        chunks.append(buf.strip())
    final = []
    for c in chunks:
        if len(c) <= max_chars:
            final.append(c)
        else:
            for i in range(0, len(c), max_chars):
                final.append(c[i:i+max_chars].strip())
    return final

def fuse_short_paragraphs(pars, min_chars, merge_titles, title_max):
    out = []
    i = 0
    while i < len(pars):
        cur = pars[i]
        is_title_like = merge_titles and (len(cur) <= title_max and "\n" not in cur and len(cur.split()) <= 16)
        if is_title_like and i + 1 < len(pars):
            merged = (cur + " ‚Äî " + pars[i+1]).strip()
            out.append(merged)
            i += 2
            continue
        if len(cur) < min_chars and i + 1 < len(pars):
            merged = (cur + " " + pars[i+1]).strip()
            out.append(merged)
            i += 2
        else:
            out.append(cur)
            i += 1
    return out

def ensure_dir(d):
    if not os.path.exists(d):
        os.makedirs(d, exist_ok=True)

def write_chunk(path, text):
    ensure_dir(os.path.dirname(path))
    with open(path, "w", encoding="utf-8") as f:
        f.write(text.strip())

def word_tokenize(text):
    return re.findall(r"\S+", text)

# ===================== M√âTODO A: P√ÅRRAFOS =====================
par_rows = []
summary_rows = []

files = [f for f in os.listdir(INPUT_DIR) if f.lower().endswith(".txt")]
for fname in sorted(files):
    path_in = os.path.join(INPUT_DIR, fname)
    base = safe_filename_stem(fname)
    out_dir_doc = os.path.join(OUT_PAR_DIR, base)
    ensure_dir(out_dir_doc)

    txt = read_txt(path_in)
    pars = split_paragraphs(txt)
    pars = fuse_short_paragraphs(pars, PAR_MIN_CHARS, MERGE_TITLES, TITLE_MAX_CHARS)

    final_pars = []
    for p in pars:
        final_pars.extend(repartition_long_paragraph(p, PAR_MAX_CHARS))

    lengths = []
    for idx, chunk in enumerate(final_pars, start=1):
        chunk_name = f"chunk_{idx:04d}.txt"
        out_path = os.path.join(out_dir_doc, chunk_name)
        write_chunk(out_path, chunk)
        lengths.append(len(chunk))
        par_rows.append({
            "filename_base": base,
            "method": "paragraphs",
            "chunk_id": f"{base}-p-{idx:04d}",
            "chunk_path": os.path.relpath(out_path, BASE_DIR).replace("\\","/"),
            "char_len": len(chunk),
            "word_len": len(chunk.split()),
            "paragraph_idx": idx
        })

    summary_rows.append({
        "filename_base": base,
        "method": "paragraphs",
        "n_chunks": len(lengths),
        "char_mean": round(mean(lengths), 1) if lengths else 0,
        "pct_short_<300": round(100*sum(l<300 for l in lengths)/len(lengths), 1) if lengths else 0
    })

if par_rows:
    with open(IDX_PAR_CSV, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=list(par_rows[0].keys()))
        writer.writeheader()
        writer.writerows(par_rows)

# ===================== M√âTODO B: VENTANAS DESLIZANTES =====================
win_rows = []
for fname in sorted(files):
    path_in = os.path.join(INPUT_DIR, fname)
    base = safe_filename_stem(fname)
    out_dir_doc = os.path.join(OUT_WIN_DIR, base)
    ensure_dir(out_dir_doc)

    txt = read_txt(path_in)
    words = word_tokenize(txt)
    n = len(words)
    lengths = []

    if n == 0:
        summary_rows.append({
            "filename_base": base,
            "method": "sliding",
            "n_chunks": 0, "char_mean": 0, "pct_short_<300": 0
            })
        continue

    idx = 0
    win_id = 1
    while idx < n:
        end = min(n, idx + WIN_WORDS)
        w_chunk = words[idx:end]
        chunk = " ".join(w_chunk).strip()
        if not chunk:
            break

        chunk_name = f"chunk_{win_id:04d}.txt"
        out_path = os.path.join(out_dir_doc, chunk_name)
        write_chunk(out_path, chunk)

        lengths.append(len(chunk))
        win_rows.append({
            "filename_base": base,
            "method": "sliding",
            "chunk_id": f"{base}-w-{win_id:04d}",
            "chunk_path": os.path.relpath(out_path, BASE_DIR).replace("\\","/"),
            "char_len": len(chunk),
            "word_len": len(w_chunk),
            "start_word": idx,
            "end_word": end
        })

        win_id += 1
        if end == n:
            break
        idx += WIN_STRIDE

    summary_rows.append({
        "filename_base": base,
        "method": "sliding",
        "n_chunks": len(lengths),
        "char_mean": round(mean(lengths), 1) if lengths else 0,
        "pct_short_<300": round(100*sum(l<300 for l in lengths)/len(lengths), 1) if lengths else 0
    })

if win_rows:
    with open(IDX_WIN_CSV, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=list(win_rows[0].keys()))
        writer.writeheader()
        writer.writerows(win_rows)

# ===================== RESUMEN =====================
if summary_rows:
    with open(SUMMARY_CSV, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=list(summary_rows[0].keys()))
        writer.writeheader()
        writer.writerows(summary_rows)

print("‚úÖ Segmentaci√≥n regenerada con par√°metros ajustados.")
print(f" - √çndice p√°rrafos:  {IDX_PAR_CSV}")
print(f" - √çndice ventanas:  {IDX_WIN_CSV}")
print(f" - Resumen:          {SUMMARY_CSV}")
print(f" - Carpeta chunks A: {OUT_PAR_DIR}")
print(f" - Carpeta chunks B: {OUT_WIN_DIR}")


‚úÖ Segmentaci√≥n regenerada con par√°metros ajustados.
 - √çndice p√°rrafos:  data\chunks_paragraphs\index_paragraphs.csv
 - √çndice ventanas:  data\chunks_sliding\index_sliding.csv
 - Resumen:          data\chunks_summary.csv
 - Carpeta chunks A: data\chunks_paragraphs
 - Carpeta chunks B: data\chunks_sliding


# Fase 3. Tokenizaci√≥n y Embeddings

In [None]:
import os, csv, time, math
from typing import List
from tqdm import tqdm

import chromadb
from chromadb.config import Settings

# (A) ‚Äî‚Äî CONFIGURACI√ìN GENERAL 
BASE_DIR       = "data"
INDEX_CSV      = os.path.join(BASE_DIR, "chunks_sliding", "index_sliding.csv")   # puedes cambiar a p√°rrafos si quieres
PERSIST_DIR    = os.path.join(BASE_DIR, "vectorstores", "chroma_sliding_openai_v1")
COLLECTION_NAME= "ai_apuntes_sliding_openai_v1"

# Proveedor de embeddings: "openai" o "local"
PROVIDER       = "openai"       
OPENAI_MODEL   = "text-embedding-3-small"      
LOCAL_MODEL    = "all-MiniLM-L6-v2"           

BATCH_SIZE     = 128            # tama√±o de lote para ingesti√≥n
MAX_RETRIES    = 5              # reintentos por rate-limit/errores transitorios
RETRY_BASE_SEC = 2              # backoff exponencial

# (B) ‚Äî‚Äî SETUP DE EMBEDDINGS 
embed_dims = None

if PROVIDER == "openai":
    from openai import OpenAI
    OPENAI_API_KEY = "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" 
    
    oai_client = OpenAI(api_key=OPENAI_API_KEY)

    def embed_texts(texts: List[str]) -> List[List[float]]:
        """Embeddings con OpenAI + reintentos."""
        for attempt in range(MAX_RETRIES):
            try:
                resp = oai_client.embeddings.create(
                    model=OPENAI_MODEL,
                    input=texts
                )
                vecs = [d.embedding for d in resp.data]
                return vecs
            except Exception as e:
                wait = RETRY_BASE_SEC * (2 ** attempt)
                print(f"[WARN] Error {e}. Reintentando en {wait}s...")
                if attempt == MAX_RETRIES - 1:
                    raise
                time.sleep(wait)

elif PROVIDER == "local":
    from sentence_transformers import SentenceTransformer
    st_model = SentenceTransformer(LOCAL_MODEL)

    def embed_texts(texts: List[str]) -> List[List[float]]:
        return st_model.encode(texts, convert_to_numpy=False, normalize_embeddings=False).tolist()

else:
    raise ValueError("PROVIDER debe ser 'openai' o 'local'.")

# (C) ‚Äî‚Äî INICIALIZAR CHROMA PERSISTENTE 
os.makedirs(PERSIST_DIR, exist_ok=True)

client = chromadb.PersistentClient(
    path=PERSIST_DIR,
    settings=Settings(is_persistent=True)
)

# Crear o recuperar la colecci√≥n
try:
    collection = client.get_collection(COLLECTION_NAME)
except:
    collection = client.create_collection(
        name=COLLECTION_NAME,
        metadata={"hnsw:space": "cosine"}  # distancia coseno
    )

# (D) ‚Äî‚Äî UTILIDADES 
def read_index_rows(csv_path: str):
    rows = []
    with open(csv_path, "r", encoding="utf-8", newline="") as f:
        reader = csv.DictReader(f)
        for r in reader:
            rows.append(r)
    return rows

def load_chunk_text(chunk_path: str) -> str:
    """
    Abre el archivo de texto del chunk, corrigiendo rutas relativas.
    """
    # Si el path ya incluye "data/", se queda tal cual
    if not os.path.isabs(chunk_path):
        # Si empieza por "data/", lo consideramos relativo al proyecto
        if chunk_path.startswith("data/") or chunk_path.startswith(".\\data\\") or chunk_path.startswith(".\\chunks_"):
            path = os.path.normpath(chunk_path)
        else:
            # Si viene solo 'chunks_sliding/...', le anteponemos 'data/'
            path = os.path.join("data", chunk_path)
    else:
        path = chunk_path

    if not os.path.exists(path):
        raise FileNotFoundError(f"No se encontr√≥ el archivo: {path}")

    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        return f.read().strip()


# (E) ‚Äî‚Äî CARGAR √çNDICE Y PREPARAR INGESTA 
rows = read_index_rows(INDEX_CSV)
print(f"Documentos (unique filename_base): {len(set(r['filename_base'] for r in rows))}")
print(f"Total de chunks en √≠ndice: {len(rows)}")

# (F) ‚Äî‚Äî INGESTA EN LOTES CON EMBEDDINGS 
ids, docs, metas = [], [], []

def flush_batch():
    if not ids:
        return
    # Calcula embeddings del batch actual
    vecs = embed_texts(docs)
    # upsert = idempotente: si ya existe el id, lo actualiza
    collection.upsert(ids=ids, documents=docs, metadatas=metas, embeddings=vecs)
    ids.clear(); docs.clear(); metas.clear()

for r in tqdm(rows, desc="Ingestando chunks en Chroma"):
    chunk_id   = r["chunk_id"]              # ej: <base>-w-0001
    chunk_path = r["chunk_path"]            # ej: data/chunks_sliding/<base>/chunk_0001.txt
    text       = load_chunk_text(chunk_path)
    if not text:
        continue

    ids.append(chunk_id)
    docs.append(text)
    metas.append({
        "filename_base": r.get("filename_base", ""),
        "method":       r.get("method", "sliding"),
        "chunk_path":   r.get("chunk_path", ""),
        "char_len":     int(r.get("char_len", 0)),
        "word_len":     int(r.get("word_len", 0)),
        "start_word":   int(r.get("start_word", 0)),
        "end_word":     int(r.get("end_word", 0)),
    })

    if len(ids) >= BATCH_SIZE:
        flush_batch()

# √∫ltimo lote
flush_batch()

print("‚úÖ Embeddings generados e indexados.")
print("üìö Collection:", COLLECTION_NAME, "| count =", collection.count())
print("üíæ Persist dir:", PERSIST_DIR)

Documentos (unique filename_base): 46
Total de chunks en √≠ndice: 386


Ingestando chunks en Chroma: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 386/386 [00:11<00:00, 33.70it/s]


‚úÖ Embeddings generados e indexados.
üìö Collection: ai_apuntes_sliding_openai_v1 | count = 386
üíæ Persist dir: data\vectorstores\chroma_sliding_openai_v1


## Prueba de Query

In [None]:
# Tooo este c√≥digo era una prueba para revisar si los embeddings se hab√≠an hecho bien

from openai import OpenAI
oai_client = OpenAI(api_key="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")
OPENAI_MODEL = "text-embedding-3-small"

QUESTION = "¬øQu√© es un autoencoder y c√≥mo se entrena?"
TOP_K = 5

# 1) Embeber la pregunta con OpenAI
qvec = oai_client.embeddings.create(
    model=OPENAI_MODEL,
    input=QUESTION
).data[0].embedding

# 2) Consultar 
res = collection.query(
    query_embeddings=[qvec],
    n_results=TOP_K,
    include=["metadatas", "distances", "documents"]
)

for rank, (doc, meta, dist) in enumerate(zip(res["documents"][0],
                                             res["metadatas"][0],
                                             res["distances"][0]), start=1):
    print(f"\n#{rank}  dist={dist:.4f}  base={meta.get('filename_base')}  palabras={meta.get('word_len')}")
    print(meta.get("chunk_path"))
    print(doc[:500].replace("\n"," ") + ("..." if len(doc)>500 else ""))



#1  dist=0.3059  base=11_Semana_AI_20251014_3_AlexStevenNaranjoMasƒ±s_EmbeddingsAutoencoders  palabras=240
chunks_sliding/11_Semana_AI_20251014_3_AlexStevenNaranjoMasƒ±s_EmbeddingsAutoencoders/chunk_0005.txt
2. Estructura basica de un Autoencoder. V. AUTOENCODERS(CODIFICADORESAUTOMATICOS) A. Estructura General y Objetivo Encoder‚ÜíEspacio Latente‚ÜíDecoder Aprenden a reconstruir la entrada. Aunque la se√±al de entre- namiento es auto-supervisada (salida = entrada), se consideran tipicamente metodos no supervisados por no requerir etiquetas externas. B. Componentes y Variantes Encoder:reduce espacialidad y comprime informacion (conv +downsampling). Latente:vector/tensor compacto; su tama√±o controla capaci...

#2  dist=0.3154  base=11_Semana_AI_20251014_1_JuanDiegoJimenezValverde_CNNYAutoencoders  palabras=240
chunks_sliding/11_Semana_AI_20251014_1_JuanDiegoJimenezValverde_CNNYAutoencoders/chunk_0007.txt
sonutiles, por ejemplo, en aplicaciones medicas para resaltar fracturas o anomalia

# Fase 4. Herramientas

In [None]:
# ========================= RAG: retrieve + answer =========================
# Esta fue una prueba de curiosidad con el RAG jaja, la idea es que quede bien y se formalice como herramienta
from openai import OpenAI
import textwrap

OPENAI_API_KEY = "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"  # ya la usas arriba
ANSWER_MODEL   = "gpt-4o-mini"    

oai = OpenAI(api_key=OPENAI_API_KEY)

def embed_query(q: str, model="text-embedding-3-small"):
    return oai.embeddings.create(model=model, input=q).data[0].embedding

def retrieve(query: str, top_k: int = 5, as_similarity: bool = True):
    qvec = embed_query(query)
    res = collection.query(
        query_embeddings=[qvec],
        n_results=top_k,
        include=["documents", "metadatas", "distances"]
    )
    out = []
    for doc, meta, dist in zip(res["documents"][0], res["metadatas"][0], res["distances"][0]):
        score = (1 - dist) if as_similarity else dist
        out.append({"text": doc, "meta": meta, "score": score})
    return out

def build_context(chunks):
    blocks = []
    for i, c in enumerate(chunks, 1):
        header = f"[Fuente {i}] base={c['meta'].get('filename_base')} | palabras={c['meta'].get('word_len')} | score={c['score']:.3f}"
        body = textwrap.shorten(c["text"].replace("\n", " "), width=1200, placeholder=" ...")
        blocks.append(header + "\n" + body)
    return "\n\n".join(blocks)

def answer_rag(query: str, top_k: int = 5):
    chunks = retrieve(query, top_k=top_k, as_similarity=True)
    context = build_context(chunks)
    system = (
        "Eres un asistente t√©cnico. Responde SOLO usando el CONTEXTO si es suficiente. "
        "Si faltan datos, dilo y sugiere pasos. Devuelve una respuesta clara y breve, con bullets si ayuda."
    )
    user = f"Pregunta:\n{query}\n\nCONTEXTO:\n{context}\n\nInstrucci√≥n: responde citando [Fuente i] cuando corresponda."
    resp = oai.chat.completions.create(
        model=ANSWER_MODEL,
        messages=[{"role":"system","content":system},{"role":"user","content":user}],
        temperature=0.2
    )
    return resp.choices[0].message.content, chunks

# Ejemplo de uso:
query = "¬øQu√© es un autoencoder y c√≥mo se entrena?"
respuesta, fuentes = answer_rag(query, top_k=5)
print("üß† Respuesta RAG:\n", respuesta)
print("\nüîó Fuentes:")
for i, f in enumerate(fuentes, 1):
    print(f"[Fuente {i}] {f['meta'].get('filename_base')}  |  score={f['score']:.3f}  |  {f['meta'].get('chunk_path')}")
# ========================================================================


üß† Respuesta RAG:
 El contexto proporcionado no incluye informaci√≥n sobre el color del cielo. 

**Sugerencia:**
- Puedes investigar sobre la dispersi√≥n de la luz en la atm√≥sfera o consultar fuentes sobre meteorolog√≠a para obtener una respuesta precisa.

üîó Fuentes:
[Fuente 1] 1_Semana_AI_20250807_1_RodolfoDavidAcunaLopez_IntroduccionIA  |  score=0.224  |  chunks_sliding/1_Semana_AI_20250807_1_RodolfoDavidAcunaLopez_IntroduccionIA/chunk_0005.txt
[Fuente 2] 6_Semana_AI_20250911_1_AndreyUrenaBermudez_RegresionLogisticaGradiente  |  score=0.218  |  chunks_sliding/6_Semana_AI_20250911_1_AndreyUrenaBermudez_RegresionLogisticaGradiente/chunk_0001.txt
[Fuente 3] 6_Semana_AI_20250911_2_SahidRojasChacon_VerosimilitudRegresionLogistica  |  score=0.217  |  chunks_sliding/6_Semana_AI_20250911_2_SahidRojasChacon_VerosimilitudRegresionLogistica/chunk_0001.txt
[Fuente 4] 4_Semana_AI_20250826_2_LuisFelipeCalderonPerez_KNNRegresionLineal  |  score=0.207  |  chunks_sliding/4_Semana_AI_20250826_2_

# Fase 5.  Perfil, orquestacion y memoria del agente LLM

# Fase 6. Aplicaci√≥n