In [2]:
from neo4j import GraphDatabase
import csv
import pandas as pd, unicodedata, regex, json
from pathlib import Path
from ftfy import fix_text
from pathlib import Path
import pysbd
import regex as re 

ModuleNotFoundError: No module named 'neo4j'

# Ingesta

In [7]:
# Configura tus credenciales y URL de conexión
URI = "bolt://localhost:7687" 
USER = "neo4j"
PASSWORD = "password"

# Crear driver
driver = GraphDatabase.driver(URI, auth=(USER, PASSWORD))

In [6]:
node_properties_query = """
MATCH (n)
UNWIND keys(n) AS prop
RETURN DISTINCT prop AS property_name
"""

with driver.session() as session:
    result = session.run(node_properties_query)
    columns = [record["property_name"] for record in result]

print(columns)


['doi', 'author_count', 'publication_date', 'abstract', 'title', 'scopus_id', 'neo4jImportId', 'affiliation_count', 'pk', 'name', 'country', 'city', 'auth_name', 'citation_count', 'initials', 'current_affiliation', 'first_name', 'last_name', 'updated', 'cursor', 'next_url']


In [8]:

def export_articles_to_csv():
    query = """
    MATCH (a:Article)
    WHERE a.scopus_id IS NOT NULL
      AND a.title IS NOT NULL AND a.title <> ""
      AND a.abstract IS NOT NULL AND a.abstract <> ""
      AND a.doi IS NOT NULL AND a.doi <> ""
    MATCH (au:Author)-[:WROTE]->(a)
    WITH a, collect(DISTINCT au.first_name + " " + au.last_name) AS authors
    WHERE size(authors) > 0
    OPTIONAL MATCH (a)-[:BELONGS_TO]->(af:Affiliation)
    WITH a, authors,
         collect(DISTINCT af.name)    AS affiliations,
         collect(DISTINCT af.city)    AS affiliation_cities,
         collect(DISTINCT af.country) AS affiliation_countries
    RETURN
      a.scopus_id                    AS scopus_id,
      a.title                        AS title,
      a.abstract                     AS abstract,
      a.doi                          AS doi,
      authors                        AS authors,
      affiliations                   AS affiliations,
      affiliation_cities             AS affiliation_cities,
      affiliation_countries          AS affiliation_countries,
      coalesce(a.citation_count, 0)  AS citation_count
    ORDER BY scopus_id
    """

    # Ejecuta consulta y arma DataFrame
    with driver.session() as session:
        rows = [dict(r) for r in session.run(query)]
    df = pd.DataFrame(rows)

    # Une listas con '; ' (evita introducir comas que confundan a quien lo lea a mano)
    def join_list(x):
        return "; ".join(str(v) for v in x if v) if isinstance(x, list) else x

    for col in ["authors", "affiliations", "affiliation_cities", "affiliation_countries"]:
        if col in df.columns:
            df[col] = df[col].map(join_list)

    # Orden de columnas
    df = df[[
        "title", "abstract", "doi", "authors",
        "affiliations", "affiliation_cities", "affiliation_countries",
        "citation_count", "scopus_id"
    ]]

    # Exporta con separador '|'
    # - quoting=QUOTE_MINIMAL: si algún campo contiene el separador '|', Pandas lo pondrá entre comillas.
    # - lineterminator="\n": EOL consistente.
    df.to_csv(
        "scopusdata.csv",
        index=False,
        encoding="utf-8",
        sep="|",
        quoting=csv.QUOTE_MINIMAL,
        lineterminator="\n",
    )
    print("Exportación completada: scopusdata.csv")

if __name__ == "__main__":
    export_articles_to_csv()



Exportación completada: scopusdata.csv


pseudocodigo

# normalizacion

In [None]:

# -------- Configuración --------
INPUT_CSV   = "scopusdata.csv"      # archivo con separador '|'
OUTPUT_PATH = "processed.parquet"   # salida recomendada (parquet)
REMOVE_ISOLATED_NUMBERS = False     # True si quieres quitar números sueltos

# -------- Funciones --------
def normalize_unicode_and_case(s: str) -> str:
    if not isinstance(s, str):
        return ""
    s = fix_text(s)                        # corrige codificación/caracteres raros
    s = s.replace("\u00A0", " ")           # NBSP -> espacio normal
    s = unicodedata.normalize("NFC", s)    # Unicode canónica
    s = s.lower()                          # minúsculas
    return s

def strip_non_informative(s: str, remove_numbers: bool = False) -> str:
    # Conserva letras/números/espacios y signos básicos de textos científicos
    s = regex.sub(r"[^\p{L}\p{N}\s\-\.,;:()\[\]/%]", " ", s)
    if remove_numbers:
        # Elimina números aislados; conserva casos como "co2", "iso-9001"
        s = regex.sub(r"\b\d+\b", " ", s)
    s = regex.sub(r"\s+", " ", s).strip()  # espacios
    return s

def normalize_text(s: str, remove_numbers: bool = False) -> str:
    s = normalize_unicode_and_case(s)
    s = strip_non_informative(s, remove_numbers=remove_numbers)
    return s

def safe_convert_dtypes(df: pd.DataFrame) -> pd.DataFrame:
    """Convierte a dtypes 'seguros' sin depender de opciones globales."""
    try:
        return df.convert_dtypes(dtype_backend="numpy_nullable")  # pandas nuevos
    except TypeError:
        return df.convert_dtypes()  # pandas más viejos

def sanitize_objects(df: pd.DataFrame) -> pd.DataFrame:
    """Convierte objetos no escalares (listas/dicts) a JSON string."""
    for c in df.columns:
        if df[c].dtype == "object":
            df[c] = df[c].map(
                lambda x: x if isinstance(x, (str, int, float, bool, type(None)))
                else json.dumps(x, ensure_ascii=False)
            )
    return df

def try_save_parquet(df: pd.DataFrame, path: str) -> bool:
    """Intenta guardar con fastparquet, luego pyarrow. Devuelve True si logra parquet."""
    # 1) fastparquet
    try:
        import fastparquet  # noqa: F401
        df.to_parquet(path, index=False, engine="fastparquet", compression="gzip")
        return True
    except Exception:
        pass
    # 2) pyarrow
    try:
        import pyarrow  # noqa: F401
        df.to_parquet(path, index=False, engine="pyarrow")  # compresión por defecto
        return True
    except Exception:
        return False

# -------- Proceso --------
# Lee CSV con separador pipe. Si tu exportación puso comillas cuando había '|',
# pandas las respeta automáticamente.
df = pd.read_csv(INPUT_CSV, sep="|", encoding="utf-8")

# Asegura presencia de columnas requeridas
for col in ["title", "abstract"]:
    if col not in df.columns:
        df[col] = ""

# Normalización SOLO sobre texto analizable
df["title_norm"]    = df["title"].fillna("").map(lambda x: normalize_text(x, REMOVE_ISOLATED_NUMBERS))
df["abstract_norm"] = df["abstract"].fillna("").map(lambda x: normalize_text(x, REMOVE_ISOLATED_NUMBERS))

# Guardar salida con metadatos originales + columnas normalizadas
cols_out = list(df.columns)
for c in ["title_norm", "abstract_norm"]:
    if c not in cols_out:
        cols_out.append(c)

# Copia de salida + saneo de tipos
out = df[cols_out].copy()
out = safe_convert_dtypes(out)

# Asegura strings planos en columnas de texto clave
for c in ["title", "abstract", "title_norm", "abstract_norm"]:
    if c in out.columns:
        out[c] = out[c].astype(str)

# Serializa objetos complejos a JSON para evitar fallos de parquet
out = sanitize_objects(out)

# -------- Guardado robusto --------
parquet_ok = try_save_parquet(out, OUTPUT_PATH)

if parquet_ok:
    print(f"Listo → {Path(OUTPUT_PATH).resolve()}")
else:
    # Fallback a CSV para no perder progreso
    fallback = Path(OUTPUT_PATH).with_suffix(".csv")
    out.to_csv(fallback, index=False, sep="|", encoding="utf-8")
    print("No se pudo escribir Parquet con fastparquet ni pyarrow. "
          f"Se guardó CSV en → {fallback.resolve()}")

# Vista rápida
print(out[["title_norm", "abstract_norm"]].head(3))


Listo → /run/media/alech/backup/Github/tesis/processed.parquet
                                          title_norm  \
0  thou shalt not die in this place : an ethnomet...   
1  use of learning frames in climate change commu...   
2  free access to public ecuadorian universities:...   

                                       abstract_norm  
0  ecuador, located in south america, has a popul...  
1  differences in climate change learning frames ...  
2  a free higher education policy was implemented...  


### deteccion de idioma

In [1]:
import pandas as pd
from pathlib import Path
from langdetect import detect, DetectorFactory

PATH = "processed.parquet"  
OUT  = "processed_lbl.parquet"   

DetectorFactory.seed = 0  # resultados más estables

def detect_lang_safe(text: str) -> str:
    t = (text or "").strip()
    if not t:
        return "und"  # indeterminado
    try:
        return detect(t)
    except Exception:
        return "und"

# --- cargar parquet ---
# intenta fastparquet y luego pyarrow
try:
    df = pd.read_parquet(PATH, engine="fastparquet")
except Exception:
    df = pd.read_parquet(PATH, engine="pyarrow")

# --- elegir fuente para detección
source_col = "abstract_norm" 
if source_col not in df.columns:
    # si no existe ninguna, crea vacía para no romper
    df[source_col] = ""

# --- detectar idioma ---
df["lang"] = df[source_col].map(detect_lang_safe)

# --- guardar ---
try:
    df.to_parquet(OUT, index=False, engine="fastparquet", compression="gzip")
except Exception:
    try:
        df.to_parquet(OUT, index=False, engine="pyarrow")
    except Exception:
        # último recurso: CSV para no perder el trabajo
        Path(OUT).with_suffix(".csv")
        df.to_csv(Path(OUT).with_suffix(".csv"), index=False, sep="|", encoding="utf-8")

print(df[["lang", source_col]].head(5))


KeyboardInterrupt: 

## segmentar oraciones

In [None]:
# -------- Configuración --------
INPUT_PARQUET  = "processed_lbl.parquet"      # entrada
OUTPUT_PARQUET = "processed_sentences.parquet"  # salida
SOURCE_COL     = "abstract_norm"               # columna a segmentar

# -------- Carga robusta --------
def read_parquet_any(path: str) -> pd.DataFrame:
    try:
        return pd.read_parquet(path, engine="fastparquet")
    except Exception:
        return pd.read_parquet(path, engine="pyarrow")

# -------- Segmentadores (ES / EN) --------
seg_es = pysbd.Segmenter(language="es", clean=False)
seg_en = pysbd.Segmenter(language="en", clean=False)

def split_by_lang(text: str, lang: str = "es") -> list[str]:
    if not isinstance(text, str) or not text.strip():
        return []
    lang = (lang or "es").lower()
    seg = seg_en if lang.startswith("en") else seg_es
    try:
        return seg.segment(text.strip())
    except Exception:
        # fallback simple si falla pysbd
        return [text.strip()]

# -------- Proceso --------
df = read_parquet_any(INPUT_PARQUET)

if SOURCE_COL not in df.columns:
    df[SOURCE_COL] = ""

# si tienes columna de idioma, úsala; si no, asume "es"
lang_series = df["lang"] if "lang" in df.columns else ["es"] * len(df)

# segmentar
df["sentences"] = [
    split_by_lang(text, lang)
    for text, lang in zip(df[SOURCE_COL], lang_series)
]

# una oración por fila
out = df.explode("sentences", ignore_index=False)
out = out.rename(columns={"sentences": "sentence"})
out = out.reset_index(names="row_id_original")
out["sentence_idx"] = out.groupby("row_id_original").cumcount()

# columnas finales
keep = []
for c in ["scopus_id", "title", "abstract", "abstract_norm", "lang"]:
    if c in out.columns:
        keep.append(c)
keep += ["row_id_original", "sentence_idx", "sentence"]
out = out[keep]

# -------- Guardar --------
try:
    out.to_parquet(OUTPUT_PARQUET, index=False, engine="fastparquet", compression="gzip")
except Exception:
    try:
        out.to_parquet(OUTPUT_PARQUET, index=False, engine="pyarrow")
    except Exception:
        out.to_csv(Path(OUTPUT_PARQUET).with_suffix(".csv"), index=False, sep="|", encoding="utf-8")

print("Ejemplo:")
print(out[["row_id_original", "sentence_idx", "sentence"]].head(10))
print(f"\nGuardado → {Path(OUTPUT_PARQUET).resolve()}")

Ejemplo:
   row_id_original  sentence_idx  \
0                0             0   
1                0             1   
2                0             2   
3                0             3   
4                0             4   
5                0             5   
6                0             6   
7                0             7   
8                0             8   
9                0             9   

                                            sentence  
0  ecuador, located in south america, has a popul...  
1  according to the national institution of stati...  
2  palliative care and hospice are relatively new...  
3  in ecuador people usually die at home, in hosp...  
4  in 2012, the first ecuadorian hospice was crea...  
5  according to symbolic interactionism theory, r...  
6  symbolic interactionism proposes that human be...  
7  through an ethnomethodological approach, the f...  
8  results emerge from the introspection of real ...  
9  based on a thematic analysis, the followi

## tokenizacion

In [7]:
from gensim.models.phrases import Phrases, Phraser

INPUT_PARQUET  = "processed_sentences.parquet"
OUTPUT_PARQUET = "corpus_token"
SENT_COL = "sentence"

TOKEN_RE = re.compile(r"(?:[^\W_]+(?:[-_][^\W_]+)+|\d+(?:\.\d+)+|[^\W_]+)", re.VERBOSE | re.IGNORECASE | re.UNICODE)
def simple_tokenize(s:str):
    if not isinstance(s, str): return []
    return TOKEN_RE.findall(re.sub(r"\s+", " ", s.strip()))

# 1) Carga y tokeniza todas las oraciones
def read_parquet_any(p):
    try: return pd.read_parquet(p, engine="fastparquet")
    except Exception: return pd.read_parquet(p, engine="pyarrow")

df = read_parquet_any(INPUT_PARQUET)
df["tokens_base"] = df[SENT_COL].map(simple_tokenize)

# 2) Entrena bigramas y (opcional) trigramas
sentences = df["tokens_base"].tolist()

# ⚠️ delimiter must be str if tokens are str
bigram = Phrases(sentences, min_count=5, threshold=10.0, delimiter=" ")
bigram_phraser = Phraser(bigram)

trigram = Phrases(bigram_phraser[sentences], min_count=5, threshold=10.0, delimiter=" ")
trigram_phraser = Phraser(trigram)

# 3) Aplica: pega frases automáticamente (p.ej., aprendizaje_automático)
df["tokens"] = [trigram_phraser[bigram_phraser[toks]] for toks in df["tokens_base"]]
df["tokens_csv"] = df["tokens"].map(lambda xs: ",".join(xs))
df["n_tokens"] = df["tokens"].map(len)

# 4) Guarda
try:
    df.to_parquet(OUTPUT_PARQUET, index=False, engine="fastparquet", compression="gzip")
except Exception:
    try: df.to_parquet(OUTPUT_PARQUET, index=False, engine="pyarrow")
    except Exception: df.to_csv(Path(OUTPUT_PARQUET).with_suffix(".csv"), index=False, sep="|", encoding="utf-8")

print(df[[SENT_COL, "tokens_csv"]].head(8))

                                            sentence  \
0  ecuador, located in south america, has a popul...   
1  according to the national institution of stati...   
2  palliative care and hospice are relatively new...   
3  in ecuador people usually die at home, in hosp...   
4  in 2012, the first ecuadorian hospice was crea...   
5  according to symbolic interactionism theory, r...   
6  symbolic interactionism proposes that human be...   
7  through an ethnomethodological approach, the f...   

                                          tokens_csv  
0  ecuador,located,in south america,has,a,populat...  
1  according to the,national,institution,of,stati...  
2  palliative care,and,hospice,are,relatively new...  
3  in,ecuador,people,usually,die,at home,in,hospi...  
4   in,2012,the,first,ecuadorian,hospice,was created  
5  according to,symbolic,interactionism,theory,re...  
6  symbolic,interactionism,proposes,that,human be...  
7  through,an,ethnomethodological,approach,the,fo...  


### stop words

In [None]:
# === Quitar stopwords sobre df["tokens"] con n-gramas separados por espacio ===
import re, unicodedata, os
import nltk
from nltk.corpus import stopwords as nltk_stop

# Asegura recurso stopwords NLTK
try:
    _ = nltk_stop.words("spanish")
except LookupError:
    nltk.download("stopwords")

# Idiomas a filtrar (ajusta a ["spanish"] si quieres solo ES)
LANGS = ["spanish", "english"]

# Construye set de stopwords
STOPSET = set()
for lang in LANGS:
    try:
        STOPSET |= set(nltk_stop.words(lang))
    except OSError:
        pass

def _norm(s: str) -> str:
    s = s.lower()
    s = unicodedata.normalize("NFKD", s)
    return "".join(ch for ch in s if not unicodedata.combining(ch))

STOPSET_NORM = {_norm(w) for w in STOPSET}

# Frases que no se filtran nunca (escribe aquí con ESPACIOS)
PROTECT_PHRASES = {
    "in south america",
    # añade más si quieres: "public health", "quality of life", ...
}

def is_stop(tok: str) -> bool:
    """
    Mantén n-gramas con contenido: elimina solo si TODAS las partes
    (separadas por espacio o guion) son stopwords; protege frases explícitas.
    """
    if not isinstance(tok, str) or not tok:
        return True  # vacío o no-string -> descartar

    t = _norm(tok).strip()

    # Protección explícita
    if t in PROTECT_PHRASES:
        return False

    # Token simple (sin espacios ni guiones)
    if (" " not in t) and ("-" not in t):
        return t in STOPSET_NORM

    # Token compuesto: separa por espacios o guiones (uno o más)
    parts = [p for p in re.split(r"[ \-]+", t) if p]
    if not parts:
        return True

    # Elimina SOLO si *todas* las partes son stopwords
    return all(p in STOPSET_NORM for p in parts)

def filter_tokens(tokens, min_len=2, drop_numeric=True):
    out = []
    if not isinstance(tokens, (list, tuple)):
        return out
    for t in tokens:
        if not isinstance(t, str) or not t:
            continue
        if drop_numeric and t.isnumeric():
            continue
        if len(t) < min_len:
            continue
        if is_stop(t):
            continue
        out.append(t)
    return out

# --- Aplicar al DF (requiere df["tokens"] como lista de strings) ---
if "tokens" not in df.columns:
    raise KeyError("Se esperaba df['tokens']. Asegúrate de haber generado los n-gramas antes.")

df["tokens_nostop"] = df["tokens"].map(filter_tokens)

# (Opcional) Texto para embeddings (bi-encoder): tokens unidos por espacio
df["text_for_embed"] = df["tokens_nostop"].map(lambda xs: " ".join(xs))

# Vistazo rápido (muestra si existen)
cols_show = [c for c in ["sentence", "tokens", "tokens_nostop", "text_for_embed"] if c in df.columns]
print(df[cols_show].head(8))

# --- Guardar a nuevo archivo para no sobrescribir el original ---
OUT_BASE = "corpus_token_nostop"
parquet_path = f"{OUT_BASE}.parquet"
try:
    df.to_parquet(parquet_path, index=False, engine="fastparquet", compression="gzip")
except Exception:
    try:
        df.to_parquet(parquet_path, index=False, engine="pyarrow")
    except Exception:
        df.to_csv(f"{OUT_BASE}.csv", index=False, sep="|", encoding="utf-8")

print("Guardado en:",
      parquet_path if os.path.exists(parquet_path) else f"{OUT_BASE}.csv")

                                            sentence  \
0  ecuador, located in south america, has a popul...   
1  according to the national institution of stati...   
2  palliative care and hospice are relatively new...   
3  in ecuador people usually die at home, in hosp...   
4  in 2012, the first ecuadorian hospice was crea...   
5  according to symbolic interactionism theory, r...   
6  symbolic interactionism proposes that human be...   
7  through an ethnomethodological approach, the f...   

                                              tokens  \
0  [ecuador, located, in south america, has, a, p...   
1  [according to the, national, institution, of, ...   
2  [palliative care, and, hospice, are, relativel...   
3  [in, ecuador, people, usually, die, at home, i...   
4  [in, 2012, the, first, ecuadorian, hospice, wa...   
5  [according to, symbolic, interactionism, theor...   
6  [symbolic, interactionism, proposes, that, hum...   
7  [through, an, ethnomethodological, approach,

## lemming

In [9]:
# LEMMATIZE text_for_embed (ES/EN) con spaCy en batch
import re, unicodedata, os
import pandas as pd
import spacy


df = pd.read_parquet("corpus_token_nostop.parquet")

assert "text_for_embed" in df.columns, "Falta la columna 'text_for_embed'."

# Carga modelos spaCy (puedes usar *_md/_lg si los tienes)
nlp_es = spacy.load("es_core_news_sm", disable=["parser","ner","textcat"])
nlp_en = spacy.load("en_core_web_sm", disable=["parser","ner","textcat"])

# Heurística ligera para detectar español
SPANISH_CUES = {"de","la","el","los","las","y","en","para","con","por","del","al","un","una","unos","unas","se","su","sus"}
ACCENTS_RE = re.compile(r"[áéíóúñüÁÉÍÓÚÑÜ]")

def _is_spanish_like(text: str) -> bool:
    if not isinstance(text, str):
        return False
    if ACCENTS_RE.search(text):
        return True
    words = [w.lower() for w in re.split(r"\s+", text.strip()) if w]
    # si al menos una palabra típica española aparece -> ES
    return any(w in SPANISH_CUES for w in words)

def _lemma_doc(doc):
    # cuida pronombres (algunos modelos antiguos devuelven "-PRON-")
    toks = []
    for t in doc:
        lem = t.lemma_ if t.lemma_ and t.lemma_ != "-PRON-" else t.text
        toks.append(lem.lower())
    # une con espacios (mismo formato que text_for_embed)
    return " ".join(toks)

# Divide índices por idioma con la heurística
idx_es = df.index[df["text_for_embed"].apply(_is_spanish_like)].tolist()
idx_en = df.index.difference(idx_es).tolist()

# Lematiza en batch por idioma (más rápido que fila a fila)
text_lemma = pd.Series(index=df.index, dtype=object)

if idx_es:
    docs_es = nlp_es.pipe(df.loc[idx_es, "text_for_embed"].fillna(""), batch_size=512, n_process=1)
    for i, doc in zip(idx_es, docs_es):
        text_lemma.loc[i] = _lemma_doc(doc)

if idx_en:
    docs_en = nlp_en.pipe(df.loc[idx_en, "text_for_embed"].fillna(""), batch_size=512, n_process=1)
    for i, doc in zip(idx_en, docs_en):
        text_lemma.loc[i] = _lemma_doc(doc)

# Asigna columna nueva
df["text_lemma"] = text_lemma.fillna("")

# Vista rápida
print(df[["text_for_embed", "text_lemma"]].head(8))

# Guardar (nuevo archivo para no pisar el anterior)
OUT = "corpus_token_nostop_lemma.parquet"
try:
    df.to_parquet(OUT, index=False, engine="fastparquet", compression="gzip")
except Exception:
    try:
        df.to_parquet(OUT, index=False, engine="pyarrow")
    except Exception:
        df.to_csv("corpus_token_nostop_lemma.csv", index=False, sep="|", encoding="utf-8")

print("Guardado en:", OUT if os.path.exists(OUT) else "corpus_token_nostop_lemma.csv")


                                      text_for_embed  \
0  ecuador located in south america population mi...   
1  according to the national institution statisti...   
2  palliative care hospice relatively new concept...   
3  ecuador people usually die at home hospitals n...   
4               first ecuadorian hospice was created   
5  according to symbolic interactionism theory re...   
6  symbolic interactionism proposes human beings ...   
7  ethnomethodological approach following researc...   

                                          text_lemma  
0  ecuador locate in south america population mil...  
1  accord to the national institution statistic e...  
2  palliative care hospice relatively new concept...  
3  ecuador people usually die at home hospital nu...  
4                 first ecuadorian hospice be create  
5  accord to symbolic interactionism theory resea...  
6  symbolic interactionism propose human being ca...  
7  ethnomethodological approach follow research a...  


## chunking 

In [4]:
# ====== CHUNKING *SIEMPRE* DESDE text_lemma (agrupado por scopus_id) ======
import os
import pandas as pd
import numpy as np
from transformers import AutoTokenizer

IN_PARQUET  = os.environ.get("IN_PARQUET", "corpus_token_nostop_lemma.parquet")
OUT_CHUNKS  = os.environ.get("OUT_CHUNKS", "corpus_chunks.parquet")

# --- Parámetros de chunking ---
MAX_TOKENS      = int(os.environ.get("MAX_TOKENS", "300"))      # 200–400 recomendado
OVERLAP_RATIO   = float(os.environ.get("OVERLAP_RATIO", "0.2"))  # 15–30% recomendado
OVERLAP_TOKENS  = int(MAX_TOKENS * OVERLAP_RATIO)
STRIDE          = max(1, MAX_TOKENS - OVERLAP_TOKENS)

# Tokenizer E5 (coherente con embeddings e5*)
TOKENIZER_NAME = "intfloat/multilingual-e5-base"
tok = AutoTokenizer.from_pretrained(TOKENIZER_NAME)

# ---------- 0) Carga y validación ----------
df = pd.read_parquet(IN_PARQUET)

assert "scopus_id" in df.columns, "Falta columna 'scopus_id' en el parquet."
assert "text_lemma" in df.columns, "Falta columna 'text_lemma' (se usa siempre)."

# Normaliza tipos / limpieza básica
df["scopus_id"]  = df["scopus_id"].astype(str)
df["text_lemma"] = (
    df["text_lemma"].fillna("").astype(str)
      .str.replace(r"\s+", " ", regex=True).str.strip()
)

# ---------- 1) Construir texto lematizado por documento ----------
# Orden preferido: sentence_idx > row_id_original > orden actual
sort_keys = ["scopus_id"]
if "sentence_idx" in df.columns:
    sort_keys += ["sentence_idx"]
elif "row_id_original" in df.columns:
    sort_keys += ["row_id_original"]

df = df.sort_values(sort_keys, kind="mergesort")

# Texto lematizado consolidado por scopus_id
agg_text = df.groupby("scopus_id")["text_lemma"].apply(
    lambda s: " ".join([t for t in s.astype(str) if t])
).rename("text_for_chunk")

# Solo scopus_id + texto consolidado
doc_df = agg_text.to_frame().reset_index()
doc_df["doc_id"] = np.arange(len(doc_df), dtype="int64")

# Limpieza final del texto
doc_df["text_for_chunk"] = (
    doc_df["text_for_chunk"].fillna("").astype(str)
       .str.replace(r"\s+", " ", regex=True).str.strip()
)
doc_df = doc_df[doc_df["text_for_chunk"].str.len() > 0].reset_index(drop=True)

# ---------- 2) Chunker por tokens ----------
def chunk_text_by_tokens(text: str, max_tokens: int = MAX_TOKENS, stride: int = STRIDE):
    ids = tok.encode(text, add_special_tokens=False)
    n = len(ids)
    if n == 0:
        return []
    chunks = []
    start = 0
    while start < n:
        end = min(start + max_tokens, n)
        sl = ids[start:end]
        chunk_txt = tok.decode(sl, skip_special_tokens=True).strip()
        if chunk_txt:
            chunks.append({
                "start_token": start,
                "end_token": end,
                "token_count": end - start,
                "text_chunk": chunk_txt
            })
        if end == n:
            break
        start += stride
    return chunks

# ---------- 3) Generar filas de chunks (solo campos mínimos + scopus_id) ----------
rows = []
for _, r in doc_df.iterrows():
    doc_id = int(r["doc_id"])
    scid   = str(r["scopus_id"])
    text   = r["text_for_chunk"]
    for j, ch in enumerate(chunk_text_by_tokens(text, MAX_TOKENS, STRIDE)):
        rows.append({
            "doc_id": doc_id,
            "chunk_id": j,
            "chunk_uid": f"{doc_id}-{j}",
            "scopus_id": scid,
            "start_token": ch["start_token"],
            "end_token": ch["end_token"],
            "token_count": ch["token_count"],
            "text_chunk": ch["text_chunk"],
        })

chunks_df = pd.DataFrame(rows)

print(chunks_df.head(8))
print("N docs (únicos scopus_id):", doc_df.shape[0], "| N chunks:", chunks_df.shape[0])

# ---------- 4) Guardar (solo mínimos) ----------
save_cols = [
    "doc_id","chunk_id","chunk_uid","scopus_id",
    "start_token","end_token","token_count","text_chunk"
]
chunks_out = chunks_df[save_cols].copy()

# Parquet (pyarrow preferente)
try:
    chunks_out.to_parquet(OUT_CHUNKS, index=False, engine="pyarrow")
except Exception:
    try:
        chunks_out.to_parquet(OUT_CHUNKS, index=False, engine="fastparquet", compression="gzip")
    except Exception:
        chunks_out.to_csv(OUT_CHUNKS.replace(".parquet", ".csv"), index=False, sep="|", encoding="utf-8")

print("Guardado en:", OUT_CHUNKS)


Token indices sequence length is longer than the specified maximum sequence length for this model (538 > 512). Running this sequence through the model will result in indexing errors


   doc_id  chunk_id chunk_uid    scopus_id  start_token  end_token  \
0       0         0       0-0  85059061481            0        156   
1       1         0       1-0  85061967853            0         94   
2       2         0       2-0  85067792389            0         60   
3       3         0       3-0  85068192726            0        255   
4       4         0       4-0  85069901345            0        173   
5       5         0       5-0  85070472925            0        205   
6       6         0       6-0  85071977997            0        195   
7       7         0       7-0  85072017885            0        169   

   token_count                                         text_chunk  
0          156  ecuador locate in south america population mil...  
1           94  difference climate change learning frame pedag...  
2           60  free high education policy be implement ecuado...  
3          255  this study explore influence family member lif...  
4          173  rapid adoptio

In [10]:
import os, gc, pickle, numpy as np, pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
from typing import Optional

# -------------------- Config --------------------
PARQUET_PATH   = os.environ.get("PARQUET_PATH", "corpus_chunks.parquet")   # <-- solo contiene los chunks
FAISS_PATH     = os.environ.get("FAISS_PATH", "faiss_index_ip.bin")
PKL_MIN_PATH   = os.environ.get("PKL_MIN_PATH", "embeddings_meta_min.pkl")


# Modelo recomendado para CPU
EMB_MODEL        = os.environ.get("EMB_MODEL", "intfloat/multilingual-e5-small")
EMB_MAX_SEQ_LEN  = int(os.environ.get("EMB_MAX_SEQ_LEN", "300"))   # <=512
INIT_BATCH       = int(os.environ.get("BATCH_SIZE", "32"))
MIN_BATCH        = 1

# -------------------- 0) Carga parquet de chunks + limpieza --------------------
try:
    chunks_df = pd.read_parquet(PARQUET_PATH, engine="pyarrow")
except Exception:
    chunks_df = pd.read_parquet(PARQUET_PATH, engine="fastparquet")

required_cols = {"doc_id","chunk_id","start_token","end_token","text_chunk"}
missing = required_cols - set(chunks_df.columns)
if missing:
    raise ValueError(f"Faltan columnas requeridas en {PARQUET_PATH}: {missing}")

# Normaliza tipos
for c in ["doc_id","chunk_id","start_token","end_token"]:
    if chunks_df[c].dtype.kind not in "iu":
        chunks_df[c] = pd.to_numeric(chunks_df[c], errors="coerce").fillna(0).astype("int64")

# Limpieza texto
chunks_df["text_chunk"] = chunks_df["text_chunk"].astype(str).str.strip()
chunks_df = chunks_df[chunks_df["text_chunk"].str.len() > 0].reset_index(drop=True)

# chunk_uid y scopus_id opcional
if "chunk_uid" not in chunks_df.columns:
    chunks_df["chunk_uid"] = chunks_df["doc_id"].astype(str) + "-" + chunks_df["chunk_id"].astype(str)
if "scopus_id" in chunks_df.columns:
    chunks_df["scopus_id"] = chunks_df["scopus_id"].astype(str)

# IDs vectoriales alineados 0..N-1
N = len(chunks_df)
chunks_df["vec_id"] = np.arange(N, dtype="int64")
chunks_df["embedding_model"] = EMB_MODEL

# -------------------- 1) Modelo (CPU) alineado con chunking --------------------
print(f"[INFO] device encode: cpu; model={EMB_MODEL}")
model = SentenceTransformer(EMB_MODEL, device="cpu")
model.max_seq_length = min(EMB_MAX_SEQ_LEN, 512)
print(f"[INFO] model.max_seq_length = {model.max_seq_length}")

# Prefijo E5
passages = ("passage: " + chunks_df["text_chunk"]).tolist()

# -------------------- 2) FAISS (IP con embeddings normalizados -> coseno) --------------------
def make_faiss_index(dim: int):
    print("[INFO] FAISS-CPU")
    return faiss.IndexFlatIP(dim)

def st_encode_cpu(texts, batch_size, normalize=True, to_numpy=True):
    embs = model.encode(
        texts,
        batch_size=batch_size,
        show_progress_bar=False,
        normalize_embeddings=normalize,
        convert_to_numpy=to_numpy
    )
    return np.asarray(embs, dtype="float32")

# -------------------- 3) Bucle por lotes (CPU) --------------------
def stream_encode_and_build(passages, init_bs=INIT_BATCH, min_bs=MIN_BATCH):
    i, bs = 0, init_bs
    index = None
    dim = None

    while i < N:
        j = min(i + bs, N)
        batch = passages[i:j]
        try:
            emb = st_encode_cpu(batch, batch_size=bs, normalize=True, to_numpy=True)

            if dim is None:
                dim = emb.shape[1]
                index = make_faiss_index(dim)

            index.add(emb)

            i = j
            print(f"[PROG] {i}/{N} ({100.0*i/N:.1f}%) - bs={bs}")

        except Exception as e:
            prev_bs = bs
            bs = max(min_bs, bs // 2)
            gc.collect()
            if prev_bs == bs and bs == min_bs:
                raise RuntimeError(f"Fallo persistente en CPU con batch={bs}: {e}") from e
            print(f"[WARN] Error en i={i}. Bajo batch {prev_bs}->{bs} y reintento…")
            continue

    return index, dim

# -------------------- 4) Ejecutar pipeline --------------------
index_cpu, dim = stream_encode_and_build(passages)

# -------------------- 5) Guardar FAISS + PKL (mapa mínimo) --------------------
faiss.write_index(index_cpu, FAISS_PATH)
print(f"[OK] FAISS guardado: {FAISS_PATH} | ntotal={index_cpu.ntotal} | dim={dim}")

# PKL: guardar meta_min con scopus_id si existe
min_cols = ["vec_id","chunk_uid","doc_id","chunk_id","start_token","end_token"]
if "scopus_id" in chunks_df.columns:
    min_cols.append("scopus_id")

with open(PKL_MIN_PATH, "wb") as f:
    pickle.dump({
        "model": EMB_MODEL,
        "device_used": "cpu",
        "dim": dim,
        "meta_min": chunks_df[min_cols].copy()
    }, f, protocol=pickle.HIGHEST_PROTOCOL)
print(f"[OK] PKL (meta_min) guardado: {PKL_MIN_PATH}")


[INFO] device encode: cpu; model=intfloat/multilingual-e5-small
[INFO] model.max_seq_length = 300
[INFO] FAISS-CPU
[PROG] 32/21005 (0.2%) - bs=32
[PROG] 64/21005 (0.3%) - bs=32
[PROG] 96/21005 (0.5%) - bs=32
[PROG] 128/21005 (0.6%) - bs=32
[PROG] 160/21005 (0.8%) - bs=32
[PROG] 192/21005 (0.9%) - bs=32
[PROG] 224/21005 (1.1%) - bs=32
[PROG] 256/21005 (1.2%) - bs=32
[PROG] 288/21005 (1.4%) - bs=32
[PROG] 320/21005 (1.5%) - bs=32
[PROG] 352/21005 (1.7%) - bs=32
[PROG] 384/21005 (1.8%) - bs=32
[PROG] 416/21005 (2.0%) - bs=32
[PROG] 448/21005 (2.1%) - bs=32
[PROG] 480/21005 (2.3%) - bs=32
[PROG] 512/21005 (2.4%) - bs=32
[PROG] 544/21005 (2.6%) - bs=32
[PROG] 576/21005 (2.7%) - bs=32
[PROG] 608/21005 (2.9%) - bs=32
[PROG] 640/21005 (3.0%) - bs=32
[PROG] 672/21005 (3.2%) - bs=32
[PROG] 704/21005 (3.4%) - bs=32
[PROG] 736/21005 (3.5%) - bs=32
[PROG] 768/21005 (3.7%) - bs=32
[PROG] 800/21005 (3.8%) - bs=32
[PROG] 832/21005 (4.0%) - bs=32
[PROG] 864/21005 (4.1%) - bs=32
[PROG] 896/21005 (4.3%) 

## recuperacion

In [5]:
import torch
import os, pickle
import pandas as pd
import faiss
from sentence_transformers import SentenceTransformer

# ---- Rutas (ajústalas o usa variables de entorno) ----
PKL_MIN_PATH = os.environ.get("PKL_MIN_PATH", "embeddings_meta_min.pkl")
FAISS_PATH   = os.environ.get("FAISS_PATH", "faiss_index_ip.bin")
SCOPUS_CSV   = os.environ.get("SCOPUS_CSV", "scopusdata.csv")
SCOPUS_SEP   = os.environ.get("SCOPUS_SEP", "|")

# ---- Caches simples ----
_model_cache = None
_meta_min_cache = None
_index_cache = None
_scopus_cache = None

def load_pkl_and_model(emb_max_seq_len=300):
    global _model_cache, _meta_min_cache
    if _model_cache is not None and _meta_min_cache is not None:
        return _model_cache, _meta_min_cache
    with open(PKL_MIN_PATH, "rb") as f:
        pkl = pickle.load(f)

    meta_min = pkl["meta_min"].copy()  # DataFrame: vec_id, chunk_uid, doc_id, chunk_id, (scopus_id), start/end
    _meta_min_cache = meta_min

    model_name = pkl.get("model", "intfloat/multilingual-e5-large")
    model = SentenceTransformer(model_name, device="cuda" if torch.cuda.is_available() else "cpu")
    model.max_seq_length = min(int(emb_max_seq_len), 512)
    _model_cache = model

    print(f"[INFO] Modelo: {model_name} | max_seq_length={model.max_seq_length}")
    print(f"[INFO] meta_min columnas: {list(meta_min.columns)} | filas={len(meta_min)}")
    return _model_cache, _meta_min_cache

def load_faiss():
    global _index_cache
    if _index_cache is None:
        _index_cache = faiss.read_index(FAISS_PATH)
        print(f"[INFO] Índice FAISS cargado: ntotal={_index_cache.ntotal}")
    return _index_cache

def load_scopus_csv():
    global _scopus_cache
    if _scopus_cache is None:
        df = pd.read_csv(SCOPUS_CSV, sep=SCOPUS_SEP)
        if "scopus_id" not in df.columns:
            raise ValueError(f"{SCOPUS_CSV} no tiene columna 'scopus_id'")
        df["scopus_id"] = df["scopus_id"].astype(str)
        _scopus_cache = df
        print(f"[INFO] scoupusdata.csv: filas={len(df)} | cols={len(df.columns)}")
    return _scopus_cache

def e5_encode_query(model, query_text: str):
    return model.encode([f"query: {query_text}"],
                        normalize_embeddings=True,
                        convert_to_numpy=True).astype("float32")

def search_min(query_text: str, topk: int = 100) -> pd.DataFrame:
    """
    Devuelve SOLO el meta mínimo del PKL (sin CSV):
    vec_id, score, chunk_uid, doc_id, chunk_id, (scopus_id si existe), start/end
    """
    model, meta_min = load_pkl_and_model()
    index = load_faiss()

    q = e5_encode_query(model, query_text)
    D, I = index.search(q, topk)
    vec_ids = I[0].tolist()

    hits = meta_min.set_index("vec_id").loc[vec_ids].reset_index()
    hits.insert(1, "score", D[0])

    cols_front = [c for c in ["vec_id","score","chunk_uid","doc_id","chunk_id","scopus_id","start_token","end_token"] if c in hits.columns]
    rest = [c for c in hits.columns if c not in cols_front]
    return hits[cols_front + rest].reset_index(drop=True)

def search_full_scopus(query_text: str, topk: int = 5) -> pd.DataFrame:
    """
    Une el TOP-K con TODAS las columnas de scoupusdata.csv por scopus_id.
    """
    model, meta_min = load_pkl_and_model()
    index = load_faiss()
    sc = load_scopus_csv()

    q = e5_encode_query(model, query_text)
    D, I = index.search(q, topk)
    vec_ids = I[0].tolist()

    hits = meta_min.set_index("vec_id").loc[vec_ids].reset_index()
    hits.insert(1, "score", D[0])

    if "scopus_id" not in hits.columns:
        raise ValueError("meta_min en PKL no contiene 'scopus_id'; no puedo unir con el CSV.")

    out = hits.merge(sc, how="left", on="scopus_id")

    # Orden: primero claves/score/offsets, luego TODO el CSV
    front = [c for c in ["vec_id","score","chunk_uid","doc_id","chunk_id","scopus_id","start_token","end_token"] if c in out.columns]
    csv_cols = [c for c in sc.columns if c not in front]
    return out[front + csv_cols].reset_index(drop=True)

# ====== DEMO RÁPIDA ======
if __name__ == "__main__":
    query = "¿Qué variables clínicas aparecen como más influyentes en la predicción de mortalidad por COVID-19 en Ecuador?"
    print("\n=== TOP-K + TODA la metadata del CSV ===")
    df = search_full_scopus(query, topk=15)
    # Si quieres guardar para revisar en Excel:
    df.to_csv("ground.csv", sep="|", index=False, encoding="utf-8")
    print(df.head(10))



=== TOP-K + TODA la metadata del CSV ===
[INFO] Modelo: intfloat/multilingual-e5-small | max_seq_length=300
[INFO] meta_min columnas: ['vec_id', 'chunk_uid', 'doc_id', 'chunk_id', 'start_token', 'end_token', 'scopus_id'] | filas=21005
[INFO] Índice FAISS cargado: ntotal=21005
[INFO] scoupusdata.csv: filas=19233 | cols=9
   vec_id     score chunk_uid  doc_id  chunk_id    scopus_id  start_token  \
0   14468  0.872843   13207-0   13207         0  85175981768            0   
1    9230  0.871996    8419-0    8419         0  85147783425            0   
2    2449  0.870473    2186-0    2186         0  85119425145            0   
3   17014  0.870369   15545-0   15545         0  85187569644            0   
4   17622  0.868678   16100-1   16100         1  85189720260          240   
5    4111  0.867389    3693-0    3693         0  85127559439            0   
6   10938  0.865886    9989-0    9989         0  85158148579            0   
7   15051  0.862978   13739-0   13739         0  85178875327 

# reranking cross encoder

In [7]:
import os, math
import numpy as np
import pandas as pd
from typing import List, Sequence, Optional, Tuple
from sentence_transformers import CrossEncoder

# ---------- Configuración ----------
CROSS_ENCODER_MODEL = os.environ.get("CROSS_ENCODER_MODEL", "cross-encoder/ms-marco-MiniLM-L-6-v2")
CE_BATCH_SIZE       = int(os.environ.get("CE_BATCH_SIZE", "64"))
W_CE, W_DENSE       = float(os.environ.get("W_CE", "0.7")), float(os.environ.get("W_DENSE", "0.3"))

# columnas de texto (orden de prioridad)
TEXT_COLS = ["title", "abstract"]  # <- ajustado a tu CSV

# ---------- Utilidades ----------
_ce_cache: Optional[CrossEncoder] = None

def get_cross_encoder(model_name: str = CROSS_ENCODER_MODEL) -> CrossEncoder:
    global _ce_cache
    if _ce_cache is None:
        _ce_cache = CrossEncoder(model_name, device="cpu")
        print(f"[INFO] Cross-Encoder cargado: {model_name}")
    return _ce_cache

def _first_nonempty(row: pd.Series, cols: Sequence[str]) -> str:
    """Devuelve el primer texto no vacío según prioridad en 'cols'.
       Si no encuentra, intenta concatenar campos semánticos."""
    for c in cols:
        if c in row and isinstance(row[c], str) and row[c].strip():
            return row[c]
    parts = []
    for c in row.index:
        name = c.lower()
        if any(tok in name for tok in ("title","abstract","summary","keywords","chunk","desc")):
            v = row[c]
            if isinstance(v, str) and v.strip():
                parts.append(v.strip())
    return " ".join(parts)[:4096]  # recorte defensivo

def _build_pairs(query_text: str, df_topk: pd.DataFrame, text_cols: Optional[List[str]]) -> Tuple[List[Tuple[str,str]], List[int]]:
    cols = text_cols or TEXT_COLS
    pairs, idx_map = [], []
    for i, row in df_topk.iterrows():
        txt = _first_nonempty(row, cols)
        pairs.append((query_text, txt if isinstance(txt, str) else ""))
        idx_map.append(i)
    return pairs, idx_map

def _minmax(x: np.ndarray) -> np.ndarray:
    x = np.asarray(x, dtype=np.float32)
    mn, mx = float(np.nanmin(x)), float(np.nanmax(x))
    if not np.isfinite(mn) or not np.isfinite(mx) or (mx - mn) <= 1e-12:
        return np.zeros_like(x, dtype=np.float32)
    return (x - mn) / (mx - mn + 1e-12)

def rerank_with_cross_encoder(query_text: str,
                              df_topk: pd.DataFrame,
                              text_cols: Optional[List[str]] = None,
                              score_dense_col: str = "score",
                              fuse_with_dense: bool = True,
                              batch_size: int = CE_BATCH_SIZE,
                              model_name: str = CROSS_ENCODER_MODEL) -> pd.DataFrame:
    """Reordena df_topk usando un Cross-Encoder y devuelve un nuevo DataFrame ordenado."""
    if df_topk is None or len(df_topk) == 0:
        raise ValueError("df_topk está vacío; ejecuta primero search_full_scopus(query, topk=N).")
    ce = get_cross_encoder(model_name)
    pairs, idx_map = _build_pairs(query_text, df_topk, text_cols)

    # Predicción por lotes
    scores_ce = []
    for start in range(0, len(pairs), batch_size):
        batch = pairs[start:start+batch_size]
        s = ce.predict(batch)  # lista/ndarray de floats
        scores_ce.append(np.asarray(s, dtype=np.float32))
    scores_ce = np.concatenate(scores_ce, axis=0) if scores_ce else np.zeros(len(df_topk), dtype=np.float32)

    # Ensamble y orden
    out = df_topk.copy()
    out.loc[idx_map, "score_ce"] = scores_ce
    ce_norm = _minmax(out["score_ce"].values)

    if fuse_with_dense and score_dense_col in out.columns:
        dense_norm = _minmax(out[score_dense_col].values)
        out["score_dense_norm"] = dense_norm
        out["score_final"] = W_CE * ce_norm + W_DENSE * dense_norm
        order_col = "score_final"
    else:
        out["score_final"] = ce_norm
        order_col = "score_final"

    out = out.sort_values(order_col, ascending=False).reset_index(drop=True)
    return out

# ---------- Ejemplo de uso: ejecuta después de definir tu 'query' ----------
# 1) Recupera candidatos con tu función (puedes subir TOPK para un mejor re-ranking).
query = os.environ.get("QUERY", "turismo en galapagos")
TOPK  = int(os.environ.get("TOPK", "100"))

try:
    df_topk = search_full_scopus(query, topk=TOPK)   # usa tu función definida arriba
except NameError:
    raise RuntimeError("No encuentro search_full_scopus(). Ejecuta primero la celda donde la defines.")

# 2) Reranking con Cross-Encoder (ajusta columnas si conoces tus nombres exactos).
#    Ejemplo: text_cols=["chunk_text"] o ["title","abstract","authkeywords"]
reranked = rerank_with_cross_encoder(
    query_text=query,
    df_topk=df_topk,
    text_cols=None,            # autodetección robusta; cambia a lista explícita si la tienes
    score_dense_col="score",   # 'score' = puntaje FAISS de tu búsqueda
    fuse_with_dense=True
)

# 3) Visualiza y/o guarda
display_cols = [c for c in ["vec_id","score","score_ce","score_final","scopus_id","title","abstract","chunk_text"] if c in reranked.columns]
print("[OK] Top 10 (ya re-rankeado):")
display(reranked.head(10)[display_cols])

[INFO] Cross-Encoder cargado: cross-encoder/ms-marco-MiniLM-L-6-v2
[OK] Top 10 (ya re-rankeado):


Unnamed: 0,vec_id,score,score_ce,score_final,scopus_id,title,abstract
0,5168,0.864254,-3.670709,0.823718,85132240597,“Rethink and reset” tourism in the Galapagos I...,Tourism growth in biodiversity conservation ar...
1,14203,0.84195,-2.71306,0.744398,85174965454,Agrobiodiversity in four Islands of the Galapa...,"The Galapagos Islands, officially annexed to t..."
2,10216,0.848282,-3.679054,0.720161,85151140601,"Residents, conservation, development and touri...",Tourism from its initial stage generates socio...
3,775,0.820772,-1.053562,0.719617,85105310828,"The Galápagos as penal colony: Exile, peonage,...",Transportation to remote islands has been a wa...
4,9580,0.860996,-4.914887,0.718905,85148845556,Imperiled Ecosystems: Galápagos Scrub,The Galápagos Islands are an extraordinary loc...
5,18344,0.84424,-3.452024,0.709393,85191855593,SUSTAINABLE MANAGEMENT APPLIED TO THE HOTEL SE...,This research analyzes the current situation o...
6,5447,0.843322,-3.669117,0.688851,85133488337,Volcanic event management in the Galápagos Isl...,"The volcanoes of Galápagos, Ecuador, are among..."
7,14590,0.855215,-5.482558,0.643393,85176328476,"A Bunch of Books, a Suitcase, and Many Trips b...","The Galapagos Islands (Galapagos Province, Ecu..."
8,15962,0.843961,-4.802355,0.616639,85182709765,The impact of the COVID-19 pandemic on the Gal...,The COVID-19 pandemic's early stages severely ...
9,11028,0.826736,-3.237062,0.611005,85159179257,Tourist planning of an emblematic destination....,The article through the analysis of the constr...


# Contruccion y Generación (LLM)

In [16]:
# -*- coding: utf-8 -*-
import os, re, textwrap, requests
import pandas as pd
from typing import List, Dict, Optional

# =================== Configuración ===================
OLLAMA_HOST       = os.environ.get("OLLAMA_HOST", "http://localhost:11434")
OLLAMA_MODEL      = os.environ.get("OLLAMA_MODEL", "gemma3:4b")
TOP_CONTEXT       = int(os.environ.get("RAG_TOP_CONTEXT", "6"))
MAX_INPUT_CHARS   = int(os.environ.get("RAG_MAX_INPUT_CHARS", "7000"))
MAX_CHUNK_CHARS   = int(os.environ.get("RAG_MAX_CHUNK_CHARS", "900"))
TEMPERATURE       = float(os.environ.get("RAG_TEMPERATURE", "0.2"))
MAX_NEW_TOKENS    = int(os.environ.get("RAG_MAX_NEW_TOKENS", "768"))  # ↑ un poco por seguridad
HTTP_TIMEOUT_SECS = int(os.environ.get("RAG_HTTP_TIMEOUT_SECS", "300"))
DO_TRIM_ABSTRACT  = os.environ.get("RAG_TRIM_ABSTRACT", "1") == "1"

# =================== Utilidades generales ===================
def _safe_str(x) -> str:
    if isinstance(x, str):
        return x
    if x is None:
        return ""
    if isinstance(x, float) and x != x:  # NaN
        return ""
    try:
        return str(x)
    except Exception:
        return ""

def _id_for_row(row: pd.Series) -> str:
    for k in ("doi", "scopus_id", "vec_id"):
        vs = _safe_str(row.get(k)).strip()
        if vs:
            return vs
    return f"row{row.name}"

def _first_nonempty(row: pd.Series, cols: List[str]) -> str:
    for c in cols:
        v = _safe_str(row.get(c, ""))
        if v.strip():
            return v.strip()
    return ""

def _shorten(txt, lim: int) -> str:
    s = _safe_str(txt)
    s = re.sub(r"\s+", " ", s).strip()
    return (s[:lim-3] + "...") if len(s) > lim else s

# ---------- Autores/Año para mención autoral ----------
def _split_authors(raw: str) -> List[str]:
    s = _safe_str(raw)
    if not s.strip(): return []
    s = s.replace("|", ";").replace(" and ", ";")
    parts = [p.strip() for p in s.split(";") if p.strip()]
    if len(parts) <= 1:
        parts = [p.strip() for p in s.split(",") if p.strip()]
    return parts

def _last_name(name: str) -> str:
    n = _safe_str(name).strip()
    if not n: return ""
    if "," in n: return n.split(",")[0].strip()
    tokens = n.split()
    return tokens[-1].strip() if tokens else n

def _format_authors_for_mention(raw: str, max_names: int = 2) -> Optional[str]:
    authors = _split_authors(raw)
    if not authors: return None
    last_names = [_last_name(a) for a in authors if _last_name(a)]
    if not last_names: return None
    if len(last_names) == 1: return last_names[0]
    if len(last_names) == 2: return f"{last_names[0]} y {last_names[1]}"
    return f"{last_names[0]} et al."

def _extract_year(row: pd.Series) -> Optional[str]:
    for c in ["year", "publication_year", "cover_date", "date"]:
        val = _safe_str(row.get(c))
        m = re.search(r"(19|20)\d{2}", val)
        if m: return m.group(0)
    return None

# =================== Construcción de bloques ===================
def build_context_blocks(df_reranked: pd.DataFrame,
                         top_k: int = TOP_CONTEXT,
                         max_chunk_chars: int = MAX_CHUNK_CHARS) -> List[Dict]:
    if df_reranked is None or len(df_reranked) == 0:
        raise ValueError("df_reranked está vacío.")
    cols_title = [c for c in ["title","chunk_title"] if c in df_reranked.columns] or ["title"]
    cols_abs   = [c for c in ["abstract","chunk_text","summary"] if c in df_reranked.columns] or ["abstract"]
    blocks = []
    for i in range(min(top_k, len(df_reranked))):
        row = df_reranked.iloc[i]
        title  = _first_nonempty(row, cols_title) or "Sin título"
        body   = _first_nonempty(row, cols_abs)
        if DO_TRIM_ABSTRACT: body = _shorten(body, max_chunk_chars)
        authors_raw = _safe_str(row.get("authors", ""))
        year  = _extract_year(row)
        doi_raw = _safe_str(row.get("doi", ""))
        blocks.append({
            "cite_id": _id_for_row(row),   # para [n]
            "title": title,
            "text": body,
            "authors_mention": _format_authors_for_mention(authors_raw),
            "authors_raw": authors_raw,    # para 'Fuentes'
            "year": year,
            "doi_raw": doi_raw             # para 'Fuentes'
        })
    return blocks

# =================== Fuentes: DOI y autores ===================
def _doi_url(doi_raw: str) -> str:
    doi = _safe_str(doi_raw).strip()
    if not doi: return "s/d"
    return doi if doi.lower().startswith("http") else f"https://doi.org/{doi}"

def _authors_cite_line(raw_authors: str) -> str:
    names = _split_authors(_safe_str(raw_authors))
    if not names: return "Autor(es) no disponibles"
    return "; ".join([_safe_str(n).strip() for n in names if _safe_str(n).strip()])

def render_fuentes_from_blocks(blocks: List[Dict]) -> str:
    lines = []
    for i, b in enumerate(blocks, start=1):
        autores = _authors_cite_line(b.get("authors_raw", ""))
        titulo  = _safe_str(b.get("title", "Sin título"))
        doiurl  = _doi_url(b.get("doi_raw", ""))
        lines.append(f"[{i}] {autores}; \"{titulo}\"; {doiurl}")
    return "\n".join(lines)

# =================== Prompt con instrucción de mención autoral ===================
def compose_prompt(query: str, blocks: List[Dict], max_chars: int = MAX_INPUT_CHARS) -> str:
    header = (
        "Eres un asistente académico. Responde de forma clara, concisa y basada en evidencia, "
        "en español y estilo APA (7ª). Utiliza exclusivamente los fragmentos proporcionados. "
        "Cada párrafo debe incluir una mención autoral explícita (‘según <autor>’ o ‘de acuerdo con <autor>’) "
        "seguida de la cita en corchetes [n]. Si no hay autor disponible, usa ‘según la fuente [n]’. "
        "No imprimas tu propia sección de 'Fuentes'; solo escribe el cuerpo con citas [n].\n\n"
        f"Pregunta: {query}\n\n"
        "Fragmentos (con autor/año si disponible):\n"
    )
    parts = []
    for i, b in enumerate(blocks, start=1):
        autor_m = b.get("authors_mention") or f"fuente [{i}]"
        anio    = f" ({b['year']})" if b.get("year") else ""
        head    = f"[{i}] {b['title']} — Autor(es): {autor_m}{anio}"
        parts.append(f"{head}\n{_safe_str(b['text'])}\n")

    footer = (
        "\nInstrucciones de redacción:\n"
        f"- Debes usar las referencias [1..{len(blocks)}] tal como están definidas.\n"
        "- No inventes datos fuera de los fragmentos.\n"
        "- Cada párrafo debe incluir al menos una mención autoral y su [n] correspondiente.\n"
        "- No generes la sección 'Fuentes'.\n"
    )
    # Incluimos igualmente las 'Fuentes' en el prompt como guía, pero pedimos explícitamente que NO las imprima.
    fuentes_prompt = render_fuentes_from_blocks(blocks)
    prompt = header + "\n".join(parts) + footer + "\nFuentes (guía, no imprimir):\n" + fuentes_prompt
    return prompt[:max_chars]

# =================== Generación vía servicio HTTP de Ollama (sin fallbacks) ===================
def generate_with_ollama_http(prompt: str,
                              model: str = OLLAMA_MODEL,
                              temperature: float = TEMPERATURE,
                              max_new_tokens: int = MAX_NEW_TOKENS,
                              base_url: str = OLLAMA_HOST,
                              timeout: int = HTTP_TIMEOUT_SECS) -> str:
    url = f"{base_url.rstrip('/')}/api/chat"
    payload = {
        "model": model,
        "messages": [
            {"role": "system", "content": "Eres un asistente académico que escribe en español (APA 7ª), conciso y basado en evidencia."},
            {"role": "user",   "content": _safe_str(prompt)}
        ],
        "options": {
            "temperature": float(temperature),
            "num_predict": int(max_new_tokens),
            # Cortamos cuando el modelo intente empezar su propia sección de fuentes:
            "stop": ["\nFuentes", "\nFUENTES", "\nReferences", "\nREFERENCIAS"]
        },
        "stream": False
    }
    try:
        r = requests.post(url, json=payload, timeout=timeout)
        r.raise_for_status()
        data = r.json()
    except Exception as e:
        raise RuntimeError(
            "Fallo al invocar el servicio HTTP de Ollama. "
            "Verifica daemon, modelo descargado y conectividad.\nDetalle: {e}"
        )
    content = _safe_str(data.get("message", {}).get("content", ""))
    if not content.strip():
        raise RuntimeError("La respuesta de Ollama está vacía.")
    return content

# =================== Auditoría de citas usadas ===================
def extract_used_refs(answer_text: str, n_max: int) -> List[int]:
    """Extrae números [n] usados en el texto, filtrando por rango 1..n_max y preservando orden de aparición."""
    nums = [int(m.group(1)) for m in re.finditer(r"\[(\d+)\]", _safe_str(answer_text))]
    seen, used = set(), []
    for n in nums:
        if 1 <= n <= n_max and n not in seen:
            used.append(n); seen.add(n)
    return used

def render_used_refs_report(answer_text: str, blocks: List[Dict]) -> str:
    used = extract_used_refs(answer_text, len(blocks))
    if not used:
        return "No se detectaron citas [n] en el texto."
    lines = ["Citas usadas en el texto:"]
    for n in used:
        b = blocks[n-1]
        lines.append(f" - [{n}] {_authors_cite_line(b.get('authors_raw',''))}; \"{_safe_str(b.get('title','Sin título'))}\"; {_doi_url(b.get('doi_raw',''))}")
    # Aviso si el modelo no usó alguna fuente disponible
    missing = [i for i in range(1, len(blocks)+1) if i not in used]
    if missing:
        lines.append(f"No usadas: {missing}")
    return "\n".join(lines)

# =================== Ejecución (ejemplo) ===================
if 'reranked' not in globals():
    raise RuntimeError("No encuentro 'reranked'. Ejecuta primero tu celda de re-ranking.")

query  = os.environ.get("QUERY", "el turismo en las islas galagos que tan bueno es?")
blocks = build_context_blocks(reranked, top_k=TOP_CONTEXT, max_chunk_chars=MAX_CHUNK_CHARS)
prompt = compose_prompt(query, blocks, max_chars=MAX_INPUT_CHARS)

print("=== Prompt (vista previa) ===")
print(textwrap.shorten(prompt, width=1000, placeholder=" ..."))

answer = generate_with_ollama_http(prompt)
print("\n=== Respuesta (LLM - Ollama HTTP) ===\n")
print(answer)

# Resumen de citas usadas por el modelo
print("\n=== Auditoría de citas usadas ===")
print(render_used_refs_report(answer, blocks))


=== Prompt (vista previa) ===
Eres un asistente académico. Responde de forma clara, concisa y basada en evidencia, en español y estilo APA (7ª). Utiliza exclusivamente los fragmentos proporcionados. Cada párrafo debe incluir una mención autoral explícita (‘según <autor>’ o ‘de acuerdo con <autor>’) seguida de la cita en corchetes [n]. Si no hay autor disponible, usa ‘según la fuente [n]’. No imprimas tu propia sección de 'Fuentes'; solo escribe el cuerpo con citas [n]. Pregunta: el turismo en las islas galagos que tan bueno es? Fragmentos (con autor/año si disponible): [1] “Rethink and reset” tourism in the Galapagos Islands: Stakeholders' views on the sustainability of tourism development — Autor(es): Ferri et al. Tourism growth in biodiversity conservation areas presents both challenges and opportunities for sustainability. The COVID-19 pandemic brought both into focus in the Galapagos. This study engages with tourism service providers and regulators in Puerto Ayora, Santa Cruz Islan

# Evaluación

## retriever

In [22]:
# eval_chunk_uid_multi.py
import os
import pandas as pd
from typing import List, Set, Optional, Iterable

# ====== importa tu búsqueda ======
# from tu_modulo import search_min   # Debe devolver DataFrame con columna 'chunk_uid'

K_LIST = [5, 10, 15]
GT_PATH = os.environ.get("GT_PATH", "groundtruth.csv")  # tab-separado por defecto

def _split_list(cell: str) -> List[str]:
    if pd.isna(cell) or not str(cell).strip():
        return []
    s = str(cell).replace("\n", " ").strip()
    parts = [p.strip() for p in s.replace(";", ",").split(",")]
    return [p for p in parts if p]

def recall_at_k(hits: pd.DataFrame, relevant: Set[str], k: int) -> float:
    if "chunk_uid" not in hits.columns or not relevant:
        return 0.0
    top = hits.head(k)
    retrieved = set(top["chunk_uid"].astype(str))
    inter = len(retrieved & relevant)
    denom = min(k, len(relevant))
    return inter / denom if denom > 0 else 0.0

def mrr_at_k(hits: pd.DataFrame, relevant: Set[str], k: int) -> float:
    if "chunk_uid" not in hits.columns or not relevant:
        return 0.0
    top = hits.head(k)["chunk_uid"].astype(str).tolist()
    for i, uid in enumerate(top, start=1):
        if uid in relevant:
            return 1.0 / i
    return 0.0

def evaluate_chunk_uid_multi(gt_path: str = GT_PATH, ks: Iterable[int] = K_LIST) -> pd.DataFrame:
    ks = sorted(set(int(x) for x in ks))
    gt = pd.read_csv(gt_path, sep="\t", dtype=str).fillna("")
    need = {"id", "consulta", "fragmentos"}
    if not need.issubset(gt.columns):
        missing = need - set(gt.columns)
        raise ValueError(f"Faltan columnas en GT: {missing}")

    # contenedores de macro
    macro_recalls = {k: [] for k in ks}
    macro_mrrs    = {k: [] for k in ks}

    rows = []
    for _, r in gt.iterrows():
        qid   = r["id"]
        query = r["consulta"]
        gold  = set(_split_list(r["fragmentos"]))

        hits = search_min(query, topk=max(ks))  # buscamos 1 vez con el mayor k
        if "chunk_uid" not in hits.columns:
            raise ValueError("search_min no devuelve columna 'chunk_uid'.")

        row_out = {"id": qid, "tipo": r.get("tipo", ""), "consulta": query}
        for k in ks:
            rec = recall_at_k(hits, gold, k)
            mrr = mrr_at_k(hits, gold, k)
            row_out[f"recall@{k}"] = rec
            row_out[f"mrr@{k}"]    = mrr
            macro_recalls[k].append(rec)
            macro_mrrs[k].append(mrr)

        rows.append(row_out)

    df = pd.DataFrame(rows)

    # imprimir macro
    print("\n== MACRO ==")
    for k in ks:
        mr = sum(macro_recalls[k]) / len(macro_recalls[k]) if macro_recalls[k] else float("nan")
        mm = sum(macro_mrrs[k]) / len(macro_mrrs[k])       if macro_mrrs[k] else float("nan")
        print(f"Recall@{k}: {mr:.3f} | MRR@{k}: {mm:.3f}")

    return df

if __name__ == "__main__":
    out = evaluate_chunk_uid_multi(GT_PATH, ks=[5,10,15])
    out.to_csv("retrieval_eval_chunk_multi.tsv", sep="\t", index=False)
    print("Guardado: retrieval_eval_chunk_multi.tsv")



== MACRO ==
Recall@5: 0.533 | MRR@5: 0.370
Recall@10: 0.533 | MRR@10: 0.370
Recall@15: 0.567 | MRR@15: 0.370
Guardado: retrieval_eval_chunk_multi.tsv


## augmentation

In [12]:
import os, csv, re, json, requests
import numpy as np
import pandas as pd
from typing import Dict, List, Sequence, Tuple
from sentence_transformers import SentenceTransformer

# --- Config ---
GT_PATH     = os.environ.get("GT_PATH", "groundtruth.csv")  # TSV (tabs)
ID_COL      = "chunkuid"                  # tus resultados deben tener esta columna
TEXT_COLS   = ["title","abstract"]        # o ["chunk_text"]
CTX_K_BASIC = int(os.environ.get("CTX_K_BASIC", "6"))
CTX_K_FULL  = int(os.environ.get("CTX_K_FULL", "6"))
EMB_MODEL   = os.environ.get("EMB_MODEL", "intfloat/multilingual-e5-small")

# Ollama
OLLAMA_HOST  = os.environ.get("OLLAMA_HOST", "http://localhost:11434")
OLLAMA_MODEL = os.environ.get("OLLAMA_MODEL", "gemma3:4b")
_OLLAMA_CHAT_URL = f"{OLLAMA_HOST.rstrip('/')}/api/chat"

# ==================== GT Loader (TSV, columnas exactas) ====================
def load_gt(path: str = GT_PATH, sep: str = "\t", id_sep: str = "|") -> Dict[str, Dict]:
    df = pd.read_csv(path, sep=sep, engine="python",
                     quoting=csv.QUOTE_MINIMAL, encoding="utf-8",
                     keep_default_na=False, on_bad_lines="error")
    required = {"consulta", "fragmentos", "Respuesta-ideal"}
    missing  = required - set(df.columns)
    if missing:
        raise KeyError(f"Faltan columnas {missing}. Presentes: {list(df.columns)}")
    def parse_ids(s: str) -> List[str]:
        return [t.strip() for t in (s or "").split(id_sep) if t.strip()]
    out = {}
    for _, r in df.iterrows():
        q     = str(r["consulta"]).strip()
        rel   = set(parse_ids(str(r["fragmentos"]).strip()))   # chunkuid
        ideal = str(r["Respuesta-ideal"]).strip()
        tipo  = str(r.get("tipo","")).strip()
        out[q] = {"relevant_ids": rel, "ideal_answer": ideal, "tipo": tipo}
    return out

# ==================== Utilidades de contexto ====================
def ensure_chunkuid(df: pd.DataFrame) -> pd.DataFrame:
    if ID_COL in df.columns:
        return df
    for alt in ["chunk_uid","chunk_id","chunkId","vec_id","scopus_id","id"]:
        if alt in df.columns:
            return df.rename(columns={alt: ID_COL})
    raise KeyError("El DataFrame no tiene 'chunkuid' (nib variantes).")

def first_text(row: pd.Series, cols: Sequence[str]) -> str:
    for c in cols:
        if c in row and isinstance(row[c], str) and row[c].strip():
            return row[c]
    for c in row.index:
        if any(t in c.lower() for t in ("title","abstract","summary","chunk","keywords","desc")):
            v = row[c]
            if isinstance(v, str) and v.strip():
                return v
    return ""

def build_context(df: pd.DataFrame, cols: Sequence[str], k_ctx: int) -> List[Dict]:
    return [{"id": str(row[ID_COL]), "text": first_text(row, cols)}
            for _, row in df.head(k_ctx).iterrows()]

# ==================== Cliente LLM (Ollama gemma3:4b) ====================
def llm_call(system_prompt: str, user_prompt: str, *,
             temperature: float = 0.2,
             num_predict: int | None = None,
             timeout: float = 120.0) -> str:
    payload = {
        "model": OLLAMA_MODEL,
        "messages": [
            {"role": "system", "content": system_prompt or ""},
            {"role": "user",   "content": user_prompt or ""},
        ],
        "stream": False,
        "options": { "temperature": temperature }
    }
    if num_predict is not None:
        payload["options"]["num_predict"] = int(num_predict)
    try:
        resp = requests.post(_OLLAMA_CHAT_URL, json=payload, timeout=timeout)
        resp.raise_for_status()
        data = resp.json()
        return (data.get("message") or {}).get("content", "") or ""
    except Exception as e:
        print(f"[WARN] Ollama request failed: {e}")
        return ""

# ==================== Generadores con prompts estrictos ====================
_ID_PATTERN = re.compile(r'\[([A-Za-z0-9_\-:./]+)\]')

def extract_ids(text: str) -> List[str]:
    return _ID_PATTERN.findall(text or "")

def truncate_text(t: str, max_chars: int = 4000) -> str:
    t = t or ""
    return t if len(t) <= max_chars else t[:max_chars]

def gen_llm_only(query: str) -> Dict:
    ans = llm_call(
        "Responde brevemente y con precisión. No cites IDs.",
        f"Pregunta: {query}"
    )
    return {"answer": ans, "cited_ids": []}

def gen_rag_basic(query: str, ctx_docs: List[Dict]) -> Dict:
    # Concatenación directa de top-k (pre-rerank)
    ctx = "\n\n".join(f"[{d['id']}]\n{truncate_text(d['text'])}" for d in ctx_docs)
    prompt = (
        "Lee los CONTEXTOS y responde SOLO con información presente en ellos. "
        "No utilices conocimiento previo. Si no hay suficiente información, responde exactamente: 'No hay evidencia suficiente'. "
        "Incluye entre corchetes los [ID] de los fragmentos realmente usados.\n\n"
        f"CONTEXTOS:\n{ctx}\n\nPREGUNTA:\n{query}"
    )
    ans = llm_call("Eres un asistente factual y literal.", prompt)
    return {"answer": ans, "cited_ids": extract_ids(ans)}

def gen_rag_full(query: str, ctx_docs: List[Dict]) -> Dict:
    # Builder: resumen intermedio por bloques (post-rerank)
    blocks = "\n\n".join(f"[{d['id']}]\n{truncate_text(d['text'])}" for d in ctx_docs)
    summary = llm_call(
        "Eres un sintetizador extractivo. No inventes. Conserva referencias [ID] cuando cites hechos.",
        "Resume con viñetas, agrupando por [ID]. Extrae SOLO hechos verificables; no agregues conocimiento fuera de CONTEXTOS.\n\n"
        f"CONTEXTOS:\n{blocks}"
    )
    prompt = (
        "Usa SOLO la información de los CONTEXTOS siguientes. "
        "Cada afirmación DEBE incluir entre corchetes el [ID] del fragmento donde aparece. "
        "Si no hay información suficiente, responde exactamente: 'No hay evidencia suficiente'."
    )
    ans = llm_call("Eres estricto con el soporte textual; cada afirmación debe acabar con [ID].", prompt)
    return {"answer": ans, "cited_ids": extract_ids(ans)}

# ==================== Embeddings y métricas ====================
_emb = None
def _emb_model():
    global _emb
    if _emb is None:
        _emb = SentenceTransformer(EMB_MODEL, device="cpu")
        print(f"[INFO] Embedder: {EMB_MODEL}")
    return _emb

def _cos_sim(u: np.ndarray, v: np.ndarray) -> float:
    return float(np.clip(np.dot(u, v), -1.0, 1.0))

def _embed_norm(texts: List[str]) -> np.ndarray:
    m = _emb_model()
    return m.encode(texts, normalize_embeddings=True, convert_to_numpy=True)

def exact_match(pred: str, gold: str) -> float:
    a = (pred or "").strip().lower()
    b = (gold or "").strip().lower()
    return 1.0 if a and b and a == b else 0.0

def context_support_score(answer: str, ctx_docs: List[Dict]) -> float:
    """Proxy de fidelidad: similitud entre respuesta y el texto agregado del contexto."""
    if not answer or not ctx_docs:
        return 0.0
    ctx_text = " ".join(d["text"] or "" for d in ctx_docs)
    emb = _emb_model()
    a_vec = emb.encode(answer, normalize_embeddings=True)
    c_vec = emb.encode(ctx_text, normalize_embeddings=True)
    return _cos_sim(a_vec, c_vec)

def gen_metrics(answer: str, ideal_answer: str, cited_ids: List[str],
                rel_ids: set, ctx_docs: List[Dict] | None = None) -> Dict[str, float]:
    em  = exact_match(answer, ideal_answer)
    sim = 0.0
    if ideal_answer.strip():
        v = _embed_norm([answer or "", ideal_answer])[0:2]
        sim = _cos_sim(v[0], v[1])
    cited = set(cited_ids or [])
    cov   = (len(cited & rel_ids)/len(cited)) if cited else 0.0
    hallu = 1.0 if (cited and len(cited & rel_ids) == 0) else 0.0
    faith = context_support_score(answer, ctx_docs or [])
    return {"EM": em, "SemanticSim": sim, "Faithfulness": faith,
            "CitationCoverage": cov, "Hallucination": hallu}

# ==================== Recuperación (para armar contexto) ====================
def retrieve_topk(query: str, topk: int = 200) -> pd.DataFrame:
    df = search_full_scopus(query, topk=topk).reset_index(drop=True)  # <- tu función
    return ensure_chunkuid(df)

def rerank_ce(query: str, df_topk: pd.DataFrame) -> pd.DataFrame:
    df_r = rerank_with_cross_encoder(
        query_text=query, df_topk=df_topk, text_cols=TEXT_COLS,
        score_dense_col="score", fuse_with_dense=True
    )
    return ensure_chunkuid(df_r)

# ==================== Runner: AUGMENTATION mejorado ====================
def run_augmentation_only() -> Tuple[pd.DataFrame, pd.DataFrame]:
    gt = load_gt(GT_PATH)
    rows = []
    for q, info in gt.items():
        rel  = info["relevant_ids"]
        gold = info["ideal_answer"]

        # LLM-only (sin contexto)
        out0 = gen_llm_only(q)
        m0   = gen_metrics(out0["answer"], gold, out0.get("cited_ids",[]), rel, ctx_docs=[])

        # RAG-básico (pre-rerank)
        df_b  = retrieve_topk(q, topk=200)
        ctx_b = build_context(df_b, TEXT_COLS, k_ctx=CTX_K_BASIC)
        out1  = gen_rag_basic(q, ctx_b)
        m1    = gen_metrics(out1["answer"], gold, out1.get("cited_ids",[]), rel, ctx_docs=ctx_b)

        # RAG-full (post-rerank + builder)
        df_f  = rerank_ce(q, df_b)
        ctx_f = build_context(df_f, TEXT_COLS, k_ctx=CTX_K_FULL)
        out2  = gen_rag_full(q, ctx_f)
        m2    = gen_metrics(out2["answer"], gold, out2.get("cited_ids",[]), rel, ctx_docs=ctx_f)

        rows += [
            {"cond":"LLM-only", "consulta":q, **m0},
            {"cond":"RAG-basic","consulta":q, **m1},
            {"cond":"RAG-full", "consulta":q, **m2},
        ]

    df = pd.DataFrame(rows)

    # Macro por condición (incluye Faithfulness)
    macro = (df.groupby("cond")
             .agg({"EM":"mean","SemanticSim":"mean","Faithfulness":"mean",
                   "CitationCoverage":"mean","Hallucination":"mean"})
             .reset_index())

    # Δ vs LLM-only (ganancia de augmentation)
    base = macro.set_index("cond").loc["LLM-only"]
    for cond in ["RAG-basic","RAG-full"]:
        for m in ["EM","SemanticSim","Faithfulness","CitationCoverage","Hallucination"]:
            macro.loc[macro["cond"]==cond, f"Δ_{m}_vs_LLM"] = macro.set_index("cond").loc[cond, m] - base[m]

    print("== AUGMENTATION (MEJORADO): MACRO POR CONDICIÓN ==")
    print(macro.round(3).to_string(index=False))

    os.makedirs("runs_eval", exist_ok=True)
    df.to_csv("runs_eval/augmentation_per_query.csv", index=False)
    macro.to_csv("runs_eval/augmentation_macro.csv", index=False)
    return df, macro

# ==================== Ejecutar ====================
df_aug_per_query, df_aug_macro = run_augmentation_only()


[INFO] Embedder: intfloat/multilingual-e5-small
== AUGMENTATION (MEJORADO): MACRO POR CONDICIÓN ==
     cond  EM  SemanticSim  Faithfulness  CitationCoverage  Hallucination  Δ_EM_vs_LLM  Δ_SemanticSim_vs_LLM  Δ_Faithfulness_vs_LLM  Δ_CitationCoverage_vs_LLM  Δ_Hallucination_vs_LLM
 LLM-only 0.0        0.893         0.000               0.0            0.0          NaN                   NaN                    NaN                        NaN                     NaN
RAG-basic 0.0        0.807         0.778               0.0            0.0          0.0                -0.087                  0.778                        0.0                     0.0
 RAG-full 0.0        0.818         0.834               0.0            0.0          0.0                -0.075                  0.834                        0.0                     0.0


In [22]:
# ==================== AUGMENTATION: EVALUACIÓN SOLO DE CONTEXTO (usa tu recuperador) ====================
# Métricas: Context Recall / Precision / Noise / Compression Ratio / Context Length
# Comparación: RAG-basic (concat top-k pre-rerank) vs RAG-full (builder sobre rerank)
# SIN usar la respuesta del LLM (solo contextos)
# Usa los CHUNKUID desde tu recuperador (meta_min + scopus CSV)

import os, re, csv, math
import numpy as np
import pandas as pd
from typing import Dict, List, Sequence, Tuple, Iterable

# ========= CONFIG =========
GT_PATH       = os.environ.get("GT_PATH", "groundtruth.csv")   # ⚠️ TSV por tabs aunque sea .csv
ID_COL        = os.environ.get("ID_COL", "chunkuid")
TEXT_COLS     = [c.strip() for c in os.environ.get("TEXT_COLS", "title,abstract").split(",")]
CTX_K_BASIC   = int(os.environ.get("CTX_K_BASIC", "6"))
CTX_K_FULL    = int(os.environ.get("CTX_K_FULL", "6"))
SIM_EMB_MODEL = os.environ.get("EMB_MODEL", "intfloat/multilingual-e5-small")
SIM_TAU       = float(os.environ.get("CTX_SIM_TAU", "0.80"))

# ========= FUNCIONES QUE YA DEBES TENER EN TU ENTORNO =========
# - load_pkl_and_model() -> (model, meta_min)   [meta_min con columnas: chunk_uid, scopus_id, ...]
# - load_scopus_csv()     -> DataFrame del CSV   [con columnas: scopus_id, title, abstract, ...]
# - search_full_scopus(query, topk)
# - rerank_with_cross_encoder(query_text, df_topk, text_cols, score_dense_col, fuse_with_dense)
# - first_text(row, TEXT_COLS)
# - ensure_chunkuid(df)   (renombra chunk_uid -> chunkuid si aplica)
# - build_context(df, TEXT_COLS, k_ctx) -> [{'id','text'}, ...]

# ========= GT LOADER =========
def load_gt_context(path: str = GT_PATH, sep: str = "\t", id_sep: str = "|") -> Dict[str, Dict]:
    df = pd.read_csv(path, sep=sep, engine="python", quoting=csv.QUOTE_MINIMAL,
                     encoding="utf-8", keep_default_na=False, on_bad_lines="error")
    req = {"consulta","fragmentos"}
    miss = req - set(df.columns)
    if miss:
        raise KeyError(f"Faltan columnas {miss}. Presentes: {list(df.columns)}")
    def split_ids(s: str) -> List[str]:
        return [t.strip() for t in (s or "").split(id_sep) if t.strip()]
    out = {}
    for _, r in df.iterrows():
        q   = str(r["consulta"]).strip()
        rel = set(split_ids(str(r["fragmentos"]).strip()))  # chunkuid
        out[q] = {"relevant_ids": rel}
    return out

# ========= MAPA chunkuid -> texto (vía tu recuperador) =========
_meta_min_cache: pd.DataFrame | None = None
_scopus_df_cache: pd.DataFrame | None = None
_chunkuid_to_text: Dict[str, str] | None = None

def _warm_retriever_caches():
    """Precarga meta_min (PKL) y scopus CSV usando tus loaders, sin tocar el FAISS."""
    global _meta_min_cache, _scopus_df_cache
    if _meta_min_cache is not None and _scopus_df_cache is not None:
        return
    _, meta_min = load_pkl_and_model()          # tu función: devuelve (model, meta_min)
    sc = load_scopus_csv()                      # tu función: devuelve df del CSV
    _meta_min_cache = meta_min.copy()
    _scopus_df_cache = sc.copy()

def _build_chunkuid_text_map():
    """Construye un dict {chunkuid -> texto} uniendo meta_min (chunk_uid, scopus_id) con scopus CSV (TEXT_COLS)."""
    global _chunkuid_to_text
    if _chunkuid_to_text is not None:
        return
    _warm_retriever_caches()
    mm = _meta_min_cache
    sc = _scopus_df_cache

    # Asegurar columnas canónicas
    if "chunk_uid" not in mm.columns:
        raise KeyError("meta_min no tiene columna 'chunk_uid'")
    if "scopus_id" not in mm.columns:
        raise KeyError("meta_min no tiene columna 'scopus_id'")
    if "scopus_id" not in sc.columns:
        raise KeyError("scopus CSV no tiene columna 'scopus_id'")

    # Unir chunk_uid -> scopus_id -> textos
    mm_small = mm[["chunk_uid","scopus_id"]].drop_duplicates()
    sc_small = sc[["scopus_id"] + [c for c in TEXT_COLS if c in sc.columns]].copy()
    merged = mm_small.merge(sc_small, how="left", on="scopus_id")

    # Construir texto por prioridad de TEXT_COLS
    def _row_text(row: pd.Series) -> str:
        for c in TEXT_COLS:
            if c in row and isinstance(row[c], str) and row[c].strip():
                return row[c]
        # fallback: concatenar cualquier columna de texto que exista
        parts = []
        for c in row.index:
            v = row[c]
            if isinstance(v, str) and v.strip():
                parts.append(v.strip())
        return " ".join(parts)
    merged["__txt__"] = merged.apply(_row_text, axis=1)
    _chunkuid_to_text = dict(zip(merged["chunk_uid"].astype(str), merged["__txt__"].astype(str)))

def text_for_chunkuid(cuid: str) -> str:
    """Devuelve el texto (p.ej. title/abstract) asociado al chunkuid usando tu recuperador."""
    _build_chunkuid_text_map()
    return _chunkuid_to_text.get(str(cuid), "")

# ========= Utilidades: oraciones y tokens =========
_SENT_RE = re.compile(r'(?<=[\.\?\!；;:])\s+|\n+')

def split_sentences(text: str) -> List[str]:
    if not text:
        return []
    parts = [p.strip() for p in _SENT_RE.split(text) if p and p.strip()]
    return [p for p in parts if len(p.split()) >= 3]

def count_tokens(text: str) -> int:
    try:
        import tiktoken
        enc = tiktoken.get_encoding("cl100k_base")
        return len(enc.encode(text))
    except Exception:
        return len(text.split())

def total_tokens(texts: Iterable[str]) -> int:
    return sum(count_tokens(t) for t in texts)

# ========= Embeddings y similitud =========
from sentence_transformers import SentenceTransformer
_emb_model: SentenceTransformer | None = None

def get_emb():
    global _emb_model
    if _emb_model is None:
        _emb_model = SentenceTransformer(SIM_EMB_MODEL, device="cpu")
        print(f"[CTX] Loaded embedder: {SIM_EMB_MODEL}")
    return _emb_model

def embed_sentences(sents: List[str]) -> np.ndarray:
    if not sents:
        return np.zeros((0, 384), dtype=float)
    emb = get_emb()
    return emb.encode(sents, normalize_embeddings=True)

def cosine_sim_matrix(A: np.ndarray, B: np.ndarray) -> np.ndarray:
    if A.size == 0 or B.size == 0:
        return np.zeros((A.shape[0], B.shape[0]))
    return np.clip(A @ B.T, -1.0, 1.0)

# ========= Construcción de contextos (BASIC vs FULL) =========
def build_basic_context_text(df_topk: pd.DataFrame, k_ctx: int) -> str:
    ctx = build_context(df_topk, TEXT_COLS, k_ctx=k_ctx)  # [{'id','text'},...]
    return "\n\n".join(f"[{c['id']}]\n{c['text']}" for c in ctx if c.get("text"))

def build_full_context_text(query: str, df_topk: pd.DataFrame, k_ctx: int) -> str:
    df_r  = rerank_with_cross_encoder(
        query_text=query, df_topk=df_topk, text_cols=TEXT_COLS,
        score_dense_col="score", fuse_with_dense=True
    )
    df_r  = ensure_chunkuid(df_r)
    ctx   = build_context(df_r, TEXT_COLS, k_ctx=k_ctx)
    # Heurístico simple de “builder”: 2 oraciones por chunk
    joined = []
    for c in ctx:
        sents = split_sentences(c.get("text",""))
        joined.append(f"[{c['id']}]\n" + " ".join(sents[:2]))
    return "\n\n".join(joined)

# ========= Métricas de contexto =========
def context_metrics_for(query: str, rel_ids: set, df_topk: pd.DataFrame,
                        mode: str, k_ctx: int, tau: float = SIM_TAU) -> Dict:
    """
    mode: 'basic' (pre-rerank, concat top-k)  |  'full' (post-rerank + builder)
    """
    df_topk = ensure_chunkuid(df_topk)

    if mode == "basic":
        ctx_text = build_basic_context_text(df_topk, k_ctx)
    elif mode == "full":
        ctx_text = build_full_context_text(query, df_topk, k_ctx)
    else:
        raise ValueError("mode must be 'basic' or 'full'")

    # Oraciones del contexto (C)
    C_sents = split_sentences(ctx_text)
    C_tokens = total_tokens(C_sents)

    # Oraciones de evidencia (E) desde GT (usando mapa chunkuid->texto del recuperador)
    E_texts = [text_for_chunkuid(cid) for cid in rel_ids]
    E_sents = []
    for t in E_texts:
        E_sents.extend(split_sentences(t))
    E_tokens = total_tokens(E_sents)

    # Si no hay evidencia GT para la consulta
    if len(E_sents) == 0:
        return {
            "cond": f"RAG-{mode}",
            "consulta": query,
            "E_sents": 0, "C_sents": len(C_sents),
            "context_recall": np.nan,
            "context_precision": np.nan,
            "context_noise": np.nan,
            "context_tokens": C_tokens,
            "evidence_tokens": E_tokens,
            "matched_E": 0, "matched_C": 0,
            "matched_tokens": 0,
            "compression_ratio": 0.0,
            "tau": tau,
        }

    # Similitud C×E
    C_emb = embed_sentences(C_sents)   # |C| x D
    E_emb = embed_sentences(E_sents)   # |E| x D
    S = cosine_sim_matrix(C_emb, E_emb)

    # Matches por umbral
    C_match_any = (S >= tau).any(axis=1) if S.size else np.zeros(len(C_sents), dtype=bool)
    E_match_any = (S >= tau).any(axis=0) if S.size else np.zeros(len(E_sents), dtype=bool)

    matched_C = int(C_match_any.sum())
    matched_E = int(E_match_any.sum())

    # Métricas
    context_recall    = matched_E / len(E_sents)
    context_precision = (matched_C / len(C_sents)) if C_sents else np.nan
    context_noise     = 1.0 - (context_precision if not math.isnan(context_precision) else 0.0)

    # Compresión: tokens de oraciones de C que matchean / tokens totales de C
    matched_tokens = sum(count_tokens(s) for i, s in enumerate(C_sents) if C_match_any[i])
    compression_ratio = (matched_tokens / C_tokens) if C_tokens > 0 else 0.0

    return {
        "cond": f"RAG-{mode}",
        "consulta": query,
        "E_sents": len(E_sents), "C_sents": len(C_sents),
        "context_recall": context_recall,
        "context_precision": context_precision,
        "context_noise": context_noise,
        "context_tokens": C_tokens,
        "evidence_tokens": E_tokens,
        "matched_E": matched_E, "matched_C": matched_C,
        "matched_tokens": matched_tokens,
        "compression_ratio": compression_ratio,
        "tau": tau,
    }

# ========= Runner =========
def run_augmentation_context_only() -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    # calentar caches del recuperador para mapa chunkuid->texto
    _warm_retriever_caches()
    _build_chunkuid_text_map()

    gt = load_gt_context(GT_PATH)
    all_rows = []
    for q, info in gt.items():
        rel = info["relevant_ids"]  # set de chunkuid (evidencia GT)

        # top-k candidatos del recuperador
        df_topk = search_full_scopus(q, topk=max(CTX_K_BASIC, CTX_K_FULL, 50)).reset_index(drop=True)
        df_topk = ensure_chunkuid(df_topk)

        # métricas por condición
        mb = context_metrics_for(q, rel, df_topk, mode="basic", k_ctx=CTX_K_BASIC, tau=SIM_TAU)
        mf = context_metrics_for(q, rel, df_topk, mode="full",  k_ctx=CTX_K_FULL,  tau=SIM_TAU)
        all_rows += [mb, mf]

    df = pd.DataFrame(all_rows)

    # Macro por condición
    agg_cols = ["context_recall","context_precision","context_noise","compression_ratio","context_tokens"]
    macro = (df.groupby("cond")[agg_cols].mean(numeric_only=True).reset_index())

    # Δ (RAG-full − RAG-basic)
    m_basic = macro.set_index("cond").loc["RAG-basic"]
    m_full  = macro.set_index("cond").loc["RAG-full"]
    deltas = {
        "Δ_context_recall":    m_full["context_recall"]    - m_basic["context_recall"],
        "Δ_context_precision": m_full["context_precision"] - m_basic["context_precision"],
        "Δ_context_noise":     m_full["context_noise"]     - m_basic["context_noise"],
        "Δ_compression_ratio": m_full["compression_ratio"] - m_basic["compression_ratio"],
        "Δ_context_tokens":    m_full["context_tokens"]    - m_basic["context_tokens"],
    }
    delta_df = pd.DataFrame([deltas])

    print("== AUGMENTATION (CONTEXT-ONLY, vía recuperador): MACRO ==")
    print(macro.round(3).to_string(index=False))
    print("\n== Δ (RAG-full − RAG-basic) ==")
    print(delta_df.round(3).to_string(index=False))

    os.makedirs("runs_eval", exist_ok=True)
    df.to_csv("runs_eval/context_eval_per_query.csv", index=False)
    macro.to_csv("runs_eval/context_eval_macro.csv", index=False)
    delta_df.to_csv("runs_eval/context_eval_deltas_full_minus_basic.csv", index=False)
    return df, macro, delta_df

# ========== Ejecutar ==========
df_ctx_per_q, df_ctx_macro, df_ctx_deltas = run_augmentation_context_only()


[INFO] Modelo: intfloat/multilingual-e5-small | max_seq_length=300
[INFO] meta_min columnas: ['vec_id', 'chunk_uid', 'doc_id', 'chunk_id', 'start_token', 'end_token', 'scopus_id'] | filas=21005
[CTX] Loaded embedder: intfloat/multilingual-e5-small
== AUGMENTATION (CONTEXT-ONLY, vía recuperador): MACRO ==
     cond  context_recall  context_precision  context_noise  compression_ratio  context_tokens
RAG-basic             1.0              0.975          0.025              0.326          99.200
 RAG-full             1.0              0.978          0.022              0.330          90.067

== Δ (RAG-full − RAG-basic) ==
 Δ_context_recall  Δ_context_precision  Δ_context_noise  Δ_compression_ratio  Δ_context_tokens
              0.0                0.003           -0.003                0.004            -9.133


## generacion

In [19]:
import torch
import os, pickle
import pandas as pd
import faiss
import requests
from sentence_transformers import SentenceTransformer

# ---- Rutas (ajústalas o usa variables de entorno) ----
PKL_MIN_PATH = os.environ.get("PKL_MIN_PATH", "embeddings_meta_min.pkl")
FAISS_PATH   = os.environ.get("FAISS_PATH", "faiss_index_ip.bin")
SCOPUS_CSV   = os.environ.get("SCOPUS_CSV", "scopusdata.csv")
SCOPUS_SEP   = os.environ.get("SCOPUS_SEP", "|")

# ---- Config LLM (Ollama + Gemma3) ----
OLLAMA_HOST   = os.environ.get("OLLAMA_HOST", "http://localhost:11434")
OLLAMA_MODEL  = os.environ.get("OLLAMA_MODEL", "gemma3:4b")
_OLLAMA_CHAT_URL = f"{OLLAMA_HOST.rstrip('/')}/api/chat"

MAX_CTX_CHARS = int(os.environ.get("RAG_MAX_CTX_CHARS", "8000"))

# ---- Caches simples ----
_model_cache = None
_meta_min_cache = None
_index_cache = None
_scopus_cache = None

def load_pkl_and_model(emb_max_seq_len=300):
    global _model_cache, _meta_min_cache
    if _model_cache is not None and _meta_min_cache is not None:
        return _model_cache, _meta_min_cache
    with open(PKL_MIN_PATH, "rb") as f:
        pkl = pickle.load(f)

    meta_min = pkl["meta_min"].copy()  # DataFrame: vec_id, chunk_uid, doc_id, chunk_id, (scopus_id), start/end
    _meta_min_cache = meta_min

    model_name = pkl.get("model", "intfloat/multilingual-e5-large")
    model = SentenceTransformer(model_name, device="cuda" if torch.cuda.is_available() else "cpu")
    model.max_seq_length = min(int(emb_max_seq_len), 512)
    _model_cache = model

    print(f"[INFO] Modelo: {model_name} | max_seq_length={model.max_seq_length}")
    print(f"[INFO] meta_min columnas: {list(meta_min.columns)} | filas={len(meta_min)}")
    return _model_cache, _meta_min_cache

def load_faiss():
    global _index_cache
    if _index_cache is None:
        _index_cache = faiss.read_index(FAISS_PATH)
        print(f"[INFO] Índice FAISS cargado: ntotal={_index_cache.ntotal}")
    return _index_cache

def load_scopus_csv():
    global _scopus_cache
    if _scopus_cache is None:
        df = pd.read_csv(SCOPUS_CSV, sep=SCOPUS_SEP)
        if "scopus_id" not in df.columns:
            raise ValueError(f"{SCOPUS_CSV} no tiene columna 'scopus_id'")
        df["scopus_id"] = df["scopus_id"].astype(str)
        _scopus_cache = df
        print(f"[INFO] scoupusdata.csv: filas={len(df)} | cols={len(df.columns)}")
    return _scopus_cache

def e5_encode_query(model, query_text: str):
    return model.encode(
        [f"query: {query_text}"],
        normalize_embeddings=True,
        convert_to_numpy=True
    ).astype("float32")

def search_min(query_text: str, topk: int = 100) -> pd.DataFrame:
    """
    Devuelve SOLO el meta mínimo del PKL (sin CSV):
    vec_id, score, chunk_uid, doc_id, chunk_id, (scopus_id si existe), start/end
    """
    model, meta_min = load_pkl_and_model()
    index = load_faiss()

    q = e5_encode_query(model, query_text)
    D, I = index.search(q, topk)
    vec_ids = I[0].tolist()

    hits = meta_min.set_index("vec_id").loc[vec_ids].reset_index()
    hits.insert(1, "score", D[0])

    cols_front = [c for c in ["vec_id","score","chunk_uid","doc_id","chunk_id","scopus_id","start_token","end_token"] if c in hits.columns]
    rest = [c for c in hits.columns if c not in cols_front]
    return hits[cols_front + rest].reset_index(drop=True)

def search_full_scopus(query_text: str, topk: int = 5) -> pd.DataFrame:
    """
    Une el TOP-K con TODAS las columnas de scoupusdata.csv por scopus_id.
    """
    model, meta_min = load_pkl_and_model()
    index = load_faiss()
    sc = load_scopus_csv()

    q = e5_encode_query(model, query_text)
    D, I = index.search(q, topk)
    vec_ids = I[0].tolist()

    hits = meta_min.set_index("vec_id").loc[vec_ids].reset_index()
    hits.insert(1, "score", D[0])

    if "scopus_id" not in hits.columns:
        raise ValueError("meta_min en PKL no contiene 'scopus_id'; no puedo unir con el CSV.")

    out = hits.merge(sc, how="left", on="scopus_id")

    # Orden: primero claves/score/offsets, luego TODO el CSV
    front = [c for c in ["vec_id","score","chunk_uid","doc_id","chunk_id","scopus_id","start_token","end_token"] if c in out.columns]
    csv_cols = [c for c in sc.columns if c not in front]
    return out[front + csv_cols].reset_index(drop=True)

# ========= RAG BÁSICO SOLO CON EL RECUPERADOR (k=15) =========

def build_context_from_hits(hits: pd.DataFrame, max_chars: int = MAX_CTX_CHARS) -> str:
    """
    Construye un contexto concatenando título + abstract (u otras columnas).
    Corta a max_chars para no pasar de contexto.
    """
    title_col = "title" if "title" in hits.columns else None
    abs_col   = "abstract" if "abstract" in hits.columns else None

    chunks = []
    for _, row in hits.iterrows():
        parts = []
        if title_col:
            parts.append(f"Título: {row.get(title_col, '')}")
        if abs_col:
            parts.append(f"Resumen: {row.get(abs_col, '')}")
        txt = "\n".join(parts).strip()
        if not txt:
            continue
        chunks.append(txt)

    full_ctx = "\n\n---\n\n".join(chunks)
    if len(full_ctx) > max_chars:
        full_ctx = full_ctx[:max_chars]
    return full_ctx

def answer_with_llm(query_text: str, context: str) -> str:
    """
    Llama al LLM local (Ollama + Gemma3) usando solo el contexto recuperado.
    """
    system_prompt = (
        "Eres un asistente experto en literatura científica. "
        "Responde a la pregunta del usuario usando EXCLUSIVAMENTE la información del contexto. "
        "Si el contexto no contiene suficiente evidencia, di explícitamente que no hay información suficiente."
    )

    user_content = f"""Pregunta del usuario:
{query_text}

Contexto recuperado (fragmentos de artículos científicos):
{context}
"""

    payload = {
        "model": OLLAMA_MODEL,
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user",   "content": user_content},
        ],
        "stream": False,
    }

    resp = requests.post(_OLLAMA_CHAT_URL, json=payload, timeout=300)
    resp.raise_for_status()
    data = resp.json()

    # Formato típico de Ollama: {"message": {"role": "assistant", "content": "..."}}
    msg = data.get("message", {}).get("content", "").strip()
    if not msg:
        raise RuntimeError(f"Respuesta vacía de Ollama: {data}")
    return msg

def rag_only_retriever(query_text: str, topk: int = 15) -> dict:
    """
    Pipeline RAG mínimo:
    1) Recupera top-k (k=15 por defecto).
    2) Construye contexto.
    3) Llama al LLM (Gemma3 en Ollama).
    Devuelve dict con respuesta + dataframe de hits + contexto.
    """
    hits = search_full_scopus(query_text, topk=topk)
    ctx = build_context_from_hits(hits, max_chars=MAX_CTX_CHARS)
    answer = answer_with_llm(query_text, ctx)
    return {
        "answer": answer,
        "hits": hits,
        "context": ctx,
    }

# ====== DEMO RÁPIDA ======
if __name__ == "__main__":
    query = "¿Cuántos milímetros  de lluvia cayeron en cada barrio del país en cada día de 2024?"

    print("\n=== RAG solo con recuperador (k=15) ===")
    result = rag_only_retriever(query, topk=15)

    print("\n--- RESPUESTA LLM (Gemma3/Ollama) ---\n")
    print(result["answer"])

    # Si quieres revisar los docs usados:
    df = result["hits"]
    df.to_csv("ground.csv", sep="|", index=False, encoding="utf-8")
    print("\nPrimeros 5 artículos recuperados:")
    print(df.head(5)[["scopus_id","title","abstract"]])



=== RAG solo con recuperador (k=15) ===
[INFO] Modelo: intfloat/multilingual-e5-small | max_seq_length=300
[INFO] meta_min columnas: ['vec_id', 'chunk_uid', 'doc_id', 'chunk_id', 'start_token', 'end_token', 'scopus_id'] | filas=21005
[INFO] Índice FAISS cargado: ntotal=21005
[INFO] scoupusdata.csv: filas=19233 | cols=9

--- RESPUESTA LLM (Gemma3/Ollama) ---

No hay información suficiente en el contexto para responder a la pregunta. El texto describe la creación y funcionamiento de redes de parcelas de largo plazo para estudiar bosques tropicales, pero no proporciona datos sobre la cantidad de lluvia que ha caído en cada barrio del país en cada día de 2024.

Primeros 5 artículos recuperados:
     scopus_id                                              title  \
0  85106283218  Taking the pulse of Earth's tropical forests u...   
1  85114871125  Author Correction: The effect of national prot...   
2  85148222225  Water, Sanitation and Socioeconomic Situation ...   
3  85139503798  Una nue

In [32]:
import os, math, pickle, requests
import numpy as np
import pandas as pd
import torch
import faiss
from typing import List, Sequence, Optional, Tuple
from sentence_transformers import SentenceTransformer, CrossEncoder

# ============================================================
#                CONFIG GLOBAL
# ============================================================

# ---- Rutas (ajústalas o usa variables de entorno) ----
PKL_MIN_PATH = os.environ.get("PKL_MIN_PATH", "embeddings_meta_min.pkl")
FAISS_PATH   = os.environ.get("FAISS_PATH", "faiss_index_ip.bin")
SCOPUS_CSV   = os.environ.get("SCOPUS_CSV", "scopusdata.csv")
SCOPUS_SEP   = os.environ.get("SCOPUS_SEP", "|")

# ---- Config LLM (Ollama + Gemma3) ----
OLLAMA_HOST   = os.environ.get("OLLAMA_HOST", "http://localhost:11434")
OLLAMA_MODEL  = os.environ.get("OLLAMA_MODEL", "gemma3:4b")
_OLLAMA_CHAT_URL = f"{OLLAMA_HOST.rstrip('/')}/api/chat"

# ---- Config CrossEncoder (Reranking) ----
CROSS_ENCODER_MODEL = os.environ.get(
    "CROSS_ENCODER_MODEL",
    "cross-encoder/ms-marco-MiniLM-L-6-v2"
)
CE_BATCH_SIZE = int(os.environ.get("CE_BATCH_SIZE", "64"))
W_CE, W_DENSE = float(os.environ.get("W_CE", "0.7")), float(os.environ.get("W_DENSE", "0.3"))

# ---- RAG general ----
MAX_CTX_CHARS = int(os.environ.get("RAG_MAX_CTX_CHARS", "8000"))

# columnas de texto (orden de prioridad)
TEXT_COLS = ["title", "abstract"]  # <- ajustado a tu CSV

# ---- Caches simples ----
_model_cache = None
_meta_min_cache = None
_index_cache = None
_scopus_cache = None
_ce_cache: Optional[CrossEncoder] = None

# ============================================================
#                LOADERs: E5 + FAISS + SCOPUS
# ============================================================

def load_pkl_and_model(emb_max_seq_len=300):
    global _model_cache, _meta_min_cache
    if _model_cache is not None and _meta_min_cache is not None:
        return _model_cache, _meta_min_cache

    with open(PKL_MIN_PATH, "rb") as f:
        pkl = pickle.load(f)

    meta_min = pkl["meta_min"].copy()
    _meta_min_cache = meta_min

    model_name = pkl.get("model", "intfloat/multilingual-e5-large")
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = SentenceTransformer(model_name, device=device)
    model.max_seq_length = min(int(emb_max_seq_len), 512)
    _model_cache = model

    print(f"[INFO] Modelo bi-encoder: {model_name} | max_seq_length={model.max_seq_length} | device={device}")
    print(f"[INFO] meta_min columnas: {list(meta_min.columns)} | filas={len(meta_min)}")
    return _model_cache, _meta_min_cache

def load_faiss():
    global _index_cache
    if _index_cache is None:
        _index_cache = faiss.read_index(FAISS_PATH)
        print(f"[INFO] Índice FAISS cargado: ntotal={_index_cache.ntotal}")
    return _index_cache

def load_scopus_csv():
    global _scopus_cache
    if _scopus_cache is None:
        df = pd.read_csv(SCOPUS_CSV, sep=SCOPUS_SEP)
        if "scopus_id" not in df.columns:
            raise ValueError(f"{SCOPUS_CSV} no tiene columna 'scopus_id'")
        df["scopus_id"] = df["scopus_id"].astype(str)
        _scopus_cache = df
        print(f"[INFO] scopusdata.csv: filas={len(df)} | cols={len(df.columns)}")
    return _scopus_cache

def e5_encode_query(model, query_text: str):
    return model.encode(
        [f"query: {query_text}"],
        normalize_embeddings=True,
        convert_to_numpy=True
    ).astype("float32")

def search_full_scopus(query_text: str, topk: int = 100) -> pd.DataFrame:
    """
    Recuperación densa (FAISS) + join con CSV.
    Devuelve topk candidatos con TODAS las columnas de scopusdata.csv.
    """
    model, meta_min = load_pkl_and_model()
    index = load_faiss()
    sc = load_scopus_csv()

    q = e5_encode_query(model, query_text)
    D, I = index.search(q, topk)
    vec_ids = I[0].tolist()

    hits = meta_min.set_index("vec_id").loc[vec_ids].reset_index()
    hits.insert(1, "score", D[0])

    if "scopus_id" not in hits.columns:
        raise ValueError("meta_min en PKL no contiene 'scopus_id'; no puedo unir con el CSV.")

    out = hits.merge(sc, how="left", on="scopus_id")

    # Orden: primero claves/score/offsets, luego TODO el CSV
    front = [c for c in ["vec_id","score","chunk_uid","doc_id","chunk_id","scopus_id","start_token","end_token"] if c in out.columns]
    csv_cols = [c for c in sc.columns if c not in front]
    return out[front + csv_cols].reset_index(drop=True)

# ============================================================
#                RERANKING: CROSS-ENCODER
# ============================================================

def get_cross_encoder(model_name: str = CROSS_ENCODER_MODEL) -> CrossEncoder:
    global _ce_cache
    if _ce_cache is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"
        _ce_cache = CrossEncoder(model_name, device=device)
        print(f"[INFO] Cross-Encoder cargado: {model_name} | device={device}")
    return _ce_cache

def _first_nonempty(row: pd.Series, cols: Sequence[str]) -> str:
    """Devuelve el primer texto no vacío según prioridad en 'cols'.
       Si no encuentra, intenta concatenar campos semánticos."""
    for c in cols:
        if c in row and isinstance(row[c], str) and row[c].strip():
            return row[c]
    parts = []
    for c in row.index:
        name = c.lower()
        if any(tok in name for tok in ("title","abstract","summary","keywords","chunk","desc")):
            v = row[c]
            if isinstance(v, str) and v.strip():
                parts.append(v.strip())
    return " ".join(parts)[:4096]

def _build_pairs(query_text: str, df_topk: pd.DataFrame, text_cols: Optional[List[str]]) -> Tuple[List[Tuple[str,str]], List[int]]:
    cols = text_cols or TEXT_COLS
    pairs, idx_map = [], []
    for i, row in df_topk.iterrows():
        txt = _first_nonempty(row, cols)
        pairs.append((query_text, txt if isinstance(txt, str) else ""))
        idx_map.append(i)
    return pairs, idx_map

def _minmax(x: np.ndarray) -> np.ndarray:
    x = np.asarray(x, dtype=np.float32)
    mn, mx = float(np.nanmin(x)), float(np.nanmax(x))
    if not np.isfinite(mn) or not np.isfinite(mx) or (mx - mn) <= 1e-12:
        return np.zeros_like(x, dtype=np.float32)
    return (x - mn) / (mx - mn + 1e-12)

def rerank_with_cross_encoder(query_text: str,
                              df_topk: pd.DataFrame,
                              text_cols: Optional[List[str]] = None,
                              score_dense_col: str = "score",
                              fuse_with_dense: bool = True,
                              batch_size: int = CE_BATCH_SIZE,
                              model_name: str = CROSS_ENCODER_MODEL) -> pd.DataFrame:
    """Reordena df_topk usando un Cross-Encoder y devuelve un nuevo DataFrame ordenado."""
    if df_topk is None or len(df_topk) == 0:
        raise ValueError("df_topk está vacío; ejecuta primero search_full_scopus(query, topk=N).")

    ce = get_cross_encoder(model_name)
    pairs, idx_map = _build_pairs(query_text, df_topk, text_cols)

    # Predicción por lotes
    scores_ce = []
    for start in range(0, len(pairs), batch_size):
        batch = pairs[start:start+batch_size]
        s = ce.predict(batch)
        scores_ce.append(np.asarray(s, dtype=np.float32))
    scores_ce = np.concatenate(scores_ce, axis=0) if scores_ce else np.zeros(len(df_topk), dtype=np.float32)

    out = df_topk.copy()
    out.loc[idx_map, "score_ce"] = scores_ce
    ce_norm = _minmax(out["score_ce"].values)

    if fuse_with_dense and score_dense_col in out.columns:
        dense_norm = _minmax(out[score_dense_col].values)
        out["score_dense_norm"] = dense_norm
        out["score_final"] = W_CE * ce_norm + W_DENSE * dense_norm
        order_col = "score_final"
    else:
        out["score_final"] = ce_norm
        order_col = "score_final"

    out = out.sort_values(order_col, ascending=False).reset_index(drop=True)
    return out

def search_full_scopus_reranked(query_text: str,
                                topk_dense: int = 100,
                                topk_final: int = 15,
                                text_cols: Optional[List[str]] = None) -> pd.DataFrame:
    """
    1) Recupera topk_dense con FAISS.
    2) Hace reranking con Cross-Encoder.
    3) Devuelve SOLO los topk_final re-rankeados.
    """
    df_topk = search_full_scopus(query_text, topk=topk_dense)
    reranked = rerank_with_cross_encoder(
        query_text=query_text,
        df_topk=df_topk,
        text_cols=text_cols,
        score_dense_col="score",
        fuse_with_dense=True
    )
    return reranked.head(topk_final).reset_index(drop=True)

# ============================================================
#                CONTEXT BUILDER
# ============================================================

def build_context_from_hits(hits: pd.DataFrame, max_chars: int = MAX_CTX_CHARS) -> str:
    """
    Construye un contexto concatenando título + abstract (u otras columnas).
    Corta a max_chars para no pasarse de contexto.
    """
    title_col = "title" if "title" in hits.columns else None
    abs_col   = "abstract" if "abstract" in hits.columns else None

    chunks = []
    for _, row in hits.iterrows():
        parts = []
        if title_col:
            t = str(row.get(title_col, "") or "").strip()
            if t:
                parts.append(f"Título: {t}")
        if abs_col:
            a = str(row.get(abs_col, "") or "").strip()
            if a:
                parts.append(f"Resumen: {a}")
        txt = "\n".join(parts).strip()
        if not txt:
            continue
        chunks.append(txt)

    full_ctx = "\n\n---\n\n".join(chunks)
    if len(full_ctx) > max_chars:
        full_ctx = full_ctx[:max_chars]
    return full_ctx

# ============================================================
#                LLM LOCAL (GEMMA3 / OLLAMA)
# ============================================================

def answer_with_llm(query_text: str, context: str) -> str:
    """
    Llama al LLM local (Ollama + Gemma3) usando solo el contexto recuperado.
    """
    system_prompt = (
        "Eres un asistente experto en literatura científica. "
        "Responde a la pregunta del usuario usando EXCLUSIVAMENTE la información del contexto. "
        "Si el contexto no contiene suficiente evidencia, di explícitamente que no hay información suficiente."
    )

    user_content = f"""Pregunta del usuario:
{query_text}

Contexto recuperado (fragmentos de artículos científicos):
{context}
"""

    payload = {
        "model": OLLAMA_MODEL,
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user",   "content": user_content},
        ],
        "stream": False,
    }

    resp = requests.post(_OLLAMA_CHAT_URL, json=payload, timeout=300)
    resp.raise_for_status()
    data = resp.json()

    msg = data.get("message", {}).get("content", "").strip()
    if not msg:
        raise RuntimeError(f"Respuesta vacía de Ollama: {data}")
    return msg

# ============================================================
#                PIPELINE RAG COMPLETO (RE-RANK + CONTEXTO)
# ============================================================

def rag_with_rerank(query_text: str,
                    topk_dense: int = 100,
                    topk_ctx: int = 15,
                    text_cols: Optional[List[str]] = None) -> dict:
    """
    Pipeline RAG:
    1) Recuperación densa topk_dense.
    2) Reranking con CrossEncoder.
    3) Selección de topk_ctx para contexto.
    4) Construcción de contexto (Context Builder).
    5) Llamada a Gemma3 en Ollama.
    """
    hits_reranked = search_full_scopus_reranked(
        query_text=query_text,
        topk_dense=topk_dense,
        topk_final=topk_ctx,
        text_cols=text_cols
    )
    ctx = build_context_from_hits(hits_reranked, max_chars=MAX_CTX_CHARS)
    answer = answer_with_llm(query_text, ctx)

    return {
        "answer": answer,
        "hits": hits_reranked,
        "context": ctx,
    }

# ============================================================
#                DEMO RÁPIDA
# ============================================================

if __name__ == "__main__":
    query = "¿Qué especie de manglar domina en estuarios ecuatorianos y qué servicio ecosistémico crítico se cuantifica ante escenarios de aumento del nivel del mar?"
    result = rag_with_rerank(
        query_text=query,
        topk_dense=100,  # candidatos que entran al reranker
        topk_ctx=15,     # context size final para Gemma3
        text_cols=None   # o por ejemplo ["title","abstract"] explícito
    )

    print("\n=== RESPUESTA LLM (Gemma3/Ollama, con reranking) ===\n")
    print(result["answer"])

    # Guardar los docs usados
    df = result["hits"]
    df.to_csv("ground_reranked.csv", sep="|", index=False, encoding="utf-8")
    print("\n[INFO] Primeros 5 artículos re-rankeados:")
    cols_show = [c for c in ["vec_id","score","score_ce","score_final","scopus_id","title","abstract"] if c in df.columns]
    print(df.head(5)[cols_show])


[INFO] Modelo bi-encoder: intfloat/multilingual-e5-small | max_seq_length=300 | device=cuda
[INFO] meta_min columnas: ['vec_id', 'chunk_uid', 'doc_id', 'chunk_id', 'start_token', 'end_token', 'scopus_id'] | filas=21005
[INFO] Índice FAISS cargado: ntotal=21005
[INFO] scopusdata.csv: filas=19233 | cols=9
[INFO] Cross-Encoder cargado: cross-encoder/ms-marco-MiniLM-L-6-v2 | device=cuda

=== RESPUESTA LLM (Gemma3/Ollama, con reranking) ===

Basándonos en la información proporcionada, no hay información suficiente para determinar qué especie de manglar domina en los estuarios ecuatorianos ni qué servicio ecosistémico crítico se cuantifica ante escenarios de aumento del nivel del mar.

Los resúmenes proporcionan información sobre la biodiversidad de manglares en Ecuador (nuevas especies de Pristimantis), la gestión de áreas protegidas de manglares (AUSCEM), la importancia de los manglares para las comunidades locales y la necesidad de más investigación sobre los servicios ecosistémicos de lo