## EXAMEN DE RECUPERACION DE LA INFORMACION

### SEGUNDO BIMESTRE

#### Jorge Rojas

#### CARGA Y EDIT DEL CORPUS

In [1]:
%pip install kagglehub

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import kagglehub

# Descargamos el dataset
path = kagglehub.dataset_download("Cornell-University/arxiv")
print("Path to dataset files:", path)


  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: C:\Users\gboy2\.cache\kagglehub\datasets\Cornell-University\arxiv\versions\244


In [3]:
import json
import os
import random

# Cargamos el JSON
file_path = os.path.join(path, "arxiv-metadata-oai-snapshot.json")  

# Leer JSON 
documents = []
with open(file_path, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        try:
            doc = json.loads(line)
            documents.append(doc)
        except:
            continue

print(f"Total documentos: {len(documents)}")

# Tomar solo el 1%
subset_size = int(len(documents) * 0.01)
subset = random.sample(documents, subset_size)

print(f"Corpus reducido a {len(subset)} documentos")


Total documentos: 2792339
Corpus reducido a 27923 documentos


In [4]:
with open("arxiv_subset.json", "w", encoding="utf-8") as out:
    json.dump(subset, out, indent=2)


#### PREPROCESAMIENTO

In [5]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()  # minúsculas
    text = re.sub(r'[^\w\s]', '', text)  # eliminar puntuación
    tokens = word_tokenize(text)  # tokenización
    tokens = [word for word in tokens if word not in stop_words]  # quitar stopwords
    return " ".join(tokens)


[nltk_data] Downloading package punkt to C:\Users\gboy2/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gboy2/nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [6]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\gboy2/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
import nltk
nltk.data.path.append("C:/Users/gboy2/nltk_data")


In [8]:
import nltk
import shutil
import os

# Borra manualmente si no lo hiciste antes
shutil.rmtree("C:/Users/gboy2/nltk_data/tokenizers/punkt", ignore_errors=True)

# Fuerza nueva descarga
nltk.download('punkt')


[nltk_data] Downloading package punkt to C:\Users\gboy2/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\gboy2/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

def preprocess_fallback(text):
    text = re.sub(r'[^\w\s]', '', text.lower())
    tokens = text.split()
    return " ".join([t for t in tokens if t not in ENGLISH_STOP_WORDS])


In [11]:
import json

with open("arxiv_subset.json", "r", encoding="utf-8") as f:
    subset = json.load(f)

print(f" Corpus cargado con {len(subset)} documentos")


 Corpus cargado con 27923 documentos


In [12]:
import re

processed_docs = []
for doc in subset:
    combined_text = doc["title"] + " " + doc["abstract"]
    processed = preprocess_fallback(combined_text)  # usamos la versión sin NLTK
    processed_docs.append(processed)


In [13]:
processed_docs = []
for doc in subset:
    combined_text = doc["title"] + " " + doc["abstract"]
    processed = preprocess_fallback(combined_text)
    processed_docs.append(processed)


In [14]:
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

def preprocess_fallback(text):
    text = re.sub(r'[^\w\s]', '', text.lower())  # quitar puntuación y pasar a minúsculas
    tokens = text.split()
    return " ".join([t for t in tokens if t not in ENGLISH_STOP_WORDS])  # eliminar stopwords

# Inicializar listas
processed_docs = []
doc_ids = []
titles = []
abstracts = []

# Procesar documentos
for doc in subset:
    if "title" in doc and "abstract" in doc:
        combined = f"{doc['title']} {doc['abstract']}"
        cleaned = preprocess_fallback(combined)
        processed_docs.append(cleaned)
        doc_ids.append(doc["id"])
        titles.append(doc["title"])
        abstracts.append(doc["abstract"])


#### INDEXACION TF-IDF

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Crear el vectorizador
tfidf_vectorizer = TfidfVectorizer()

# Aplicar sobre los textos procesados
tfidf_matrix = tfidf_vectorizer.fit_transform(processed_docs)

print(f" TF-IDF generado: {tfidf_matrix.shape[0]} documentos, {tfidf_matrix.shape[1]} términos únicos")


 TF-IDF generado: 27923 documentos, 123169 términos únicos


In [16]:
def search_tfidf(query, top_k=10):
    query_cleaned = preprocess_text(query)
    query_vec = tfidf_vectorizer.transform([query_cleaned])
    scores = (tfidf_matrix @ query_vec.T).toarray().flatten()
    top_indices = scores.argsort()[::-1][:top_k]
    
    results = []
    for idx in top_indices:
        results.append({
            "id": doc_ids[idx],
            "title": titles[idx],
            "score": scores[idx],
            "abstract_snippet": abstracts[idx][:300] + "..."
        })
    return results


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Crear vectorizador y ajustar
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(processed_docs)


In [18]:
def search_tfidf(query, top_k=10):
    query_clean = preprocess_fallback(query)  # usamos el mismo limpiador
    query_vec = tfidf_vectorizer.transform([query_clean])
    scores = (tfidf_matrix @ query_vec.T).toarray().flatten()
    top_indices = scores.argsort()[::-1][:top_k]
    
    results = []
    for idx in top_indices:
        results.append({
            "id": doc_ids[idx],
            "title": titles[idx],
            "score": round(scores[idx], 4),
            "abstract_snippet": abstracts[idx][:300] + "..."
        })
    return results


##### TEST

In [19]:
query = "quantum chromodynamics"
results = search_tfidf(query)

for r in results:
    print(f"🔹 {r['title']} (Score: {r['score']})")
    print(f" {r['abstract_snippet']}\n")


🔹 The Hybrid Monte Carlo Algorithm for Quantum Chromodynamics (Score: 0.3405)
   The Hybrid Monte Carlo (HMC) algorithm currently is the favorite scheme to
simulate quantum chromodynamics including dynamical fermions. In this
talk-which is intended for a non-expert audience--I want to bring together
methodical and practical aspects of the HMC for full QCD simulations. I will
c...

🔹 Simulating quantum field theory with a quantum computer (Score: 0.2618)
   Forthcoming exascale digital computers will further advance our knowledge of
quantum chromodynamics, but formidable challenges will remain. In particular,
Euclidean Monte Carlo methods are not well suited for studying real-time
evolution in hadronic collisions, or the properties of hadronic matter...

🔹 Quantum Fisher information as the measure of Gaussian quantum
  correlation: Role in quantum metrology (Score: 0.2243)
   We have introduced a measure of Gaussian quantum correlations based on
quantum Fisher information. For bipartite

In [20]:
%pip install rank_bm25

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [21]:
def simple_tokenize(text):
    text = re.sub(r'[^\w\s]', '', text.lower())
    return [t for t in text.split() if t not in ENGLISH_STOP_WORDS]

tokenized_docs = [simple_tokenize(doc) for doc in processed_docs]


In [22]:
from rank_bm25 import BM25Okapi

bm25_model = BM25Okapi(tokenized_docs)

In [23]:
def search_bm25(query, top_k=10):
    query_tokens = simple_tokenize(query)
    scores = bm25_model.get_scores(query_tokens)
    top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]

    results = []
    for idx in top_indices:
        results.append({
            "id": doc_ids[idx],
            "title": titles[idx],
            "score": round(scores[idx], 4),
            "abstract_snippet": abstracts[idx][:300] + "..."
        })
    return results

##### TEST

In [24]:
query = "quantum chromodynamics"
results = search_bm25(query)

for r in results:
    print(f"🔹 {r['title']} (Score: {r['score']:.2f})")
    print(f" {r['abstract_snippet']}\n")


🔹 The Hybrid Monte Carlo Algorithm for Quantum Chromodynamics (Score: 15.32)
   The Hybrid Monte Carlo (HMC) algorithm currently is the favorite scheme to
simulate quantum chromodynamics including dynamical fermions. In this
talk-which is intended for a non-expert audience--I want to bring together
methodical and practical aspects of the HMC for full QCD simulations. I will
c...

🔹 Simulating quantum field theory with a quantum computer (Score: 11.68)
   Forthcoming exascale digital computers will further advance our knowledge of
quantum chromodynamics, but formidable challenges will remain. In particular,
Euclidean Monte Carlo methods are not well suited for studying real-time
evolution in hadronic collisions, or the properties of hadronic matter...

🔹 Instanton infra-red stabilization in the nonperturbative QCD vacuum (Score: 11.47)
   The influence of nonperturbative fields on instantons in quantum
chromodynamics is studied. Nonperturbative vacuum is described in terms of
nonlocal g

In [25]:
%pip install sentence-transformers faiss-cpu

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [26]:
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

# Cargar modelo
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Generar embeddings del corpus
corpus_embeddings = embedding_model.encode(processed_docs, show_progress_bar=True)

# Crear índice FAISS (distancia L2)
dimension = corpus_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(corpus_embeddings))

print(" FAISS cargado con", index.ntotal, "documentos")

Batches: 100%|██████████| 873/873 [08:37<00:00,  1.69it/s]


 FAISS cargado con 27923 documentos


In [27]:
def search_faiss(query, top_k=10):
    query_vec = embedding_model.encode([query])
    D, I = index.search(np.array(query_vec), top_k)

    results = []
    for idx in I[0]:
        results.append({
            "id": doc_ids[idx],
            "title": titles[idx],
            "score": round(D[0][list(I[0]).index(idx)], 4),
            "abstract_snippet": abstracts[idx][:300] + "..."
        })
    return results


##### TEST

In [28]:
results = search_faiss("quantum chromodynamics")
for r in results:
    print(f"🔹 {r['title']} (Distancia: {r['score']})")
    print(f" {r['abstract_snippet']}\n")


🔹 The Aharonov-Anandan phase of a classical dynamical system seen
  mathematically as a quantum dynamical system (Distancia: 0.9886000156402588)
   It is shown that the non-adiabatic Hannay's angle of an integrable
non-degenerate classical hamiltonian dynamical system may be related to the
Aharonov-Anandan phase it develops when it is looked mathematically as a
quantum dynamical system.
...

🔹 The Second Law of Thermodynamics under Unitary Evolution and External
  Operations (Distancia: 1.0778000354766846)
   A microscopic definition of the thermodynamic entropy in an isolated quantum
system must satisfy (i) additivity, (ii) extensivity and (iii) the second law
of thermodynamics. We show that the diagonal entropy, which is the Shannon
entropy in the energy eigenbasis at each instant of time, meets the ...

🔹 Triple Interference, Non-linear Talbot Effect and Gravitization of the
  Quantum (Distancia: 1.0785000324249268)
   Recently we have discussed a new approach to the problem of quan

#### EVALUACION ENTRE MODELOS

In [29]:
import pandas as pd

# Ejecutar búsqueda con una misma query
query = "quantum chromodynamics"
tfidf_results = search_tfidf(query, top_k=10)
bm25_results = search_bm25(query, top_k=10)
faiss_results = search_faiss(query, top_k=10)

# Extraer IDs
tfidf_ids = [doc["id"] for doc in tfidf_results]
bm25_ids = [doc["id"] for doc in bm25_results]
faiss_ids = [doc["id"] for doc in faiss_results]

# Comparativa por ranking
df = pd.DataFrame({
    "Rank": range(1, 11),
    "TF-IDF_ID": tfidf_ids,
    "BM25_ID": bm25_ids,
    "FAISS_ID": faiss_ids
})

# Coincidencias por posición
df["TFIDF == BM25"] = df["TF-IDF_ID"] == df["BM25_ID"]
df["TFIDF == FAISS"] = df["TF-IDF_ID"] == df["FAISS_ID"]
df["BM25 == FAISS"] = df["BM25_ID"] == df["FAISS_ID"]

# Coincidencias globales 
intersection_tf_bm25 = len(set(tfidf_ids) & set(bm25_ids))
intersection_tf_faiss = len(set(tfidf_ids) & set(faiss_ids))
intersection_bm25_faiss = len(set(bm25_ids) & set(faiss_ids))

print(" Coincidencias en Top-10:")
print(f"TF-IDF & BM25: {intersection_tf_bm25}")
print(f"TF-IDF & FAISS: {intersection_tf_faiss}")
print(f"BM25 & FAISS: {intersection_bm25_faiss}")

df["TFIDF_Title"] = [doc["title"] for doc in tfidf_results] # Modelo base del documento en TF-IDFF


df


 Coincidencias en Top-10:
TF-IDF & BM25: 2
TF-IDF & FAISS: 0
BM25 & FAISS: 0


Unnamed: 0,Rank,TF-IDF_ID,BM25_ID,FAISS_ID,TFIDF == BM25,TFIDF == FAISS,BM25 == FAISS,TFIDF_Title
0,1,hep-lat/9712019,hep-lat/9712019,math-ph/0511087,True,False,False,The Hybrid Monte Carlo Algorithm for Quantum C...
1,2,1811.10085,1811.10085,1303.5471,True,False,False,Simulating quantum field theory with a quantum...
2,3,1406.5144,hep-ph/0211139,2303.15645,False,False,False,Quantum Fisher information as the measure of G...
3,4,1311.2960,cond-mat/0604666,2309.01851,False,False,False,An Axiomatization for Quantum Processes to Uni...
4,5,1503.04216,2010.05827,1609.02265,False,False,False,Searching for quantum speedup in quasistatic q...
5,6,1908.07927,hep-ph/0608122,1004.1214,False,False,False,A Full Quantum Eigensolver for Quantum Chemist...
6,7,2501.10747,1508.01449,1707.05347,False,False,False,Quantifying Quantum Steering with Limited Reso...
7,8,0704.1737,supr-con/9510001,1704.06846,False,False,False,Quantum memory for images - a quantum hologram
8,9,quant-ph/0004045,hep-ph/9606272,hep-th/9701022,False,False,False,Relative entropy in quantum information theory
9,10,2101.08354,2109.05041,1007.1656,False,False,False,Enhancing Generative Models via Quantum Correl...


#### RAG

In [30]:
%pip install openai python-dotenv

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [32]:
from dotenv import load_dotenv
import os
import openai  # ✅ IMPORTAR EL MÓDULO OPENAI

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

# Verifica (opcional):
print("API Key cargada:", "Sí" if openai.api_key else "No")


API Key cargada: Sí


In [33]:
%pip install --upgrade openai

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [34]:
from openai import OpenAI
from dotenv import load_dotenv
import os

# Cargar clave desde .env
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def generate_rag_response_openai(query, top_k=3):
    top_docs = search_faiss(query, top_k=top_k)
    contexto = "\n\n".join([f"{i+1}. {doc['abstract_snippet']}" for i, doc in enumerate(top_docs)])

    prompt = (
        f"Pregunta del usuario:\n{query}\n\n"
        f"Contexto recuperado desde documentos científicos relevantes:\n{contexto}\n\n"
        f"Con base en el contexto anterior, responde de forma clara, concisa y académica:"
    )

    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.7
    )

    return response.choices[0].message.content


In [35]:
print(generate_rag_response_openai("quantum chromodynamics"))

Quantum Chromodynamics (QCD) es la teoría cuántica de los campos que describe la interacción fuerte entre quarks y gluones, los constituyentes fundamentales de los hadrones como los protones y neutrones. Esta teoría es parte del Modelo Estándar de la física de partículas y se encarga de explicar fenómenos como la confinamiento de los quarks, la cromodinámica de los gluones y la estructura interna de los hadrones. En el contexto de la física cuántica, QCD es fundamental para comprender las interacciones a nivel subatómico y ha sido objeto de numerosos estudios teóricos y experimentales en el campo de la física de partículas.


##### EVALUACION Y TEST SI LA RESPUESTA USA LA INFORMACION DEL CONTEXTO

In [36]:
from collections import Counter

def evaluar_rag_response(query, top_k=3):
    # 1. Obtener documentos
    top_docs = search_faiss(query, top_k=top_k)
    context = " ".join([doc["abstract_snippet"] for doc in top_docs])

    # 2. Generar respuesta
    respuesta = generate_rag_response_openai(query, top_k=top_k)

    # 3. Palabras clave del contexto
    context_tokens = [t for t in re.sub(r'[^\w\s]', '', context.lower()).split() if t not in ENGLISH_STOP_WORDS]
    respuesta_tokens = [t for t in re.sub(r'[^\w\s]', '', respuesta.lower()).split()]

    # 4. Comparar y contar coincidencias
    context_counter = Counter(context_tokens)
    respuesta_counter = Counter(respuesta_tokens)
    coincidencias = sum((respuesta_counter & context_counter).values())

    print("Consulta:", query)
    print("Palabras clave del contexto:", len(set(context_tokens)))
    print("Palabras en respuesta:", len(respuesta_tokens))
    print("Palabras del contexto usadas en la respuesta:", coincidencias)
    print("\n Respuesta generada:")
    print(respuesta)

    return {
        "coincidencias": coincidencias,
        "longitud_contexto": len(set(context_tokens)),
        "longitud_respuesta": len(respuesta_tokens),
        "respuesta": respuesta
    }


In [37]:
evaluar_rag_response("quantum chromodynamics")

Consulta: quantum chromodynamics
Palabras clave del contexto: 55
Palabras en respuesta: 84
Palabras del contexto usadas en la respuesta: 1

 Respuesta generada:
Quantum Chromodynamics (QCD) es una teoría fundamental de las interacciones fuertes entre quarks y gluones, que son los constituyentes fundamentales de los protones, neutrones y otras partículas subatómicas. En el marco de la física de partículas, QCD describe cómo los quarks y gluones interactúan a través de la fuerza nuclear fuerte, que es mediada por los gluones. Esta teoría es crucial para entender la estructura de los hadrones y fenómenos como la cromodinámica cuántica confinamiento de los quarks dentro de los protones y neutrones.


{'coincidencias': 1,
 'longitud_contexto': 55,
 'longitud_respuesta': 84,
 'respuesta': 'Quantum Chromodynamics (QCD) es una teoría fundamental de las interacciones fuertes entre quarks y gluones, que son los constituyentes fundamentales de los protones, neutrones y otras partículas subatómicas. En el marco de la física de partículas, QCD describe cómo los quarks y gluones interactúan a través de la fuerza nuclear fuerte, que es mediada por los gluones. Esta teoría es crucial para entender la estructura de los hadrones y fenómenos como la cromodinámica cuántica confinamiento de los quarks dentro de los protones y neutrones.'}

#### CONSULTAS queries

In [38]:
with open("queries.txt", "r", encoding="utf-8") as f:
    queries = [line.strip() for line in f if line.strip()]

# Función para extraer solo los IDs del top-k
def extract_ids(results):
    return [r["id"] for r in results]

# Procesar cada consulta
for query in queries:
    print(f" Consulta: {query}")
    print("-" * 100)

    tfidf_results = search_tfidf(query)
    bm25_results = search_bm25(query)
    faiss_results = search_faiss(query)
    rag_response = generate_rag_response_openai(query)

    # Mostrar top 10 resultados
    for name, results in zip(["TF-IDF", "BM25", "FAISS"], [tfidf_results, bm25_results, faiss_results]):
        print(f"\n🔹 {name} - Top 10 documentos:")
        for i, r in enumerate(results, 1):
            print(f"{i}. {r['id']} - {r['title'][:60]}...")

    # Extraer IDs
    tfidf_ids = extract_ids(tfidf_results)
    bm25_ids = extract_ids(bm25_results)
    faiss_ids = extract_ids(faiss_results)

    # Comparaciones
    tfidf_vs_bm25 = list(set(tfidf_ids) & set(bm25_ids))
    tfidf_vs_faiss = list(set(tfidf_ids) & set(faiss_ids))
    bm25_vs_faiss = list(set(bm25_ids) & set(faiss_ids))

    # Mostrar comparaciones
    print("\n Comparación de coincidencias en el top 10:")
    print(f"TF-IDF vs BM25: {len(tfidf_vs_bm25)} en común → {tfidf_vs_bm25}")
    print(f"TF-IDF vs FAISS: {len(tfidf_vs_faiss)} en común → {tfidf_vs_faiss}")
    print(f"BM25 vs FAISS: {len(bm25_vs_faiss)} en común → {bm25_vs_faiss}")

    # Mostrar respuesta RAG
    print("\n Respuesta generada por RAG (CHAT GTP):")
    print(rag_response)

    print("\n" + "=" * 120 + "\n")


 Consulta: diphoton production cross sections
----------------------------------------------------------------------------------------------------

🔹 TF-IDF - Top 10 documentos:
1. 1704.08903 - Status and challenges of neutrino cross sections...
2. nucl-ex/0702050 - Projectile fragmentation reactions and production of nuclei ...
3. nucl-th/0605051 - Random Phase Approximation and neutrino-nucleus cross sectio...
4. 1701.04866 - Cosmic Ray Antiprotons at High Energies...
5. 1603.09354 - Asymmetric Dark Matter Models and the LHC Diphoton Excess...
6. 1510.00299 - Imaging resonances in low-energy NO-He inelastic collisions...
7. 0808.1625 - Exclusive heavy quarkonium + gamma production from e+ e- ann...
8. hep-ph/0111078 - Prospects for New Physics observations in diffractive proces...
9. nucl-ex/9810016 - Measurement of the 6Li(e,e'p) reaction cross sections at low...
10. 0804.0490 - Photon plus Jet Cross Sections at the Tevatron...

🔹 BM25 - Top 10 documentos:
1. 0804.0490 - Photon plus