In [1]:
from sentence_transformers import SentenceTransformer
from langchain.vectorstores import FAISS
import faiss
import torch
from langchain.text_splitter import CharacterTextSplitter
import spacy
import pandas as pd
import numpy as np
import concurrent.futures
import pickle
from transformers import pipeline
from scipy.stats import skew
from langchain_text_splitters import TokenTextSplitter


pd.set_option('display.max_colwidth', 100)
nlp = spacy.load("es_core_news_sm")
nlp = spacy.load("en_core_web_sm")
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(torch.cuda.current_device()))

NVIDIA GeForce RTX 2070 SUPER


In [2]:
model = SentenceTransformer("nomic-ai/nomic-embed-text-v2-moe", trust_remote_code=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

!!!!!!!!!!!!megablocks not available, using torch.matmul instead


In [3]:
import ir_datasets
dataset = ir_datasets.load("codec/history")

In [4]:
needed_docs_ids = set()
for qrel in dataset.qrels_iter():
    needed_docs_ids.add(qrel.doc_id)
len(needed_docs_ids)

1884

In [5]:
history_docs = []
for doc in dataset.docs_iter():
    if doc.doc_id in needed_docs_ids:
        history_docs.append(doc)
len(history_docs)

1884

In [None]:
with open("translated-docs-codec-history.pkl", "wb") as f:
    pickle.dump(history_docs, f)

In [6]:
with open("docs-codec-history.pkl", "rb") as f:
    history_docs = pickle.load(f)

#### Traduccion

In [7]:
class TranslatedCodecDoc:
    def __init__(self, doc, translated_text: str) -> None:
        self.doc_id = doc.doc_id
        self.text = translated_text
        self.title = doc.title
        self.count = doc.count
        self.index = doc.index
        self.url = doc.url

    @classmethod
    def from_codec_doc(cls, doc, text_translated: str) -> "TranslatedCodecDoc":
        return cls(doc, text_translated)

In [8]:
from deep_translator import GoogleTranslator
translator = GoogleTranslator(source='en', target='es')

In [18]:
d1 = history_docs[0]
nt = translator.translate(d1.text)
tcd = TranslatedCodecDoc.from_codec_doc(d1, nt)

In [None]:
translated_history_docs: list[TranslatedCodecDoc] = []
for i, doc in enumerate(history_docs):
    try:
        doc_spacy = nlp(doc.text)
        new_text = ""
        current_sentences = ""
        for sent in doc_spacy.sents:
            if len(current_sentences) + len(sent.text) > 4500:
                translated_sentences = translator.translate(current_sentences)
                current_sentences = sent.text + " "
                new_text += translated_sentences + " "
            else:
                current_sentences += sent.text + " "

        translated_sentences = translator.translate(current_sentences)
        new_text += translated_sentences + " "
        current_sentences = ""

        translated_history_docs.append(TranslatedCodecDoc.from_codec_doc(doc, new_text))
        print(f"{i+1}/{len(history_docs)}: {doc.doc_id} translated")
    except Exception as e:
        print(f"Error translating doc-{i}/{doc.doc_id}: {e}")

In [9]:
class CodecHistoryStore:
    def __init__(self, docs: list[TranslatedCodecDoc]) -> None:
        self.docs = docs
        self.chunks = []
        self.chunks_ids = []
        self.embeddings = None

        self.__create_chunks()

    def __create_chunks(self):
        text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=50)
        for i, doc in enumerate(self.docs):
            new_chunks = text_splitter.split_text(doc.text)
            self.chunks.extend(new_chunks)
            self.chunks_ids.extend([i] * len(new_chunks))
    
    def doc_from_chunk_id(self, chunk_id: int) -> TranslatedCodecDoc:
        return self.docs[self.chunks_ids[chunk_id]]

    def chunk_from_id(self, chunk_id: int) -> str:
        return self.chunks[chunk_id]

    def generate_embeddings(self):
        model = SentenceTransformer("nomic-ai/nomic-embed-text-v2-moe", trust_remote_code=True)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = model.to(device)

        self.embeddings = model.encode(self.chunks, convert_to_tensor=True, prompt_name="passage")

In [10]:
# store = CodecHistoryStore(history_docs)
# len(store.chunks), len(store.chunks_ids)

with open("store-with-embeddings-translated-docs-codec-history.pkl", "rb") as f:
    store = pickle.load(f)
store.embeddings.shape

torch.Size([12707, 768])

In [11]:
embeddings_np = store.embeddings.cpu().numpy()
dimension = embeddings_np.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_np)

In [14]:
faiss.write_index(index, "faiss-index-translated-docs-codec-history.bin")

In [12]:
import os
import torch
from transformers import pipeline
from huggingface_hub import InferenceClient

local_pipe = None
client_inference = None
cache: dict[str, str] = {}

def __init_inference_client():
    global client_inference
    if client_inference is None:
        client_inference = InferenceClient(
            provider="sambanova", api_key=os.getenv("HF_TOKEN")
        )


def __init_pipe():
    global local_pipe
    if local_pipe is None:
        model_id = "meta-llama/Llama-3.2-1B-Instruct"
        local_pipe = pipeline(
            "text-generation",
            model=model_id,
            torch_dtype=torch.bfloat16,
            device_map="auto",
        )


def __rewrite_local(q: str, messages: str) -> str:
    __init_pipe()
    global local_pipe

    outputs = local_pipe(
        messages,
        max_new_tokens=256,
    )

    return outputs[0]["generated_text"][-1]["content"]


def __rewrite_inference(q: str, messages: str) -> str:
    __init_inference_client()
    global client_inference

    completion = client_inference.chat.completions.create(
        model="meta-llama/Llama-3.2-1B-Instruct",
        messages=messages,
        max_tokens=256,
    )

    return completion.choices[0].message.content


def rewrite(q: str, use_local_model: bool = False) -> str:
    messages = [
        {
            "role": "system",
            "content": "Eres un experto en historia latinoamericana. Tu tarea es tomar consultas relacionadas con la historia de América Latina y reformularlas para hacerlas más precisas, claras y detalladas. Deberías asegurarte de que la nueva consulta sea más específica, completa y comprensible. Responde solo con la consulta reformulada, sin agregar respuestas completas ni contenido adicional. No uses listas, títulos ni formato Markdown.",
        },
        {"role": "user", "content": q},
    ]
    if q in cache:
        return cache[q]

    cache[q] = __rewrite_local(q, messages) if use_local_model else __rewrite_inference(q, messages)
    return cache[q]

In [64]:
from rerankers import Reranker
from collections import defaultdict
ranker = Reranker('ms-marco-MultiBERT-L-12', lang='es', model_type='flashrank')

Loading FlashRankRanker model ms-marco-MultiBERT-L-12 (this message can be suppressed by setting verbose=0)
Loading model FlashRank model ms-marco-MultiBERT-L-12...


In [65]:
def rerank2(query: str, initial: list[int], top_k=10) -> list[int]:
    already_in = set()
    relevant_chunks: list[tuple[str, str]] = []
    for i in initial:
        doc = store.doc_from_chunk_id(i)
        if doc.doc_id in already_in:
            continue

        already_in.add(doc.doc_id)
        relevant_chunks.append((store.chunk_from_id(i), doc.doc_id))
        if len(relevant_chunks) == top_k:
            break
    
    results = ranker.rank(
        query=query, 
        docs=[r[0] for r in relevant_chunks], 
        doc_ids=[r[1] for r in relevant_chunks],
    )
    
    return {r.document.doc_id: 1 for r in results}

In [66]:
def clean_query(query: str) -> str:
    query = query.replace('"', '').replace("'", '')
    query = query.replace("\n", " ").replace("\r", " ")
    query = query.replace("*", "").replace("-", "").replace("~", "")
    query = query.replace("  ", " ").strip()

    doc = nlp(query)
    sentences = [sent.text.strip() for sent in doc.sents]
    return " ".join([sent for sent in sentences if len(sent) > 0 and sent[-1] == '.']).lower()

def search_testing(query: str, top_k: int = 10) -> None:
    query_embedding = model.encode([clean_query(query)], convert_to_tensor=True, prompt_name="query").cpu().numpy()
    _, indices = index.search(query_embedding, k=top_k*4)

    already_in = set()
    results = {}
    for i in indices[0]:
        doc = store.doc_from_chunk_id(i)
        if doc.doc_id in already_in:
            continue

        already_in.add(doc.doc_id)
        results[doc.doc_id] = 1
        if len(results) == top_k:
            break
    return results

def search_rr_testing(query: str, top_k: int = 10) -> None:
    query_embedding = model.encode([clean_query(query)], convert_to_tensor=True, prompt_name="query").cpu().numpy()
    _, indices = index.search(query_embedding, k=top_k*4)
    return rerank2(query, indices[0], top_k=top_k)

In [67]:
# the query must be in spanish
translator = GoogleTranslator(source='en', target='es')

In [None]:
run_dict = {}
for query in dataset.queries_iter():
    translated_q = translator.translate(query.query)
    translated_q = rewrite(translated_q, use_local_model=True)
    run_dict[query.query_id] = search_rr_testing(translated_q, top_k=10)
run_dict

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

{'history-1': {'f899046212f09f75e15c4530f33af755': 1,
  'f56811a9a9dbb1341edca016a9d25a0d': 1,
  'd97ef39f722f82d0fc18c64dcd1499f7': 1,
  'e2b08483e51636ca4ec533e9b48f3aa7': 1,
  '6e6e2085f0437a2dfb2d1e8071611358': 1,
  'fd8028c23a7f906b8241d71b60e84979': 1,
  'ce9c7a5ce0199e0e04d9e1e093cc73f1': 1,
  'b72a5b8259e5d8fae6bba41858f7939d': 1,
  'd7d052983a3ccd6ad46064de028f5b83': 1,
  '3424253a2f1ba4115edffa88908c4686': 1,
  '11ad73fa6bfb0dd9a28e0b503ec722e7': 1,
  '06e67eeec0cc8914894d4619c89ed19d': 1,
  '056e3bed6d8393d561dbaf0032070338': 1,
  'ac02ba956685efec1b0949aa28d485a7': 1,
  'f97481e87e75eb73bf8df33a06f795a5': 1,
  '7f877f4402cf244f0f7d432866701d29': 1,
  'ea7b5ed8a83af0de343055b2adf77a78': 1,
  'dcf7bd4bbba492113e1955e503df99f6': 1,
  '29fc8c127b0e6c45e32ba0d49b956430': 1,
  'bb133c459fea80c93086e9f360a2337f': 1,
  'f0fe32e5ea533cd6de3a866df2a7ed39': 1,
  '00c4a6c39b7eee6564ce904e40612799': 1,
  '1a94ba1c5c18e4e3e2628aeb2b0d99ad': 1,
  '9b2a216e2b90ebde0efefad8817979e6': 1,
  '

In [89]:
from ranx import Qrels, Run, evaluate

run = Run(run_dict)
run

DictType[unicode_type,DictType[[unichr x 32],float64]<iv=None>]<iv=None>({history-1: {ecba1df90bd8fc041e198d53da49cf6f: 1.0, a8fe344bd62bb6d6747544a006d823b8: 1.0, da6188f64b86b25544aa5d1875859b8d: 1.0, 31fe1491dc09605f026e63905ac7455b: 1.0, 1fcf98666095385a2914804e1d3aff50: 1.0, 8be1e5ff1522cd86fafdc491314c74b4: 1.0, 0abc471f6100a80a9d723a750e7af9e3: 1.0, 54bff948b8fb944fb146448430c89dcf: 1.0, bcfbab42bf4b2fa4f6eca735fe46663d: 1.0, 4764fe8b2daa5876df2e6eb7da2ca235: 1.0, c202fa3a089dfbac841b0faf6377b399: 1.0, abcfe10c4390f89270f30bd2ff4d25ad: 1.0, 682c52b764a079b0f424bfa55a4a1daf: 1.0, d6eb8227ee8cb0aaac1d1fb40faa2740: 1.0, a768e9c0277f0e4dd07c8c0eab2d8623: 1.0, 3a8fe471ccc8ac184b43289f1467f9f8: 1.0, 8d3101501cbb74f632443220e1698ea8: 1.0, 65e66085fad7e166b2ff633ac378d6b7: 1.0, 81f4b892167045b5e8f771f5a80f023e: 1.0, 5131595913afcafc83f3bfcaf8640e30: 1.0, abbd0ef81d64bf468ad45bfdd7402f50: 1.0, 4d1963f941c9ce61be0216dcb6d1075e: 1.0, 409d464445b1249d19afd307797d44fd: 1.0, 7294587846c6163ab

In [90]:
qrels = Qrels.from_ir_datasets("codec/history")
qrels_dict: dict = qrels.to_dict()

for key in qrels_dict.keys():
    for doc_id in qrels_dict[key]:
        if qrels_dict[key][doc_id] > 0:
            qrels_dict[key][doc_id] = 1

qrels = Qrels.from_dict(qrels_dict)
qrels

DictType[unicode_type,DictType[[unichr x 32],int32]<iv=None>]<iv=None>({history-1: {8be1e5ff1522cd86fafdc491314c74b4: 1, ffe1b263fb145329c9caa4ce14ef7ab5: 1, 2839a866e9e366e89c304ce2aed9cd1d: 1, 06510d0aa4b63058d945732f3122193e: 1, 28d6d1b124f6e90fc28653107efcd122: 1, 06e67eeec0cc8914894d4619c89ed19d: 1, 11ad73fa6bfb0dd9a28e0b503ec722e7: 1, 155b8e617dd862be6f055e97e5d63a40: 1, 1fcf98666095385a2914804e1d3aff50: 1, 58c13668d1e89daaf82b04aa4904da00: 1, fb465a1fcce9215419f5c4e8d3e7712e: 1, 9232c038d6788945e24d5f09f9fb6707: 1, 8d3101501cbb74f632443220e1698ea8: 1, a395644c47c42aaf9e165104130bedb9: 1, 81f4b892167045b5e8f771f5a80f023e: 1, 33b3a8ba9386eed4a4b2390619dcd8a9: 1, 9758b99a710f503216dfc800bc8f5a29: 1, 2148d57f5538b0c9b1fab4d17cf710fd: 1, bd1fa635fd4c4029ad7cd329cd668f2a: 1, 675abd47ee2d6237261161232f0a44e8: 1, 409d464445b1249d19afd307797d44fd: 1, f9455927e97c00b12dd018844a313252: 1, f0fe32e5ea533cd6de3a866df2a7ed39: 1, abcfe10c4390f89270f30bd2ff4d25ad: 1, ac02ba956685efec1b0949aa28d4

Con rerank/top10

In [71]:
evaluate(
  qrels,
  run,
  metrics=["mrr", "ndcg", "recall", "precision", "map", "hit_rate"],
)

{'mrr': 0.9285714285714286,
 'ndcg': 0.24127323696877173,
 'recall': 0.12544670057930163,
 'precision': 0.8428571428571429,
 'map': 0.11665979334121253,
 'hit_rate': 0.9285714285714286}

Sin rerank/top10

In [48]:
evaluate(
  qrels,
  run,
  metrics=["mrr", "ndcg", "recall", "precision", "map", "hit_rate"],
)

{'mrr': 0.8928571428571429,
 'ndcg': 0.23922792441360768,
 'recall': 0.12544670057930163,
 'precision': 0.8428571428571429,
 'map': 0.11662989250780893,
 'hit_rate': 0.9285714285714286}