<a href="https://colab.research.google.com/github/jtlagumbay/cebqa/blob/main/retriever/bm_roberta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **QA Pipeline**

1. ElasticSeach Indexer
2. BM25 Retriever
3. Fine-tuned XLMR Reader


# Dependencies

In [1]:
pip install elasticsearch transformers datasets evaluate rank_bm25 nltk fuzzywuzzy sentence_transformers


Collecting elasticsearch
  Downloading elasticsearch-9.0.0-py3-none-any.whl.metadata (8.5 kB)
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting elastic-transport<9,>=8.15.1 (from elasticsearch)
  Downloading elastic_transport-8.17.1-py3-none-any.whl.metadata (3.8 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=202

In [None]:
# pip install --upgrade --no-cache-dir numpy==1.26.4

In [2]:
pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m72.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [None]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import json
import os
import subprocess
import time
import pandas as pd
import requests
from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer, XLMRobertaTokenizerFast, DPRQuestionEncoder, DPRQuestionEncoderTokenizer, DPRContextEncoder, DPRContextEncoderTokenizer
from datasets import Dataset, load_dataset
import re
from evaluate import load
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
from fuzzywuzzy import fuzz
import nltk
import random
import numpy as np
from sentence_transformers import SentenceTransformer, util
import torch
from nltk.tokenize import sent_tokenize
import faiss
import unicodedata

nltk.download('punkt')
nltk.download('punkt_tab')
from google.colab import drive
drive.mount('/content/drive')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
CEBQA_DATASET = "jhoannarica/cebquad_split"
SUPERBALITA_DATASET = "jhoannarica/superbalita_split"
ELASTIC_URL = "https://tender-separately-mudfish.ngrok-free.app"
# ELASTIC_URL = "http://localhost:9200"

DRIVE_ROOT = "/content/drive/My Drive"
# DRIVE_ROOT = "/Users/jhoannaricalagumbay/Library/CloudStorage/GoogleDrive-jtlagumbay@up.edu.ph/My Drive"

READER_FOLDER = "/UP Files/IV - 2nd sem/CMSC 198.1/cebqa_roberta/new-split/xlmr_body-filtered/2025-04-04_05-13"
DPR_FOLDER = "/UP Files/IV - 2nd sem/CMSC 198.1/cebqa_roberta/dpr/2025-04-16_03-37"


CURRENT_MODEL = DRIVE_ROOT + READER_FOLDER + "/model"
CURRENT_TOKENIZER = DRIVE_ROOT + READER_FOLDER + "/tokenizer"
INDEX_NAME = "superbalita"
K = 10
BM25 = "bm25"
FAISS = "faiss"

CEBQA_DPR_MODEL = DRIVE_ROOT + DPR_FOLDER + "/model"
CEBQA_DPR_TOKENIZER = DRIVE_ROOT + DPR_FOLDER + "/tokenizer"
DPR_CONTEXT_ENCODER = "voidful/dpr-ctx_encoder-bert-base-multilingual"

# Indexer

Start ElasticSearch Locally:
1. Start ES docker
2. Start NGROK: `ngrok http --url=tender-separately-mudfish.ngrok-free.app 9200`


In [None]:
headers = {
    "Origin": "https://colab.research.google.com",
     "Content-Type": "application/json",
    "Accept": "application/json"
}

response = requests.options(ELASTIC_URL, headers=headers)
print(response.headers)


In [16]:
es = Elasticsearch([ELASTIC_URL], verify_certs=False, headers=headers, request_timeout=30)
print(es.info())
# try:
#     print(es.transport.perform_request('GET', '/'))
# except Exception as e:
#     print("Error:", e)

  _transport = transport_class(


BadRequestError: BadRequestError(400, 'media_type_header_exception', 'Invalid media-type value on headers [Accept, Content-Type]')

In [6]:
class ElasticSearchIndexer:
    def __init__(self, index_name=INDEX_NAME):
        self.index_name = index_name
        self.es = Elasticsearch(ELASTIC_URL)  # Ensure ES is running
        print(f"Initiating ESIndexer {self.index_name}")

    def create_index(self):
        """ Create an index with a text field for BM25 """
        if not self.es.indices.exists(index=self.index_name):
            self.es.indices.create(index=self.index_name, body={
                "settings": {
                    "number_of_shards": 1,
                    "number_of_replicas": 0
                },
                "mappings": {
                    "properties": {
                        "id": {"type": "keyword"},
                        "title": {"type": "text"},
                        "body": {"type": "text"}
                    }
                }
            })
            print(f"Index '{self.index_name}' created.")

    def index_documents(self, documents):
        """ Bulk index documents into ElasticSearch """
        actions = [
            {
                "_index": self.index_name,
                "_id": doc["id"],  # Use document ID for uniqueness
                "_source": {
                    "id": doc["id"],
                    "title": doc["pseudonymized_title"],
                    "body": doc["pseudonymized_body"]
                }
            }
            for doc in documents
        ]
        bulk(self.es, actions)
        print(f"Indexed {len(documents)} documents.")

    def index_from_csv(self, file_path):
        df = pd.read_csv(file_path)
        documents = df.to_dict(orient="records")  # Convert DataFrame to a list of dicts
        self.index_documents(documents)

    def index_from_huggingface(self, dataset = SUPERBALITA_DATASET):
        dataset_obj = load_dataset(dataset)

        # Create the dataset_dict from the loaded dataset object's splits
        dataset_dict = {
            split_name: split_dataset
            for split_name, split_dataset in dataset_obj.items()
        }

        all_documents = []
        for split_name, split_dataset in dataset_dict.items():
            documents = split_dataset.to_list()
            all_documents.extend(documents)

        self.index_documents(all_documents)




In [None]:
# # Sample usage
# indexer = ElasticSearchIndexer()
# indexer.create_index()
# indexer.index_from_csv("/Users/jhoannaricalagumbay/School/cebqa/dataset/articles_202503120405_author_removed_fixed.csv")

# BM25

In [7]:
class BM25Retriever:
    def __init__(self, index_name = INDEX_NAME):
        print(f"Initiating retriever with index_name: {INDEX_NAME}")
        self.indexer = ElasticSearchIndexer(index_name = INDEX_NAME)
        self.indexer.create_index()
        self.indexer.index_from_huggingface()

    def retrieve(self, query, top_k=3):
        """ Retrieve top-k relevant documents using BM25 """
        print(f"retrieving {top_k} docs for [{query}]")
        response = self.indexer.es.search(index=self.indexer.index_name, body={
            "query": {
                "match": {
                    "body": query
                }
            },
            "size": top_k
        })
        return [hit["_source"] for hit in response["hits"]["hits"]]

    def retrieve_batch(self, queries, top_k=3):
        print(f"Retrieve Batch for {len(queries)} queries")
        """ Retrieve top-k relevant documents for multiple queries using BM25 in batch mode """
        if not isinstance(queries, list):
            raise ValueError("queries should be a list of strings")

        # Multi-search request body
        request_body = ""
        for query in queries:
            safe_question = json.dumps(query)
            request_body += f'{{"index": "{self.indexer.index_name}"}}\n'  # Metadata
            request_body += f'{{"query": {{"match": {{"body": {safe_question}}}}}, "size": {top_k}}}\n'  # Query

        # Send multi-search request
        response = self.indexer.es.msearch(body=request_body)

        # Extract results
        results = []
        for query_response in response["responses"]:
            retrieved_docs = [hit["_source"] for hit in query_response["hits"]["hits"]]
            results.append(retrieved_docs)

        return results  # List of lists, where each sublist contains retrieved documents for a query

    def retrieve_batch_query_dict(self, queries_list, top_k=3):
        print(f"Retrieve Batch Dict for {len(queries_list)} queries")

        """ Retrieve top-k relevant documents for multiple queries using BM25 in batch mode.

        Args:
            queries_list (list): A list of dictionaries, each containing 'id' and 'question'.
            top_k (int): Number of top relevant documents to retrieve per query.

        Returns:
            dict: A dictionary where keys are query IDs and values are lists of retrieved documents.
        """
        if not isinstance(queries_list, list) or not all(isinstance(q, dict) and 'id' in q and 'question' in q for q in queries_list):
            raise ValueError("queries_list should be a list of dictionaries with 'id' and 'question' keys")

        # Multi-search request body
        request_body = ""
        query_ids = []  # To track IDs in order
        for query in queries_list:
            safe_question = json.dumps(query["question"])
            query_ids.append(query["id"])
            request_body += f'{{"index": "{self.index_name}"}}\n'  # Metadata
            request_body += f'{{"query": {{"match": {{"body": {safe_question}}}}}, "size": {top_k}}}\n'  # Query

        # Send multi-search request
        response = self.indexer.es.msearch(body=request_body)

        # Extract results and associate with query IDs
        results = []
        for i, query_response in enumerate(response["responses"]):
            retrieved_docs = [hit["_source"] for hit in query_response["hits"]["hits"]]
            results.append({
                "query_id": str(query_ids[i]),
                "top_docs": retrieved_docs
            })

        return results  # Dictionary format: {id: retrieved_docs}



In [8]:
# Sample usage
retriever = BM25Retriever()
query = ['Unsa ang giingon ni Gobernador Abalayan nga mabuhat ra "with a united country"?']
top_docs = retriever.retrieve_batch(query)
print("Retrieved Documents:", top_docs)

Initiating retriever with index_name: superbalita
Initiating ESIndexer superbalita


BadRequestError: BadRequestError(400, 'None')

# FAISS Indexer

In [8]:
class FAISSIndexer:
    def __init__(self, index_file="faiss_index.idx", model_name="sentence-transformers/all-MiniLM-L6-v2", use_fine_tuned = False):
        self.index_file = index_file
        self.model = SentenceTransformer(model_name)
        print("Loading DPR multilingual context encoder...")
        self.dpr_tokenizer = DPRContextEncoderTokenizer.from_pretrained(DPR_CONTEXT_ENCODER)
        self.dpr_model = DPRContextEncoder.from_pretrained(DPR_CONTEXT_ENCODER)
        self.use_fine_tuned = use_fine_tuned
        self.index = None
        self.documents = []  # Store original text
        # self.index_from_csv()
        self.index_from_huggingface()
        print("FAISS Indexer initialized.")


    def encode_contexts_with_dpr(self):
        """Encode contexts using DPR multilingual context encoder and re-index."""
        # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        device = torch.device("cpu")
        self.dpr_model.to(device)
        self.dpr_model.eval()

        with torch.no_grad():
            inputs = self.dpr_tokenizer(
                self.documents,
                return_tensors="pt",
                truncation=True,
                padding="max_length",
                max_length=512
            ).to(device)

            embeddings = self.dpr_model(**inputs).pooler_output.cpu().numpy()

            if not embeddings.flags['C_CONTIGUOUS']:
                embeddings = np.ascontiguousarray(embeddings)

            faiss.normalize_L2(embeddings)
            return embeddings

    def create_index(self, d):
        """Create a new FAISS index."""
        self.index = faiss.IndexFlatL2(d)
        print(f"Created FAISS index with dimension {d}.")

    def index_documents(self, documents):
        """Index documents into FAISS."""
        self.documents = [doc['pseudonymized_body'] for doc in documents]
        self.article_ids = [doc['id'] for doc in documents]
        self.titles = [doc['pseudonymized_title'] for doc in documents]

        if self.use_fine_tuned:
            embeddings = self.encode_contexts_with_dpr()
        else:
            embeddings = self.model.encode(self.documents, convert_to_numpy=True)

        d = embeddings.shape[1]

        if self.index is None:
            self.create_index(d)

        self.index.add(embeddings)
        print(f"Indexed {len(documents)} documents into FAISS.")
        self.save_index()

    def index_from_csv(self, file_path):
        """Load documents from a CSV file and index them."""
        df = pd.read_csv(file_path)
        documents = df.to_dict(orient="records")
        self.index_documents(documents)

    def index_from_huggingface(self, dataset = SUPERBALITA_DATASET):
        dataset_obj = load_dataset(dataset)

        # Create the dataset_dict from the loaded dataset object's splits
        dataset_dict = {
            split_name: split_dataset
            for split_name, split_dataset in dataset_obj.items()
        }
        all_documents = []
        for split_name, split_dataset in dataset_dict.items():
            documents = split_dataset.to_list()
            all_documents.extend(documents)

        self.index_documents(all_documents)

    def save_index(self):
        """Save FAISS index to disk."""
        faiss.write_index(self.index, self.index_file)
        print(f"FAISS index saved to {self.index_file}.")

    def load_index(self):
        """Load FAISS index from disk."""
        self.index = faiss.read_index(self.index_file)
        print(f"FAISS index loaded from {self.index_file}.")



In [9]:
class FAISSRetriever:
    def __init__(
            self,
            q_encoder = DPRQuestionEncoder.from_pretrained(CEBQA_DPR_MODEL),
            q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(CEBQA_DPR_TOKENIZER),
            index_file="faiss_index.idx",
            top_k=3,
            # device="cuda" if torch.cuda.is_available() else "cpu",
            device = "cpu",
            use_fine_tuned = False
        ):
        print("Initializing FAISS Retriever")
        self.indexer = FAISSIndexer(index_file=index_file, use_fine_tuned=use_fine_tuned)
        self.top_k = top_k
        self.use_fine_tuned = use_fine_tuned
        self.q_encoder = q_encoder
        self.q_tokenizer = q_tokenizer
        self.device = device

        # Move model to the appropriate device
        self.q_encoder.to(device)
        self.q_encoder.eval()

    def encode_query(self, query):
        """Encode the query using the fine-tuned question encoder."""
        with torch.no_grad():
            inputs = self.q_tokenizer(
                query,
                return_tensors="pt",
                padding="max_length",
                truncation=True,
                max_length=64
            ).to(self.device)

            embeddings = self.q_encoder(**inputs).pooler_output.cpu().numpy()

            # Ensure embeddings are C-contiguous for FAISS
            if not embeddings.flags['C_CONTIGUOUS']:
                embeddings = np.ascontiguousarray(embeddings)

            # Normalize embeddings for cosine similarity
            faiss.normalize_L2(embeddings)

            return embeddings

    def retrieve(self, query):
        """Retrieve top-k relevant documents using FAISS."""
        if self.use_fine_tuned:
            query_embedding = self.encode_query([query])
        else:
            query_embedding = self.indexer.model.encode([query], convert_to_numpy=True)

        D, I = self.indexer.index.search(query_embedding, self.top_k)
        print(D, I)
        results = [
            {
                "rank": rank + 1,
                "score": float(D[0][rank]),
                "id": self.indexer.article_ids[idx],
                "title": self.indexer.titles[idx],
                "body": self.indexer.documents[idx],
            }
            for rank, idx in enumerate(I[0]) if idx < len(self.indexer.documents)
        ]
        return results

    def retrieve_batch(self, queries):

        print(f"processing {len(queries)}")
        """Retrieve top-k relevant documents for multiple queries."""
        questions = [query["question"] for query in queries]
        if self.use_fine_tuned:
            query_embeddings = self.encode_query([query])
        else:
            query_embeddings = self.indexer.model.encode(questions, convert_to_numpy=True)
        D, I = self.indexer.index.search(query_embeddings, self.top_k)
        print(f"done {len(D)}")
        results = []
        for query_idx, query in enumerate(queries):
            print(f"query idx: {query_idx}")
            retrieved_docs = [
                {
                    "rank": rank + 1,
                    "score": float(D[query_idx][rank]),
                    "text": self.indexer.documents[idx]
                }
                for rank, idx in enumerate(I[query_idx]) if idx < len(self.indexer.documents)
            ]
            results.append({"query": query, "top_docs": retrieved_docs})
        return results


In [None]:
# # Initialize the FAISS indexer
# indexer = FAISSIndexer(index_file="faiss_index.idx")

# Index documents from a CSV file
# # Save the FAISS index for later use
# indexer.save_index()

# indexer = FAISSIndexer(index_file="faiss_index.idx")
# indexer.load_index()

In [None]:
# Initialize the retriever with the loaded indexer
# retriever = FAISSRetriever(index_file="faiss_index.idx", top_k=K)

# # Retrieve relevant documents for a single query
# query = "kanus-a ang palarong pambansa??"
# results = retriever.retrieve(query)

# # Print retrieved documents
# print(results)


# Retrieve relevant documents for a single query
# query = [{"question": "kanus-a ang palarong pambansa?"}, {"question":"Kinsa ang hepe sa Cebu Police?"}]
# results = retriever.retrieve_batch(query)

# # Print retrieved documents
# for res in results:
#     print(res)


# Reader

In [10]:
class Reader:
    def __init__(
        self,
        model_path = CURRENT_MODEL,
        tokenizer_path = CURRENT_TOKENIZER
      ):
        print(f"Initiating reader with model: {model_path}")
        model_best = AutoModelForQuestionAnswering.from_pretrained(model_path)
        tokenizer_best = AutoTokenizer.from_pretrained(tokenizer_path)

        # device = torch.device("mps")
        # device="cuda" if torch.cuda.is_available() else "cpu"
        device = "cpu"
        self.qa_pipeline = pipeline(
            "question-answering",
            model=model_best,
            tokenizer=tokenizer_best,
            device=device
            )

    def extract_answer_batch(self, queries_list, top_docs):
        print(f"Extracting batch answer for {len(queries_list)} queries")
        qa_dataset = Dataset.from_dict({
          "question": [queries_list["question"] for doc in top_docs['top_docs']] ,
          "context": [doc['body'] for doc in top_docs['top_docs']]
        })

        return self.qa_pipeline(qa_dataset)

    def extract_answer(self, question, documents, num_chunks = 1, overlap = 0.3):
        print(f"extracting answer for {question}")
        """ Find the best answer from retrieved documents while keeping metadata """
        best_result = None
        best_score = 0

        for doc in documents:
            if num_chunks == 1:
                contexts = [doc["body"]]
            else:
                contexts = self.chunk_text(doc["body"],  num_chunks, overlap)

            for context in contexts:
            #   print(question)
            #   print(context)
              result = self.qa_pipeline(question=question, context = context)
              if result["score"] > best_score:
                  best_result = {
                      "article_id": doc["id"],
                      "title": doc["title"],
                      "body": doc["body"],
                      "answer": result["answer"],
                      "score": result["score"]
                  }
                  best_score = result["score"]

        return best_result

    def chunk_text(self, text, chunk_size=3, overlap=0.5):
        sentences = sent_tokenize(text)  # Tokenize text into sentences
        step = int(chunk_size * (1 - overlap))  # Overlapping step

        chunks = []
        for i in range(0, len(sentences), step):
            chunk = sentences[i:i + chunk_size]
            if not chunk:
                continue
            chunks.append(" ".join(chunk))

        return chunks

# QA Pipeline

In [11]:
class QA:
    def __init__(
        self,
        model_path = CURRENT_MODEL,
        tokenizer_path = CURRENT_TOKENIZER,
        dataset = CEBQA_DATASET,
        indexer_type = BM25,
        index_name = INDEX_NAME,
        k = K,
        sample = None,
        isRandom = False,
        overlap = 0.0,
        num_chunks = 1,
        use_fine_tuned = False
      ):
        reader = Reader(model_path=model_path, tokenizer_path=tokenizer_path)

        self.model_path = model_path
        self.tokenizer_path = tokenizer_path
        self.reader = reader
        self.tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base")
        test_dataset = load_dataset(dataset)["test"]
        self.dataset = test_dataset.filter(self.filter_incomplete_examples) \
            .map(self.normalize_row, batched=True) \
            .map(self.tokenize_train_function, batched=True)\
            .filter(self.decode_error)
        self.sentence_transformer = SentenceTransformer("all-MiniLM-L6-v2")
        self.k = k
        self.overlap = overlap
        self.num_chunks = num_chunks
        self.sample = sample
        self.isRandom = isRandom
        self.index_name = index_name
        self.indexer_type = indexer_type

        if sample is not None and isRandom:
            indices = random.sample(range(len(self.dataset)), sample)
            self.dataset = self.dataset.select(indices)
        elif sample is not None and not isRandom:
            self.dataset = self.dataset.select(range(sample))

        print(f"Initiating QA Pipeline.")
        print(f"QA model {self.model_path}")
        print(f"QA tokenizer {self.tokenizer_path}")
        print(f"QA reader {self.reader}")
        print(f"QA dataset {len(self.dataset)}")
        print(f"QA k {self.k}")
        print(f"QA overlap {self.overlap}")
        print(f"QA num_chunks {self.num_chunks}")
        print(f"QA sample {self.sample}")
        print(f"QA isRandom {self.isRandom}")
        print(f"QA index_name {self.index_name}")
        print(f"QA indexer {self.index_name}")
        self.queries = [
            {
                "id": item['id'],
                "article_id": item['article_id'],
                "question": item['question'],
                "context": {
                    "text": item['context'],
                    "start": item['context_start']
                },
                "answer": {
                    "text": item['answer'],
                    "start": item['answer_start']
                }
            }
            for item in self.dataset
        ]

        if indexer_type == BM25:
            self.retriever = BM25Retriever(index_name=index_name)
            self.run_top_docs_batch_bm25()
        else:
            self.retriever = FAISSRetriever(top_k=self.k, use_fine_tuned = use_fine_tuned)
            self.run_top_docs_batch_faiss()
        print(f"QA retriever {self.retriever}")


    def run_top_docs_batch_bm25(self):
        self.top_docs = self.retriever.retrieve_batch_query_dict(
            queries_list = self.queries,
            top_k=self.k
        )

        return self.top_docs

    def run_top_docs_batch_faiss(self):
        docs = []
        for item in self.dataset:
            result = self.retriever.retrieve(item["question"])
            doc = {
                "query_id": item["id"],
                "top_docs": result
            }
            docs.append(doc)

        self.top_docs = docs
        return self.top_docs

    def run(self):
        start_time = time.time()
        date_now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start_time))
        print(f"QA run for {self.model_path} on {date_now}")

        results = []

        for index, query in enumerate(self.queries):
          print(f"{index} processing {query['id']}")
          docs = self.top_docs[index]['top_docs']
        #   print([f"{doc}\n" for doc in docs])
          answer = self.reader.extract_answer(
              question = query["question"],
              documents = docs,
              num_chunks = self.num_chunks,
              overlap= self.overlap
          )
        #   print(query["answer"]["text"])
        #   print(answer["answer"])
          result = query
          result["pred"] = answer
          result["top_docs"] = docs
          results.append(result)

        self.results = results

        end_time = time.time()
        self.stats ={
            'run_time': end_time - start_time
        }
        return self.results

    def normalize_row(self, examples):
        examples["context"] = [unicodedata.normalize("NFKC", context) for context in examples["context"]]

        examples["article_body"] = [unicodedata.normalize("NFKC", body) for body in examples["article_body"]]

        examples["answer"] =  [unicodedata.normalize("NFKC", answer) for answer in examples["answer"]]

        examples["question"] = [unicodedata.normalize("NFKC", q) for q in examples["question"]]

        return examples

    def normalize_text(self, text):
        """Lowercase and remove punctuation, articles, and extra whitespace."""
        text = text.lower()
        text = re.sub(r'\W+', ' ', text)  # Remove punctuation and special characters
        text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
        return text

    def compute_similarity(self, text1, text2):
        """Compute cosine similarity between two texts using Sentence Transformers."""
        emb1 = self.sentence_transformer.encode(text1, convert_to_tensor=True)
        emb2 = self.sentence_transformer.encode(text2, convert_to_tensor=True)
        similarity = util.pytorch_cos_sim(emb1, emb2).item()  # Convert tensor to float
        return similarity

    def evaluate_batch(self):
        pass

    def evaluate_retriever(self):
        wrong_doc = []
        for index, query in enumerate(self.queries):
            top_doc = self.top_docs[index]["top_docs"]
            if not any(doc["id"] == query["article_id"] for doc in top_doc):
                wrong_doc.append(query["article_id"])

        return wrong_doc

    def compute_retrieval_metrics(self):
        metrics = {
            "hits@1": 0,
            "hits@3": 0,
            "hits@5": 0,
            "hits@10": 0,
            "mrr": 0.0
        }

        total = len(self.queries)

        for index, query in enumerate(self.queries):
            correct_id = query["article_id"]
            docs = self.top_docs[index]["top_docs"]  # Ranked list of dicts with 'id'

            found = False

            for rank, doc in enumerate(docs):
                                    # rank is 0-based, so add 1
                print(rank, doc)

                r = rank + 1
                if doc["id"] == correct_id:
                    if r <= 1: metrics["hits@1"] += 1
                    if r <= 3: metrics["hits@3"] += 1
                    if r <= 5: metrics["hits@5"] += 1
                    if r <= 10: metrics["hits@10"] += 1
                    metrics["mrr"] += 1 / r
                    found = True
                    break  # stop checking once found


            if not found:
                metrics["mrr"] += 0.0  # optional, for clarity

        # Average over total queries
        for k in ["hits@1", "hits@3", "hits@5", "hits@10"]:
            metrics[k] = metrics[k] / total
        metrics["mrr"] = metrics["mrr"] / total

        return metrics

    def evaluate(self):
        print(f"QA evaluate for {len(self.results)} results on {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}")
        pred = [
          {
              'id': result['id'],  # Convert ID to string
              'prediction_text': self.normalize_text(result['pred']['answer'])
          }
          for result in self.results
        ]

        ref = [
            {
                'id': item['id'],  # Convert ID to string
                'answers': {
                    'text': [self.normalize_text(item['answer']['text'])],
                    'answer_start': [item['answer']['start']]
                }
            }
            for item in self.results
        ]

        # Load SQuAD metric
        metric = load("squad")

        # Compute metric
        res = metric.compute(predictions=pred, references=ref)
        sentence_match_scores = [
            p['prediction_text'] in r['answers']['text'][0] for p, r in zip(pred, ref)
        ]

        # Compute average sentence match score
        avg_sentence_match = np.mean(sentence_match_scores)

        # Combine results
        res["sentence_match"] = float(avg_sentence_match ) * 100
        print(res)

        self.config = {
            'model_path': self.model_path,
            'tokenizer_path': self.tokenizer_path,
            'k': self.k,
            'sample': self.sample,
            'isRandom': self.isRandom,
            'overlap': self.overlap,
            'num_chunks': self.num_chunks,
            'indexer_type': self.indexer_type
        }
        self.eval_res = res

        return self.eval_res, self.config, self.stats

    def filter_incomplete_examples(self, example):
        # Ensure both "question" and "context" exist and are non-empty
        return "question" in example and example["question"] and \
            "article_body" in example and example["answer"]

    def filter_by_token_length(self, example):
        # Tokenize the concatenated question + article_body
        tokens = self.tokenizer(example["question"], example["article_body"], truncation=False)
        return len(tokens["input_ids"]) <= 512

    def decode_error(self, example):
        input_ids = example["input_ids"]
        start_positions = example["start_positions"]
        end_positions = example["end_positions"]
        predict_answer_tokens = input_ids[start_positions : end_positions+1]
        return self.tokenizer.decode(predict_answer_tokens) == example["answer"]

    def tokenize_train_function(self, examples):
        article_text = [article for article in examples.get("article_body", [""])]
        context_text = [context for context in examples.get("context", ["{}"])]
        answer_text = examples.get("answer", [""])
        answer_start = examples.get("answer_start", [0])
        context_start_list = examples.get("context_start", [0])
        question_text = [q for q in examples.get("question", [""])]
        start_positions = []
        end_positions = []

        inputs = self.tokenizer(
            question_text,
            article_text,
            truncation="only_second",  # Truncate only the context
            max_length=512,            # Limit input length
            stride=128,                # Add a sliding window
            return_overflowing_tokens=False,  # Handle long contexts
            return_offsets_mapping=True,
            padding="max_length"
        )

        offset_mapping = inputs.pop("offset_mapping")
        # sample_map = inputs.pop("overflow_to_sample_mapping")

        for i, offset in enumerate(offset_mapping):
            answer = answer_text[i]
            context = context_text[i]
            article = article_text[i]
            start_char = int(context_start_list[i]) + int(answer_start[i])
            end_char = start_char + len(answer)


            sequence_ids = inputs.sequence_ids(i)

            # Find the start and end of the context
            idx = 0
            while sequence_ids[idx] != 1:
                idx += 1
            context_start = idx
            while sequence_ids[idx] == 1:
                idx += 1
            context_end = idx - 1

            # If the answer is not fully inside the context, label is (0, 0)
            if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
                start_positions.append(0)
                end_positions.append(0)
            else:
                # Otherwise it's the start and end token positions
                idx = context_start
                while idx <= context_end and offset[idx][0] <= start_char:
                    idx += 1
                start_positions.append(idx - 1)

                idx = context_end
                while idx >= context_start and offset[idx][1] >= end_char:
                    idx -= 1
                end_positions.append(idx + 1)

        inputs["start_positions"] = start_positions
        inputs["end_positions"] = end_positions


        return inputs


# QA - BM25

In [12]:
print(np.__version__)  # Check if NumPy is available
print(torch.randn(1).numpy())


2.0.2
[-0.42630348]


In [13]:
qa_bm25 = QA(
    model_path=CURRENT_MODEL,
    k = 100, overlap=0.0, num_chunks=1)
wrong = qa_bm25.evaluate_retriever()
metrics = qa_bm25.compute_retrieval_metrics()
print(len(wrong))
print(metrics)

Initiating reader with model: /content/drive/My Drive/UP Files/IV - 2nd sem/CMSC 198.1/cebqa_roberta/new-split/xlmr_body-filtered/2025-04-04_05-13/model


KeyboardInterrupt: 

In [None]:
qa_bm25.run()
qa_bm25.evaluate()


# QA - FAISS

In [14]:
qa_faiss = QA(
    sample=100,
    model_path=CURRENT_MODEL,
    k = 10, overlap=0.0, num_chunks=1,
    indexer_type=FAISS,
    use_fine_tuned=True)
wrong_faiss = qa_faiss.evaluate_retriever()
metrics = qa_faiss.compute_retrieval_metrics()
print(len(wrong_faiss))
print(metrics)

Initiating reader with model: /content/drive/My Drive/UP Files/IV - 2nd sem/CMSC 198.1/cebqa_roberta/new-split/xlmr_body-filtered/2025-04-04_05-13/model


Device set to use cpu
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/533 [00:00<?, ?B/s]

train.csv:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

validation.csv:   0%|          | 0.00/6.10M [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/11.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/19300 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5597 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5597 [00:00<?, ? examples/s]

Map:   0%|          | 0/5596 [00:00<?, ? examples/s]

Map:   0%|          | 0/5596 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5596 [00:00<?, ? examples/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Initiating QA Pipeline.
QA model /content/drive/My Drive/UP Files/IV - 2nd sem/CMSC 198.1/cebqa_roberta/new-split/xlmr_body-filtered/2025-04-04_05-13/model
QA tokenizer /content/drive/My Drive/UP Files/IV - 2nd sem/CMSC 198.1/cebqa_roberta/new-split/xlmr_body-filtered/2025-04-04_05-13/tokenizer
QA reader <__main__.Reader object at 0x7e2211114d90>
QA dataset 100
QA k 10
QA overlap 0.0
QA num_chunks 1
QA sample 100
QA isRandom False
QA index_name superbalita
QA indexer superbalita
Initializing FAISS Retriever
Loading DPR multilingual context encoder...


tokenizer_config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.


pytorch_model.bin:   0%|          | 0.00/712M [00:00<?, ?B/s]

Some weights of the model checkpoint at voidful/dpr-ctx_encoder-bert-base-multilingual were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


README.md:   0%|          | 0.00/364 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/711M [00:00<?, ?B/s]

train.csv:   0%|          | 0.00/2.39M [00:00<?, ?B/s]

validation.csv:   0%|          | 0.00/346k [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/663k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1190 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/169 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/341 [00:00<?, ? examples/s]

Created FAISS index with dimension 768.
Indexed 1700 documents into FAISS.
FAISS index saved to faiss_index.idx.
FAISS Indexer initialized.
[[1.4406872 1.4552516 1.4680524 1.477099  1.48803   1.4894395 1.498502
  1.5018615 1.507806  1.5148   ]] [[ 399  772 1380 1493 1581 1361  745  636 1194 1528]]
[[1.509584  1.551236  1.563184  1.5669134 1.5987114 1.6072872 1.6077955
  1.6080596 1.6154044 1.6154988]] [[ 399 1528 1380  412  368 1538  219 1361  827  977]]
[[1.3842777 1.4149517 1.4279404 1.4341114 1.434721  1.4352542 1.4376818
  1.4416026 1.4440646 1.4456697]] [[1026 1576 1584 1608   35  537 1226 1306  219  280]]
[[1.7475703 1.7692494 1.7850958 1.7897801 1.7965312 1.8002459 1.8013752
  1.8054103 1.807054  1.820726 ]] [[1361  535  276 1129 1310  373 1640  347  566  413]]
[[1.5113903 1.5146412 1.5157156 1.5191927 1.5270267 1.531431  1.5449325
  1.5620288 1.5621692 1.5654838]] [[1576  834  536 1361 1584 1026  445  581 1496   35]]
[[1.7087901 1.7642617 1.7706661 1.7772131 1.7901918 1.7972755

In [None]:
qa_faiss.run()
qa_faiss.evaluate()