<a href="https://colab.research.google.com/github/jtlagumbay/cebqa/blob/main/retriever/bm_roberta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **QA Pipeline**

1. ElasticSeach Indexer
2. BM25 Retriever
3. Fine-tuned XLMR Reader


# Dependencies

In [1]:
!pip install elasticsearch transformers torch sentence-transformers


Collecting elasticsearch
  Downloading elasticsearch-8.17.2-py3-none-any.whl.metadata (8.8 kB)
Collecting elastic-transport<9,>=8.15.1 (from elasticsearch)
  Downloading elastic_transport-8.17.0-py3-none-any.whl.metadata (3.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collectin

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import json
import os
import subprocess
import time


# Setting up ElasticSearch

In [6]:
!wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.10.1-linux-x86_64.tar.gz
!tar -xzf elasticsearch-7.10.1-linux-x86_64.tar.gz
!chown -R daemon:daemon elasticsearch-7.10.1

--2025-03-11 00:10:53--  https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.10.1-linux-x86_64.tar.gz
Resolving artifacts.elastic.co (artifacts.elastic.co)... 34.120.127.130, 2600:1901:0:1d7::
Connecting to artifacts.elastic.co (artifacts.elastic.co)|34.120.127.130|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 318801277 (304M) [application/x-gzip]
Saving to: ‘elasticsearch-7.10.1-linux-x86_64.tar.gz’


2025-03-11 00:11:11 (16.9 MB/s) - ‘elasticsearch-7.10.1-linux-x86_64.tar.gz’ saved [318801277/318801277]



In [8]:
es_process = subprocess.Popen(
    ['./elasticsearch-7.10.1/bin/elasticsearch'],
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE,
    preexec_fn=lambda: os.setuid(1)  # Run as user 'daemon'
)
time.sleep(30)
print("Elasticsearch started!")
es = Elasticsearch("http://localhost:9200")

# Test connection
if es.ping():
    print("Connected to Elasticsearch!")
else:
    print("Could not connect to Elasticsearch.")

Elasticsearch started!
Could not connect to Elasticsearch.


# Indexer

In [4]:
class ElasticSearchIndexer:
    def __init__(self, index_name="superbalita"):
        self.index_name = index_name
        self.es = Elasticsearch("http://localhost:9200")  # Ensure ES is running

    def create_index(self):
        """ Create an index with a text field for BM25 """
        if not self.es.indices.exists(index=self.index_name):
            self.es.indices.create(index=self.index_name, body={
                "settings": {
                    "number_of_shards": 1,
                    "number_of_replicas": 0
                },
                "mappings": {
                    "properties": {
                        "id": {"type": "keyword"},
                        "title": {"type": "text"},
                        "body": {"type": "text"}
                    }
                }
            })
            print(f"Index '{self.index_name}' created.")

    def index_documents(self, documents):
        """ Bulk index documents into ElasticSearch """
        actions = [
            {
                "_index": self.index_name,
                "_id": doc["id"],  # Use document ID for uniqueness
                "_source": {
                    "id": doc["id"],
                    "title": doc["title"],
                    "body": doc["body"]
                }
            }
            for doc in documents
        ]
        bulk(self.es, actions)
        print(f"Indexed {len(documents)} documents.")



In [5]:
# Sample usage
indexer = ElasticSearchIndexer()
indexer.create_index()
documents = [
    {"id": "1", "title": "Eiffel Tower", "body": "The Eiffel Tower is in Paris."},
    {"id": "2", "title": "Python Language", "body": "Python is a programming language."},
    {"id": "3", "title": "Tokyo", "body": "The capital of Japan is Tokyo."}
]
indexer.index_documents(documents)

Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/urllib3/connection.py", line 198, in _new_conn
    sock = connection.create_connection(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/urllib3/util/connection.py", line 85, in create_connection
    raise err
  File "/usr/local/lib/python3.11/dist-packages/urllib3/util/connection.py", line 73, in create_connection
    sock.connect(sa)
ConnectionRefusedError: [Errno 111] Connection refused

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/elastic_transport/_node/_http_urllib3.py", line 167, in perform_request
    response = self.pool.urlopen(
               ^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/urllib3/connectionpool.py", line 841, in urlopen
    retries = retries.increment(
              ^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/pyth

ConnectionError: Connection error caused by: ConnectionError(Connection error caused by: NewConnectionError(<urllib3.connection.HTTPConnection object at 0x7f00463394d0>: Failed to establish a new connection: [Errno 111] Connection refused))