# Build Stage

## Setup environment

In [None]:
!gdown --fuzzy https://drive.google.com/file/d/1r-n0w3yIAahy-MIiT4klvtJ8WAl4ONwU/view?usp=drive_link
!gdown --fuzzy https://drive.google.com/file/d/1fPh2CT6eKCc6PirkIuqESmp6jDIXXbht/view?usp=drive_link
!gdown --fuzzy https://drive.google.com/file/d/17x7am_JGRXfiwNO68mv3yOCpbLK-7IDA/view?usp=drive_link

Downloading...
From (original): https://drive.google.com/uc?id=1r-n0w3yIAahy-MIiT4klvtJ8WAl4ONwU
From (redirected): https://drive.google.com/uc?id=1r-n0w3yIAahy-MIiT4klvtJ8WAl4ONwU&confirm=t&uuid=da37929d-3cea-42be-ae5d-4b092ba0f2fd
To: /content/corpus.csv
100% 381M/381M [00:07<00:00, 48.8MB/s]
Downloading...
From: https://drive.google.com/uc?id=1fPh2CT6eKCc6PirkIuqESmp6jDIXXbht
To: /content/public_test.csv
100% 1.30M/1.30M [00:00<00:00, 111MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=17x7am_JGRXfiwNO68mv3yOCpbLK-7IDA
From (redirected): https://drive.google.com/uc?id=17x7am_JGRXfiwNO68mv3yOCpbLK-7IDA&confirm=t&uuid=7bb8d5a7-9e93-4eb5-8889-9243f4923b71
To: /content/train.csv
100% 185M/185M [00:03<00:00, 59.4MB/s]


In [None]:
!pip install -q pymilvus rerankers openai

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/201.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m201.1/201.1 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.1/41.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.6/53.6 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.5/49.5 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os
import ast
import random
from dataclasses import dataclass, field
from abc import ABC, abstractmethod

import numpy as np
import pandas as pd

import torch
import pymilvus
from pymilvus import MilvusClient

from transformers import AutoTokenizer, AutoModel
import rerankers

In [None]:
# ===== CORE MODEL CONFIG ======
@dataclass
class Config:
    device: str = "cuda" if torch.cuda.is_available() else "cpu"


@dataclass
class EmbeddingConfig(Config):
    model_name: str = "keepitreal/vietnamese-sbert"


# dataclass
# class LanguageConfig(Config):
#     model_name: str
#     temperature: float = 0.8
#     top_k: int = 10


# dataclass
# class RerankerConfig(Config):
#     model_name: str


@dataclass
class DatasetConfig:
    data_root: str = "/content"
    corpus_dir: str = f"{data_root}/corpus.csv"
    train_dir: str = f"{data_root}/train.csv"
    vector_src_dir: str = f"{data_root}/vector_db_src.csv"
    public_test_dir: str = f"{data_root}/public_test.csv"


# ===== DATA SETUP CONFIG =====
@dataclass
class MilvusDBConfig:
    data_root: str = "/content"

    db_name: str = (
        f"{data_root}/bkai_milvus.db"  # Change this to place the db to where you want
    )
    collection_name: str = "bkai_vectordb"
    limit: int = 30  # This is top_k results
    output_fields: list = field(default_factory=lambda: ["question", "context", "cid"])
    metric_type: str = (
        "COSINE"  # Possible values are IP, L2, COSINE, JACCARD, and HAMMING
    )

    # More details at: https://milvus.io/api-reference/pymilvus/v2.4.x/MilvusClient/Vector/search.md
    params: dict = field(default_factory=lambda: {})

    # Dataset config: this part requires dataset's EDA.
    dimension: int = 768  # Length of the embedding vector (1, embed_len)
    primary_field_name: str = "id"
    id_type: str = "int"
    vector_field_name: str = "embeddings"
    auto_id: bool = False

In [None]:
# Get API key from Colab's Secret
from google.colab import userdata

HF_TOKEN = userdata.get("HF_TOKEN")
GEMINI_API_KEY = userdata.get("GEMINI_API_KEY")
GROQ_API_KEY = userdata.get("GROQ_API_KEY")
SAMBANOVA_API_KEY = userdata.get("SAMBANOVA_API_KEY")

## Base Model

In [None]:
class BaseAgent(ABC):
    @abstractmethod
    def invoke(self, payload: any) -> any:
        raise NotImplementedError

    def prettify(self, payload): ...

# Runtime Stage

## Setup Milvus Database

In [None]:
"Connet to MilvusDB"


class MilvusDBConnection:
    "Establish Connection to Milvus"

    def __init__(self, config):
        self.config = config
        self.connect()

    def connect(self):
        try:
            self.client = MilvusClient(self.config.db_name)
        except Exception as e:
            raise pymilvus.exceptions.ConnectError() from e

    def check_collection(self) -> bool:
        return self.client.has_collection(collection_name=self.config.collection_name)

    def create_collection(self):
        try:
            self.client.create_collection(
                collection_name=self.config.collection_name,
                dimension=self.config.dimension,
                primary_field_name=self.config.primary_field_name,
                id_type=self.config.id_type,
                vector_field_name=self.config.vector_field_name,
                auto_id=self.config.auto_id,
                metric_type=self.config.metric_type,
            )
        except Exception as e:
            raise e

    def drop_collection(self):
        try:
            if self.check_collection():
                self.client.drop_collection(collection_name=self.config.collection_name)
        except Exception as e:
            raise pymilvus.exceptions.CollectionNotExistException() from e

## Setup Retriever

In [None]:
def setup_env_var(env_name: str):
    try:
        os.environ[env_name] = os.getenv(env_name)
    except:
        raise ImportError(f"Can not find {env_name} in .env file.")


# This script is highly recommended for individuals that have limit computatation resources.
def process_data_in_batches(df, batch_size=10000):
    """Processes data in batches.

    Args:
      df: The Pandas DataFrame to process.
      batch_size: The number of rows to process in each batch.

    Yields:
      A generator that yields batches of the DataFrame.
    """
    for i in range(0, len(df), batch_size):
        yield df[i : i + batch_size]


def get_env(env_name: str, default_name: str) -> str:
    "Get variables from the environment, if there is not, return default value"
    return os.getenv(env_name) if os.getenv(env_name) else default_name

## Embedding Model

In [None]:
class EmbeddingModel(BaseAgent):
    def __init__(self, config: EmbeddingConfig):
        super().__init__()
        # self.config = EmbeddingModel()
        self.config = config
        self.model, self.tokenizer = self._init_model(
            model_name=self.config.model_name, device=self.config.device
        )

    def _init_model(self, model_name: str, device: str) -> tuple:
        "Initialize model"
        model = AutoModel.from_pretrained(model_name).to(device)
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        return (model, tokenizer)

    # Mean Pooling - Take attention mask into account for correct averaging
    def _mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[
            0
        ]  # First element of model_output contains all token embeddings
        input_mask_expanded = (
            attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        )
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
            input_mask_expanded.sum(1), min=1e-9
        )

    # TODO: make hyperparameters read from a declarative file.
    def invoke(self, payload: str | list[str]):
        tokenized_input = self.tokenizer(
            payload, padding=True, truncation=True, return_tensors="pt"
        ).to(self.config.device)
        with torch.no_grad():
            model_output = self.model(**tokenized_input)

        embedding_outputs = (
            self._mean_pooling(model_output, tokenized_input["attention_mask"])
            .detach()
            .cpu()
            .numpy()
        )

        return embedding_outputs

### Language Model

#### Prompts

#### Model Serving

In [None]:
"Setup HuggingFace"

'Setup HuggingFace'

### Re-rankers

In [None]:
# ----- PROMPTS ENGINEER -----
def chain_of_thoughts(df: pd.DataFrame, n_samples: int = 30) -> list[str]:
    "Get a list of examplars for the training dataset"
    rand_idx = random.sample(range(0, len(df)), n_samples)
    result_df = df.iloc[rand_idx][["question", "context"]]
    questions = result_df.question.to_list()
    contexts = result_df.context.to_list()
    results = []

    for question, context in zip(questions, contexts):
        result = (
            question + ": " + context[2:-2]
        )  # Remove open/close brackets and quote ([] and '')
        results.append(result)
    return results


def query_refining(query: str) -> str: ...


# ----- RE-RANKING RESULTS -----
def reranking(query: str, contexts: list, cids: list, reranker: rerankers.Reranker):
    results = reranker.rank(query=query, documents=contexts, doc_ids=[cids])

    return results

## Inference

In [None]:
ds_conf = DatasetConfig()

In [None]:
corpus_df = pd.read_csv(ds_conf.corpus_dir)
corpus_df.head()

Unnamed: 0,text,cid
0,"Thông tư này hướng dẫn tuần tra, canh gác bảo ...",0
1,"1. Hàng năm trước mùa mưa, lũ, Ủy ban nhân dân...",1
2,Tiêu chuẩn của các thành viên thuộc lực lượng ...,2
3,"Nhiệm vụ của lực lượng tuần tra, canh gác đê\n...",3
4,"Phù hiệu của lực lượng tuần tra, canh gác đê\n...",4


In [None]:
embedding_model = EmbeddingModel(config=EmbeddingConfig())

In [None]:
data = [corpus_df.iloc[idx]["text"] for idx in range(1000)]
embedding_model.invoke(data)

array([[ 0.25151125, -0.2394855 ,  0.50856113, ...,  0.5162235 ,
         0.07901092,  0.1941617 ],
       [ 0.07124101, -0.1602871 ,  0.27494153, ...,  0.41351348,
        -0.16980839, -0.00733354],
       [-0.11018166, -0.10737949,  0.12514003, ...,  0.37712625,
        -0.3184072 , -0.11765953],
       ...,
       [-0.1347532 , -0.14668043, -0.04667186, ...,  0.47414014,
        -0.30602235, -0.07017761],
       [ 0.27715978, -0.10888157, -0.23547548, ..., -0.00310109,
        -0.23062989, -0.17432532],
       [ 0.20590976, -0.17906967, -0.29023546, ...,  0.18324456,
        -0.24655035, -0.00187758]], dtype=float32)

In [None]:
print("Embedding context")
vector_embedding = []
idx = 0
for batch in process_data_in_batches(corpus_df, batch_size=800):
    print(f"[INFO] Embedding batch {idx}")
    idx += 1

    data = [batch.iloc[idx]["text"] for idx in range(len(batch))]
    embedding_batches = embedding_model.invoke(data)
    vector_embedding += list(embedding_batches)

corpus_df["embeddings"] = vector_embedding
corpus_df.head()

Embedding context


Unnamed: 0,text,cid,embeddings
0,"Thông tư này hướng dẫn tuần tra, canh gác bảo ...",0,"[0.25151125, -0.2394855, 0.50856113, -0.154237..."
1,"1. Hàng năm trước mùa mưa, lũ, Ủy ban nhân dân...",1,"[0.071241006, -0.1602871, 0.27494153, 0.189196..."
2,Tiêu chuẩn của các thành viên thuộc lực lượng ...,2,"[-0.11018166, -0.10737949, 0.12514003, 0.24666..."
3,"Nhiệm vụ của lực lượng tuần tra, canh gác đê\n...",3,"[-0.08416447, -0.0010932062, 0.28799042, 0.047..."
4,"Phù hiệu của lực lượng tuần tra, canh gác đê\n...",4,"[0.285765, 0.0005648818, 0.37183034, 0.1194471..."


In [None]:
corpus_df.to_csv("/content/vector_db_src.csv", index=False)

In [None]:
vt_df = pd.read_csv("/content/vector_db_src.csv")
vt_df.head()

Unnamed: 0,text,cid,embeddings
0,"Thông tư này hướng dẫn tuần tra, canh gác bảo ...",0,[ 2.51511246e-01 -2.39485502e-01 5.08561134e-...
1,"1. Hàng năm trước mùa mưa, lũ, Ủy ban nhân dân...",1,[ 0.07124101 -0.1602871 0.27494153 0.189196...
2,Tiêu chuẩn của các thành viên thuộc lực lượng ...,2,[-1.10181659e-01 -1.07379489e-01 1.25140026e-...
3,"Nhiệm vụ của lực lượng tuần tra, canh gác đê\n...",3,[-8.41644704e-02 -1.09320623e-03 2.87990421e-...
4,"Phù hiệu của lực lượng tuần tra, canh gác đê\n...",4,[ 2.85764992e-01 5.64881775e-04 3.71830344e-...


In [None]:
sample = vt_df.iloc[0]["embeddings"]
string = sample[1:-1]
float_strings = string.replace("\n ", " ").split(",")
# print(type(float_strings))
float_list = [type(s) for s in float_strings]
float_list

[str]

In [None]:
string_ = sample[2:-2]
float_strings = string_.replace("\n ", " ").split(" ")
float_list = [float(s) for s in float_strings if s != ""]
float_list

[0.251511246,
 -0.239485502,
 0.508561134,
 -0.154237479,
 0.419388384,
 0.318670094,
 -0.226887465,
 0.101015024,
 0.0590886623,
 0.103152506,
 0.25899899,
 -0.238516539,
 0.337258637,
 -0.332455784,
 -0.58778578,
 0.350172162,
 0.268106431,
 0.494628876,
 -0.00973232929,
 -0.108364746,
 -0.0811625943,
 0.0128706628,
 0.218784645,
 -0.0396557115,
 -0.532194793,
 -0.223774716,
 0.171768889,
 0.0661188066,
 -0.170780644,
 0.0363783985,
 -0.0241080765,
 -0.0231820736,
 -0.00365994708,
 -0.0307426117,
 0.345928282,
 0.315429866,
 -0.546843529,
 0.248901516,
 0.317107916,
 -0.106531844,
 0.103877865,
 0.107534297,
 -0.0386499688,
 -0.0623764619,
 -0.036817424,
 -0.228677616,
 -0.178997397,
 -0.191190317,
 0.0510617904,
 0.0370283015,
 0.134944588,
 0.403852671,
 0.198575824,
 -0.27245611,
 0.281724125,
 0.0441567972,
 -0.0400243476,
 0.19505772,
 0.348422647,
 -0.171591893,
 -0.137190551,
 0.176442459,
 0.0214074366,
 -0.207791746,
 0.299978316,
 0.47545588,
 0.0205516312,
 0.0471596867,
 

In [None]:
len(float_list)

768

In [None]:
class DoculensRetreiver:
    "Actions on Database"

    def __init__(
        self,
        embedding_conf=EmbeddingConfig(),
        ds_conf=DatasetConfig(),
        mlv_conf=MilvusDBConfig(),
    ):
        self.ds_conf = ds_conf
        self.mlv_conf = mlv_conf

        # Setup Embedding model
        self.embedding_model = EmbeddingModel(config=embedding_conf)

        # Setup MilvusDB Connection
        self.connection = MilvusDBConnection(config=mlv_conf)
        self.connection.create_collection()

        self.client = self.connection.client

        self.setup_db()

    def setup_db(self):
        "Create an instance to database"

        if self.connection.check_collection():
            # 1. Convert embedding value from string to float
            print("Convert embedding value from string to float")
            vector_df = pd.read_csv(self.ds_conf.vector_src_dir, index_col=0)

            print("Insert data by batch")
            for batch in process_data_in_batches(vector_df, batch_size=1000):
                data = [batch.iloc[idx].to_dict() for idx in range(len(batch))]

                # Insert records
                res = self.client.insert(
                    collection_name=self.mlv_conf.collection_name, data=data
                )

                print(res)
        else:
            print("Collection is not created")

    def retrieve(self, query: str | list[str]) -> dict:
        "Retrieve an instance"
        ...
        sentence_embedding = self.embedding_model.invoke(query)
        search_params = {
            "metric_type": self.mlv_conf.metric_type,
            "params": self.mlv_conf.params,
        }

        print("Semantic search")
        result = self.client.search(
            collection_name=self.mlv_conf.collection_name,
            data=sentence_embedding,
            limit=self.mlv_conf.limit,
            output_fields=self.mlv_conf.output_fields,
            search_params=search_params,
        )
        return result

    def _convert_string_to_float_df(self, sample):
        # Remove the open/close brackets
        string = sample[1:-1]

        # Split the string into a list of strings
        float_strings = string.split(", ")

        # Convert each string to a float
        float_list = [float(s) for s in float_strings]

        return float_list

In [None]:
# Setup Config
ds_conf = DatasetConfig()

# Setup Retriever
print("Setting Doculens Retriever")
embedding_model = EmbeddingModel(config=EmbeddingConfig())
retriever = DoculensRetreiver()

Setting Doculens Retriever


ERROR:pymilvus.milvus_client.milvus_client:Failed to create new connection using: 7ed0b0d740dc492ab7ed1620d28a8ea8


ConnectError: <ConnectError: (code=1, message=)>

In [None]:
# Setup public test
print("Reading Dataframe")
corpus_df = pd.read_csv(ds_conf.corpus_dir)
test_df = pd.read_csv(ds_conf.public_test_dir)


# Start getting results from public test
print("Start Infering: Answering legal questions")
result_dict = {"question": [], "qid": [], "context": [], "cid": []}

idx = 0
for batch in process_data_in_batches(test_df, batch_size=1000):
    print("[INFO] Inference on batch {idx}")
    # Iterate via each instance in batch
    for idx in range(len(batch)):
        question = batch.iloc[idx]["question"]
        qid = batch.iloc[idx]["qid"]

        # Retrieve relevant context
        contexts = []
        cids = []

        try:
            result = retriever.retrieve(question)
            print(result)
            for res in result[0]:
                res_entity = res["entity"]
                contexts.append(res_entity["context"])
                cids.append(res_entity["cid"])
        except:
            contexts = [None]
            cids = [-1]

        result_dict["question"].append(question)
        result_dict["qid"].append(qid)
        result_dict["context"].append(contexts)
        result_dict["cid"].append(cids)
    idx += 1