# Set up

In [None]:
from huggingface_hub import login
login("YOUR KEY HERE")

In [None]:
import json
import os
import chromadb
from langchain_chroma import Chroma

from langchain.text_splitter import RecursiveCharacterTextSplitter
from llm.llm import BedRockLLMs,CoreLLMs
from llm.llm_utils import *
import numpy as np
from pyvi import ViTokenizer
import boto3
from dotenv import load_dotenv
load_dotenv()
from langchain.embeddings.openai import OpenAIEmbeddings
from prompts import *
from tqdm import tqdm

In [None]:
llm = CoreLLMs(quantization='int4')

# Load Database

In [None]:
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings

In [None]:
model = 'bkai-foundation-models/vietnamese-bi-encoder'
embd = SentenceTransformerEmbeddings(model_name=model)

In [None]:
data_dir="data_raw10k"
persist_dir="/content/drive/MyDrive/speech/RAG/ChromaDB/chromadb"
col_name="wiki10k"
DATA_DIR = data_dir
CHROMA_PATH = persist_dir
COLLECTION_NAME = col_name

vectorstore1 = Chroma(collection_name=COLLECTION_NAME, persist_directory=CHROMA_PATH, embedding_function=embd)

data_dir="gendata"
persist_dir="/content/drive/MyDrive/speech/RAG/ChromaDB/genchromadb"
col_name="wiki10k"
DATA_DIR = data_dir
CHROMA_PATH = persist_dir
COLLECTION_NAME = col_name

vectorstore2 = Chroma(collection_name=COLLECTION_NAME, persist_directory=CHROMA_PATH, embedding_function=embd)

# Set up

In [1]:
# Context function
def RAG(questions):
    data = vectorstore1.similarity_search(questions,5)
    context = '\n\n'.join([doc.page_content for doc in data])
    return context

In [None]:
# Load questions
questions = json.loads(open('/content/drive/MyDrive/speech/RAG/hard_questions_4000.json').read())

# RAG

In [None]:
answers = []
labels = []
predicts = []
for test in tqdm(questions[:400]):
    try:
        question = test['question']
        context = RAG(question)
        response = RAG_QA(llm, question, context, test['choice'])
        answer = response[0]
        for key, val in answer.items():
          if key != 'answer':
            key = 'answer'
            answer = {}
            answer[key] = val
        answer['id'] = test['id']
        answers.append(answer)
        with open('answers_hard_4000_llama3_base.json', 'w') as f:
            f.write(json.dumps(answers))
        labels.append(test['answer'])
        predicts.append(answer['answer'])
    except Exception as e:
        pass

In [None]:
use_label = []
for label in labels:
    try:
        label = int(label)
        use_label.append(label)
    except:
        use_label.append(0)

labels = np.array(labels)
predicts = np.array(predicts)

accuracy = np.mean(labels == predicts)
print('Accuracy baseline: ', accuracy)

# RAG + Rerank

In [None]:
from FlagEmbedding import FlagReranker

In [None]:
reranker_model = 'BAAI/bge-reranker-v2-m3'
reranker = FlagReranker(reranker_model)

In [None]:
# Ulitity functions for reranking
def create_query(query, docs):

    if isinstance(query, list):
        query = query[0]
    if isinstance(docs, str):
        docs = [docs]

    pairs = []
    for doc in docs:
        pairs.append([query, doc])
    return pairs


def rerank(query, docs, k):
    pairs = create_query(query, docs)
    scores = np.array(reranker.compute_score(pairs))
    docs = np.array(docs)

    top_k_indices = np.argpartition(scores, -k)[-k:]
    top_k_elements = scores[top_k_indices]
    # print(top_k_indices)

    top_k_indices = top_k_indices[np.argsort(-top_k_elements)]
    return docs[top_k_indices].tolist()

def RAG_rerank(questions):
    data = vectorstore2.similarity_search(questions,25)
    data = rerank(questions, [doc.page_content for doc in data], 5)
    context = '\n\n'.join(data)
    return context
    # return data

In [None]:
answers = []
labels = []
predicts = []
for test in tqdm(questions):
    try:
        question = test['question']
        context = RAG_rerank(question)
        response = RAG_QA(llm, question, context, test['choice'])
        answer = response[0]
        for key, val in answer.items():
          if key != 'answer':
            key = 'answer'
            answer = {}
            answer[key] = val
        answer['id'] = test['id']

        with open('answers_hard_4000_llama3_reranker.json', 'w') as f:
            f.write(json.dumps(answers))
        labels.append(test['answer'])
        predicts.append(answer['answer'])
    except Exception as e:
        print('Error: ',e)
        print(answer)
        pass
    # break

In [None]:
use_label = []
for label in labels:
    try:
        label = int(label)
        use_label.append(label)
    except:
        use_label.append(0)

labels = np.array(labels)
predicts = np.array(predicts)

accuracy = np.mean(labels == predicts)
print('Accuracy rerank: ', accuracy)