In [None]:
import pandas as pd
import numpy as np
from dotenv import load_dotenv
from langchain_core.documents import Document
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_mongodb import MongoDBAtlasVectorSearch
from pymongo import MongoClient

from concurrent.futures import ThreadPoolExecutor

import os, json, time

pd.set_option('display.max_colwidth', None)

In [None]:
load_dotenv()

RANDOM_SEED = int(os.getenv("RANDOM_SEED"))

np.random.seed(RANDOM_SEED)

## Data Preparation

In [None]:
GENERATED_DATA_DIR = os.getenv('GENERATED_DATA_DIR')
TRAINING_PROPORTION = float(os.getenv("TRAINING_PROPORTION"))
VALIDATION_PROPORTION = float(os.getenv("VALIDATION_PROPORTION"))
TESTING_PROPORTION = float(os.getenv("TESTING_PROPORTION"))

print(f"""Generated Data Directory = {GENERATED_DATA_DIR}
      
Training Proportion = {TRAINING_PROPORTION}
Validation Proportion = {VALIDATION_PROPORTION}
Testing Proportion = {TESTING_PROPORTION}""")

In [None]:
def compile_dataset(source_dir, 
                    training_proportion=TRAINING_PROPORTION, 
                    validation_proportion=VALIDATION_PROPORTION):
    
    training_df = pd.DataFrame()
    validation_df = pd.DataFrame()
    testing_df = pd.DataFrame()
    chunked_data = []

    for source_csv_dir in os.listdir(source_dir):
        source_csv_paths = os.listdir(os.path.join(source_dir, source_csv_dir))
        for csv_path in source_csv_paths:
            csv = pd.read_csv(os.path.join(source_dir, source_csv_dir, csv_path))
            csv = csv.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)
            total_qty = len(csv)
            validation_start = int(total_qty * training_proportion)
            validation_end = validation_start + int(total_qty * validation_proportion)
            training_df = pd.concat([training_df, csv.iloc[:validation_start]])
            chunked_data.append("\n\n".join([f"{row['input']}\n{row['output']}" for _, row in csv.iloc[:validation_start].iterrows()]))
            validation_df = pd.concat([validation_df, csv.iloc[validation_start:validation_end]])
            testing_df = pd.concat([testing_df, csv.iloc[validation_end:]])

    training_df = training_df.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)
    validation_df = validation_df.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)
    testing_df = testing_df.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

    return training_df, validation_df, testing_df, chunked_data

In [None]:
training_data, validation_data, testing_data, chunked_data = compile_dataset(GENERATED_DATA_DIR)

In [None]:
testing_data

In [None]:
chunked_data

## Setup Model Evaluation

In [None]:
GEMINI_TOKENS = json.loads(os.getenv("GEMINI_TOKENS"))

EMBEDDING_MODELS = [GoogleGenerativeAIEmbeddings(model="gemini-embedding-001", api_key=key) for key in GEMINI_TOKENS]
CHAT_MODELS = [ChatGoogleGenerativeAI(model="gemini-2.5-flash-lite", api_key=key) for key in GEMINI_TOKENS]

CHUNKS = [[] for _ in range(len(EMBEDDING_MODELS))]
for i, chunk in enumerate(chunked_data):
    CHUNKS[i % len(EMBEDDING_MODELS)].append(Document(page_content=chunk))

In [None]:
MONGODB_VECTOR = os.getenv("MONGODB_VECTOR")
MONGODB_COLLECTION = os.getenv("MONGODB_COLLECTION")
MONGODB_DB = os.getenv("MONGODB_DB")
MONGODB_URI = os.getenv("MONGODB_URI")

In [None]:
MONGO_CLIENT = MongoClient(MONGODB_URI)
MONGO_DB = MONGO_CLIENT.get_database(MONGODB_DB)
MONGO_COLLECTION = MONGO_DB.get_collection(MONGODB_COLLECTION)

In [None]:
VECTOR_STORE_CONFIG = {
  "name": MONGODB_VECTOR,
  "definition": {
    "fields": [{
      "type": "vector",
      "path": "embedding",
      "numDimensions": 3072,
      "similarity": "euclidean"
    }],
  },
  "type": "vectorSearch",
}
VECTOR_STORES = [
    MongoDBAtlasVectorSearch(
        index_name=MONGODB_VECTOR,
        collection=MONGO_COLLECTION,
        embedding=embedding,
        embedding_key=VECTOR_STORE_CONFIG["definition"]["fields"][0]["path"],
        text_key="text",
        relevance_score_fn="euclidean",
    ) for embedding in EMBEDDING_MODELS
]

In [None]:
def add_to_store(args):
    idx, vector_store, chunks = args
    for i, chunk in enumerate(chunks):
        success = False
        retry = 2
        while not success and retry > 0:
            try:
                vector_store.add_documents([chunk])
                success = True
            except:
                retry -= 1
                time.sleep(65)
        if (i % 8 == 0):
            print(f"MODEL {idx} = {i}")
            if (i != 0):
                time.sleep(65)
    print(f"MODEL {idx} COMPLETED")

with ThreadPoolExecutor(max_workers=len(VECTOR_STORES)) as executor:
    for i, vector_store in enumerate(VECTOR_STORES):
        executor.submit(add_to_store, (i, vector_store, CHUNKS[i]))

In [None]:
VECTOR_STORES[0].create_vector_search_index(dimensions=VECTOR_STORE_CONFIG["definition"]["fields"][0]["numDimensions"])

## Model Evaluation

In [None]:
def retrieve(prompt, top_k=5, index=0):
    vector_store = VECTOR_STORES[index]
    return vector_store.similarity_search(prompt, k=top_k)

def generate(prompt, augment=False, index=0):
    model = CHAT_MODELS[index]
    pre_instruction = "You are a helpful assistant."
    full_instruction = f"{pre_instruction} Do not use any context that is provided to you. Strictly only answer the user's questions based on what you know."
    if augment:
        context = retrieve(prompt, index=index)
        full_instruction = f"{pre_instruction} Strictly only use the provided context to answer the user's questions: {context}"
    response = model.invoke([
        ["system", full_instruction],
        ["human", prompt],
    ])
    return response.content

In [None]:
prompt = "What is Adware and what is its primary function?"
response = generate(prompt)
response

In [None]:
prompt = "What is Adware and what is its primary function?"
response = generate(prompt, True)
response

In [None]:
prompt = "Identify the key components involved in the SingHealth user authentication process."
response = generate(prompt)
response

In [None]:
prompt = "Identify the key components involved in the SingHealth user authentication process."
response = generate(prompt, True)
response

In [None]:
tests = [[] for _ in range(len(CHAT_MODELS))]

for i in testing_data.index.tolist():
    tests[i % len(CHAT_MODELS)].append([testing_data.iloc[i]["input"], testing_data.iloc[i]["output"]])

In [None]:
results = [[None for _ in range(len(tests[i]))] for i in range(len(tests))]

def get_responses(args):
    index, test_set = args

    for i, (input, output) in enumerate(test_set):
        success = False
        print(f"MODEL {index} TEST {i} STARTED")
        retry = 2
        while not success and retry > 0:
            try:
                raw_response = generate(input, False, index)
                aug_response = generate(input, True, index)
                results[index][i] = [input, raw_response, aug_response, output]
                success = True
            except:
                retry -= 1
                time.sleep(65)
        print(f"MODEL {index} TEST {i} COMPLETED")

with ThreadPoolExecutor(max_workers=len(CHAT_MODELS)) as executor:
    for i, test_set in enumerate(tests):
        executor.submit(get_responses, (i, test_set))

In [None]:
results