In [1]:
import os
import optuna
from dotenv import load_dotenv
import pandas as pd
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    Settings,
)
from llama_index.core.node_parser import (
    SimpleNodeParser,
    SentenceSplitter,
    TokenTextSplitter,
    SemanticSplitterNodeParser,
    MarkdownNodeParser,
)
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.embeddings.cohere import CohereEmbedding
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.llms.openai import OpenAI
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["COHERE_API_KEY"] = os.getenv("COHERE_API_KEY")
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")

In [3]:
def parse_qa_csv(file_path: str)-> list[tuple]:
    df = pd.read_csv(file_path)
    qa_pairs = list(zip(df["question"], df["answer"]))
    return qa_pairs

In [4]:
def compute_relevance(response_text, ground_truth_text, evaluation_embed_model):
    response_embedding = evaluation_embed_model.get_text_embedding(response_text)
    ground_truth_embedding = evaluation_embed_model.get_text_embedding(ground_truth_text)
    
    similarity = cosine_similarity([response_embedding], [ground_truth_embedding])
    return similarity[0][0]


In [5]:
def evaluate_rag_app(index, qa_pairs, evaluation_embed_model, top_k):
    retriever = VectorIndexRetriever(
        index=index,
        similarity_top_k=top_k,
    )
    query_engine = RetrieverQueryEngine(
        retriever=retriever,
    )
    relevance_scores = []

    for question, ground_truth in qa_pairs:
        response = query_engine.query(question)
        response_text = response.response  # Extract the text content
        relevance = compute_relevance(response_text, ground_truth, evaluation_embed_model)
        relevance_scores.append(relevance)
    average_score = sum(relevance_scores) / len(relevance_scores)
    return average_score

In [6]:

documents = SimpleDirectoryReader(input_dir="../data/articles", recursive=True).load_data()

qa_pairs = parse_qa_csv("../data/faq/faq_cleaned.csv")

evaluation_embed_model = OpenAIEmbedding()

In [7]:
def objective(trial):
    # Hyperparameters to optimize
    embedding_choice = trial.suggest_categorical(
        "embedding_model", ["openai", "cohere", "gemini"]
    )
    chunk_size = trial.suggest_int('chunk_size', 256, 1536, step=256)
    chunk_overlap = trial.suggest_int("chunk_overlap", 0, 200, step=50)
    top_k = trial.suggest_int("top_k", 1, 10)
    node_parser_choice = trial.suggest_categorical(
        "node_parser", ["simple", "sentence", "token", "semantic", "markdown"]
    )
    index_type = trial.suggest_categorical("index_type", ["simple", "chromadb"])

    # Select embedding model based on choice
    if embedding_choice == "openai":
        embed_model = OpenAIEmbedding(model="text-embedding-ada-002")
    elif embedding_choice == "cohere":
        embed_model = CohereEmbedding(
            api_key=os.getenv("COHERE_API_KEY"),
            model_name="embed-english-v3.0",
            input_type="search_document",
        )
    elif embedding_choice == "gemini":
        embed_model = GeminiEmbedding(
            api_key=os.getenv("GOOGLE_API_KEY"), model_name="models/embedding-001"
        )
    else:
        raise ValueError("Invalid embedding model selected.")

    # Select node parser based on choice
    if node_parser_choice == "simple":
        node_parser = SimpleNodeParser(chunk_overlap=chunk_overlap, chunk_size=chunk_size)
    elif node_parser_choice == "sentence":
        node_parser = SentenceSplitter(chunk_overlap=chunk_overlap, chunk_size=chunk_size)
    elif node_parser_choice == "token":
        node_parser = TokenTextSplitter(chunk_overlap=chunk_overlap, chunk_size=chunk_size)
    elif node_parser_choice == "semantic":
        buffer_size = trial.suggest_int("buffer_size", 1, 3)
        breakpoint_percentile_threshold = trial.suggest_int(
            "breakpoint_percentile_threshold", 60, 95
        )
        node_parser = SemanticSplitterNodeParser(
            buffer_size=buffer_size,
            breakpoint_percentile_threshold=breakpoint_percentile_threshold,
            embed_model=embed_model,
        )
    elif node_parser_choice == "markdown":
        include_prev_next_rel = trial.suggest_categorical(
            "include_prev_next_rel", [True, False]
        )
        node_parser = MarkdownNodeParser(include_prev_next_rel=include_prev_next_rel)
    else:
        raise ValueError("Invalid node parser selected.")

    # Configure the service context
    openai_llm = OpenAI(temperature=0.0, model="gpt-4o-mini")
    Settings.embed_model = embed_model
    Settings.llm = openai_llm
    Settings.node_parser = node_parser

    # Build the index with the current hyperparameters
    index = VectorStoreIndex.from_documents(documents, index_type=index_type)

    # Evaluate the index using the evaluation function
    score = evaluate_rag_app(index, qa_pairs, evaluation_embed_model, top_k)
    return score

In [8]:
study_name = 'rag_lora_study'
storage_name = f"sqlite:///optuna_{study_name}.db"

In [9]:
study = optuna.create_study(
    study_name=study_name,
    storage=storage_name,
    direction="maximize",
    load_if_exists=True
)

[I 2024-10-07 22:28:46,065] A new study created in RDB with name: rag_lora_study


In [10]:
study.optimize(objective, n_trials=20)

[I 2024-10-07 22:41:01,352] Trial 0 finished with value: 0.8735897514035502 and parameters: {'embedding_model': 'openai', 'chunk_size': 512, 'chunk_overlap': 50, 'top_k': 7, 'node_parser': 'sentence', 'index_type': 'simple'}. Best is trial 0 with value: 0.8735897514035502.
[I 2024-10-07 22:53:27,273] Trial 1 finished with value: 0.873839473832013 and parameters: {'embedding_model': 'openai', 'chunk_size': 768, 'chunk_overlap': 150, 'top_k': 6, 'node_parser': 'simple', 'index_type': 'simple'}. Best is trial 1 with value: 0.873839473832013.
[I 2024-10-07 23:04:43,541] Trial 2 finished with value: 0.8673895622025561 and parameters: {'embedding_model': 'gemini', 'chunk_size': 512, 'chunk_overlap': 150, 'top_k': 8, 'node_parser': 'markdown', 'index_type': 'chromadb', 'include_prev_next_rel': True}. Best is trial 1 with value: 0.873839473832013.
[I 2024-10-07 23:16:32,599] Trial 3 finished with value: 0.8727006220962842 and parameters: {'embedding_model': 'cohere', 'chunk_size': 1024, 'chunk

In [11]:
print("Best hyperparameters: ", study.best_params)

Best hyperparameters:  {'embedding_model': 'openai', 'chunk_size': 1536, 'chunk_overlap': 200, 'top_k': 5, 'node_parser': 'simple', 'index_type': 'simple'}
