### Import modules

In [1]:
import os
import pandas as pd

os.chdir("../../")

from datasets import load_dataset
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import HuggingFaceDatasetLoader
from langchain.embeddings import HuggingFaceEmbeddings
from dotenv import load_dotenv

In [2]:
from src.rag_pipeline import chunk_by_recursive_split, RAGSystem
from src.env_loader import load_api_keys
from src.ragas.ragas_pipeline import run_ragas_evaluation
from src import display_df

### Load API keys

In [3]:
openai_api_key = load_api_keys("OPENAI_API_KEY")

#### Initialize embeddings and RAG system

In [4]:
# embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
embeddings = OpenAIEmbeddings(api_key=openai_api_key, model='text-embedding-ada-002')

#### Initialize RAG system with ensemble_retriever with BM25 retriever

In [10]:
optimization_name = "ensemble_retriever_with_bm25"
optimization_no = 2

In [5]:

rag_system_ensemble = RAGSystem(
  model_name = "gpt-4o",
  existing_vectorstore = False,
  use_ensemble_retriever = True,
  embeddings=embeddings
)

In [6]:
rag_system_ensemble.initialize()

--Split 1000 documents into 5030 chunks.--


### Check the RAG system
TODO - Write a test to check if RAG system is working properly - asserts for the output

In [8]:
question = "What event is Rory McIlroy preparing for after the WGC-Cadillac Championship?"
result = rag_system_ensemble.rag_chain.invoke(question)
result

{'question': 'What event is Rory McIlroy preparing for after the WGC-Cadillac Championship?',
 'answer': 'Rory McIlroy is preparing for the U.S. Masters at Augusta after the WGC-Cadillac Championship.',
 'contexts': ['(CNN)Jordan Spieth has Rory McIlroy and the world No.1 spot firmly in his sights after winning the Valspar Championship on Sunday. Spieth won a three-way play-off with a 28-foot birdie on the third extra hole to become only the fourth player since 1940 to win twice on the PGA Tour before turning 22. It is a feat that not even McIlroy mastered with Tiger Woods, Sergio Garcia and Robert Gamez the only players to have achieved that particular accolade in the past 75 years. But it is the Northern Irishman that is within Spieth\'s focus heading towards Augusta. "I like studying the game, being a historian of the game," Spieth told the PGA Tour website. "It\'s really cool to have my name go alongside those. "But right now currently what I\'m really focused on is Rory McIlroy an

## RAGAS Pipeline testing the rag_chain

In [9]:
rag_results = run_ragas_evaluation(rag_system_ensemble.rag_chain)

--LOADING EVALUATION DATA--
--GETTING CONTEXT AND ANSWERS--
--EVALUATING LOCALLY--


Evaluating:   0%|          | 0/76 [00:00<?, ?it/s]

--EVALUATION COMPLETE--


In [11]:
# Save results to csv
rag_results.to_csv(f"data/evaluation_results/bm_{optimization_no}_{optimization_name}.csv")