### Import modules

In [1]:
import os
import pandas as pd

os.chdir("../../")

from datasets import load_dataset
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import HuggingFaceDatasetLoader
from langchain.embeddings import HuggingFaceEmbeddings
from dotenv import load_dotenv

In [2]:
from src.rag_pipeline import chunk_by_recursive_split, RAGSystem
from src.env_loader import load_api_keys
from src.ragas.ragas_pipeline import run_ragas_evaluation
from src import display_df

### Load API keys

In [3]:
openai_api_key = load_api_keys("OPENAI_API_KEY")

#### Initialize embeddings and RAG system

In [4]:
# embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
embeddings = OpenAIEmbeddings(api_key=openai_api_key, model='text-embedding-ada-002')

In [5]:
rag_system = RAGSystem(
  model_name = "gpt-4o",
  existing_vectorstore = False,
  embeddings = embeddings,
  clear_store = True
)

In [6]:
rag_system.initialize()

--Split 1000 documents into 5030 chunks.--


#### Test the RAG Chain

In [7]:
question = "What event is Rory McIlroy preparing for after the WGC-Cadillac Championship?"
result = rag_system.rag_chain.invoke(question)

In [8]:
result

{'question': 'What event is Rory McIlroy preparing for after the WGC-Cadillac Championship?',
 'answer': "Rory McIlroy is preparing for Arnold Palmer's tournament, which takes place from March 19-22.",
 'contexts': ['(CNN)Jordan Spieth has Rory McIlroy and the world No.1 spot firmly in his sights after winning the Valspar Championship on Sunday. Spieth won a three-way play-off with a 28-foot birdie on the third extra hole to become only the fourth player since 1940 to win twice on the PGA Tour before turning 22. It is a feat that not even McIlroy mastered with Tiger Woods, Sergio Garcia and Robert Gamez the only players to have achieved that particular accolade in the past 75 years. But it is the Northern Irishman that is within Spieth\'s focus heading towards Augusta. "I like studying the game, being a historian of the game," Spieth told the PGA Tour website. "It\'s really cool to have my name go alongside those. "But right now currently what I\'m really focused on is Rory McIlroy and

#### Initialize RAG system with ensemble_retriever with BM25 retriever

In [12]:
rag_system_ensemble = RAGSystem(
  model_name = "gpt-4o",
  existing_vectorstore = False,
  use_ensemble_retriever = True,
  embeddings=embeddings
)

In [13]:
rag_system_ensemble.initialize()

--Split 1000 documents into 5030 chunks.--


In [14]:
question = "What event is Rory McIlroy preparing for after the WGC-Cadillac Championship?"
result = rag_system_ensemble.rag_chain.invoke(question)

In [15]:
result

{'question': 'What event is Rory McIlroy preparing for after the WGC-Cadillac Championship?',
 'answer': 'Rory McIlroy is preparing for the U.S. Masters at Augusta after the WGC-Cadillac Championship.',
 'contexts': ['(CNN)Jordan Spieth has Rory McIlroy and the world No.1 spot firmly in his sights after winning the Valspar Championship on Sunday. Spieth won a three-way play-off with a 28-foot birdie on the third extra hole to become only the fourth player since 1940 to win twice on the PGA Tour before turning 22. It is a feat that not even McIlroy mastered with Tiger Woods, Sergio Garcia and Robert Gamez the only players to have achieved that particular accolade in the past 75 years. But it is the Northern Irishman that is within Spieth\'s focus heading towards Augusta. "I like studying the game, being a historian of the game," Spieth told the PGA Tour website. "It\'s really cool to have my name go alongside those. "But right now currently what I\'m really focused on is Rory McIlroy an

## RAGAS Pipeline testing the rag_chain

In [8]:
basic_rag_results = run_ragas_evaluation(rag_system.rag_chain)

--LOADING EVALUATION DATA--
--GETTING CONTEXT AND ANSWERS--
--EVALUATING LOCALLY--


Evaluating:   0%|          | 0/20 [00:00<?, ?it/s]

--EVALUATION COMPLETE--


In [9]:
basic_rag_results

Unnamed: 0,question,contexts,answer,ground_truth,answer_correctness,faithfulness,answer_relevancy,context_precision
0,What upcoming animated project will feature Ad...,[(The Hollywood Reporter)The skies over Gotham...,The upcoming animated project that will featur...,Adam West and Burt Ward will be reprising thei...,0.745731,0.666667,0.0,0.916667
1,What animated project did Adam West and Burt W...,[(The Hollywood Reporter)The skies over Gotham...,Adam West and Burt Ward announced an upcoming ...,Adam West and Burt Ward announced a new animat...,0.882117,0.666667,0.0,1.0
2,What event is Rory McIlroy preparing for after...,[(CNN)Jordan Spieth has Rory McIlroy and the w...,Rory McIlroy is preparing for Arnold Palmer's ...,Rory McIlroy is preparing for the U.S. Masters...,0.233292,0.0,0.867013,0.916667
3,How did Donald Trump help Rory McIlroy retriev...,[(CNN)With a little bit of help from Donald Tr...,Donald Trump got a scuba diver to retrieve Ror...,Donald Trump helped Rory McIlroy retrieve his ...,0.453221,1.0,0.907409,1.0
4,What caused the collapse of the Iraqi army dur...,"[militia into army units. At the same time, th...",The collapse of the Iraqi army during the ISIS...,The collapse of the Iraqi army during the ISIS...,0.577018,1.0,1.0,1.0


In [16]:
rag_ensemble_results = run_ragas_evaluation(rag_system_ensemble.rag_chain)

--LOADING EVALUATION DATA--
--GETTING CONTEXT AND ANSWERS--
--EVALUATING LOCALLY--


Evaluating:   0%|          | 0/20 [00:00<?, ?it/s]

--EVALUATION COMPLETE--


In [17]:
rag_ensemble_results.to_csv("data/results/rag_ensemble_results.csv")

In [20]:
rag_ensemble_results

Unnamed: 0,question,contexts,answer,ground_truth,answer_correctness,faithfulness,answer_relevancy,context_precision
0,What upcoming animated project will feature Ad...,[(The Hollywood Reporter)The skies over Gotham...,An upcoming animated project in 2016 will feat...,Adam West and Burt Ward will be reprising thei...,0.669585,1.0,0.0,1.0
1,What animated project did Adam West and Burt W...,[(The Hollywood Reporter)The skies over Gotham...,Adam West and Burt Ward announced an upcoming ...,Adam West and Burt Ward announced a new animat...,0.98926,0.666667,0.0,0.833333
2,What event is Rory McIlroy preparing for after...,[(CNN)Jordan Spieth has Rory McIlroy and the w...,Rory McIlroy is preparing for the U.S. Masters...,Rory McIlroy is preparing for the U.S. Masters...,0.743055,0.5,0.98415,0.8875
3,How did Donald Trump help Rory McIlroy retriev...,[(CNN)With a little bit of help from Donald Tr...,Donald Trump got a scuba diver to retrieve Ror...,Donald Trump helped Rory McIlroy retrieve his ...,0.488935,1.0,0.888655,1.0
4,What caused the collapse of the Iraqi army dur...,"[militia into army units. At the same time, th...",The collapse of the Iraqi army during the ISIS...,The collapse of the Iraqi army during the ISIS...,0.528901,1.0,0.983234,1.0


In [9]:
basic_rag_results = run_ragas_evaluation(rag_system.rag_chain)

--LOADING EVALUATION DATA--
--GETTING CONTEXT AND ANSWERS--
--EVALUATING LOCALLY--


Evaluating:   0%|          | 0/76 [00:00<?, ?it/s]

--EVALUATION COMPLETE--


In [10]:
basic_rag_results

Unnamed: 0,question,contexts,answer,ground_truth,answer_correctness,faithfulness,answer_relevancy,context_precision
0,What is the significance of cherry trees in Wa...,[Washington (CNN)There should be plenty of pen...,"The cherry trees in Washington, D.C., are sign...",Cherry trees hold great significance in Washin...,0.243916,0.833333,0.983714,1.0
1,What is one of the events that Hillary Clinton...,"[(CNN)After a handful events in two months, Hi...",Hillary Clinton will be participating in a pai...,Hillary Clinton will be participating in the A...,0.998523,1.0,0.944543,0.95
2,What role did the United States-led coalition ...,[(CNN)In his first interview since Islamic Sta...,The United States-led coalition conducted airs...,The United States-led coalition played a role ...,0.866016,0.8,0.887886,1.0
3,What can viewers expect from the season finale...,[(CNN)Have Rick and his fellow survivors final...,Viewers can expect a 90-minute season finale f...,"The season finale of ""The Walking Dead"" is exp...",0.223431,1.0,0.973545,0.916667
4,What is the significance of the Tunisian jihad...,"[Museum. Protesters held banners that said ""We...",The Tunisian jihadist's significance lies in h...,The significance of the Tunisian jihadist in r...,0.673266,1.0,0.901839,1.0
5,How did the undercover FBI informant play a ro...,[(CNN)An Army National Guard member and his co...,The undercover FBI informant played a crucial ...,The undercover FBI informant played a role in ...,0.829241,0.714286,0.852989,1.0
6,How does the new Red brand aim to leverage Rad...,"[Carlson Rezidor, which is hueing (sic) toward...",The new Red brand aims to leverage Radisson's ...,The new Red brand aims to leverage Radisson's ...,0.848405,1.0,0.915066,0.916667
7,When are the peak blooms expected during the N...,[Washington (CNN)There should be plenty of pen...,The peak blooms are expected between April 11 ...,The peak blooms are expected between April 11 ...,0.25,1.0,0.92351,1.0
8,What is Zhanna Nemtsova's opinion on the Russi...,"[of the case except what is in the media. ""I d...",Zhanna Nemtsova has no faith in the Russian in...,Zhanna Nemtsova said she has no faith in the R...,0.73315,1.0,0.935731,1.0
9,Who was one of Putin's harshest critics?,[be heading an opposition party and do what I'...,One of Putin's harshest critics was Boris Nemt...,Boris Nemtsov,0.229624,1.0,1.0,1.0


In [12]:
basic_rag_results.to_csv("data/evaluation_results/bm_1_baseline.csv")