### Import modules

In [1]:
import os
import pandas as pd

os.chdir("../../")

from datasets import load_dataset
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import HuggingFaceDatasetLoader
from langchain.embeddings import HuggingFaceEmbeddings
from dotenv import load_dotenv

In [2]:
from src.rag_pipeline import chunk_by_recursive_split, RAGSystem
from src.env_loader import load_api_keys

### Load API keys

In [3]:
openai_api_key = load_api_keys("OPENAI_API_KEY")

#### Initialize embeddings and RAG system

In [4]:
# embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
embeddings = OpenAIEmbeddings(api_key=openai_api_key, model='text-embedding-ada-002')

In [5]:
rag_system = RAGSystem(
  model_name = "gpt-3.5-turbo",
  existing_vectorstore = False,
  clear_store = True
)

In [6]:
rag_system.initialize()

--Split 1000 documents into 5030 chunks.--


#### Test the RAG Chain

In [7]:
question = "What event is Rory McIlroy preparing for after the WGC-Cadillac Championship?"
result = rag_system.rag_chain.invoke(question)

In [8]:
result

{'question': 'What event is Rory McIlroy preparing for after the WGC-Cadillac Championship?',
 'answer': 'Rory McIlroy is preparing for the Masters Tournament after the WGC-Cadillac Championship.',
 'contexts': ['gift was data processing of genetic profiles from donor-recipient pairs. It works on a simple swapping principle but takes it to a much higher level, according to California Pacific Medical Center in San Francisco. So high, that it is taking five surgeons, a covey of physician assistants, nurses and anesthesiologists, and more than 40 support staff to perform surgeries on 12 people. They are extracting six kidneys from donors and implanting them into six recipients. "The ages of the donors and recipients range from 26 to 70 and include three parent and child pairs, one sibling pair and one brother and sister-in-law pair," the medical center said in a statement. The chain of surgeries is to be wrapped up Friday. In late March, the medical center is planning to hold a reception 

#### Initialize RAG system with ensemble_retriever with BM25 retriever

In [9]:
rag_system = RAGSystem(
  model_name = "gpt-3.5-turbo",
  existing_vectorstore = False,
  use_ensemble_retriever = True,
  embeddings=embeddings
)

In [10]:
rag_system.initialize()

--Split 1000 documents into 5030 chunks.--


In [7]:
question = "What event is Rory McIlroy preparing for after the WGC-Cadillac Championship?"
result = rag_system.rag_chain.invoke(question)

In [8]:
result

{'question': 'What event is Rory McIlroy preparing for after the WGC-Cadillac Championship?',
 'answer': 'Rory McIlroy is preparing for the U.S. Masters at Augusta.',
 'contexts': ['(CNN)Jordan Spieth has Rory McIlroy and the world No.1 spot firmly in his sights after winning the Valspar Championship on Sunday. Spieth won a three-way play-off with a 28-foot birdie on the third extra hole to become only the fourth player since 1940 to win twice on the PGA Tour before turning 22. It is a feat that not even McIlroy mastered with Tiger Woods, Sergio Garcia and Robert Gamez the only players to have achieved that particular accolade in the past 75 years. But it is the Northern Irishman that is within Spieth\'s focus heading towards Augusta. "I like studying the game, being a historian of the game," Spieth told the PGA Tour website. "It\'s really cool to have my name go alongside those. "But right now currently what I\'m really focused on is Rory McIlroy and the No.1 in the world. That\'s who

## RAGAS Pipeline testing the rag_chain

In [7]:
from src.ragas.ragas_pipeline import run_ragas_evaluation
from src import display_df

In [8]:
basic_rag_results = run_ragas_evaluation(rag_system.rag_chain)

--LOADING EVALUATION DATA--
--GETTING CONTEXT AND ANSWERS--
--EVALUATING LOCALLY--
Dataset({
    features: ['question', 'contexts', 'answer', 'ground_truth'],
    num_rows: 1
})


Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

No statements were generated from the answer.
  value = np.nanmean(self.scores[cn])


--EVALUATION COMPLETE--
