### Import modules

In [1]:
import os

from datasets import load_dataset
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import HuggingFaceDatasetLoader
from langchain.embeddings import HuggingFaceEmbeddings

from dotenv import load_dotenv

import pandas as pd

In [2]:
os.chdir("../../")
from src.utils import load_docs_from_csv

### Load env & OpenAI API Key

In [4]:
load_dotenv()
openai_api_key = os.environ.get("OPENAI_API_KEY")

### Load the documents

In [3]:
documents = load_docs_from_csv()

#### Save the dataset to csv file for faster loading

In [7]:
# Convert to a pandas DataFrame
# df = pd.DataFrame(dataset)

# Save to a CSV file
# df.to_csv("data/cnn_dailymail_validation_subset.csv", index=False)  # index=False to avoid saving row numbers

### Load dataset using langchain.  Api [Reference](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.hugging_face_dataset.HuggingFaceDatasetLoader.html)
- Loads the data into document objects
- load_and_split defaults to RecursiveCharacterTextSplitter
* The challenge: Unable to specify the type of dataset i.e train or validation

In [8]:
# from langchain_community.document_loaders import HuggingFaceDatasetLoader
# dataset_name = "cnn_dailymail"
# page_content_column = "article"
# configuration = "3.0.0"

# loader = HuggingFaceDatasetLoader(dataset_name, page_content_column, configuration)
# data = loader.load()
# data = data[:1000]

### Split the documents

In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 200,
)

split_docs = text_splitter.create_documents(documents)

In [10]:
print(split_docs[0].page_content)

(CNN)Share, and your gift will be multiplied. That may sound like an esoteric adage, but when Zully Broussard selflessly decided to give one of her kidneys to a stranger, her generosity paired up with big data. It resulted in six patients receiving transplants. That surprised and wowed her. "I thought I was going to help this one person who I don't know, but the fact that so many people can have a life extension, that's pretty big," Broussard told CNN affiliate KGO. She may feel guided in her generosity by a higher power. "Thanks for all the support and prayers," a comment on a Facebook page


#### Initialize embeddings and vectorstore

In [None]:
# HuggingFace embeddings
from langchain.embeddings import HuggingFaceEmbeddings
# the choice of an embedding model depends on the specific NLP task
# you might need to use a different model 
embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

In [11]:
embeddings = OpenAIEmbeddings(api_key=openai_api_key, model='text-embedding-ada-002')

In [12]:
vectorstore = Chroma.from_documents(split_docs, embeddings, persist_directory="chromadb")

#### Create retriever

In [13]:
# Create retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

In [14]:
# Initialize language model
llm = ChatOpenAI(model_name="gpt-4o", temperature=0)

#### Create a RAG chain

In [15]:
rag_chain = RetrievalQA.from_chain_type(
  llm = llm,
  chain_type = "stuff",
  retriever = retriever,
  return_source_documents = True,
)

#### Test the RAG chain

In [16]:
question = "Who bit Jon Huntsman in 2011"
result = rag_chain.invoke(question)

In [1]:
print(result)

NameError: name 'result' is not defined

#### Evaluation the RAG system using RAGAS

In [18]:
contexts = [doc.page_content for doc in result["source_documents"]]
# formatted_context = pretty_print_docs(contexts)

In [21]:
print(result['result'])

A goat named Izak bit Jon Huntsman in 2011.
