### Import modules

In [45]:
import os

from datasets import load_dataset
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA

from dotenv import load_dotenv

import pandas as pd

In [2]:
os.chdir("../")

### Load env & OpenAI API Key

In [3]:
load_dotenv()
openai_api_key = os.environ.get("OPENAI_API_KEY")

#### Load the dataset from huggingface

In [4]:
dataset = load_dataset("cnn_dailymail", "3.0.0", split="validation[:1000]")

In [5]:
dataset

Dataset({
    features: ['article', 'highlights', 'id'],
    num_rows: 1000
})

#### Save the dataset to csv file for faster loading

In [6]:
# Convert to a pandas DataFrame
df = pd.DataFrame(dataset)

# Save to a CSV file
df.to_csv("data/cnn_dailymail_validation_subset.csv", index=False)  # index=False to avoid saving row numbers

### Prepare documents

In [7]:
documents = [article['article'] for article in dataset]

##### Split the documents

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 200,
)

split_docs = text_splitter.create_documents(documents)

In [28]:
print(split_docs[0].page_content)

(CNN)Share, and your gift will be multiplied. That may sound like an esoteric adage, but when Zully Broussard selflessly decided to give one of her kidneys to a stranger, her generosity paired up with big data. It resulted in six patients receiving transplants. That surprised and wowed her. "I thought I was going to help this one person who I don't know, but the fact that so many people can have a life extension, that's pretty big," Broussard told CNN affiliate KGO. She may feel guided in her generosity by a higher power. "Thanks for all the support and prayers," a comment on a Facebook page in her name read. "I know this entire journey is much bigger than all of us. I also know I'm just the messenger." CNN cannot verify the authenticity of the page. But the power that multiplied Broussard's gift was data processing of genetic profiles from donor-recipient pairs. It works on a simple swapping principle but takes it to a much higher level, according to California Pacific Medical Center


#### Initialize embeddings and vectorstore

In [9]:
embeddings = OpenAIEmbeddings(api_key=openai_api_key)

In [10]:
vectorstore = Chroma.from_documents(split_docs, embeddings)

#### Create retriever

In [11]:
# Create retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

In [48]:
# Initialize language model
llm = ChatOpenAI(model_name="gpt-4o", temperature=0)

#### Create a RAG chain

In [49]:
rag_chain = RetrievalQA.from_chain_type(
  llm = llm,
  chain_type = "stuff",
  retriever = retriever,
  return_source_documents = True,
)

#### Test the RAG chain

In [50]:
question = "What act of generosity did Zully Broussard perform?"
result = rag_chain.invoke(question)

In [51]:
print(result)

{'query': 'What act of generosity did Zully Broussard perform?', 'result': "I'm sorry, but the provided context does not contain any information about an act of generosity performed by Zully Broussard.", 'source_documents': [Document(page_content='wrote a blank check for Cruz and pulled out his wallet, only to throw it on the ground at Cruz\'s feet. "For those who believe in miracles, this gentleman just threw his wallet at a politician," Cruz said, getting a big laugh from the crowd. "And he actually got it back." Cruz could not accept the money because he\'s not yet a candidate. One man, Bill Higgins, stood outside in the snow for four hours hoping Cruz would meet his goat named Izak, an apparent fixture on the campaign trail which was wearing a hat that said "I voted" and who happened to bite Jon Huntsman in 2011. Cruz waved hello at Higgins as the senator exited the building Sunday afternoon but didn\'t stop to greet the goat. "He\'ll be back again," Higgins said, trying to appear 