### Import modules

In [19]:
import os

from datasets import load_dataset
from langchain_openai import OpenAIEmbeddings, OpenAI
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA

from dotenv import load_dotenv

import pandas as pd

In [2]:
os.chdir("../")

### Load env & OpenAI API Key

In [3]:
load_dotenv()
openai_api_key = os.environ.get("OPENAI_API_KEY")

#### Load the dataset from huggingface

In [4]:
dataset = load_dataset("cnn_dailymail", "3.0.0", split="validation[:1000]")

In [5]:
dataset

Dataset({
    features: ['article', 'highlights', 'id'],
    num_rows: 1000
})

#### Save the dataset to csv file for faster loading

In [6]:
# Convert to a pandas DataFrame
df = pd.DataFrame(dataset)

# Save to a CSV file
df.to_csv("data/cnn_dailymail_validation_subset.csv", index=False)  # index=False to avoid saving row numbers

### Prepare documents

In [7]:
documents = [article['article'] for article in dataset]

##### Split the documents

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 200,
)

split_docs = text_splitter.create_documents(documents)

#### Initialize embeddings and vectorstore

In [9]:
embeddings = OpenAIEmbeddings(api_key=openai_api_key)

In [10]:
vectorstore = Chroma.from_documents(split_docs, embeddings)

#### Create retriever

In [11]:
# Create retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

In [18]:
# Initialize language model
llm = OpenAI(temperature=0)

#### Create a RAG chain

In [22]:
rag_chain = RetrievalQA.from_chain_type(
  llm = llm,
  chain_type = "stuff",
  retriever = retriever,
  return_source_documents = True,
)