# Integrating document loaders

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.document_loaders import HNLoader

from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import UnstructuredHTMLLoader

import os

from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

from langchain_openai import OpenAI
from langchain.chains import RetrievalQA
from langchain.chains import RetrievalQAWithSourcesChain

openai_api_key = os.getenv('OPENAI_API_KEY') 

## PDF document loaders

In [None]:
# Create a document loader for attention_is_all_you_need.pdf
loader = PyPDFLoader("documents/attention-is-all-you-need.pdf")

# Load the document
data = loader.load()
print(data[0])

## CSV document loaders

In [None]:
# Create a document loader for fifa_countries_audience.csv
loader = CSVLoader(file_path='documents/fifa_countries_audience.csv')

# Load the document
data = loader.load()
print(data[0])

## Third-party document loaders

In [None]:
# Create a document loader for the top Hacker News stories
loader = HNLoader("https://news.ycombinator.com")

# Load the document
data = loader.load()

# Print the first document
print(data[0])

# Print the first document's metadata
print(data[0].metadata)

# Splitting external data for retrieval

## Splitting bu character

In [None]:
quote = 'One machine can do the work of fifty ordinary humans. No machine can do the work of one extraordinary human.'
chunk_size = 24
chunk_overlap = 3

# Create an instance of the splitter class
splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap)

# Split the document and print the chunks
docs = splitter.split_text(quote)
print(docs)

## Recursively splitting by character

In [None]:
quote = 'Words are flowing out like endless rain into a paper cup,\nthey slither while they pass,\nthey slip away across the universe.'
chunk_size = 24
chunk_overlap = 10

# Create an instance of the splitter class
splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap)

# Split the document and print the chunks
docs = splitter.split_text(quote)
print(docs)

## Splitting HTML

In [None]:
# Load the HTML document into memory
loader = UnstructuredHTMLLoader("documents/white_house_executive_order_nov_2023.html")
data = loader.load()

# Define variables
chunk_size = 300
chunk_overlap = 100

# Split the HTML
splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separators=['.'])

docs = splitter.split_documents(data)
print(docs)

# RAG storage and retrieval using vector databases

## Preparing the documents and vector database

In [None]:
loader = PyPDFLoader('documents/attention-is-all-you-need.pdf')
data = loader.load()
chunk_size = 200
chunk_overlap = 50

# Split the quote using RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap)
docs = splitter.split_documents(data) 

# Define an OpenAI embeddings model
embedding_model = OpenAIEmbeddings(openai_api_key=openai_api_key)

# Create the Chroma vector DB using the OpenAI embedding function; persist the database
vectordb = Chroma(
    persist_directory='chroma/',
    embedding_function=embedding_model)
vectordb.persist()

## Storing and retrieving documents

In [None]:
# Embed the documents and store them in a Chroma DB
embedding_model = OpenAIEmbeddings(openai_api_key=openai_api_key)
docstorage = Chroma.from_documents(docs, embedding_model)

# Define the Retrieval QA Chain to integrate the database and LLM
qa = RetrievalQA.from_chain_type(
    OpenAI(model_name="gpt-3.5-turbo-instruct", temperature=0, openai_api_key=openai_api_key), chain_type="stuff", retriever=docstorage.as_retriever())

# Run the chain on the query provided
query = "What is the primary architecture presented in the document?"
print(qa.run(query))

## RAG with sources

In [None]:
# Define the function for the question to be answered with
qa = RetrievalQAWithSourcesChain.from_chain_type(
    OpenAI(model_name="gpt-3.5-turbo-instruct", temperature=0, openai_api_key=openai_api_key), chain_type="stuff", retriever=docstorage.as_retriever()
)

# Run the query on the documents
results = qa({"question": "What is the primary architecture presented in the document?"}, return_only_outputs=True)
print(results)