### RAG Fundamentals - Document Loaders

In [None]:
import os

from dotenv import load_dotenv
from langchain_openai import ChatOpenAI


In [None]:
load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")

if not openai_api_key:
    raise ValueError("Please set the OPENAI_API_KEY environment variable.")

model_name = "gpt-4o"
temperature = 0.0
llm = ChatOpenAI(
    model=model_name,
    temperature=temperature,
    openai_api_key=openai_api_key
)

In [None]:
from langchain.document_loaders import UnstructuredFileLoader

loader = UnstructuredFileLoader("./when.txt")

data = loader.load()

In [None]:
print(len(data))

In [None]:
print(data[0].metadata)
print(data[0].page_content[:1000])  # Print the first 1000 characters of the content

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len
)

texts = text_splitter.create_documents([doc.page_content for doc in data])
for idx, text in enumerate(texts, start=1):
    text.metadata = data[0].metadata.copy()  # Use a copy to avoid shared references
    text.metadata["chunk_id"] = idx
    print(f"Chunk ID: {text.metadata['chunk_id']}, Length: {len(text.page_content)}")

print(texts)
print(len(texts))
print(texts[0].metadata)

# Print the first 1000 characters of the content
print(texts[0].page_content[:1000])
