### Data Ingestion Project with LangChain 0.3 version

In [None]:
from langchain_core.documents import Document
import os
import dotenv
dotenv.load_dotenv(dotenv_path=".env")
import langchain
print("Langchain Version:", langchain.__version__)

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if OPENAI_API_KEY is None:
    raise ValueError("Please set the OPENAI_API_KEY environment variable.")
print(f"OpenAI API Key found sucessfully")

### Using Langchain Document Loaders to load documents from various sources

In [None]:
doc = Document(
    page_content="This will be the text which will be stored in the document and retreived later using embeddings.", 
    metadata={
        "source": "my_source",
        "author": "John Doe",
        "date": "2023-01-01"
    }
)
print(doc.page_content)  # Print the document to verify its contents

### Add the Text Splitter to split the documents into smaller chunks

In [None]:
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter,TokenTextSplitter


### Adding sample Text file

In [None]:
os.makedirs("data/text_files", exist_ok=True)
sample_text = {
    "data/text_files/langchainintro.txt": """LangChain is a framework for developing applications powered by language models. 
It can be used for chatbots, Generative Question-Answering (GQA), summarization, and much more.""",
"data/text_files/ai.txt": """Artificial Intelligence (AI) refers to the simulation of human intelligence in machines that are programmed to think and learn like humans."""}

#iterate sample text files and create them
for filepath, content in sample_text.items():
    with open(filepath, "w", encoding="utf-8") as f:
        f.write(content)
print(f"Created sample text files in data/text_files/ directory.")

### Reading the text file using TextLoader

In [None]:
from langchain.document_loaders import TextLoader

# reading the text file using TextLoader
loader = TextLoader("data/text_files/langchainintro.txt",encoding="utf8")
data = loader.load()
#It created the Document object containing the page_content and metadata
print(data)


### Reading multiple files using DirectoryLoader
from langchain.document_loaders import DirectoryLoader

loader = DirectoryLoader("data/text_files", glob="*.txt", encoding="utf8")
data = loader.load()
print(data)
print(f"Number of documents loaded: {len(data)}")

In [None]:
from langchain.document_loaders import DirectoryLoader

# Correct usage: pass encoding via loader_kwargs
loader = DirectoryLoader(
    "data/text_files",
    glob="*.txt",
    loader_kwargs={"encoding": "utf8"}
)
documents = loader.load()

print(documents)
print(f"Number of documents loaded: {len(documents)}")
print(f"Source of the first document: {documents[0].metadata['source']}")  # Print metadata of the first document

### Now we will use the Text Splitter to split the documents into smaller chunks

In [None]:
# CharacterTextSplitter usage for single text document.
print(f'document created {data}')
textSplitter = CharacterTextSplitter(separator=" ", chunk_size=50, chunk_overlap=10, length_function=len)
texts = textSplitter.split_text(data[0].page_content)
print(f"Number of chunks created: {len(texts)}")
print(type(texts))
for i, chunk in enumerate(texts):
    print(f"Chunk {i+1}: {chunk}")
    print('--------------------------------')



### Now we will use CharacterTextSplitter to split the Directory documents into smaller chunks

In [None]:
print(f'documents created from directory {documents}')
docSplitter = CharacterTextSplitter(separator="", chunk_size=50, chunk_overlap=10, length_function=len)
# for doc in documents:
#     texts = docSplitter.split_text(doc.page_content)
#     print(f"Number of chunks created for document {doc.id}: {len(texts)}")
#     for i, chunk in enumerate(texts):
#         print(f"Chunk {i+1}: {chunk}")
#         print('--------------------------------')

# showing how we can use spilt documents to create Document objects
splitdocs = docSplitter.split_documents(documents)
print(f"Number of split Document objects created: {len(splitdocs)}")
print(type(splitdocs))
for i, chunk in enumerate(splitdocs[i].page_content for i in range(len(splitdocs))):
    print(f"Document Chunk {i+1}: {chunk}")
    print('--------------------------------')

#print(f"First split document content: {splitdocs[0].page_content}")


In [None]:
# Using RecursiveCharacterTextSplitter for single text document.
recTextSplitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " ",""], chunk_size=50, 
    chunk_overlap=10,length_function=len)
rec_texts = recTextSplitter.split_documents(documents)
#splitdocs = docSplitter.split_documents(documents)
print(f"Number of split Document objects created: {len(rec_texts)}")
print(type(rec_texts))
for i, chunk in enumerate(rec_texts[i].page_content for i in range(len(rec_texts))):
    print(f"Document Chunk {i+1}: {chunk}")
    print('--------------------------------')

#### Once chunks are created, we will use embeddings to convert text into vectors

In [None]:
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY,model="text-embedding-3-small")
print(embeddings)