In [None]:
!pip install openai==1.54.5
!pip install langchain-core==0.3.19
!pip install azure-ai-openai
!pip install langchain==0.3.7
!pip install langchain-community==0.3.7
!pip install langchain-openai==0.2.9
!pip install pypdf
!pip install python-docx
!pip install networkx
!pip install openpyxl
!pip install "unstructured[excel]"

In [2]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import UnstructuredExcelLoader
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

def load_document(file_path):
    _, file_extension = os.path.splitext(file_path)
    
    if file_extension.lower() == ".pdf":
        loader = PyPDFLoader(file_path)
    elif file_extension.lower() in [".xlsx", ".xls"]:
        loader = UnstructuredExcelLoader(file_path, mode="elements")
    elif file_extension.lower() in [".docx", ".doc"]:
        loader = UnstructuredWordDocumentLoader(file_path, mode="elements", strategy="fast")
    else:
        raise ValueError(f"Unsupported file type: {file_extension}")
    
    return loader.load()

def split_documents(docs, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return text_splitter.split_documents(docs)


file_paths = [
    "./DataSource1-D365_BatchProcess.docx",
    "./LangChain.pdf",
    "./Ind-Japan- 2010 Population Comparison.xlsx"
]

# Process and split documents
for file_path in file_paths:
    try:
        docs = load_document(file_path)
        splits = split_documents(docs)
        print(f"Successfully processed {file_path}. Total splits: {len(splits)}")
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

Successfully processed ./DataSource1-D365_BatchProcess.docx. Total splits: 75
Successfully processed ./LangChain.pdf. Total splits: 19
Successfully processed ./Ind-Japan- 2010 Population Comparison.xlsx. Total splits: 4


In [None]:
# Set up Azure OpenAI configuration
azure_openai_endpoint = " "  # Provide your endpoint
azure_openai_api_version = " " # Provide the api-version
azure_openai_key = " " # Provide the key

In [None]:
from langchain_openai import AzureOpenAIEmbeddings
from langchain.vectorstores import FAISS  
import openai

# Initialize embeddings
azure_embeddings = AzureOpenAIEmbeddings(
    model=" ",  # Provide the model name
    azure_endpoint=azure_openai_endpoint,
    openai_api_key = azure_openai_key,
    openai_api_version=azure_openai_api_version
)

In [38]:
from langchain.vectorstores import FAISS  

vectorstore = FAISS.from_documents(documents=splits, embedding=azure_embeddings)

In [None]:
azure_openai_Type = "azure"
azure_openai_Endpoint = " " # Your endpoint
azure_openai_Key = " " # Your key

In [None]:
from langchain_openai import AzureChatOpenAI

# Initialize Azure OpenAI LLM
llm = AzureChatOpenAI(
    openai_api_key= azure_openai_Key,
    azure_endpoint= azure_openai_Endpoint,
    deployment_name= "",  # Your model name
    api_version = "",  # Your api-version
    openai_api_type = "" # Your api-type
)

In [41]:
# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever()

In [42]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [47]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

#Creating Prompt Template
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)
prompt_sample = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [48]:
rag_chain = (
    {"context": retriever | format_docs, "input": RunnablePassthrough()}
    | prompt_sample
    | llm
    | StrOutputParser()
)

In [100]:
question_answer_chain = create_stuff_documents_chain(llm, prompt_sample)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

response = rag_chain.invoke({"input": "How do document loaders contribute to building indexes in LangChain?"})
print(response["answer"])

Document loaders in LangChain are essential for ingesting and processing data from various sources, such as files, databases, or web pages. They help convert raw documents into a structured format that can be indexed, making it easier to retrieve and query information later. By standardizing the data input, document loaders enhance the efficiency and accuracy of the indexing process within LangChain.
