In [3]:
import os
from dotenv import load_dotenv
load_dotenv()

# Set up API keys
openai_api_key = os.getenv("OPENAI_API_KEY")
if openai_api_key:
    os.environ["OPENAI_API_KEY"] = openai_api_key
    print("✓ OpenAI API key loaded successfully")
else:
    print("⚠️ No OpenAI API key found")

# Set up LangSmith tracking (optional)
langchain_api_key = os.getenv("LANGCHAIN_API_KEY")
if langchain_api_key:
    os.environ["LANGCHAIN_API_KEY"] = langchain_api_key
    os.environ["LANGCHAIN_TRACING_V2"] = "true"
    langchain_project = os.getenv("LANGCHAIN_PROJECT")
    if langchain_project:
        os.environ["LANGCHAIN_PROJECT"] = langchain_project
    print("✓ LangSmith tracking configured")
else:
    print("ℹ No LangSmith tracking configured")

# Set USER_AGENT to avoid warnings when scraping websites
os.environ["USER_AGENT"] = "LangChain-WebScraper/1.0 (Educational Purpose)"
print("✓ User agent configured for web scraping")

✓ OpenAI API key loaded successfully
✓ LangSmith tracking configured
✓ User agent configured for web scraping


In [4]:
# DATA INGESTION -- Scrape data from the LangChain website
from langchain_community.document_loaders import WebBaseLoader

print("🌐 Loading data from LangChain website...")
try:
    loader = WebBaseLoader("https://langchain.com/")
    data = loader.load()
    print(f"✓ Successfully loaded {len(data)} document(s)")
    print(f"  Document length: {len(data[0].page_content)} characters")
    print(f"  Source: {data[0].metadata.get('source', 'Unknown')}")
except Exception as e:
    print(f"✗ Error loading data: {e}")
    data = None


🌐 Loading data from LangChain website...
✓ Successfully loaded 1 document(s)
  Document length: 5402 characters
  Source: https://langchain.com/


In [None]:
# TEXT SPLITTING -- Split the document into smaller chunks
from langchain_text_splitters import RecursiveCharacterTextSplitter

if data:
    print("📄 Splitting document into chunks...")
    try:
        # Create text splitter
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,      # Size of each chunk
            chunk_overlap=200,    # Overlap between chunks
            length_function=len,
            separators=["\n\n", "\n", " ", ""]
        )
        
        # Split the documents
        chunks = text_splitter.split_documents(data)
        print(f"✓ Successfully split into {len(chunks)} chunks")
        print(f"  Average chunk size: {sum(len(chunk.page_content) for chunk in chunks) // len(chunks)} characters")
        
        # Show first chunk as example
        print(f"\n First chunk preview:")
        print(f"  Length: {len(chunks[0].page_content)} characters")
        print(f"  Content: {chunks[0].page_content[:200]}...")
        
    except Exception as e:
        print(f" Error splitting documents: {e}")
        chunks = None
else:
    print(" No data available for splitting")
    chunks = None

📄 Splitting document into chunks...
✓ Successfully split into 8 chunks
  Average chunk size: 763 characters

📝 First chunk preview:
  Length: 750 characters
  Content: LangChain




















Products

FrameworksLangGraphLangChainPlatformsLangSmithLangGraph PlatformResources

GuidesBlogCustomer StoriesLangChain AcademyCommunityEventsChangelogDocs

PythonLangGr...


In [10]:
chunks

[Document(metadata={'source': 'https://langchain.com/', 'title': 'LangChain', 'description': 'LangChain’s suite of products supports developers along each step of their development journey.', 'language': 'en'}, page_content='LangChain\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nProducts\n\nFrameworksLangGraphLangChainPlatformsLangSmithLangGraph PlatformResources\n\nGuidesBlogCustomer StoriesLangChain AcademyCommunityEventsChangelogDocs\n\nPythonLangGraphLangSmithLangChainJavaScriptLangGraphLangSmithLangChainCompany\n\nAboutCareersPricingGet a demoSign up\n\n\n\n\n\n\n\n\n\n\n\n\nProducts\n\nFrameworksLangGraphLangChainPlatformsLangSmithLangGraph PlatformResources\n\nGuidesBlogCustomer StoriesLangChain AcademyCommunityEventsChangelogDocs\n\nPythonLangGraphLangSmithLangChainJavaScriptLangGraphLangSmithLangChainCompany\n\nAboutCareersPricingGet a demoSign upThe platform for reliable agents. Tools for every step of the agent development lifecycle -- built to unlock powerful AI\xa0in productio

In [12]:
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")


In [13]:
from  langchain_community.vectorstores import FAISS
vector_store_db = FAISS.from_documents(chunks, embeddings)


In [14]:
vector_store_db

<langchain_community.vectorstores.faiss.FAISS at 0x2717ce62630>

RETRIEVERS & CHAINS

In [15]:
query="Langsmith has 2 usage limits: total traces and extended"
result = vector_store_db.similarity_search(query, k=2)

In [16]:
result[0].page_content

"LangChain products are designed to be used independently or stack for multiplicative benefit. LangChainLangGraphFrameworksLangSmithLangGraph PlatformPlatformsFrameworksLangChainLangGraphPlatformsLangSmithLangGraph \u2028PlatformSTACK 1:\xa0LangGraph +\xa0LangChain +\xa0LangSmith +\xa0LangGraph\xa0PlatformA full product suite for reliable agents and LLM appsLangChain's products work seamlessly together to provide an integrated solution for every step of the application development journey. When you use all LangChain products, you'll build better, get to production quicker, and grow visibility -- all with less set up and friction. LangChain provides the smoothest path to high quality agents.Orchestration:Integrations:Evals + Observability:Deployment:STACK 2: No framework +\xa0LangSmithTrace\xa0and evaluate any LLM appLangSmith is framework-agnostic. Trace using the TypeScript or Python SDK\xa0to gain visibility into your agent interactions -- whether you use LangChain's frameworks or no

In [18]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

In [19]:
### retrieval chain, Documents chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_template(
 """
 Answer the question based on the context provided.
 <context>
 {context}
 </context>
 """
)

document_chain = create_stuff_documents_chain(llm, prompt)
document_chain



RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableLambda(format_docs)
}), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
| ChatPromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template='\n Answer the question based on the context provided.\n <context>\n {context}\n </context>\n '), additional_kwargs={})])
| ChatOpenAI(client=<openai.resources.chat.completions.completions.Completions object at 0x000002717FEB19D0>, async_client=<openai.resources.chat.completions.completions.AsyncCompletions object at 0x000002717D024350>, root_client=<openai.OpenAI object at 0x0000027169A4D7F0>, root_async_client=<openai.AsyncOpenAI object at 0x000002717FEB0320>, temperature=0.0, model_kwargs={}, openai_api_key=SecretStr('**********'))
| StrOutputParser(), kwargs={}, config={'run_name': 'st

In [21]:
from langchain_core.documents import Document
document_chain.invoke({
    "input": "langcmith has 2 usage limits: total traces and extended",
    "context": [Document(page_content="Langsmith has 2 usage limits: total traces and extended")]
})

'The question is not provided.'

In [22]:
## Input --> Retriever --> vectorstoredb
vector_store_db

<langchain_community.vectorstores.faiss.FAISS at 0x2717ce62630>

In [23]:
vector_store_db.as_retriever()

VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x000002717CE62630>, search_kwargs={})

In [25]:
from langchain.chains import create_retrieval_chain
retrieval_chain = create_retrieval_chain(
    retriever=vector_store_db.as_retriever(),
    combine_docs_chain=document_chain
)
retrieval_chain

RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableBinding(bound=RunnableLambda(lambda x: x['input'])
           | VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x000002717CE62630>, search_kwargs={}), kwargs={}, config={'run_name': 'retrieve_documents'}, config_factories=[])
})
| RunnableAssign(mapper={
    answer: RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
              context: RunnableLambda(format_docs)
            }), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
            | ChatPromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template='\n Answer the question based on the context provided.\n <context>\n {context}\n </context>\n '), additional_kwargs={})])
            | ChatOpenAI(cli

In [27]:
# get the response from the LLM
response = retrieval_chain.invoke({"input": "langcmith has 2 usage limits: total traces and extended"})
response


{'input': 'langcmith has 2 usage limits: total traces and extended',
 'context': [Document(id='ba6df6ac-61a5-41f8-8a9a-55eb532382a0', metadata={'source': 'https://langchain.com/', 'title': 'LangChain', 'description': 'LangChain’s suite of products supports developers along each step of their development journey.', 'language': 'en'}, page_content="LangChain products are designed to be used independently or stack for multiplicative benefit. LangChainLangGraphFrameworksLangSmithLangGraph PlatformPlatformsFrameworksLangChainLangGraphPlatformsLangSmithLangGraph \u2028PlatformSTACK 1:\xa0LangGraph +\xa0LangChain +\xa0LangSmith +\xa0LangGraph\xa0PlatformA full product suite for reliable agents and LLM appsLangChain's products work seamlessly together to provide an integrated solution for every step of the application development journey. When you use all LangChain products, you'll build better, get to production quicker, and grow visibility -- all with less set up and friction. LangChain prov

In [28]:
response['answer']

'LangChain products are designed to be used independently or stacked for multiplicative benefit.'

In [None]:
# EMBEDDINGS -- Create embeddings for the document chunks
from langchain_openai import OpenAIEmbeddings

if chunks and openai_api_key:
    print("🔮 Creating embeddings for document chunks...")
    try:
        # Create embeddings
        embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
        
        # Generate embeddings for all chunks
        print("  Generating embeddings (this may take a moment)...")
        chunk_embeddings = embeddings.embed_documents([chunk.page_content for chunk in chunks])
        
        print(f"✓ Successfully created {len(chunk_embeddings)} embeddings")
        print(f"  Embedding dimensions: {len(chunk_embeddings[0])}")
        print(f"  First embedding (first 5 values): {chunk_embeddings[0][:5]}")
        
    except Exception as e:
        print(f"✗ Error creating embeddings: {e}")
        chunk_embeddings = None
        embeddings = None
else:
    if not chunks:
        print("⚠️ No chunks available for embedding")
    if not openai_api_key:
        print("⚠️ No OpenAI API key available for embeddings")
    chunk_embeddings = None
    embeddings = None


In [None]:
# VECTOR STORE -- Store embeddings in a vector database
from langchain_community.vectorstores import Chroma

if chunks and embeddings:
    print("🗄️ Creating vector store...")
    try:
        # Create Chroma vector store
        vectorstore = Chroma.from_documents(
            documents=chunks,
            embedding=embeddings,
            persist_directory="./chroma_db"  # Local storage
        )
        
        print(f"✓ Successfully created vector store with {len(chunks)} documents")
        print("  Vector store saved to: ./chroma_db")
        
        # Test similarity search
        print("\n🔍 Testing similarity search...")
        test_query = "What is LangChain?"
        similar_docs = vectorstore.similarity_search(test_query, k=2)
        
        print(f"Query: '{test_query}'")
        print(f"Found {len(similar_docs)} similar documents:")
        for i, doc in enumerate(similar_docs):
            print(f"  {i+1}. Length: {len(doc.page_content)} chars")
            print(f"     Preview: {doc.page_content[:100]}...")
        
    except Exception as e:
        print(f"✗ Error creating vector store: {e}")
        vectorstore = None
else:
    print("⚠️ Cannot create vector store - missing chunks or embeddings")
    vectorstore = None


In [None]:
# RETRIEVAL QA CHAIN -- Create a question-answering system
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI

if vectorstore and openai_api_key:
    print("🤖 Creating Retrieval QA Chain...")
    try:
        # Create the LLM
        llm = ChatOpenAI(
            model="gpt-3.5-turbo",
            temperature=0.7,
            max_tokens=500
        )
        
        # Create the retrieval QA chain
        qa_chain = RetrievalQA.from_chain_type(
            llm=llm,
            chain_type="stuff",
            retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
            return_source_documents=True
        )
        
        print("✓ Retrieval QA Chain created successfully!")
        
        # Test the QA system
        print("\n❓ Testing Question Answering...")
        test_questions = [
            "What is LangChain?",
            "What are the main features of LangChain?",
            "How can I get started with LangChain?"
        ]
        
        for question in test_questions:
            print(f"\nQ: {question}")
            try:
                result = qa_chain.invoke({"query": question})
                print(f"A: {result['result']}")
                print(f"   Sources: {len(result['source_documents'])} documents used")
            except Exception as e:
                print(f"   Error: {e}")
        
    except Exception as e:
        print(f"✗ Error creating QA chain: {e}")
        qa_chain = None
else:
    print("⚠️ Cannot create QA chain - missing vector store or API key")
    qa_chain = None


In [None]:
# INTERACTIVE QA -- Create an interactive question-answering function
def ask_question(question):
    """Ask a question to the QA system."""
    if qa_chain:
        try:
            result = qa_chain.invoke({"query": question})
            return {
                "answer": result['result'],
                "sources": len(result['source_documents']),
                "source_docs": result['source_documents']
            }
        except Exception as e:
            return {"error": str(e)}
    else:
        return {"error": "QA chain not available"}

# Example usage
if qa_chain:
    print("🎯 Interactive Question Answering System Ready!")
    print("You can now ask questions about LangChain using the ask_question() function.")
    print("\nExample:")
    
    # Test with a sample question
    sample_result = ask_question("What is LangChain used for?")
    if "error" not in sample_result:
        print(f"Q: What is LangChain used for?")
        print(f"A: {sample_result['answer']}")
        print(f"   Sources: {sample_result['sources']} documents")
    else:
        print(f"Error: {sample_result['error']}")
else:
    print("⚠️ Interactive QA system not available")
