In [48]:
# Set Groq API Key
# The GROQ API key is required to authenticate with the Groq API and use their language models.
# We set it as an environment variable, which is a common and secure way to handle API keys.
import os

# Replace with your actual Groq API key or use Colab secrets
os.environ["GROQ_API_KEY"] = "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"

print("GROQ API Key environment variable set.")

GROQ API Key environment variable set.


In [66]:
# Step 2: Define the URL of the webpage to scrape and load the content
from langchain_community.document_loaders import WebBaseLoader

url = "https://www.cnbc.com/" # Replace with the URL you want to scrape

try:
    loader = WebBaseLoader(url)
    web_documents = loader.load()
    print(f"Successfully loaded {len(web_documents)} documents from {url}.")



except Exception as e:
    print(f"Error loading content from {url}: {e}")
    web_documents = []
    print("web_documents list is empty.")


Successfully loaded 1 documents from https://www.cnbc.com/.


In [68]:
# Step 3: Define the path to the Excel file and load the data
import pandas as pd
from langchain.schema import Document

excel_file_path = "/content/AP_Reorganization_Site_Prices.xlsx" # Replace with the path to your Excel file

try:
    df = pd.read_excel(excel_file_path)
    print(f"Successfully loaded data from '{excel_file_path}' into a DataFrame.")
    documents_from_excel = []
    for index, row in df.iterrows():
        # Convert each row to a string representation and create a Document
        page_content = ", ".join([f"{col}: {value}" for col, value in row.items()])
        metadata = {"source": excel_file_path, "row_index": index}
        documents_from_excel.append(Document(page_content=page_content, metadata=metadata))
    print(f"Converted {len(documents_from_excel)} rows into LangChain Document objects.")

except FileNotFoundError:
    print(f"Error: The file '{excel_file_path}' was not found.")
    documents_from_excel = []
    print("documents_from_excel list is empty.")
except Exception as e:
    print(f"An error occurred while reading the Excel file: {e}")
    documents_from_excel = []
    print("documents_from_excel list is empty.")



Successfully loaded data from '/content/AP_Reorganization_Site_Prices.xlsx' into a DataFrame.
Converted 4 rows into LangChain Document objects.
/content/AP_Reorganization_Site_Prices.xlsx


In [56]:
# Step 4: Split the documents into texts
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain.retrievers import EnsembleRetriever
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQA

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 50)

web_texts = text_splitter.split_documents(web_documents)
print(f"Created {len(web_texts)} text chunks from web documents.")

excel_texts = text_splitter.split_documents(documents_from_excel)
print(f"Created {len(excel_texts)} text chunks from Excel documents.")

# Step 5: Initialize the embeddings model
embeddings = HuggingFaceEmbeddings()
print("Initialized HuggingFaceEmbeddings model.")

# Step 6: Create separate vector databases
persist_directory_web = "vector_db_web"
if web_texts:
    vectordb_web = Chroma.from_documents(web_texts, embeddings, persist_directory=persist_directory_web)
    print(f"Created Chroma vector database for web texts in '{persist_directory_web}'.")
else:
    print("web_texts is empty. Cannot create web vector database.")
    vectordb_web = None

persist_directory_excel = "vector_db_excel"
if excel_texts:
    vectordb_excel = Chroma.from_documents(excel_texts, embeddings, persist_directory=persist_directory_excel)
    print(f"Created Chroma vector database for Excel texts in '{persist_directory_excel}'.")
else:
    print("excel_texts is empty. Cannot create Excel vector database.")
    vectordb_excel = None

# Step 7: Set up a multi-source retriever
retrievers = []
if vectordb_web:
    retrievers.append(vectordb_web.as_retriever())
if vectordb_excel:
    retrievers.append(vectordb_excel.as_retriever())

multi_source_retriever = None
if retrievers:
    # Adjust weights if needed based on source importance
    weights = [1.0/len(retrievers)] * len(retrievers) # Equal weights
    multi_source_retriever = EnsembleRetriever(retrievers=retrievers, weights=weights)
    print("Multi-source retriever configured using EnsembleRetriever.")
else:
    print("No vector databases created. Cannot configure multi-source retriever.")


# Step 8: Initialize the language model
# Ensure GROQ_API_KEY is set (assuming it's done in a previous cell)
llm = ChatGroq(model = "llama-3.3-70b-versatile",temperature=0)
print("Initialized ChatGroq language model.")

# Step 9: Configure RAG chain with the multi-source retriever
multi_source_qa_chain = None
if multi_source_retriever:
    multi_source_qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=multi_source_retriever,
        return_source_documents=True
    )
    print("RetrievalQA chain set up with multi-source retriever.")
else:
    print("Multi-source retriever not configured. Cannot set up RAG chain.")

# The multi_source_qa_chain is now ready to be used for querying

Created 26 text chunks from web documents.
Created 4 text chunks from Excel documents.


  embeddings = HuggingFaceEmbeddings()


Initialized HuggingFaceEmbeddings model.
Created Chroma vector database for web texts in 'vector_db_web'.
Created Chroma vector database for Excel texts in 'vector_db_excel'.
Multi-source retriever configured using EnsembleRetriever.
Initialized ChatGroq language model.
RetrievalQA chain set up with multi-source retriever.


In [62]:
# Step 1: Define a query
# Define a query that is relevant to the content of either or both sources.
query = "Market today from CNBC?"

# Step 2 & 3: Invoke the multi_source_qa_chain and store the response
# Invoke the RAG chain with the defined query using the multi-source retriever.
response = multi_source_qa_chain.invoke({"query": query})

# Step 4: Print the full combined_response
# Print the entire output from the RetrievalQA chain.
print("Full combined response:")
print(response)

# Step 5: Print only the generated answer from the combined_response
# Extract and print only the 'result' field from the response dictionary.
print("\nGenerated answer:")
print(response['result'])

# Optional: Step 6: Inspect and print the source documents
print("\nSource Documents:")
if 'source_documents' in response and response['source_documents']:
    for i, doc in enumerate(response['source_documents']):
        print(f"--- Source Document {i+1} ---")
        print(f"Source: {doc.metadata.get('source', 'N/A')}")
        print(f"Content (first 200 chars): {doc.page_content[:200]}...")
else:
    print("No source documents found in the response.")

Full combined response:
{'query': 'Market today from CNBC?', 'result': 'The S&P 500 posted its fifth straight record close this week, powered by solid earnings.', 'source_documents': [Document(id='aa9121b8-fd6c-4dd9-b54e-059462ba1682', metadata={'row_index': 1, 'source': '/content/AP_Reorganization_Site_Prices.xlsx'}, page_content='Site Name: Visakhapatnam, Development Type: IT Hub, Item: Commercial Space (per sqft), Price (INR): 5500'), Document(id='c062af1f-a7cb-4ed4-83a2-d7073530aad5', metadata={'description': 'CNBC International is the world leader for news on business, technology, China, trade, oil prices, the Middle East and markets.', 'title': 'International Business, World News & Global Stock Market Analysis', 'language': 'en', 'source': 'https://www.cnbc.com/'}, page_content="that give stock to rank-and-file employees4 Hours AgoDover, Honeywell continue their post-earnings slides. Here's how we may respond4 Hours AgoInvestor Dan Niles' favorite picks for the rest of earnings s

In [70]:
# Verify the loader object
print("Loader object:")
print(loader)
print(f"Type of loader: {type(loader)}")

# Verify the content of excel
print("\nContent of excel documents:")
if documents_from_excel:
    for i, doc in enumerate(documents_from_excel):
        print(f"--- Document {i+1} ---")
        print(f"Source: {doc.metadata.get('source', 'N/A')}")
        print(f"Metadata: {doc.metadata}")
        print(f"Page Content (first 500 chars):\n{doc.page_content[:500]}...")
else:
    print("documents_from_excel list is empty.")

Loader object:
<langchain_community.document_loaders.web_base.WebBaseLoader object at 0x78a12c752610>
Type of loader: <class 'langchain_community.document_loaders.web_base.WebBaseLoader'>

Content of excel documents:
--- Document 1 ---
Source: /content/AP_Reorganization_Site_Prices.xlsx
Metadata: {'source': '/content/AP_Reorganization_Site_Prices.xlsx', 'row_index': 0}
Page Content (first 500 chars):
Site Name: Amaravati, Development Type: Administrative, Item: Land (per acre), Price (INR): 35000000...
--- Document 2 ---
Source: /content/AP_Reorganization_Site_Prices.xlsx
Metadata: {'source': '/content/AP_Reorganization_Site_Prices.xlsx', 'row_index': 1}
Page Content (first 500 chars):
Site Name: Visakhapatnam, Development Type: IT Hub, Item: Commercial Space (per sqft), Price (INR): 5500...
--- Document 3 ---
Source: /content/AP_Reorganization_Site_Prices.xlsx
Metadata: {'source': '/content/AP_Reorganization_Site_Prices.xlsx', 'row_index': 2}
Page Content (first 500 chars):
Site Name: