# Installing Libraries

In [None]:
 #installing libraries

!pip install langchain-community
!pip install langchain-google-genai
!pip install fastembed
!pip install chromadb
!pip install langchain-google-genai
!pip install -U langchain-chroma
!pip install pypdf



In [None]:
# importing libraries
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_google_genai import ChatGoogleGenerativeAI # Keep ChatGoogleGenerativeAI import
from langchain_community.document_loaders import CSVLoader
# from langchain_community.chat_models.openrouter import ChatOpenRouter # Removed ChatOpenRouter import
import sys
import os

#**Load the Documents**

In [None]:
def ingest():
  # get the documents...
  loader = CSVLoader("/content/Enhanced_Store_Data_10_Stores.csv")

  pages = loader.load()
  # split the pages by character
  text_splitter = RecursiveCharacterTextSplitter(
      chunk_size = 1000,
      chunk_overlap = 200,
      length_function = len,
      add_start_index = True,
  )
  chunks = text_splitter.split_documents(pages)
  print(f"Split {len(pages)} Documents into  {len(chunks)} Chunks")

  embedding = FastEmbedEmbeddings()
  #create a vector storage ...


  # Instantiate Chroma and call the from_documents class method..
  vectorstore = Chroma.from_documents(documents=chunks, embedding=embedding, persist_directory="./sql_chroma_db")


In [None]:
ingest()

Split 10 Documents into  10 Chunks


# Huggingface Login


In [None]:
from huggingface_hub import login
# access_token_read and access_token_write are defined later in the original notebook, keeping for completeness
access_token_read = "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"
access_token_write ="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"
login(token = access_token_read)

# Creating a RAG chain to Retrevies chunks and prepare a response

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI
from google.colab import userdata

def rag_chain():
  # Retrieve the API key from Colab secrets
  google_api_key = userdata.get('GOOGLE_API_KEY')

  if not google_api_key:
      raise ValueError("Please set the GOOGLE_API_KEY environment variable in Colab secrets.")

  # Use ChatGoogleGenerativeAI with the specified model and API key
  model = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=google_api_key)

  # Modify the prompt to be more specific about extracting CSV details
  prompt =  PromptTemplate.from_template(
      """
           <s> [Instructions] You are a helpful assistant with expertise in analyzing store data.
           Based only on the following context from a CSV file, provide a detailed and structured answer to the user's question.
           Extract and present key information such as Store ID, Date, Total Sales, POS Transactions and Value, Online Transactions and Value, People Counting, Vehicles Parked, Footfall Peak Hour, Average Dwell Time, Employee Count, Customer Satisfaction Score, and any detected incidents (Gun Detection, Theft Detection, Face Recognition Alerts) relevant to the query.
           Present this information clearly, preferably using bullet points or a summary paragraph that highlights the main data points for the store(s) and time period(s) mentioned in the question.
           If the answer is not found in the provided context, please state that you cannot find the information in the provided data.
           [/Instructions]</s>
           [Instructions] Question: {input}
           context: {context}
           Answer: [/Instructions]</s>
  """)
  #load Vector storage

  embedding = FastEmbedEmbeddings()
  vectorstore = Chroma(persist_directory="./sql_chroma_db", embedding_function=embedding)

  #create Chain..
  retriever = vectorstore.as_retriever(
      search_type = "similarity_score_threshold",
      search_kwargs = {"k": 3,
                       "score_threshold": 0.2,
                       }, )

  # Update the create_stuff_documents_chain call with the modified prompt
  Document_chain = create_stuff_documents_chain(model, prompt)
  chain = create_retrieval_chain(retriever, Document_chain)
  return chain

# Experimenting

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI
from google.colab import userdata
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

def rag_chain():
  # Retrieve the API key from Colab secrets
  google_api_key = userdata.get('GOOGLE_API_KEY')

  if not google_api_key:
      raise ValueError("Please set the GOOGLE_API_KEY environment variable in Colab secrets.")

  # Use ChatGoogleGenerativeAI with the specified model and API key
  model = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=google_api_key)

  # Use the enhanced prompt template from the previous step
  prompt =  PromptTemplate.from_template(
      """
           <s> [Instructions] You are a helpful assistant with expertise in analyzing store data.
           Based only on the following context from a CSV file, provide a detailed and structured answer to the user's question.
           Extract and present key information such as Store ID, Date, Total Sales, POS Transactions and Value, Online Transactions and Value, People Counting, Vehicles Parked, Footfall Peak Hour, Average Dwell Time, Employee Count, Customer Satisfaction Score, and any detected incidents (Gun Detection, Theft Detection, Face Recognition Alerts) relevant to the query.
           Present this information clearly, preferably using bullet points or a summary paragraph that highlights the main data points for the store(s) and time period(s) mentioned in the question.
           If the answer is not found in the provided context, please state that you cannot find the information in the provided data.
           [/Instructions]</s>
           [Instructions] Question: {input}
           context: {context}
           Answer: [/Instructions]</s>
  """)
  #load Vector storage

  embedding = FastEmbedEmbeddings()
  vectorstore = Chroma(persist_directory="./sql_chroma_db", embedding_function=embedding)

  #create Chain..
  # Experiment with different retriever settings: k, score_threshold, search_type
  retriever = vectorstore.as_retriever(
      search_type = "similarity", # Changed search_type from "similarity_score_threshold" to "similarity" for experimentation
      search_kwargs = {"k": 5, # Increased k from 3 to 5
                       # Removed score_threshold as "similarity" search type does not use it
                       }, )

  Document_chain = create_stuff_documents_chain(model, prompt)
  chain = create_retrieval_chain(retriever, Document_chain)
  return chain

# Ask Query.?

In [None]:
import os

def ask(query: str):
  chain = rag_chain()

  #invoke chain
  results = chain.invoke({"input": query})
  #print Results
  print(results['answer'])
  for doc in results["context"]:
    print("souce : ", doc.metadata['source'])

# Ask Queries...!!

In [None]:
print("Query1:- What are the total sales and customer satisfaction score for Store 2?")
ask("What are the total sales and customer satisfaction score for Store 2?")

Query1:- What are the total sales and customer satisfaction score for Store 2?
Here's the information regarding Store 2's total sales and customer satisfaction score, based on the provided data:

*   **Total Sales:** 30831$
*   **Customer Satisfaction Score (out of 10):** 6.9
souce :  /content/Enhanced_Store_Data_10_Stores.csv
souce :  /content/Enhanced_Store_Data_10_Stores.csv
souce :  /content/Enhanced_Store_Data_10_Stores.csv


In [None]:
print("Query 2: Provide details about the footfall peak hour and average dwell time for any store mentioned in the data.")
ask("Provide details about the footfall peak hour and average dwell time for any store mentioned in the data.")

Query 2: Provide details about the footfall peak hour and average dwell time for any store mentioned in the data.
Here are the details regarding footfall peak hour and average dwell time for Store 10, as provided in the data:

*   **Store ID:** Store 10
*   **Footfall Peak Hour:** 5 PM - 6 PM
*   **Average Dwell Time:** 13 minutes
souce :  /content/Enhanced_Store_Data_10_Stores.csv
souce :  /content/Enhanced_Store_Data_10_Stores.csv
souce :  /content/Enhanced_Store_Data_10_Stores.csv


In [None]:
# Ask about detected incidents.
print("Query 3: Were there any gun detection or theft detection incidents reported?")
ask("Were there any gun detection or theft detection incidents reported?")

Query 3: Were there any gun detection or theft detection incidents reported?
Yes, there were incidents reported:

*   **Gun Detection:** YES (reported in all provided entries for Store 9)
*   **Theft Detection:** NO (reported in all provided entries for Store 9)
souce :  /content/Enhanced_Store_Data_10_Stores.csv
souce :  /content/Enhanced_Store_Data_10_Stores.csv
souce :  /content/Enhanced_Store_Data_10_Stores.csv
