In [None]:
!pip install langchain faiss-cpu openai PyMuPDF



Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting PyMuPDF
  Downloading pymupdf-1.26.1-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m61.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pymupdf-1.26.1-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m87.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF, faiss-cpu
Successfully installed PyMuPDF-1.26.1 faiss-cpu-1.11.0


In [None]:
!pip install -U langchain langchain-openai langchain-community langchain-google-genai



In [24]:
import os
import fitz  # PyMuPDF
from dotenv import load_dotenv
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.chains import RetrievalQA

In [25]:
# Load Gemini API key from .env or manually set here
load_dotenv()
os.environ["GOOGLE_API_KEY"] = "Replace with your Gemini API key"  

# Step 1: Read PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    return "".join(page.get_text() for page in doc)

text = extract_text_from_pdf("/content/UP.pdf")

# Step 2: Chunk the text
splitter = CharacterTextSplitter(chunk_size=800, chunk_overlap=100)
documents = splitter.create_documents([text])

# Step 3: Create embeddings using Gemini
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
vector_db = FAISS.from_documents(documents, embeddings)

# Step 4: Setup retriever + Gemini LLM
retriever = vector_db.as_retriever(search_type="similarity", search_kwargs={"k": 4})
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.2)

qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

# Step 5: Ask a question
query = "what kind of industry i can setup in up ?"
answer = qa_chain.run(query)
print("Answer:\n", answer)

Answer:
 Based on the Uttar Pradesh Industrial Investment & Employment Promotion Policy 2022, you can set up industries in a wide range of sectors. The policy categorizes them into "Focus sectors," "Sunrise & potential sectors," and "Champion sectors in Service Industry."

**Focus Sectors:**
*   Agro & Food Processing
*   Handloom & Textiles
*   Tourism
*   MSME
*   Electronics Manufacturing
*   Data Centre
*   Defence & Aerospace
*   Warehousing & logistics
*   Dairy and Poultry
*   IT/ ITeS
*   Start-up
*   Electric Vehicle
*   Film
*   Renewable Energy (Solar)
*   Pharmaceuticals
*   Civil Aviation
*   Biofuel
*   Semiconductor
*   Animation, Visual effects, Gaming and Comics (AVGC)
*   Private sector hospital
*   Private universities/ colleges including medical & paramedical colleges
*   Mega multisector focused skill parks/ hubs

**Sunrise & Potential Sectors:**
*   Green hydrogen production
*   Capital goods (including heavy electrical and power equipment, earthmoving and mining 

In [31]:
import os
import fitz  # PyMuPDF
from dotenv import load_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.chains import RetrievalQA

# Load environment variables
load_dotenv()
os.environ["GOOGLE_API_KEY"] = "Replace with your Gemini API key"  

# Function to extract text from a single PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    return "".join(page.get_text() for page in doc)

# Step 1: Collect all PDFs (modify paths as needed)
pdf_files = ["/content/UP.pdf"]  # Add more PDFs here

# Step 2: Extract and split documents
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
)

documents = []
for file in pdf_files:
    text = extract_text_from_pdf(file)
    doc_chunks = splitter.create_documents([text])
    # Add metadata to the documents after creation
    for chunk in doc_chunks:
        chunk.metadata["source"] = os.path.basename(file)
    documents.extend(doc_chunks)

# Step 3: Create embeddings and vector store
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
vector_db = FAISS.from_documents(documents, embeddings)

# Step 4: Setup retriever and Gemini LLM
retriever = vector_db.as_retriever(search_type="similarity", search_kwargs={"k": 4})
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.2)

qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)

# Step 5: Ask a question
query = "What kind of industry can I set up in UP?"
result = qa_chain({"query": query})

# Display the result
print("Answer:\n", result["result"])
print("\nSources:")
for doc in result["source_documents"]:
    print(f"- {doc.metadata.get('source', 'Unknown Source')}")

Answer:
 Based on the provided context, you can set up industries in the following sectors in Uttar Pradesh:

**8.1 Focus Sectors (where the State has a competitive advantage):**
*   Agro & Food Processing
*   Handloom & Textiles
*   Tourism
*   MSME (Micro, Small, and Medium Enterprises)
*   Electronics Manufacturing
*   Data Centre
*   Defence & Aerospace
*   Warehousing & logistics
*   Dairy and Poultry
*   IT/ ITeS (Information Technology/IT enabled Services)
*   Start-up
*   Electric Vehicle
*   Film
*   Renewable Energy (Solar)
*   Pharmaceuticals
*   Civil Aviation
*   Biofuel
*   Semiconductor
*   Animation, Visual effects, Gaming and Comics (AVGC)
*   Private sector hospital
*   Private universities/ colleges including medical & paramedical colleges
*   Mega multisector focused skill parks/ hubs

**8.2 Sunrise & Potential Sectors (new sectors with high potential for growth, diversification, and investment):**
*   Green hydrogen production
*   Capital goods (including heavy ele