In [None]:
%pip install --quiet langchain-text-splitters langchain-community langgraph 
%pip install -q "langchain[google-genai]"
%pip install -q langchain langchain-google-genai google-generativeai
%pip install -q langchain-chroma
%pip install -q beautifulsoup4
%pip install -q pypdf
%pip install -q langchain_community pypdf
%pip install -qU langchain-qdrant

In [6]:
from dotenv import load_dotenv
import os

load_dotenv()

os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = os.getenv("LANGSMITH_API")

In [7]:
import os
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import HumanMessage

# Securely input and set API key
if not os.environ.get("GOOGLE_API_KEY"):
    os.environ["GOOGLE_API_KEY"] = os.getenv("GEMINI_API")

# Initialize Gemini 2.0 Flash model
llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    google_api_key=os.environ["GOOGLE_API_KEY"]
)

# Example call
response = llm.invoke([HumanMessage(content="Hello Gemini 2.0!")])
print(response.content)


Hello! It's great to be able to interact with you. How can I help you today?


## Efficient handling of the response data

In [8]:
import re
import html

def clean_llm_response(text):
    if not isinstance(text, str):
        return ""

    text = text.replace("*", "")  # Remove asterisks
    text = re.sub(r"^#+\s?", "", text, flags=re.MULTILINE)  # Remove markdown headers
    text = re.sub(r"\n{2,}", "\n\n", text)  # Normalize blank lines
    text = re.sub(r"^\s*[-•\d]+\.\s+", "", text, flags=re.MULTILINE)  # Clean bullet prefixes
    text = html.unescape(text)  # Decode &amp;, &lt;, &gt;, etc.
    text = text.replace("```", "")  # Remove code block markers
    text = text.replace("“", '"').replace("”", '"')  # Normalize double quotes
    text = text.replace("‘", "'").replace("’", "'")  # Normalize single quotes
    text = text.strip()

    return text

# === Main invocation ===
user_input = input("You: ")
response = llm.invoke([HumanMessage(content=user_input)])
cleaned_output = clean_llm_response(response.content)
print("\nLLM Response:\n", cleaned_output)



LLM Response:
 AI, while holding immense potential for good, also presents several potential disaster scenarios. These can be broadly categorized as follows:
Job Displacement and Economic Instability:

   Mass Unemployment: AI-powered automation could displace workers across various industries, leading to mass unemployment, increased inequality, and social unrest.
   Deskilling:  AI might take over complex tasks, leading to a decline in human skills and making workers more vulnerable to job losses.
   Concentration of Wealth: The benefits of AI could be concentrated in the hands of a few companies and individuals, further exacerbating wealth inequality.
Bias and Discrimination:

   Reinforcement of Existing Biases: AI systems are trained on data, and if that data reflects existing societal biases (e.g., racial, gender), the AI will perpetuate and even amplify those biases in its decision-making. This can lead to unfair outcomes in areas like hiring, loan applications, criminal justice

In [None]:
# Embedding model and setup

import os

if not os.environ.get("GOOGLE_API_KEY"):
    os.environ["GOOGLE_API_KEY"] = os.getenv("GEMINI_API")

from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")


In [10]:
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient

client = QdrantClient(":memory:")

# Create the collection before using it
if not client.collection_exists(collection_name="test"):
    client.create_collection(
        collection_name="test",
        vectors_config={"size": embeddings.embed_query("test").__len__(), "distance": "Cosine"}
    )

vector_store = QdrantVectorStore(
    client=client,
    collection_name="test",
    embedding=embeddings,
)

## Web Link

In [12]:
import bs4
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict

In [13]:


# url = input("Enter the URL to load: ").strip()  //for run time user input
loader = WebBaseLoader(
    web_paths= ("https://en.wikipedia.org/wiki/Formula_One_car",),
)

docs = loader.load()

# more chunks more specific 

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
all_splits = text_splitter.split_documents(docs)

# Index chunks only if there is content
if all_splits:
    _ = vector_store.add_documents(documents=all_splits)
else:
    print("Warning: No content was loaded or split from the web page. Skipping indexing.")

prompt = hub.pull("rlm/rag-prompt")


# Define state for application
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str


# Define application steps
def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}


def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content}

# Compile application and test
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [14]:
question = input("Enter your question: ") 
response = graph.invoke({"question": question})
print(response["answer"])

An F1 car, or Formula One car, is a single-seat, open-cockpit, open-wheel racing car used in Formula One racing events. It features front and rear wings, large wheels, and a turbocharged engine behind the driver. These cars are constructed with carbon fiber and composite materials to withstand high impact forces.


## PDF

In [15]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter


async def process_pdf_async(file_path):
    loader = PyPDFLoader(file_path)
    # pages = loader.load()
    
    # Load pages asynchronously suitable for larger pdfs or while using lazy load
    pages = []
    async for page in loader.alazy_load():
        # Optional: clean page content to avoid Unicode issues
        page.page_content = page.page_content.encode("utf-8", "ignore").decode("utf-8")
        pages.append(page)

    # Split text into chunks
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    splits = splitter.split_documents(pages)
     
    # Create the collection before using it
    if not client.collection_exists(collection_name="test1"):
        client.create_collection(
        collection_name="test1",
        vectors_config={"size": embeddings.embed_query("test1").__len__(), "distance": "Cosine"}
        )

    vector_store = QdrantVectorStore(
    client=client,
    collection_name="test1",
    embedding=embeddings,
    )

    if splits:
        vector_store.add_documents(splits)
        print("Document processed and indexed for RAG.")
    else:
        print(" No content extracted from PDF.")

    return vector_store

file_path = "f1car.pdf"
vector_store = await process_pdf_async(file_path) 


Document processed and indexed for RAG.


In [16]:
from langchain_core.messages import HumanMessage

question = "comment on the steering and transmission of f1 cars"

results = vector_store.similarity_search(question, k=3)
context = "\n\n".join(doc.page_content for doc in results)

prompt = f"""Answer the question based on the following context:

{context}

Question: {question}
"""

response = llm.invoke([HumanMessage(content=prompt)])
# print("Answer:\n",response.content)
cleaned_output = clean_llm_response(response.content)
print("\nLLM Response:\n", cleaned_output)


LLM Response:
 Here's a summary of the steering and transmission of Formula One cars, based on the provided text:

   Gearboxes: Modern F1 cars use semi-automatic sequential gearboxes with rear-wheel drive. These have eight forward gears and a reverse gear, operated by paddle shifters.
   Materials: The gearbox is constructed from carbon-reinforced titanium and is bolted to the back of the engine.
   Automation Ban: Fully automatic gearboxes, launch control, and traction control were banned in the 2000s to emphasize driver skill and reduce costs.
   Gear Shifts: Gear shifts are initiated by the driver using paddles on the steering wheel. A system of solenoids, hydraulic actuators, and sensors then performs the shift.


## RAG for Academic papers using grobid

In [42]:
# To know more about grobid look at Readme file

# Check if grobid is alive
import requests
requests.get("http://localhost:8070/api/isalive").text


'true'

In [None]:
# Loader using Grobid which is runnning on the machine
# Look on the readme file to setup grobid

from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import GrobidParser

# Load PDFs from a folder using GROBID
loader = GenericLoader.from_filesystem(
    "research_paper/",
    glob="*",
    suffixes=[".pdf"],
    parser=GrobidParser(segment_sentences=False)
)

docs = loader.load()
print(f"✅ Loaded {len(docs)} structured docs")
print(docs[0].metadata)
print(docs[0].page_content[:500])


✅ Loaded 46 structured docs
{'text': 'The promise of deep learning is to discover rich, hierarchical models [2] that represent probability distributions over the kinds of data encountered in artificial intelligence applications, such as natural images, audio waveforms containing speech, and symbols in natural language corpora.So far, the most striking successes in deep learning have involved discriminative models, usually those that map a high-dimensional, rich sensory input to a class label [14,22].These striking successes have primarily been based on the backpropagation and dropout algorithms, using piecewise linear units [19,9,10] which have a particularly well-behaved gradient .Deep generative models have had less of an impact, due to the difficulty of approximating many intractable probabilistic computations that arise in maximum likelihood estimation and related strategies, and due to difficulty of leveraging the benefits of piecewise linear units in the generative context.We pro

In [None]:
# Splitter

from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = splitter.split_documents(docs)

print(f"✅ Split into {len(splits)} chunks")


✅ Split into 50 chunks


In [None]:
client = QdrantClient(":memory:")

In [None]:
# Embedder

from qdrant_client.http.models import VectorParams, Distance
from langchain.vectorstores import Qdrant

# Create the collection before using it
if not client.collection_exists(collection_name="test3"):
    dim = len(embeddings.embed_query("test3"))  # Get embedding vector size

    client.create_collection(
        collection_name="test3",
        vectors_config=VectorParams(
            size=dim,
            distance=Distance.COSINE
        )
    )

# Initialize Qdrant vector store
vector_store = QdrantVectorStore(
    client=client,
    collection_name="test3",
    embedding=embeddings,
)

print("✅ Chunks embedded and stored in Qdrant DB")


✅ Chunks embedded and stored in Qdrant DB


## Semantic search for papers

In [None]:

query = "What is GAN?"

# Retrieve top-k similar chunks
results = vector_store.similarity_search(query, k=3)  # It will return top three most similar docs

# Inspect results
for i, doc in enumerate(results, 1):
    print(f"\nResult {i}:")
    print(doc.metadata)
    print(doc.page_content[:300], "...")



Result 1:
{'text': 'We estimate probability of the test set data under p g by fitting a Gaussian Parzen window to the samples generated with G and reporting the log-likelihood under this distribution.The σ parameter Model MNIST TFD DBN [3] 138 ± 2 1909 ± 66 Stacked CAE [3] 121 ± 1.6 2110 ± 50 Deep GSN [6] 214 ± 1.1 1890 ± 29 Adversarial nets 225 ± 2 2057 ± 26', 'para': '1', 'bboxes': "[[{'page': '5', 'x': '108.00', 'y': '712.10', 'h': '396.00', 'w': '9.65'}, {'page': '5', 'x': '108.00', 'y': '723.06', 'h': '326.39', 'w': '8.96'}], [{'page': '5', 'x': '437.57', 'y': '723.06', 'h': '66.43', 'w': '8.96'}, {'page': '6', 'x': '237.44', 'y': '83.05', 'h': '26.01', 'w': '8.64'}, {'page': '6', 'x': '301.21', 'y': '83.05', 'h': '30.99', 'w': '8.64'}, {'page': '6', 'x': '362.85', 'y': '83.05', 'h': '18.82', 'w': '8.64'}, {'page': '6', 'x': '232.88', 'y': '94.40', 'h': '35.14', 'w': '8.64'}, {'page': '6', 'x': '300.66', 'y': '94.09', 'h': '32.10', 'w': '8.74'}, {'page': '6', 'x': '351.23', 'y': 

## Used the Gemini to answer the question

In [41]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import HumanMessage

llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash")

# Combine the context
context = "\n\n".join(doc.page_content for doc in results)

prompt = f"""Use the following research paper content to answer the question.

Context:
{context}

Question: {query}
"""

response = llm.invoke([HumanMessage(content=prompt)])
print("\n🧠 Gemini Answer:\n", response.content)



🧠 Gemini Answer:
 Based on the provided text, GAN stands for "adversarial nets framework." It's a technique that trains a generative machine to draw samples from a desired distribution without explicitly defining a probability distribution. Unlike some other methods like Generative Stochastic Networks (GSNs), GANs don't require a Markov chain for sampling, allowing them to better utilize piecewise linear units.
