In [31]:
import requests
import os

# --- Configuration ---
remote_pdf_url = "https://arxiv.org/pdf/1709.00666.pdf"
pdf_filename = "ch02-downloaded.pdf"

print(f"Attempting to download PDF from: {remote_pdf_url}")

try:
    # Use a timeout to prevent the script from hanging indefinitely
    response = requests.get(remote_pdf_url, timeout=15)
    
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Write the content of the response to a file in binary mode ('wb')
        with open(pdf_filename, "wb") as pdf_file:
            pdf_file.write(response.content)
        
        # Confirmation message
        print(f"✅ Success! PDF saved as: {os.path.abspath(pdf_filename)}")
    else:
        # Handle non-200 status codes (e.g., 404 Not Found, 500 Server Error)
        print(f"❌ Failed to download the PDF. HTTP Status Code: {response.status_code}")
        
except requests.exceptions.RequestException as e:
    # Handle connection errors, DNS errors, timeouts, etc.
    print(f"❌ An error occurred during the request: {e}")
    
except IOError as e:
    # Handle file writing errors
    print(f"❌ An error occurred while writing the file: {e}")

Attempting to download PDF from: https://arxiv.org/pdf/1709.00666.pdf
✅ Success! PDF saved as: C:\Users\jay_s\kg-rag\kg-rag\notebooks\ch02-downloaded.pdf


In [32]:
import pdfplumber
import os

# --- Configuration (Must match the name used in the download script) ---
pdf_filename = "ch02-downloaded.pdf"
text = ""

# Check if the PDF file exists before trying to open it
if not os.path.exists(pdf_filename):
    print(f"❌ Error: File not found. Please ensure '{pdf_filename}' was downloaded successfully in the previous step.")
else:
    try:
        print(f"Reading text from: {pdf_filename}...")

        with pdfplumber.open(pdf_filename) as pdf:
            for i, page in enumerate(pdf.pages):
                # Using += might be memory-intensive for huge files,
                # but is fine for this example.
                extracted_page_text = page.extract_text()
                if extracted_page_text:
                    text += extracted_page_text + "\n"

        # Check if any text was extracted
        if text:
            print(f"✅ Success! Extracted {len(text)} characters.")
            print("\n--- First 20 characters of the extracted text ---")
            print(text[0:20])
        else:
            print("⚠️ Warning: Successfully opened the PDF, but no text could be extracted.")

    except Exception as e:
        print(f"❌ An unexpected error occurred while processing the PDF: {e}")

Reading text from: ch02-downloaded.pdf...
✅ Success! Extracted 44774 characters.

--- First 20 characters of the extracted text ---
Einstein’s Patents a


In [45]:
# If you run into ModuleNotFoundError, uncomment the line below and run it first
# !pip install google-genai neo4j python-dotenv

# Use IPython/Jupyter magic commands to load environment variables from .env
# This requires a file named .env in the same directory as the notebook
%load_ext dotenv
%dotenv

import os
from google import genai
from neo4j import GraphDatabase

# --- 1. Set up the Gemini (GenAI) client ---

# Retrieve API key from environment variables (loaded by %dotenv)
gemini_api_key = os.getenv("GEMINI_API_KEY")

if not gemini_api_key:
    # Print a clear error if the key is missing
    print("❌ ERROR: GEMINI_API_KEY not found in environment variables.")
    print("Please ensure you have a .env file with GEMINI_API_KEY='YOUR_API_KEY'.")
    genai_client = None
else:
    print("✅ GEMINI_API_KEY loaded successfully.")
    # Initialize the client
    genai_client = genai.Client(api_key=gemini_api_key)
    print("✅ Gemini Client initialized.")

# NOTE: You would typically add the Neo4j setup here as well, 
# but for now, we focus on fixing the Gemini client part.

# Example of how you would initialize Neo4j (if needed later)
# NEO4J_URI = os.getenv("NEO4J_URI")
# NEO4J_USER = os.getenv("NEO4J_USER")
# NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
# neo4j_driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv
✅ GEMINI_API_KEY loaded successfully.
✅ Gemini Client initialized.


In [46]:
from utils import chunk_text

chunks = chunk_text(text, 500, 40)
print(len(chunks))
print(chunks[0])

89
Einstein’s Patents and Inventions
Asis Kumar Chaudhuri
Variable Energy Cyclotron Centre
1‐AF Bidhan Nagar, Kolkata‐700 064
Abstract: Times magazine selected Albert Einstein, the German born Jewish Scientist as the person of the 20th
century. Undoubtedly, 20th century was the age of science and Einstein’s contributions in unravelling mysteries
of nature was unparalleled. However, few are aware that Einstein was also a great inventor. He and his
collaborators had patented a wide variety of inventions


In [36]:
import os
print("URI:", os.environ.get("NEO4J_URI"))
print("USER:", os.environ.get("NEO4J_USERNAME"))
print("PASS:", os.environ.get("NEO4J_PASSWORD"))

URI: bolt://localhost:7687
USER: neo4j
PASS: password


In [37]:
def embed_with_gemini(texts, model="text-embedding-004"):
    client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))

    vectors = []

    for t in texts:
        response = client.models.embed_content(
            model=model,
            contents=t
        )

        # response.embeddings is a list
        emb = response.embeddings[0]     # take the first embedding object
        values = emb.values              # tuple of floats

        vectors.append([float(v) for v in values])

    return vectors

In [38]:
embeddings = embed_with_gemini(chunks)
print(len(embeddings), len(embeddings[0]))

89 768


In [None]:
# embeddings

In [16]:
print(len(chunks))

89


In [39]:
driver.execute_query("""
CREATE VECTOR INDEX pdf IF NOT EXISTS
FOR (c:Chunk)
ON c.embedding
""")


EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x000002239DFBE650>, keys=[])

In [40]:
# Neo4j does NOT allow referencing $chunks inside size() or range() before it’s declared in the same WITH clause.

cypher_query = """
WITH $chunks AS chunks, $embeddings AS embeddings
WITH chunks, embeddings, range(0, size(chunks)) AS idx
UNWIND idx AS i
WITH i, chunks[i] AS chunk, embeddings[i] AS embedding
MERGE (c:Chunk {index: i})
SET c.text = chunk,
    c.embedding = embedding
"""

driver.execute_query(cypher_query, chunks=chunks, embeddings=embeddings)

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x000002239DC33BB0>, keys=[])

In [41]:
print(embeddings[0][0:3])
print(len(embeddings))
print(len(embeddings[0]))

[0.00421591, -0.042048614, 0.019263804]
89
768


In [20]:
type(embeddings)

list

In [42]:
from neo4j import GraphDatabase
# driver = GraphDatabase.driver("neo4j://localhost:7687", auth=("neo4j", "password"))
driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "password"))

In [43]:
result = driver.verify_connectivity()
print(result)

None


In [44]:
question = "At what time was Einstein really interested in experimental works?"
question_embedding = embed_with_gemini([question])[0]

query = '''
CALL db.index.vector.queryNodes('pdf', $k, $question_embedding) YIELD node AS hits, score
RETURN hits.text AS text, score, hits.index AS index
'''
similar_records, _, _ = driver.execute_query(query, question_embedding=question_embedding, k=4)

for record in similar_records:
    print(record["text"])
    print(record["score"], record["index"])
    print("======")

Einstein
left his job at the Patent office and joined the University of Zurich on October 15, 1909. Thereafter, he
continued to rise in ladder. In 1911, he moved to Prague University as a full professor, a year later, he
was appointed as full professor at ETH, Zurich, his alma‐mater. In 1914, he was appointed Director of
the Kaiser Wilhelm Institute for Physics (1914–1932) and a professor at the Humboldt University of
Berlin, with a special clause in his contract that freed him from teaching obligations. In the meantime,
he was working for
0.8374502658843994 31
Einstein’s life was rather featureless. He diligently worked at the patent office,
played violin, discussed physics with his friends, write few not so interesting papers. Then in 1905, he
took the academic world by surprise. In the annals of physics, the year 1905 is known as “annus
mirabilis” or the year of miracle. Indeed, a miracle happened. Albert Einstein, barely 26 years old,
sitting in an obscure Swiss patent office, wrot

In [25]:
from google import genai

# Initialize the client
# genai.configure(api_key="YOUR_API_KEY")

# Prepare system + user messages
system_message = "You're an Einstein expert, but can only use the provided documents to respond to the questions."
user_message = f"""
Use the following documents to answer the question that will follow:
{[doc['text'] for doc in similar_records]}

---

The question to answer using information only from the above documents: {question}
"""

# Create a chat completion
response = genai.client.responses.create(
    model="gemini-2.2-chat",
    input=[
        {"author": "system", "content": system_message},
        {"author": "user", "content": user_message},
    ],
    temperature=0.0,
    max_output_tokens=512
)
print(response.output_text)


AttributeError: module 'google.genai.client' has no attribute 'responses'

In [22]:
try :
    driver.execute_query(f"CREATE FULLTEXT INDEX ftPdfChunk FOR (c:Chunk) ON EACH [c.text]")
except:
    print("Fulltext Index already exists")

In [30]:
from google import genai
import os

# Initialize the Gemini client
gemini_client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))

# System and user messages
system_message = "You're an Einstein expert, but can only use the provided documents to respond to the questions."
user_message = f"""
Use the following documents to answer the question that will follow:
{[doc['text'] for doc in similar_records]}

---

The question to answer using information only from the above documents: {question}
"""

# ✅ Correct v1.52.0 call
response = gemini_client.chat.create(
    model="gemini-2.2-chat",
    messages=[
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message}
    ],
    temperature=0.0,
    max_output_tokens=512
)

# Print the result
print(response.output_text)


AttributeError: 'Client' object has no attribute 'chat'

In [None]:
hybrid_query = '''
CALL {
    // vector index
    CALL db.index.vector.queryNodes('pdf', $k, $question_embedding) YIELD node, score
    WITH collect({node:node, score:score}) AS nodes, max(score) AS max
    UNWIND nodes AS n
    // We use 0 as min
    RETURN n.node AS node, (n.score / max) AS score
    UNION
    // keyword index
    CALL db.index.fulltext.queryNodes('ftPdfChunk', $question, {limit: $k})
    YIELD node, score
    WITH collect({node:node, score:score}) AS nodes, max(score) AS max
    UNWIND nodes AS n
    // We use 0 as min
    RETURN n.node AS node, (n.score / max) AS score
}
// dedup
WITH node, max(score) AS score ORDER BY score DESC LIMIT $k
RETURN node, score
'''
similar_hybrid_records, _, _ = driver.execute_query(hybrid_query, question_embedding=question_embedding, question=question, k=4)

for record in similar_hybrid_records:
    print(record["node"]["text"])
    print(record["score"], record["node"]["index"])
    print("======")

CH‐Switzerland
Considering Einstein’s upbringing, his interest in inventions and patents was not unusual.
Being a manufacturer’s son, Einstein grew upon in an environment of machines and instruments.
When his father’s company obtained the contract to illuminate Munich city during beer festival, he
was actively engaged in execution of the contract. In his ETH days Einstein was genuinely interested
in experimental works. He wrote to his friend, “most of the time I worked in the physical laboratory,
fascinated by the direct contact with observation.” Einstein's
1.0 42
Einstein
left his job at the Patent office and joined the University of Zurich on October 15, 1909. Thereafter, he
continued to rise in ladder. In 1911, he moved to Prague University as a full professor, a year later, he
was appointed as full professor at ETH, Zurich, his alma‐mater. In 1914, he was appointed Director of
the Kaiser Wilhelm Institute for Physics (1914–1932) and a professor at the Humboldt University of
Berlin

In [None]:
user_message = f"""
Use the following documents to answer the question that will follow:
{[doc["node"]["text"] for doc in similar_hybrid_records]}

---

The question to answer using information only from the above documents: {question}
"""

print("Question:", question)

stream = open_ai_client.chat.completions.create(
    model="gpt-4",
    messages=[
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message}
    ],
    stream=True,
)
for chunk in stream:
    print(chunk.choices[0].delta.content or "", end="")

Question: At what time was Einstein really interested in experimental works?
Einstein was genuinely interested in experimental works during his days at ETH, as indicated in the provided documents.

In [1]:
from google import genai
print(genai.__version__)

1.52.0
