In [1]:
import requests

remote_pdf_url = "https://arxiv.org/pdf/1709.00666.pdf"
pdf_filename = "ch02-downloaded.pdf"

response = requests.get(remote_pdf_url)

if response.status_code == 200:
    with open(pdf_filename, "wb") as pdf_file:
        pdf_file.write(response.content)
else:
    print("Failed to download the PDF. Status code:", response.status_code)

In [2]:
import pdfplumber

text = ""

with pdfplumber.open(pdf_filename) as pdf:
    for page in pdf.pages:
        text += page.extract_text()

print(text[0:20])

Einstein’s Patents a


In [3]:
# pip install if needed:
# pip install google-genai neo4j python-dotenv
%load_ext dotenv
%dotenv

import os
from google import genai
from neo4j import GraphDatabase
from dotenv import load_dotenv

# load_dotenv()

# 1. Set up the Gemini (GenAI) client
genai_client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))

In [4]:
from utils import chunk_text

chunks = chunk_text(text, 500, 40)
print(len(chunks))
print(chunks[0])

89
Einstein’s Patents and Inventions
Asis Kumar Chaudhuri
Variable Energy Cyclotron Centre
1‐AF Bidhan Nagar, Kolkata‐700 064
Abstract: Times magazine selected Albert Einstein, the German born Jewish Scientist as the person of the 20th
century. Undoubtedly, 20th century was the age of science and Einstein’s contributions in unravelling mysteries
of nature was unparalleled. However, few are aware that Einstein was also a great inventor. He and his
collaborators had patented a wide variety of inventions


In [5]:
type(chunks)

list

In [6]:
len(chunks[1])

552

In [7]:
import os
print("URI:", os.environ.get("NEO4J_URI"))
print("USER:", os.environ.get("NEO4J_USERNAME"))
print("PASS:", os.environ.get("NEO4J_PASSWORD"))

URI: bolt://localhost:7687
USER: neo4j
PASS: password


In [8]:
def embed_with_gemini(texts, model="text-embedding-004"):
    # genai_client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
    
    vectors = []

    for t in texts:
        response = genai_client.models.embed_content(
            model=model,
            contents=t
        )

        # response.embeddings is a list
        # emb = response.embeddings[0]: Safely extracts the actual embedding object from the list returned by the API. 
        # Since the loop sends texts one by one, there should only be one embedding per response.
        emb = response.embeddings[0]     # take the first embedding object
        
        #values = emb.values: Extracts the raw numerical vector, which is often returned as a Python tuple of floats.
        values = emb.values              # tuple of floats

        vectors.append([float(v) for v in values])

    return vectors

In [9]:
embeddings = embed_with_gemini(chunks)
print(len(embeddings), len(embeddings[0]))

89 768


In [10]:
# embeddings

In [11]:
# 2. Connect to Neo4j (adjust URI/credentials as needed)
driver = GraphDatabase.driver(
    "bolt://localhost:7687",
    auth=(os.getenv("NEO4J_USERNAME"), os.getenv("NEO4J_PASSWORD"))
)

In [12]:
print(len(chunks))

89


In [13]:
driver.execute_query("""
CREATE VECTOR INDEX pdf IF NOT EXISTS
FOR (c:Chunk)
ON c.embedding
""")


EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x0000018B869C35B0>, keys=[])

In [14]:
# Neo4j does NOT allow referencing $chunks inside size() or range() before it’s declared in the same WITH clause.

cypher_query = """
WITH $chunks AS chunks, $embeddings AS embeddings
WITH chunks, embeddings, range(0, size(chunks)) AS idx
UNWIND idx AS i
WITH i, chunks[i] AS chunk, embeddings[i] AS embedding
MERGE (c:Chunk {index: i})
SET c.text = chunk,
    c.embedding = embedding
"""

driver.execute_query(cypher_query, chunks=chunks, embeddings=embeddings)

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x0000018B869C39D0>, keys=[])

In [15]:
print(embeddings[0][0:3])
print(len(embeddings))
print(len(embeddings[0]))

[0.00421591, -0.042048614, 0.019263804]
89
768


In [16]:
type(embeddings)

list

In [17]:
from neo4j import GraphDatabase
# driver = GraphDatabase.driver("neo4j://localhost:7687", auth=("neo4j", "password"))
driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "password"))

In [18]:
result = driver.verify_connectivity()
print(result)

None


In [19]:
"""
# Your embeddings will always be 768-dimensional with text-embedding-004.
# THIS CODE NEED TO FIX DUE TO ABOVE ERROR

# RUN THIS ON THE DATABASE

CREATE VECTOR INDEX pdf IF NOT EXISTS
FOR (c:Chunk)
ON c.embedding
OPTIONS {
  indexConfig: {
    `vector.dimensions`: 1536,
    `vector.similarity_function`: 'cosine'
  }

"""
"""
ABOVE CODE GIVE ERROR:
# Index query vector has 768 dimensions, but indexed vectors have 1536

This means:

✅ Your stored embeddings = 1536-dimensional
❌ Your question embedding = 768-dimensional

Neo4j vector search requires the query vector dimension = index dimension.

Earlier you inserted embeddings using Gemini with output_dim=1536:
Since you did not provide output_dim=1536, Gemini produced its default dimension = 768.

"""

'\nABOVE CODE GIVE ERROR:\n# Index query vector has 768 dimensions, but indexed vectors have 1536\n\nThis means:\n\n✅ Your stored embeddings = 1536-dimensional\n❌ Your question embedding = 768-dimensional\n\nNeo4j vector search requires the query vector dimension = index dimension.\n\nEarlier you inserted embeddings using Gemini with output_dim=1536:\nSince you did not provide output_dim=1536, Gemini produced its default dimension = 768.\n\n'

In [20]:
question = "At what time was Einstein really interested in experimental works?"
question_embedding = embed_with_gemini([question])[0]

query = '''
CALL db.index.vector.queryNodes('pdf', $k, $question_embedding) YIELD node AS hits, score
RETURN hits.text AS text, score, hits.index AS index
'''
similar_records, _, _ = driver.execute_query(query, question_embedding=question_embedding, k=4)

for record in similar_records:
    print(record["text"])
    print(record["score"], record["index"])
    print("======")

Einstein
left his job at the Patent office and joined the University of Zurich on October 15, 1909. Thereafter, he
continued to rise in ladder. In 1911, he moved to Prague University as a full professor, a year later, he
was appointed as full professor at ETH, Zurich, his alma‐mater. In 1914, he was appointed Director of
the Kaiser Wilhelm Institute for Physics (1914–1932) and a professor at the Humboldt University of
Berlin, with a special clause in his contract that freed him from teaching obligations. In the meantime,
he was working for
0.8374502658843994 31
Einstein’s life was rather featureless. He diligently worked at the patent office,
played violin, discussed physics with his friends, write few not so interesting papers. Then in 1905, he
took the academic world by surprise. In the annals of physics, the year 1905 is known as “annus
mirabilis” or the year of miracle. Indeed, a miracle happened. Albert Einstein, barely 26 years old,
sitting in an obscure Swiss patent office, wrot

In [21]:
question = "At what time was Einstein really interested in experimental works?"

# Consolidate the system instruction into the user message
# This is a common workaround when the 'system_instruction' parameter is unavailable.
user_message = f"You are a helpful and precise assistant. Answer the user's question directly based on historical facts. Question: {question}"

print("Question:", question)

# --- FINAL CORRECTED CALL ---
try:
    response = genai_client.models.generate_content(
        model="gemini-2.5-flash",
        contents=[
            # Use ONLY the 'user' role
            {"role": "user", "parts": [{"text": user_message}]}, 
        ]
        # Removed: temperature=0.0,
        # Removed: max_output_tokens=512
    )

    print("\n--- Gemini Response ---")
    print(response.text)

except Exception as e:
    print(f"\nAn unexpected error occurred: {e}")

Question: At what time was Einstein really interested in experimental works?

--- Gemini Response ---
Albert Einstein was primarily a **theoretical physicist** throughout his entire career. His major contributions were in developing theoretical frameworks, not in conducting or designing experimental works.

While he deeply understood the importance of experimental verification for his theories and often drew inspiration from experimental results (like the Michelson-Morley experiment for special relativity), he did not actively pursue or engage in experimental physics himself. His genius lay in thought experiments and abstract mathematical reasoning. Therefore, there was no specific "time" when he was "really interested in experimental works" in the sense of actively pursuing them; his interest was consistently theoretical.


In [22]:
try :
    driver.execute_query(f"CREATE FULLTEXT INDEX ftPdfChunk FOR (c:Chunk) ON EACH [c.text]")
except:
    print("Fulltext Index already exists")

Fulltext Index already exists


In [23]:
hybrid_query = '''
CALL () {
    // vector index
    CALL db.index.vector.queryNodes('pdf', $k, $question_embedding) YIELD node, score
    WITH collect({node:node, score:score}) AS nodes, max(score) AS max
    UNWIND nodes AS n
    // We use 0 as min
    RETURN n.node AS node, (n.score / max) AS score
    UNION
    // keyword index
    CALL db.index.fulltext.queryNodes('ftPdfChunk', $question, {limit: $k})
    YIELD node, score
    WITH collect({node:node, score:score}) AS nodes, max(score) AS max
    UNWIND nodes AS n
    // We use 0 as min
    RETURN n.node AS node, (n.score / max) AS score
}
// dedup
WITH node, max(score) AS score ORDER BY score DESC LIMIT $k
RETURN node, score
'''

similar_hybrid_records, _, _ = driver.execute_query(hybrid_query, question_embedding=question_embedding, question=question, k=4)

for record in similar_hybrid_records:
    print(record["node"]["text"])
    print(record["score"], record["node"]["index"])
    print("======")

CH‐Switzerland
Considering Einstein’s upbringing, his interest in inventions and patents was not unusual.
Being a manufacturer’s son, Einstein grew upon in an environment of machines and instruments.
When his father’s company obtained the contract to illuminate Munich city during beer festival, he
was actively engaged in execution of the contract. In his ETH days Einstein was genuinely interested
in experimental works. He wrote to his friend, “most of the time I worked in the physical laboratory,
fascinated by the direct contact with observation.” Einstein's
1.0 42
Einstein
left his job at the Patent office and joined the University of Zurich on October 15, 1909. Thereafter, he
continued to rise in ladder. In 1911, he moved to Prague University as a full professor, a year later, he
was appointed as full professor at ETH, Zurich, his alma‐mater. In 1914, he was appointed Director of
the Kaiser Wilhelm Institute for Physics (1914–1932) and a professor at the Humboldt University of
Berlin

In [28]:
import os
from google import genai
# Note: The variables 'gemini_client', 'question', and 'similar_hybrid_records' 
# are assumed to be defined and initialized in your environment.

# 1. CONSOLIDATE PROMPT
system_instruction_text = "You are a helpful and precise assistant. Answer the user's question directly and strictly based ONLY on the documents provided below. Do not use outside knowledge."

user_message = f"""
{system_instruction_text}

Use the following documents to answer the question that will follow:
{[doc["node"]["text"] for doc in similar_hybrid_records]}

---

The question to answer using information only from the above documents: {question}
"""

print("Question:", question)

# --- FINAL CORRECTED CALL: Removed 'stream', 'system_instruction', etc. ---
try:
    # Assuming gemini_client is already initialized
    response = genai_client.models.generate_content(
        model="gemini-2.5-flash", 
        contents=[
            {"role": "user", "parts": [{"text": user_message}]},
        ],
        # REMOVED: stream=True
        # REMOVED: system_instruction=..., temperature=..., max_output_tokens=...
    )

    print("\n--- Gemini Response (Full) ---")
    # Access the full text response via the .text attribute
    print(response.text) 

except NameError as e:
    print(f"\nError: {e}. Ensure all variables are defined.")
except Exception as e:
    print(f"\nAn API or unexpected error occurred: {e}")

Question: At what time was Einstein really interested in experimental works?

--- Gemini Response (Full) ---
During his ETH days.
