In [1]:
%load_ext dotenv
%dotenv

In [2]:
import re
from typing import List

import pdfplumber
import requests

from utils import chat, chunk_text, embed, neo4j_driver, num_tokens_from_string

In [3]:
stepback_system_message = """
You are an expert at world knowledge. Your task is to step back
and paraphrase a question to a more generic step-back question, which
is easier to answer. Here are a few examples

"input": "Could the members of The Police perform lawful arrests?"
"output": "what can the members of The Police do?"

"input": "Jan Sindel’s was born in what country?"
"output": "what is Jan Sindel’s personal history?"
"""


def generate_stepback(question: str):
    user_message = f"""{question}"""
    step_back_question = chat(
        messages=[
            {"role": "system", "content": stepback_system_message},
            {"role": "user", "content": user_message},
        ]
    )
    return step_back_question

In [4]:
question = "Which team did Thierry Audel play for from 2007 to 2008?"
step_back_question = generate_stepback(question)
print(f"Stepback results: {step_back_question}")

Stepback results: What is Thierry Audel's career history?


In [5]:
remote_pdf_url = "https://arxiv.org/pdf/1709.00666.pdf"
pdf_filename = "ch03-downloaded.pdf"

response = requests.get(remote_pdf_url)

if response.status_code == 200:
    with open(pdf_filename, "wb") as pdf_file:
        pdf_file.write(response.content)
else:
    print("Failed to download the PDF. Status code:", response.status_code)

In [6]:
text = ""

with pdfplumber.open(pdf_filename) as pdf:
    for page in pdf.pages:
        text += page.extract_text()

In [7]:
def split_text_by_titles(text):
    # A regular expression pattern for titles that
    # match lines starting with one or more digits, an optional uppercase letter,
    # followed by a dot, a space, and then up to 50 characters
    title_pattern = re.compile(r"(\n\d+[A-Z]?\. {1,3}.{0,60}\n)", re.DOTALL)
    titles = title_pattern.findall(text)
    # Split the text at these titles
    sections = re.split(title_pattern, text)
    sections_with_titles = []
    # Append the first section
    sections_with_titles.append(sections[0])
    # Iterate over the rest of sections
    for i in range(1, len(titles) + 1):
        section_text = sections[i * 2 - 1].strip() + "\n" + sections[i * 2].strip()
        sections_with_titles.append(section_text)

    return sections_with_titles


sections = split_text_by_titles(text)
print(f"Number of sections: {len(sections)}")

Number of sections: 9


In [8]:
for s in sections:
    print(num_tokens_from_string(s))

154
254
4186
570
2703
804
637
194
600


In [9]:
print(sections[1])

1. Introduction
Towards the end of the last century, Times Magazine asked some of the World’s leading
personalities to pick their choice for the person of the century. The magazine compiled a list 100 most
influential people of 20th century and the German born scientist Albert Einstein topped the list.
Einstein’s choice as the person of the century didn’t invoke any resentment, it was generally agreed
that 20th century is the age of Science and undoubtedly, Einstein’s contribution to Science, to the
understanding of the intricate laws of nature was unparalleled. He greatly influenced modern science;
altered our views on space‐time, matter and energy, gave new interpretation to gravity etc. The
enormous popularity he enjoyed during his lifetime and even now, is rare for any individual; religious
leader, politician, film star. Even a child knows his name, not to speak of adults.
However, while Einstein is known as a great theoretical physicist, few possibly knew that he
had more than 50 

In [10]:
parent_chunks = []
for s in sections:
    parent_chunks.extend(chunk_text(s, 2000, 40))

In [11]:
cypher_import_query = """
MERGE (pdf:PDF {id:$pdf_id})
MERGE (p:Parent {id:$pdf_id + '-' + $id})
SET p.text = $parent
MERGE (pdf)-[:HAS_PARENT]->(p)
WITH p, $children AS children, $embeddings as embeddings
UNWIND range(0, size(children) - 1) AS child_index
MERGE (c:Child {id: $pdf_id + '-' + $id + '-' + toString(child_index)})
SET c.text = children[child_index], c.embedding = embeddings[child_index]
MERGE (p)-[:HAS_CHILD]->(c);
"""

In [12]:
from neo4j import GraphDatabase
driver = GraphDatabase.driver(
  "neo4j://localhost:7688", 
  auth=("neo4j", "Leejinhee83")
)

print("✓ Connected to new Neo4j container 'graphrag-book'")

✓ Connected to new Neo4j container 'graphrag-book'


In [13]:
for i, chunk in enumerate(parent_chunks):
    child_chunks = chunk_text(chunk, 500, 20)
    embeddings = embed(child_chunks)
    # Add to neo4j
    neo4j_driver.execute_query(
        cypher_import_query,
        id=str(i),
        pdf_id="1709.00666",
        parent=chunk,
        children=child_chunks,
        embeddings=embeddings,
    )

In [14]:
index_name = "pdf"
driver.execute_query("""CREATE VECTOR INDEX pdf IF NOT EXISTS
FOR (c:Child)
ON c.embedding""")

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x117693510>, keys=[])

In [15]:
retrieval_query = """
CALL db.index.vector.queryNodes($index_name, $k * 4, $question_embedding)
YIELD node, score
MATCH (node)<-[:HAS_CHILD]-(parent)
WITH parent, max(score) AS score
RETURN parent.text AS text, score
ORDER BY score DESC
LIMIT toInteger($k)
"""

In [16]:
def parent_retrieval(question: str, k: int = 5) -> List[str]:
    question_embedding = embed([question])[0]
    
    print(f"Searching with question: {question}")
    print(f"Using index: {index_name}")
    print(f"Looking for k={k} results")
    
    similar_records, _, _ = neo4j_driver.execute_query(
        retrieval_query,
        question_embedding=question_embedding,
        k=k,
        index_name=index_name,
    )
    
    #print(f"Found {len(similar_records)} records")
    #print(f"Records: {similar_records}")
    
    return [record["text"] for record in similar_records]

In [17]:
documents = parent_retrieval(
    "Who was the Einstein's collaborator on sound reproduction system?"
)

print(f"Number of documents found: {len(documents)}")
print(f"Documents: {documents}")

for d in documents:
    print(d)
    print("=" * 20)

Searching with question: Who was the Einstein's collaborator on sound reproduction system?
Using index: pdf
Looking for k=5 results
Number of documents found: 0
Documents: []


In [18]:
generate_stepback("Who was the Einsten's collaborator on sound reproduction system?")

'Who worked with Einstein on technological projects?'

In [19]:
answer_system_message = "You're en Einstein expert, but can only use the provided documents to respond to the questions."


def generate_answer(question: str, documents: List[str]) -> str:
    user_message = f"""
    Use the following documents to answer the question that will follow:
    {documents}

    ---

    The question to answer using information only from the above documents: {question}
    """
    result = chat(
        messages=[
            {"role": "system", "content": answer_system_message},
            {"role": "user", "content": user_message},
        ]
    )
    print("Response:", result)

In [20]:
def rag_pipeline(question: str) -> str:
    stepback_prompt = generate_stepback(question)
    print(f"Stepback prompt: {stepback_prompt}")
    documents = parent_retrieval(stepback_prompt)
    answer = generate_answer(question, documents)
    return answer

In [21]:
rag_pipeline("When was Einstein granted the patent for his blouse design?")

Stepback prompt: What are some notable achievements and inventions associated with Einstein?
Searching with question: What are some notable achievements and inventions associated with Einstein?
Using index: pdf
Looking for k=5 results
Response: I'm sorry, but there are no documents provided to answer the question about when Einstein was granted the patent for his blouse design.
