In [None]:
# Load environment variables from .env file (Neo4j credentials, OpenAI API key)

%load_ext dotenv
%dotenv

In [None]:
# Import required libraries and utilities for text processing, Neo4j, and OpenAI

from utils import neo4j_driver, num_tokens_from_string, chunk_text, chat, embed

# ch07_tools가 이미 로드되어 있다면 제거
import sys
if 'ch07_tools' in sys.modules:
    del sys.modules['ch07_tools']

# 새로 임포트
import ch07_tools

import json
import requests

from tqdm import tqdm
from typing import List, Dict

In [None]:
# Download the Odyssey text from Project Gutenberg

url = "https://www.gutenberg.org/cache/epub/1727/pg1727.txt"
response = requests.get(url)

In [None]:
# Split the full text into individual books

def chunk_into_books(text: str) -> List[str]:
    return (
        text.split("PREFACE TO FIRST EDITION")[2]
        .split("FOOTNOTES")[0]
        .strip()
        .split("\nBOOK")[1:]
    )

books = chunk_into_books(response.text)

In [None]:
# Calculate and display token statistics for each book

token_count = [num_tokens_from_string(el) for el in books]
print(
    f"""There are {len(token_count)} books with token sizes:
- avg {sum(token_count) / len(token_count)}
- min {min(token_count)}
- max {max(token_count)}
"""
)

There are 24 books with token sizes:
- avg 6515.208333333333
- min 4459
- max 10760



In [None]:
# Chunk each book into smaller pieces (1000 tokens, 40 overlap)

chunked_books = [chunk_text(book, 1000, 40) for book in books]

In [None]:
# Define entity types and extraction function using GPT-4

ENTITY_TYPES = [
    "PERSON",
    "ORGANIZATION",
    "LOCATION",
    "GOD",
    "EVENT",
    "CREATURE",
    "WEAPON_OR_TOOL",
]
def extract_entities(text: str) -> List[Dict]:
    # Construct prompt
    messages = [
        {"role": "user", "content": ch07_tools.create_extraction_prompt(ENTITY_TYPES, text)},
    ]
    # Make the LLM call
    output = chat(messages, model = "gpt-4o")
    # Construct JSON from output
    return ch07_tools.parse_extraction_output(output)

In [None]:
# Extract entities and relationships from chunks and import to Neo4j

number_of_books = 1
for book_i, book in enumerate(
    tqdm(chunked_books[:number_of_books], desc="Processing Books")
):
    for chunk_i, chunk in enumerate(tqdm(book, desc=f"Book {book_i}", leave=False)):
        nodes, relationships = extract_entities(chunk)
        neo4j_driver.execute_query(
            ch07_tools.import_nodes_query,
            data=nodes,
            book_id=book_i,
            text=chunk,
            chunk_id=chunk_i,
        )
        neo4j_driver.execute_query(
            ch07_tools.import_relationships_query, data=relationships
        )

Processing Books: 100%|██████████| 1/1 [03:17<00:00, 197.39s/it]


In [None]:
# Verify the count of entities and relationships imported

data, _, _ = neo4j_driver.execute_query(
    """MATCH (:`__Entity__`)
    RETURN 'entity' AS type, count(*) AS count
    UNION
    MATCH ()-[:RELATIONSHIP]->()
    RETURN 'relationship' AS type, count(*) AS count
    """
)
print([el.data() for el in data])

[{'type': 'entity', 'count': 65}, {'type': 'relationship', 'count': 286}]


In [None]:
# Retrieve all descriptions for ORESTES (raw description array)

data, _, _ = neo4j_driver.execute_query(
    """MATCH (n:PERSON)
WHERE n.name = "ORESTES"
RETURN n.description AS description"""
)
print([el.data()['description'] for el in data])

[["Orestes is the son of Agamemnon, who avenged his father's death by killing Aegisthus", 'Orestes is a person who was expected to take revenge on Aegisthus', "Orestes is praised for avenging his father's murder by killing Aegisthus", 'Orestes is the son of Agamemnon who killed Aegisthus', 'Orestes is a person who was expected to take revenge on Aegisthus', "Orestes is praised for avenging his father's murder by killing Aegisthus", 'Orestes is the son of Agamemnon who killed Aegisthus', 'Orestes is the son of Agamemnon who killed Aegisthus', 'Orestes is a person who was expected to take revenge on Aegisthus', "Orestes is praised for avenging his father's murder by killing Aegisthus"]]


In [None]:
# Find the entity pair with the most relationships between them

data, _, _ = neo4j_driver.execute_query(
    """MATCH (n:__Entity__)-[:RELATIONSHIP]-(m:__Entity__)
WITH n,m, count(*) AS countOfRels
ORDER BY countOfRels DESC LIMIT 1
MATCH (n)-[r:RELATIONSHIP]-(m)
RETURN n.name AS source, m.name AS target, countOfRels, collect(r.description) AS descriptions
"""
)
print([el.data() for el in data])

[{'source': 'TELEMACHUS', 'target': 'MINERVA', 'countOfRels': 22, 'descriptions': ['Telemachus spoke quietly to Minerva during the banquet to avoid being overheard', 'Minerva, a goddess, advises and encourages Telemachus, giving him courage and making him think about his father', 'Minerva is guiding Telemachus in his plans for a voyage', 'Telemachus spoke quietly to Minerva during the banquet', 'Minerva, in disguise, advises and encourages Telemachus, giving him courage and making him think of his father', 'Minerva gave counsel to Telemachus regarding his intended voyage', 'Telemachus spoke quietly to Minerva during the banquet', 'Minerva, in disguise, advises and encourages Telemachus, giving him courage and making him think of his father', "Minerva brings sleep to Telemachus's mother, showing divine intervention in his household", 'Minerva provides guidance and counsel to Telemachus for his intended voyage', 'Minerva visits Ithaca, where Telemachus challenges the suitors', 'Minerva p

In [None]:
# Summarize multiple descriptions for each entity using GPT-4

candidates_to_summarize, _, _ = neo4j_driver.execute_query(
    """MATCH (e:__Entity__) WHERE size(e.description) > 1 
    RETURN e.name AS entity_name, e.description AS description_list"""
)
summaries = []
for candidate in tqdm(candidates_to_summarize, desc="Summarizing entities"):
    messages = [
        {
            "role": "user",
            "content": ch07_tools.get_summarize_prompt(
                candidate["entity_name"], candidate["description_list"]
            ),
        },
    ]
    summary = chat(messages, model="gpt-4o")
    summaries.append({"entity": candidate["entity_name"], "summary": summary})

ch07_tools.import_entity_summary(neo4j_driver, summaries)

Summarizing entities: 100%|██████████| 62/62 [02:10<00:00,  2.10s/it]


In [None]:
# Check the summarized description for ORESTES

summary, _, _ = neo4j_driver.execute_query(
    """MATCH (n:PERSON)
WHERE n.name = "ORESTES"
RETURN n.summary AS summary""")
print(summary[0]['summary'])

Orestes is the son of Agamemnon, who is renowned for avenging his father's death by killing Aegisthus. He was expected to take revenge on Aegisthus, and he fulfilled this expectation, earning praise for his actions.


In [None]:
# Summarize relationships between entity pairs (for pairs with multiple connections)

rels_to_summarize, _, _ = neo4j_driver.execute_query(
    """MATCH (s:__Entity__)-[r:RELATIONSHIP]-(t:__Entity__)
    WHERE id(s) < id(t)
    WITH s.name AS source, t.name AS target, 
           collect(r.description) AS description_list,
           count(*) AS count
    WHERE count > 1
    RETURN source, target, description_list"""
)
rel_summaries = []
for candidate in tqdm(rels_to_summarize, desc="Summarizing relationships"):
    entity_name = f"{candidate['source']} relationship to {candidate['target']}"
    messages = [
        {
            "role": "user",
            "content": ch07_tools.get_summarize_prompt(
                entity_name, candidate["description_list"]
            ),
        },
    ]
    summary = chat(messages, model="gpt-4o")
    rel_summaries.append({"source": candidate["source"], "target": candidate["target"], "summary": summary})

ch07_tools.import_rels_summary(neo4j_driver, summaries)

Summarizing relationships: 100%|██████████| 71/71 [04:11<00:00,  3.54s/it]


In [None]:
# Check the summarized relationship between TELEMACHUS and MINERVA

data, _, _ = neo4j_driver.execute_query(
    """MATCH (n:__Entity__)-[r:SUMMARIZED_RELATIONSHIP]-(m:__Entity__)
WHERE n.name = 'TELEMACHUS' AND m.name = 'MINERVA'
RETURN r.summary AS description
"""
)
print(data[0]["description"])

Minerva brings sleep to Telemachus's mother, showing her divine influence


In [None]:
# Detect communities using Louvain algorithm (requires Neo4j GDS plugin)

community_distribution = ch07_tools.calculate_communities(neo4j_driver)
print(f"There are {community_distribution['communityCount']} communities with distribution: {community_distribution['communityDistribution']}")

There are 9 communities with distribution: {'p1': 2, 'p5': 2, 'max': 15, 'p90': 15, 'p50': 5, 'p95': 15, 'p10': 2, 'p75': 11, 'p99': 15, 'p25': 4, 'min': 2, 'mean': 7.111111111111111, 'p999': 15}


In [None]:
# Generate summaries for each community using GPT-4

community_info, _, _ = neo4j_driver.execute_query(ch07_tools.community_info_query)

communities = []
for community in tqdm(community_info, desc="Summarizing communities"):
    messages = [
        {
            "role": "user",
            "content": ch07_tools.get_summarize_community_prompt(
                community["nodes"], community["rels"]
            ),
        },
    ]
    summary = chat(messages, model="gpt-4o")
    communities.append(
        {
            "community": json.loads(ch07_tools.extract_json(summary)),
            "communityId": community["communityId"],
            "nodes": [el["id"] for el in community["nodes"]],
        }
    )

neo4j_driver.execute_query(ch07_tools.import_community_query, data=communities)

Summarizing communities: 100%|██████████| 9/9 [02:58<00:00, 19.81s/it]


EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x114b27a90>, keys=[])

In [None]:
# Retrieve the largest community's title and summary

data, _, _ = neo4j_driver.execute_query(
    """MATCH (c:__Community__)
WITH c, count {(c)<-[:IN_COMMUNITY]-()} AS size
ORDER BY size DESC LIMIT 1
RETURN c.title AS title, c.summary AS summary
"""
)
print(data[0]["title"])
print(data[0]["summary"])

Minerva, Telemachus, and the Quest for Ulysses
The community centers around Minerva, the Roman goddess of wisdom, and Telemachus, the son of Ulysses, as they navigate the challenges posed by the suitors in Ithaca and seek news of Ulysses. Minerva plays a pivotal role in guiding and encouraging Telemachus, while other entities such as Jove, Phemius, and Antinous contribute to the dynamics of this mythological narrative.


In [None]:
# Define global retriever using Map-Reduce pattern over community summaries

def global_retriever(query: str, rating_threshold: float = 5) -> str:
    community_data, _, _ = neo4j_driver.execute_query(
        """
    MATCH (c:__Community__)
    WHERE c.rating >= $rating
    RETURN c.summary AS summary
    """,
        rating=rating_threshold,
    )
    print(f"Got {len(community_data)} community summaries")
    intermediate_results = []
    for community in tqdm(community_data, desc="Processing communities"):
        intermediate_messages = [
            {
                "role": "system",
                "content": ch07_tools.get_map_system_prompt(community["summary"]),
            },
            {
                "role": "user",
                "content": query,
            },
        ]
        intermediate_response = chat(intermediate_messages, model="gpt-4o")
        intermediate_results.append(intermediate_response)

    final_messages = [
        {
            "role": "system",
            "content": ch07_tools.get_reduce_system_prompt(intermediate_results),
        },
        {"role": "user", "content": query},
    ]
    summary = chat(final_messages, model="gpt-4o")
    return summary

In [None]:
# Test global retriever with a high-level question

print(global_retriever("What is this story about?"))

Got 6 community summaries


Processing communities: 100%|██████████| 6/6 [00:48<00:00,  8.05s/it]


The story is a rich tapestry of mythological and historical elements centered around the legendary Greek hero Ulysses, also known as Odysseus, and his adventures following the Trojan War. A significant portion of the narrative unfolds in Ithaca, Ulysses' homeland, where his son Telemachus and the goddess Minerva (Athena) navigate the challenges posed by the suitors vying for Ulysses' wife, Penelope, in his prolonged absence [Data: Reports (1)].

Minerva plays a crucial role in guiding and encouraging Telemachus as he seeks news of his father. The dynamics of the story are further enriched by the involvement of other divine and mortal entities, such as Jove (Zeus), Phemius, and Antinous, highlighting the impact of divine intervention on human affairs [Data: Reports (1)].

The narrative also delves into the tragic events involving Aegisthus, Agamemnon, and Orestes. Aegisthus' affair with Clytemnestra and his role in Agamemnon's murder set off a chain of vengeance, culminating in Orestes 

In [None]:
# Create embeddings for entity summaries and build vector index

entities, _, _ = neo4j_driver.execute_query(
    """
MATCH (e:__Entity__)
RETURN e.summary AS summary, e.name AS name
"""
)
data = [{"name": el["name"], "embedding": embed(el["summary"])[0]} for el in entities]
neo4j_driver.execute_query(
    """
UNWIND $data AS row
MATCH (e:__Entity__ {name: row.name})
CALL db.create.setNodeVectorProperty(e, 'embedding', row.embedding)
""",
    data=data,
)

neo4j_driver.execute_query(
    """
CREATE VECTOR INDEX entities IF NOT EXISTS
FOR (n:__Entity__)
ON (n.embedding)
""",
    data=data,
)


EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x114bfa410>, keys=[])

In [None]:
# Define local search query that uses vector similarity to find relevant context

local_search_query = """
CALL db.index.vector.queryNodes('entities', $k, $embedding)
YIELD node, score
WITH collect(node) as nodes
WITH collect {
    UNWIND nodes as n
    MATCH (n)<-[:HAS_ENTITY]->(c:__Chunk__)
    WITH c, count(distinct n) as freq
    RETURN c.text AS chunkText
    ORDER BY freq DESC
    LIMIT $topChunks
} AS text_mapping,
collect {
    UNWIND nodes as n
    MATCH (n)-[:IN_COMMUNITY]->(c:__Community__)
    WITH c, c.rank as rank, c.weight AS weight
    RETURN c.summary 
    ORDER BY rank, weight DESC
    LIMIT $topCommunities
} AS report_mapping,
collect {
    UNWIND nodes as n
    MATCH (n)-[r:SUMMARIZED_RELATIONSHIP]-(m) 
    WHERE m IN nodes
    RETURN r.summary AS descriptionText
    ORDER BY r.rank, r.weight DESC 
    LIMIT $topInsideRels
} as insideRels,
collect {
    UNWIND nodes as n
    RETURN n.summary AS descriptionText
} as entities
RETURN {Chunks: text_mapping, Reports: report_mapping, 
       Relationships: insideRels, 
       Entities: entities} AS text
"""

In [None]:
# Define local search function using vector similarity and graph context

k_entities = 5

topChunks = 3
topCommunities = 3
topInsideRels = 3


def local_search(query: str) -> str:
    context, _, _ = neo4j_driver.execute_query(
        local_search_query,
        embedding=embed(query)[0],
        topChunks=topChunks,
        topCommunities=topCommunities,
        topInsideRels=topInsideRels,
        k=k_entities,
    )
    context_str = str(context[0]["text"])
    local_messages = [
        {
            "role": "system",
            "content": ch07_tools.get_local_system_prompt(context_str),
        },
        {
            "role": "user",
            "content": query,
        },
    ]
    final_answer = chat(local_messages, model="gpt-4o")
    return final_answer


In [27]:
# Test local search with a specific question about an entity

print(local_search("Who is the most important character?"))

Determining the most important character in the narrative surrounding Ithaca and the events related to Ulysses (Odysseus) involves considering several key figures, each playing a significant role in the unfolding story. The data provides insights into the importance of Ulysses, Telemachus, and Minerva, among others.

### Ulysses (Odysseus)

Ulysses is a central figure in the narrative, renowned for his intelligence, resourcefulness, and leadership during the Trojan War. His absence from Ithaca creates a leadership void and a chaotic situation at his household, which is overrun by suitors vying for his wife's hand [Data: Relationships (2); Entities (1, 6)]. Ulysses' journey and the uncertainty of his return are pivotal to the story, as his fate is intertwined with the will of the gods and the future of Ithaca [Data: Reports (1); Entities (6)].

### Telemachus

Telemachus, the son of Ulysses, emerges as a significant character due to his aspirations to assume leadership in Ithaca and his