In [None]:
%pip install --upgrade langchain langchain-experimental langchain-openai python-dotenv pyvis

In [18]:
import json, re

In [11]:
from dotenv import load_dotenv
import os

# Load the .env file
load_dotenv(override=True)
# Get API key from environment variable 
# (make sure the key is present in .env file in the project directory)
api_key = os.getenv("OPENAI_API_KEY")

In [15]:
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.document_loaders import WebBaseLoader
from typing import List, Dict, Tuple, Optional, Iterable
from pydantic import BaseModel, Field

from langchain_community.document_transformers import Html2TextTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter


### LLM Graph Transformer
Using GPT-4o in all examples.

In [None]:
llm = ChatOpenAI(temperature=0, model_name="gpt-4o")

graph_transformer = LLMGraphTransformer(llm=llm)

### Extract Data from Web Sources

In [40]:
URL = "https://zoe.com/learn/foods-that-ease-hot-flashes"

# —— 1) 抓取网页并转纯文本 ——
docs = WebBaseLoader(URL).load()
plain = Html2TextTransformer().transform_documents(docs)

splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=200)
chunks = splitter.split_documents(plain)


In [30]:
text = plain[0].page_content
index = text.find("https")
trimmed_text = text[:index]
trimmed_text

"Hot Flashes in Menopause: Foods To Eat and Foods To AvoidAccessibility\nStatementZOE App & Gut Health TestNewDaily30+Our\nscienceMenoScaleLibraryFAQsInvestMenuZOE App & Gut Health TestNewDaily30+Our\nscienceMenoScaleLibraryFAQsInvestNutritionGut HealthCOVIDHealthy LivingLife\nStagesHealth ConditionsPodcastsLife StagesMenopauseUpdated 14th October\n2024What foods help ease hot flashes in menopause?Written byCaroline Thomason,\nRD, CDCESReviewed byKate Bermingham, PhDShare this articleMenopauseFoods to\neatFoods to avoidSummaryHot flashes are a very common menopause symptom, and\nresearch suggests that eating certain foods and avoiding others may help.\nMenopause is the point in a woman’s life when she hasn’t had a period for 12\nmonths due to certain hormonal changes. The time leading up to it is called\nperimenopause and generally lasts for several years.The levels of hormones in\nthe body fluctuate throughout this time, which is why hot flashes are so\ncommon during perimenopause and

In [45]:
SCHEMA_HINT = """
You are extracting a menopause knowledge graph from text.
You need to identify all concepts such as symptoms, food items, bioactive chemicals, and their relationships.

[RULES]
- Map synonyms to one concept, for example: "hot flashes"/"hot flushes" to the same Symptom.
- Keep names concise (e.g., 'hot flashes', 'soy foods', 'Mediterranean diet'). 

Return triples only; no summaries.
"""

llm = ChatOpenAI(model=os.getenv("OPENAI_MODEL_NAME", "gpt-4o"), temperature=0)

graph_prompt = ChatPromptTemplate.from_messages([
("system", SCHEMA_HINT + "Return triples only; do not summarize."),
("human", "Extract menopause knowledge graph triples from given text. Return only graph objects."),
])

extractor = LLMGraphTransformer(llm=llm, prompt=graph_prompt)

In [47]:
documents = [Document(page_content=text, metadata={"source": "manual_string"})]
graph_documents = extractor.convert_to_graph_documents(documents)

In [48]:
print(f"Nodes:{graph_documents[0].nodes}")
print(f"Relationships:{graph_documents[0].relationships}")

Nodes:[Node(id='Hot Flashes', type='Symptom', properties={}), Node(id='Soy Foods', type='Food', properties={}), Node(id='Isoflavones', type='Bioactive chemical', properties={}), Node(id='Mediterranean Diet', type='Food', properties={}), Node(id='Fruits', type='Food', properties={}), Node(id='Vegetables', type='Food', properties={}), Node(id='Whole Grains', type='Food', properties={}), Node(id='Nuts', type='Food', properties={}), Node(id='Seeds', type='Food', properties={})]
Relationships:[Relationship(source=Node(id='Soy Foods', type='Food', properties={}), target=Node(id='Isoflavones', type='Bioactive chemical', properties={}), type='CONTAIN', properties={}), Relationship(source=Node(id='Isoflavones', type='Bioactive chemical', properties={}), target=Node(id='Hot Flashes', type='Symptom', properties={}), type='REDUCE', properties={}), Relationship(source=Node(id='Mediterranean Diet', type='Food', properties={}), target=Node(id='Fruits', type='Food', properties={}), type='INCLUDE', pro

### Extract graph data

In [6]:
text = """
Albert Einstein[a] (14 March 1879 – 18 April 1955) was a German-born theoretical physicist who is best known for developing the theory of relativity. Einstein also made important contributions to quantum mechanics.[1][5] His mass–energy equivalence formula E = mc2, which arises from special relativity, has been called "the world's most famous equation".[6] He received the 1921 Nobel Prize in Physics for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect.[7]

Born in the German Empire, Einstein moved to Switzerland in 1895, forsaking his German citizenship (as a subject of the Kingdom of Württemberg)[note 1] the following year. In 1897, at the age of seventeen, he enrolled in the mathematics and physics teaching diploma program at the Swiss federal polytechnic school in Zurich, graduating in 1900. He acquired Swiss citizenship a year later, which he kept for the rest of his life, and afterwards secured a permanent position at the Swiss Patent Office in Bern. In 1905, he submitted a successful PhD dissertation to the University of Zurich. In 1914, he moved to Berlin to join the Prussian Academy of Sciences and the Humboldt University of Berlin, becoming director of the Kaiser Wilhelm Institute for Physics in 1917; he also became a German citizen again, this time as a subject of the Kingdom of Prussia.[note 1] In 1933, while Einstein was visiting the United States, Adolf Hitler came to power in Germany. Horrified by the Nazi persecution of his fellow Jews,[8] he decided to remain in the US, and was granted American citizenship in 1940.[9] On the eve of World War II, he endorsed a letter to President Franklin D. Roosevelt alerting him to the potential German nuclear weapons program and recommending that the US begin similar research.

In 1905, sometimes described as his annus mirabilis (miracle year), he published four groundbreaking papers.[10] In them, he outlined a theory of the photoelectric effect, explained Brownian motion, introduced his special theory of relativity, and demonstrated that if the special theory is correct, mass and energy are equivalent to each other. In 1915, he proposed a general theory of relativity that extended his system of mechanics to incorporate gravitation. A cosmological paper that he published the following year laid out the implications of general relativity for the modeling of the structure and evolution of the universe as a whole.[11][12] In 1917, Einstein wrote a paper which introduced the concepts of spontaneous emission and stimulated emission, the latter of which is the core mechanism behind the laser and maser, and which contained a trove of information that would be beneficial to developments in physics later on, such as quantum electrodynamics and quantum optics.[13]

In the middle part of his career, Einstein made important contributions to statistical mechanics and quantum theory. Especially notable was his work on the quantum physics of radiation, in which light consists of particles, subsequently called photons. With physicist Satyendra Nath Bose, he laid the groundwork for Bose–Einstein statistics. For much of the last phase of his academic life, Einstein worked on two endeavors that ultimately proved unsuccessful. First, he advocated against quantum theory's introduction of fundamental randomness into science's picture of the world, objecting that God does not play dice.[14] Second, he attempted to devise a unified field theory by generalizing his geometric theory of gravitation to include electromagnetism. As a result, he became increasingly isolated from mainstream modern physics.
"""

In [7]:
documents = [Document(page_content=text)]
graph_documents = await graph_transformer.aconvert_to_graph_documents(documents)

#### Visualize graph

In [49]:
from pyvis.network import Network

def visualize_graph(graph_documents):

    # Create network
    net = Network(height="1200px", width="100%", directed=True,
                      notebook=False, bgcolor="#222222", font_color="white")
    
    nodes = graph_documents[0].nodes
    relationships = graph_documents[0].relationships

    # Build lookup for valid nodes
    node_dict = {node.id: node for node in nodes}
    
    # Filter out invalid edges and collect valid node IDs
    valid_edges = []
    valid_node_ids = set()
    for rel in relationships:
        if rel.source.id in node_dict and rel.target.id in node_dict:
            valid_edges.append(rel)
            valid_node_ids.update([rel.source.id, rel.target.id])


    # Track which nodes are part of any relationship
    connected_node_ids = set()
    for rel in relationships:
        connected_node_ids.add(rel.source.id)
        connected_node_ids.add(rel.target.id)

    # Add valid nodes
    for node_id in valid_node_ids:
        node = node_dict[node_id]
        try:
            net.add_node(node.id, label=node.id, title=node.type, group=node.type)
        except:
            continue  # skip if error

    # Add valid edges
    for rel in valid_edges:
        try:
            net.add_edge(rel.source.id, rel.target.id, label=rel.type.lower())
        except:
            continue  # skip if error

    # Configure physics
    net.set_options("""
            {
                "physics": {
                    "forceAtlas2Based": {
                        "gravitationalConstant": -100,
                        "centralGravity": 0.01,
                        "springLength": 200,
                        "springConstant": 0.08
                    },
                    "minVelocity": 0.75,
                    "solver": "forceAtlas2Based"
                }
            }
            """)
        
    output_file = "knowledge_graph.html"
    net.save_graph(output_file)
    print(f"Graph saved to {os.path.abspath(output_file)}")

    # Try to open in browser
    try:
        import webbrowser
        webbrowser.open(f"file://{os.path.abspath(output_file)}")
    except:
        print("Could not open browser automatically")
        
# Run the function
visualize_graph(graph_documents)

Graph saved to /Users/ml5128/Documents/BINFG4003_SymbolicAI/Project/menopause_knowledge_graph_by_LLM/knowledge_graph.html


### Extract specific types of nodes

In [40]:
allowed_nodes = ["Person", "Organization", "Location", "Award", "ResearchField"]
graph_transformer_nodes_defined = LLMGraphTransformer(llm=llm, allowed_nodes=allowed_nodes)
graph_documents_nodes_defined = await graph_transformer_nodes_defined.aconvert_to_graph_documents(documents)

In [41]:
print(f"Nodes:{graph_documents_nodes_defined[0].nodes}")
print(f"Relationships:{graph_documents_nodes_defined[0].relationships}")

Nodes:[Node(id='Albert Einstein', type='Person', properties={}), Node(id='German Empire', type='Location', properties={}), Node(id='Switzerland', type='Location', properties={}), Node(id='University Of Zurich', type='Organization', properties={}), Node(id='Prussian Academy Of Sciences', type='Organization', properties={}), Node(id='Humboldt University Of Berlin', type='Organization', properties={}), Node(id='Kaiser Wilhelm Institute For Physics', type='Organization', properties={}), Node(id='United States', type='Location', properties={}), Node(id='Franklin D. Roosevelt', type='Person', properties={}), Node(id='Nobel Prize In Physics', type='Award', properties={}), Node(id='Theory Of Relativity', type='Researchfield', properties={}), Node(id='Quantum Mechanics', type='Researchfield', properties={}), Node(id='Photoelectric Effect', type='Researchfield', properties={}), Node(id='Special Relativity', type='Researchfield', properties={}), Node(id='General Relativity', type='Researchfield',

### Extract specific types of relationships

In [42]:
allowed_nodes = ["Person", "Organization", "Location", "Award", "ResearchField"]
allowed_relationships = [
    ("Person", "WORKS_AT", "Organization"),
    ("Person", "SPOUSE", "Person"),
    ("Person", "AWARD", "Award"),
    ("Organization", "IN_LOCATION", "Location"),
    ("Person", "FIELD_OF_RESEARCH", "ResearchField")
]
graph_transformer_rel_defined = LLMGraphTransformer(
  llm=llm,
  allowed_nodes=allowed_nodes,
  allowed_relationships=allowed_relationships
)
graph_documents_rel_defined = await graph_transformer_rel_defined.aconvert_to_graph_documents(documents)

In [43]:
# Visualize graph
visualize_graph(graph_documents_rel_defined)

Graph saved to /Users/thuvu/Documents/vlogging/Research/knowledge_graph_app/knowledge_graph.html
