In [1]:
from neo4j import GraphDatabase, basic_auth
from neo4j.exceptions import CypherSyntaxError
from pyvis.network import Network
import networkx as nx
import re

# Part 1. Summarise the graphs

In [2]:
# Load the graph
driver = GraphDatabase.driver(
    "neo4j+s://38951e74.databases.neo4j.io", 
    auth=basic_auth("neo4j", "kU3RG9BdAP5lCpqCsLyuLx_0ICL6L8y2WHj6LmdextU"))

# Initialize an empty directed graph
G = nx.DiGraph()

# Run a Cypher query to get your data
with driver.session() as session:
    result = session.run("MATCH (n)-[r]->(m) RETURN n, r, m") # Adjust your query as needed

    for record in result:
        # For each record, add a node for n and m to the graph, and an edge for r
        n = record["n"]["name"] # Adjust as needed
        m = record["m"]["name"] # Adjust as needed
        r_type = record["r"].type # Adjust as needed
        G.add_node(n)
        G.add_node(m)
        G.add_edge(n, m,name=r_type)

# Close the driver connection
driver.close()

# Convert the networkx graph into a pyvis network
net = Network(notebook=True)
net.from_nx(G)

# Plot the graph
net.show("graph.html")

graph.html


In [3]:
# Sample the walks as Node2Vec behavior
from node2vec import Node2Vec
node2vec = Node2Vec(G, dimensions=128, walk_length=40, num_walks=5, workers=2)
walks = node2vec.walks


Computing transition probabilities:   0%|          | 0/91 [00:00<?, ?it/s]

Generating walks (CPU: 2):   0%|          | 0/2 [00:00<?, ?it/s]Generating walks (CPU: 2): 100%|██████████| 2/2 [00:00<00:00, 2728.89it/s]
Generating walks (CPU: 1):   0%|          | 0/3 [00:00<?, ?it/s]Generating walks (CPU: 1): 100%|██████████| 3/3 [00:00<00:00, 3416.48it/s]


In [4]:
# Format the walk to sentences
def complete_walk(G, walk):
    complete_walk = []
    for i in range(len(walk) - 1):
        A = walk[i]
        B = walk[i+1]
        edge_name = G[A][B]['name']  # Get the edge name from A to B
        complete_walk.append(A)
        complete_walk.append(edge_name)  # Add the edge name to the walk
        if i == len(walk) - 2:  # If B is the last node in the walk
            complete_walk.append(B)
    return complete_walk

def format_walk_to_text(walk):

    sentences = []
    i = 0
    while i < len(walk)-2:
        sentences.append(f"{walk[i]} {walk[i+1]} {walk[i+2]}. ")
        i = i +2
    text = "".join(sentences)
    return text

In [5]:
# Now process the walks with the new function
all_walks_sentences = []
for walk in walks:
    comp_walk = complete_walk(G, walk)
    walk_sentence = format_walk_to_text(comp_walk)
    if len(walk_sentence)>0:
        all_walks_sentences.append(walk_sentence)
all_walks_sentences[0]

'Acquisitions comprised_of Brexit_Programme. Brexit_Programme comprised_of human_capital_management_project. human_capital_management_project improved_by Rescope_large_project_which_has_failed_to_deliver_in_2020. Rescope_large_project_which_has_failed_to_deliver_in_2020 enables Reshaping_the_Asset_and_Cost_Base_. Reshaping_the_Asset_and_Cost_Base_ enables Capacity_and_Capability_theme. '

In [6]:
# Transfer the list to document
input_text = "\n\n".join(all_walks_sentences)
print(input_text[:500])

Acquisitions comprised_of Brexit_Programme. Brexit_Programme comprised_of human_capital_management_project. human_capital_management_project improved_by Rescope_large_project_which_has_failed_to_deliver_in_2020. Rescope_large_project_which_has_failed_to_deliver_in_2020 enables Reshaping_the_Asset_and_Cost_Base_. Reshaping_the_Asset_and_Cost_Base_ enables Capacity_and_Capability_theme. 

4._Marketing depends_on 3._Logistics. 3._Logistics operated_by HomeCare_division. HomeCare_division holds Busi


In [7]:
# Use OpenAI to summarise the document
from dotenv import load_dotenv
from langchain.chat_models import ChatOpenAI
import os
from langchain.chains.question_answering import load_qa_chain

load_dotenv()
openai_key = os.getenv('OPENAI_KEY')
os.environ['OPENAI_API_KEY'] = openai_key

chat_model = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k")


In [8]:
query = f"""
        You are a professional project manager.
        Now I have the input text {input_text} where each paragraph is sampled 
        the nodes and egdes from a knowlegde graph from documents. 
        There are some typos in the input context. 
        Can you wirte a good summary of this knowlegde graph for around 200 words
        """
print(chat_model.predict (query))

The knowledge graph provided consists of various interconnected nodes and edges that represent different projects and their dependencies within a business transformation portfolio. The graph includes projects related to acquisitions, divestments, digital transformation, logistics, marketing, and more.

One key theme that emerges from the graph is the importance of agility and innovation in response to market changes, particularly in the context of Brexit. The Brexit Programme is a central project that enables agility and innovation, which in turn drive the overall business transformation. The graph also highlights the significance of partnerships, quality, and service in achieving success.

Another important aspect is the focus on value and purpose. Projects related to reshaping the asset and cost base, as well as those aimed at improving tangible and intangible assets, contribute to the overall value theme. Additionally, the graph emphasizes the need for responsible and sustainable li

# Part 2. Text to graph

In [9]:
# Write a customized prompt templates to extract keywords from the document

# Load the pdf 
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

file_path = "IPA_Cost_Estimating_Guidance.pdf"
my_loader = PyPDFLoader(file_path)
documents = my_loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 0)
docs = text_splitter.split_documents(documents)

In [10]:
all_docs_contents = ""
document_title = "Cost Estimating Guidance A best practice approach for infrastructure projects and programmes"
for doc in docs:
    all_docs_contents+=doc.page_content


half_length = len(all_docs_contents) // 2  # Use floor division to get the integer part
first_half_contents = all_docs_contents[:half_length]
rest_half_contents= all_docs_contents[half_length:]

# Get the important context

In [11]:
query = f"""
        You are a professional project manager.
        Now I give you a document context as {first_half_contents} and the document title is {document_title}.
        Can you please fetch the important sentences are relating to important terminologies from the document according to the title 
        Then modify each of these sentence into format as Noun. verb Noun. (example format: Dog chases cat.)
        Please output these sentences WITHOUT making any other information. 
        """
first_half_sentences = chat_model.predict (query)
first_half_sentences

'Cost estimating process guides project delivery.\nCost estimate is a key factor in project success.\nCost estimate evolves over time as project matures.\nCost estimate should be presented as a range.\nCost estimate is not a single figure.\nCost estimate is linked to project scope and schedule.\nCost estimate should be evidence-based.\nCost estimate should be transparent and robust.\nCost estimate should be reviewed and assured.\nCost estimate should be owned by project leadership.\nCost estimate supports decision-making.\nCost estimate is part of project discipline.\nCost estimate requires qualified and experienced personnel.\nCost estimate should be based on clear assumptions.\nCost estimate should be continuously improved.\nCost estimate should be risk-adjusted.\nCost estimate should be supported by data and evidence.\nCost estimate should be reviewed and validated.\nCost estimate should be produced using appropriate methodology.\nCost estimate should be based on accurate and reliab

In [12]:
query = f"""
        You are a professional project manager.
        Now I give you a document context as {rest_half_contents} and the document title is {document_title}.
        Can you please fetch the important sentences are relating to important terminologies from the document according to the title 
        Then modify each of these sentence into format as Noun. verb Noun. (example format: Dog chases cat.)
        Please output these sentences WITHOUT making any other information.
        """
rest_half_sentences = chat_model.predict (query)

In [13]:
all_sentences = first_half_sentences + "\n" + rest_half_sentences
all_sentences

'Cost estimating process guides project delivery.\nCost estimate is a key factor in project success.\nCost estimate evolves over time as project matures.\nCost estimate should be presented as a range.\nCost estimate is not a single figure.\nCost estimate is linked to project scope and schedule.\nCost estimate should be evidence-based.\nCost estimate should be transparent and robust.\nCost estimate should be reviewed and assured.\nCost estimate should be owned by project leadership.\nCost estimate supports decision-making.\nCost estimate is part of project discipline.\nCost estimate requires qualified and experienced personnel.\nCost estimate should be based on clear assumptions.\nCost estimate should be continuously improved.\nCost estimate should be risk-adjusted.\nCost estimate should be supported by data and evidence.\nCost estimate should be reviewed and validated.\nCost estimate should be produced using appropriate methodology.\nCost estimate should be based on accurate and reliab

# Return kownledge triples

In [14]:
query = f"""
        You are a professional project manager.
        Now I give you a bunch of sentences {all_sentences}.
        Each sentence should be roughly be Noun. verb Noun. format. 
        Can you output a list, where each item in the list in the format as (Noun, verb, Noun).
        For example, if there is sentence like: Dog chases cat, you only return me (Dog, chases, cat) as one item.
        Please output the list WITHOUT making any other information.
        """
triples_list = chat_model.predict (query)
triples_list

'[\n  ("Cost estimating process", "guides", "project delivery"),\n  ("Cost estimate", "is", "key factor"),\n  ("Cost estimate", "evolves", "over time"),\n  ("Cost estimate", "should be presented", "as range"),\n  ("Cost estimate", "is", "not single figure"),\n  ("Cost estimate", "is linked", "to project scope"),\n  ("Cost estimate", "should be", "evidence-based"),\n  ("Cost estimate", "should be", "transparent and robust"),\n  ("Cost estimate", "should be reviewed", "and assured"),\n  ("Cost estimate", "should be owned", "by project leadership"),\n  ("Cost estimate", "supports", "decision-making"),\n  ("Cost estimate", "is part", "of project discipline"),\n  ("Cost estimate", "requires", "qualified and experienced personnel"),\n  ("Cost estimate", "should be based", "on clear assumptions"),\n  ("Cost estimate", "should be continuously", "improved"),\n  ("Cost estimate", "should be", "risk-adjusted"),\n  ("Cost estimate", "should be supported", "by data and evidence"),\n  ("Cost estimat

In [15]:
import ast

triples_list_for_graph = ast.literal_eval(triples_list)

print(triples_list_for_graph)

[('Cost estimating process', 'guides', 'project delivery'), ('Cost estimate', 'is', 'key factor'), ('Cost estimate', 'evolves', 'over time'), ('Cost estimate', 'should be presented', 'as range'), ('Cost estimate', 'is', 'not single figure'), ('Cost estimate', 'is linked', 'to project scope'), ('Cost estimate', 'should be', 'evidence-based'), ('Cost estimate', 'should be', 'transparent and robust'), ('Cost estimate', 'should be reviewed', 'and assured'), ('Cost estimate', 'should be owned', 'by project leadership'), ('Cost estimate', 'supports', 'decision-making'), ('Cost estimate', 'is part', 'of project discipline'), ('Cost estimate', 'requires', 'qualified and experienced personnel'), ('Cost estimate', 'should be based', 'on clear assumptions'), ('Cost estimate', 'should be continuously', 'improved'), ('Cost estimate', 'should be', 'risk-adjusted'), ('Cost estimate', 'should be supported', 'by data and evidence'), ('Cost estimate', 'should be reviewed', 'and validated'), ('Cost estim

In [16]:
# Create a directed graph
G = nx.DiGraph()

# Add edges to the graph
for edge in triples_list_for_graph:
    G.add_edge(edge[0], edge[2], title=edge[1])

# Create a pyvis network
net = Network(notebook=True)
net.from_nx(G)

# Show the graph
net.show("new_graph.html")




new_graph.html
