<a href="https://colab.research.google.com/github/joshuaalpuerto/ML-guide/blob/main/Create_Knowledge_Graph.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Create knowledge graph for DEU immigration law

This notebook uses the technique from this sources:
- https://github.com/rahulnyk/knowledge_graph
- https://towardsdatascience.com/how-to-convert-any-text-into-a-graph-of-concepts-110844f22a1a

Observation:
It produces some good concept and relationship between those concepts. However, because of the content of document, seems creating concepts and adding edges for each chunk doesn't provide good relationship.

It's better to graph it with Each chapter->sections->paragraphs. As each paragraphs are isolated concept for specific law.

In [None]:
!pip install -q pypdf --progress-bar off
!pip install -qU langchain --progress-bar off
!pip install -qU langchainhub --progress-bar off
!pip install -qU openai --progress-bar off
!pip install -qU langchain-fireworks --progress-bar off
!pip install -qU pyvis

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m756.0/756.0 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m43.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# @title Connect to google drive
from google.colab import drive
import os
import json

drive.mount('/content/drive')

with open('/content/drive/MyDrive/env/env.json') as jsonfile:
    env = json.load(jsonfile)

Mounted at /content/drive


# Regenerate graph

In [None]:
# @title Load llm
from langchain.chat_models import ChatOpenAI

# Initialize a Fireworks chat model
# For function calling we cannot use ChatFireworks integration as it doesn't properly pass functions
llm = ChatOpenAI(model="huggingfaceh4/zephyr-7b-beta:free",
        openai_api_key=env['openrouter.ai']['apiKey'],
        openai_api_base="https://openrouter.ai/api/v1",
        temperature= 0, max_tokens= 4096
      )

In [None]:
# @title Helper functions
import pandas as pd
import numpy as np
from textwrap import dedent
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
import uuid
import re
import json



def graphPrompt(input: str, metadata={}):
    prompt = PromptTemplate.from_template(
        dedent("""<|system|>You are a network graph maker who extracts terms and their relations from a given context.
        You are provided with a context chunk (delimited by ```) Your task is to extract the ontology
        of terms mentioned in the given context. These terms should represent the key concepts as per the context.

        Thought 1: While traversing through each sentence, Think about the key terms mentioned in it.
            Terms may include object, entity, location, organization, person,
            condition, acronym, documents, service, concept, etc.
            Terms should be as atomistic as possible

        Thought 2: Think about how these terms can have one on one relation with other terms.
            Terms that are mentioned in the same sentence or the same paragraph are typically related to each other.
            Terms can be related to many other terms

        Thought 3: Find out the relation between each such related pair of terms.

        You must format your output as a list of json. Each element of the list contains a pair of terms
        and the relation between them, like the following:
        ```json
        [
           {{
              "node_1": "A concept from extracted ontology",
              "node_2": "A related concept from extracted ontology",
              "edge": "relationship between the two concepts, node_1 and node_2 in one or two sentences"
           }}
        ]
        ```

        If you do your BEST WORK, I'll give you a $10,000 tip!</s>
        <|user|>
        context: ```{context}```
        <|assistant|>
        """)
    )

    try:
        llm_chain = LLMChain(prompt=prompt, llm=llm)
        response = llm_chain.invoke({ "context": input})
        text = response['text']
        if "```json" in text:
          json_str = re.search(r'```json(.*?)```', text, re.DOTALL).group(1).strip()
        else :
          json_str = text

        result = json.loads(json_str)
        result = [dict(item, **metadata) for item in result]
    except:
        error_info =  {
            "context": input,
             **metadata
        }
        print("\n\nERROR ### Here is the buggy response: ", error_info, "\n\n")
        result = None

    return result


def documents2Dataframe(documents) -> pd.DataFrame:
    rows = []
    for chunk in documents:
        row = {
            "text": chunk.page_content,
            **chunk.metadata,
            "chunk_id": uuid.uuid4().hex,
        }
        rows = rows + [row]

    df = pd.DataFrame(rows)
    return df

def df2Graph(dataframe: pd.DataFrame) -> list:
    # dataframe.reset_index(inplace=True)
    results = dataframe.apply(
        lambda row: graphPrompt(row.text, {"chunk_id": row.chunk_id}), axis=1
    )
    # invalid json results in NaN
    results = results.dropna()
    results = results.reset_index(drop=True)

    ## Flatten the list of lists to one single list of entities.
    concept_list = np.concatenate(results).ravel().tolist()
    return concept_list

def graph2Df(nodes_list) -> pd.DataFrame:
    ## Remove all NaN entities
    graph_dataframe = pd.DataFrame(nodes_list).replace(" ", np.nan)
    graph_dataframe = graph_dataframe.dropna(subset=["node_1", "node_2"])
    graph_dataframe["node_1"] = graph_dataframe["node_1"].apply(lambda x: x.lower())
    graph_dataframe["node_2"] = graph_dataframe["node_2"].apply(lambda x: x.lower())

    return graph_dataframe

In [None]:
# @title Load pdf
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

loader = PyPDFLoader("/content/drive/MyDrive/knowledge_graph/deu_englisch_aufenthg.pdf")
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=200
)

pages = splitter.split_documents(documents)
print("Number of chunks = ", len(pages))

df = documents2Dataframe(pages)
print(df.shape)
df.head()

Number of chunks =  354
(354, 4)


Unnamed: 0,text,source,page,chunk_id
0,Service provided by the Federal Ministry of Ju...,/content/drive/MyDrive/knowledge_graph/deu_eng...,0,d7fefa94042944198fc8acd3930535a7
1,"bearing the consequences thereof (OJ L 212, p...",/content/drive/MyDrive/knowledge_graph/deu_eng...,0,e1aba25951fc4b9fa7309b424aa6a9e3
2,Service provided by the Federal Ministry of Ju...,/content/drive/MyDrive/knowledge_graph/deu_eng...,1,eafc569b864b461089b8b6a6fc9a0c1b
3,"applies, insofar as this may constitute a prer...",/content/drive/MyDrive/knowledge_graph/deu_eng...,1,9ab8c1ea35004290abeb62d0bd7f1988
4,Service provided by the Federal Ministry of Ju...,/content/drive/MyDrive/knowledge_graph/deu_eng...,2,98e799b5482c4f928710103f81463c39


In [None]:
from pathlib import Path

prefix = "deu_englisch_aufenthg"
outputdirectory = Path(f"/content/drive/MyDrive/knowledge_graph/")
BATCH_SIZE = 5

In [None]:
# @title Main graph creator
from tqdm import tqdm
import time

def createAndSaveGraph(df, batch_number=0):
    concepts_list = df2Graph(df)
    dfg1 = graph2Df(concepts_list)
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)

    dfg1.to_csv(outputdirectory/f"{prefix}_graph_{batch_number}.csv", sep="|", index=False)


# Define a function to load data in batches
def batch_loader(df, batch_size=BATCH_SIZE):
    num_batches = len(df) // batch_size
    remainder = len(df) % batch_size

    if remainder > 0:
        num_batches += 1
    for i in tqdm(range(num_batches), desc='Batch Processing'):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(df))
        current_df = df.iloc[start_idx:end_idx]

        print(f"Processing: {start_idx} - {end_idx} chunks")
        createAndSaveGraph(current_df, i)
        # we need to create delay to prevent rate limit
        time.sleep(20)



batch_loader(df)
df.to_csv(outputdirectory/f"{prefix}_chunks.csv", sep="|", index=False)

In [None]:
# @title concat generated csv files
import pandas as pd

# sometimes LLM generate node_3, node_4 ... we will remove those.
def drop_extra_columns(df, node_number = 2):
    columns_to_drop = [col for col in df.columns if 'node' in col and int(col.split('_')[-1]) > node_number]
    df.drop(columns=columns_to_drop, inplace=True)

    return df

def merge_sub_graphs():
  # read the stored df_chunks
  df_chunks = pd.read_csv(f"{outputdirectory}/{prefix}_chunks.csv", sep="|")
  num_batches = len(df_chunks) // BATCH_SIZE
  # List to store DataFrames
  graph_dfs = []
  for i in range(num_batches):
      temp_df = pd.read_csv(f"{outputdirectory}/{prefix}_graph_{i}.csv", sep="|")
      # Append DataFrame to the list
      graph_dfs.append(temp_df)

  # Concatenate all DataFrames into a single DataFrame
  df_graph = pd.concat(graph_dfs, ignore_index=True)
  df_graph =  drop_extra_columns(df_graph)
  df_graph.replace("", np.nan, inplace=True)
  df_graph.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
  df_graph['count'] = 4

  return df_graph


df_graph = merge_sub_graphs()
df_graph.to_csv(outputdirectory/f"{prefix}_graph.csv", sep="|", index=False)

# Create contextual proximity

In [None]:
import numpy as np

def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    # Melt the dataframe into a list of nodes
    # Merge node_1 and node_2 to single column called "node"
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )

    # Using .melt PD creates a column called `variable` to display node_1 and node_2. We don't need it so drop it.
    dfg_long.drop(columns=["variable"], inplace=True)

    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    # since each chunks create many it's concepts we need to link them back
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))


    # drop self loops (where node_1 and node_2 is equal)
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)

    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )

    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)

    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"

    return dfg2


df_graph_contextual_proximity = contextual_proximity(df_graph)
df_graph_contextual_proximity.head(300)

Unnamed: 0,node_1,node_2,chunk_id,count,edge
0,(14),(15),"7c88a012fd5040a8b79cf0af3353ebb2,7c88a012fd504...",2,contextual proximity
1,(14),(16),"7c88a012fd5040a8b79cf0af3353ebb2,7c88a012fd504...",2,contextual proximity
14,(14),section 60c (1) sentence 1 no. 2,"7c88a012fd5040a8b79cf0af3353ebb2,7c88a012fd504...",2,contextual proximity
20,(15),(14),"7c88a012fd5040a8b79cf0af3353ebb2,7c88a012fd504...",2,contextual proximity
21,(15),(16),"7c88a012fd5040a8b79cf0af3353ebb2,7c88a012fd504...",4,contextual proximity
...,...,...,...,...,...
781,action,ban,"3b7163ac94bc42669f2f4317ae10cf2a,3b7163ac94bc4...",20,contextual proximity
782,action,calculation of residence periods,"3b7163ac94bc42669f2f4317ae10cf2a,3b7163ac94bc4...",5,contextual proximity
788,action,enforceably required,"b895dd5819524eda86f6e120a98356c8,b895dd5819524...",3,contextual proximity
789,action,entry,"3b7163ac94bc42669f2f4317ae10cf2a,3b7163ac94bc4...",15,contextual proximity


In [None]:
# @title Merge both semantic relationship and contextual proximity
final_df_graph = pd.concat([df_graph, df_graph_contextual_proximity], axis=0)
final_df_graph = (
    final_df_graph.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
    .reset_index()
)

final_df_graph

Unnamed: 0,node_1,node_2,chunk_id,edge,count
0,(14),(15),"7c88a012fd5040a8b79cf0af3353ebb2,7c88a012fd504...","both refer to specific sections in the text,co...",6
1,(14),(16),"7c88a012fd5040a8b79cf0af3353ebb2,7c88a012fd504...",contextual proximity,2
2,(14),section 60c (1) sentence 1 no. 2,"7c88a012fd5040a8b79cf0af3353ebb2,7c88a012fd504...",contextual proximity,2
3,(15),(14),"7c88a012fd5040a8b79cf0af3353ebb2,7c88a012fd504...",contextual proximity,2
4,(15),(16),"7c88a012fd5040a8b79cf0af3353ebb2,7c88a012fd504...","both refer to specific sections in the text,co...",8
...,...,...,...,...,...
24931,youth welfare office,section 49 (5),"4b70d44d08984795959e2e534efe7b66,4b70d44d08984...",contextual proximity,2
24932,youth welfare office,section 49 (8),"4b70d44d08984795959e2e534efe7b66,4b70d44d08984...",contextual proximity,2
24933,youth welfare office,section 71a,"4b70d44d08984795959e2e534efe7b66,4b70d44d08984...",contextual proximity,2
24934,youth welfare office,section 98 (2a) no. 1,"4b70d44d08984795959e2e534efe7b66,4b70d44d08984...",contextual proximity,4


# Calculate the NetworkX Graph

In [None]:
nodes = pd.concat([final_df_graph['node_1'], final_df_graph['node_2']], axis=0).unique()
nodes.shape

(2321,)

In [None]:
import networkx as nx
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)
    )

## Add edges to the graph
for index, row in final_df_graph.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row['count']/4
    )

In [None]:
# @title Calculate communities for coloring the nodes
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

Number of Communities =  26


In [None]:
# @title Create a dataframe for community colors
import random
import seaborn as sns
palette = "hls"

## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)
colors

In [None]:
# @title Add colors to the graph
for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]

In [None]:
from pyvis.network import Network

graph_output_directory = f"/content/drive/MyDrive/knowledge_graph/{prefix}_kg.html"

net = Network(
    notebook=False,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=False,
)

net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])

net.show(graph_output_directory, notebook=False)

/content/drive/MyDrive/knowledge_graph/deu_englisch_aufenthg_kg.html
