## Setup

In [44]:
import pandas as pd
import numpy as np
import os
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random


In [45]:
# packages used by helper functions
import uuid

In [46]:
# packages for prompting definitions
import sys
sys.path.append("..")
import json

In [47]:
from langchain_community.llms import Ollama

In [48]:
import ollama
from ollama import Client
client = Client(host='http://localhost:11434')

## Prompt definitions (function to extract triplets in JSON-format for knowledge graph from text chunks)

In [49]:
#################################
# Definition of used LLM
#################################
##########################################################################
def graphPrompt(input: str, metadata={}, model="mixtral:latest"):
    if model == None:
        model = "mixtral:latest"
    
    chunk_id = metadata.get('chunk_id', None)

    # model_info = client.show(model_name=model)
    # print( chalk.blue(model_info))

    SYS_PROMPT = ("You are a network graph maker who extracts terms and their relations from a given context. "
        "You are provided with a context chunk (delimited by ```) Your task is to extract the ontology "
        "of terms mentioned in the given context. These terms should represent the key concepts as per the context. \n"
        "Thought 1: While traversing through each sentence, Think about the key terms mentioned in it.\n"
            "\tTerms may include person (agent), location, organization, date, duration, \n"
            "\tcondition, concept, object, entity  etc.\n"
            "\tTerms should be as atomistic as possible\n\n"
        "Thought 2: Think about how these terms can have one on one relation with other terms.\n"
            "\tTerms that are mentioned in the same sentence or the same paragraph are typically related to each other.\n"
            "\tTerms can be related to many other terms\n\n"
        "Thought 3: Find out the relation between each such related pair of terms. \n\n"
        "Format your output as a list of json. Each element of the list contains a pair of terms"
        "and the relation between them like the follwing. NEVER change the value of the chunk_ID as defined in this prompt: \n"
        "[\n"
        "   {\n"
        '       "chunk_id": "CHUNK_ID_GOES_HERE",\n'
        '       "node_1": "A concept from extracted ontology",\n'
        '       "node_2": "A related concept from extracted ontology",\n'
        '       "edge": "relationship between the two concepts, node_1 and node_2 in one or two sentences"\n' 
        "   }, {...}\n"
        "]"
    )
    SYS_PROMPT = SYS_PROMPT.replace('CHUNK_ID_GOES_HERE', chunk_id)

    USER_PROMPT = f"context: ```{input}``` \n\n output: "

    response = client.generate(model="mixtral:latest", system=SYS_PROMPT, prompt=USER_PROMPT)

    aux1 = response['response']
    # Find the index of the first open bracket '['
    start_index = aux1.find('[')
    # Slice the string from start_index to extract the JSON part and fix an unexpected problem with insertes escapes (WHY ?)
    json_string = aux1[start_index:]
    json_string = json_string.replace('\\\\\_', '_')
    json_string = json_string.replace('\\\\_', '_')
    json_string = json_string.replace('\\\_', '_')
    json_string = json_string.replace('\\_', '_')
    json_string = json_string.replace('\_', '_')
    json_string.lstrip() # eliminate eventual leading blank spaces
#####################################################
    print("json-string:\n" + json_string)
#####################################################         
    try:
        result = json.loads(json_string)
        result = [dict(item) for item in result]
    except:
        print("\n\nERROR ### Here is the buggy response: ", response, "\n\n")
        result = None
    print("§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§")

    return result

## Functions

In [50]:
def documents2Dataframe(documents) -> pd.DataFrame:
    rows = []
    for chunk in documents:
        row = {
            "text": chunk.page_content,
            **chunk.metadata,
            "chunk_id": uuid.uuid4().hex,
        }
        rows = rows + [row]

    df = pd.DataFrame(rows)
    return df

In [51]:
def df2Graph(dataframe: pd.DataFrame, model=None) -> list:
    # dataframe.reset_index(inplace=True)
    results = dataframe.apply(
        lambda row: graphPrompt(row.text, {"chunk_id": row.chunk_id}, model), axis=1
    )
    # invalid json results in NaN
    results = results.dropna()
    results = results.reset_index(drop=True)

    ## Flatten the list of lists to one single list of entities.
    concept_list = np.concatenate(results).ravel().tolist()
    return concept_list

In [52]:
def graph2Df(nodes_list) -> pd.DataFrame:
    ## Remove all NaN entities
    graph_dataframe = pd.DataFrame(nodes_list).replace(" ", np.nan)
    graph_dataframe = graph_dataframe.dropna(subset=["node_1", "node_2"])
    graph_dataframe["node_1"] = graph_dataframe["node_1"].apply(lambda x: x.lower())
    graph_dataframe["node_2"] = graph_dataframe["node_2"].apply(lambda x: x.lower())

    return graph_dataframe

In [53]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2

## Variables

In [54]:
## Input data directory
##########################################################
input_file_name = "Saxony_Eastern_Expansion_EP_96.txt"
input_file_name = "Harry_Potter_1_chapter_1.txt"
##########################################################
data_dir = "HotG_Data/"+input_file_name
inputdirectory = Path(f"./{data_dir}")

## This is where the output csv files will be written
outputdirectory = Path(f"./data_output")

In [55]:
output_graph_file_name = f"graph_{input_file_name[:-4]}.csv"
output_graph_file_with_path = outputdirectory/output_graph_file_name

output_chunks_file_name = f"chunks_{input_file_name[:-4]}.csv"
output_chunks_file_with_path = outputdirectory/output_chunks_file_name

output_context_prox_file_name = f"graph_contex_prox_{input_file_name[:-4]}.csv"
output_context_prox_file_with_path = outputdirectory/output_context_prox_file_name

print(output_graph_file_with_path)
print(output_chunks_file_with_path)
print(output_context_prox_file_with_path)

data_output/graph_Harry_Potter_1_chapter_1.csv
data_output/chunks_Harry_Potter_1_chapter_1.csv
data_output/graph_contex_prox_Harry_Potter_1_chapter_1.csv


## Load Documents

In [56]:
#loader = TextLoader("./HotG_Data/Hanse.txt")
loader = TextLoader(inputdirectory)
Document = loader.load()
# clean unnecessary line breaks
Document[0].page_content = Document[0].page_content.replace("\n", " ")

In [57]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(Document)
print("Number of chunks = ", len(pages))
print(pages[5].page_content)

Number of chunks =  29
owl-free morning. He yelled at five different people. He made several important telephone calls and shouted a bit more. He was in a very good mood until lunchtime, when he thought he'd stretch his legs and walk across the road to buy himself a bun from the bakery. He'd forgotten all about the people in cloaks until he passed a group of them next to the baker's. He eyed them angrily as he passed. He didn't know why, but they made him uneasy. This bunch were whispering excitedly, too, and he couldn't see a single collecting tin. It was on his way back past them, clutching a large doughnut in a bag, that he caught a few words of what they were saying. "The Potters, that's right, that's what I heard yes, their son, Harry" Mr. Dursley stopped dead. Fear flooded him. He looked back at the whisperers as if he wanted to say something to them, but thought better of it. He dashed back across the road, hurried up to his office, snapped at his secretary not to disturb him, s

## Create a dataframe of all the chunks

In [58]:
df = documents2Dataframe(pages)
print(df.shape)
df.head()

(29, 3)


Unnamed: 0,text,source,chunk_id
0,"Mr. and Mrs. Dursley, of number four, Privet D...",HotG_Data/Harry_Potter_1_chapter_1.txt,55fba9390d9b40a9b2080b407539097c
1,found out about the Potters. Mrs. Potter was M...,HotG_Data/Harry_Potter_1_chapter_1.txt,48e4a5ce2c81495db5523f7dcbc077d6
2,wrestled a screaming Dudley into his high chai...,HotG_Data/Harry_Potter_1_chapter_1.txt,301738c0377846e784ed8b1b3c46a026
3,Mr. Dursley drove around the corner and up the...,HotG_Data/Harry_Potter_1_chapter_1.txt,5201307e46a34a59b226337712056aac
4,excitedly together. Mr. Dursley was enraged to...,HotG_Data/Harry_Potter_1_chapter_1.txt,4bf0d253006d4bb79aea3ded8b287a68


## Extract Concepts

In [61]:
## To regenerate the graph with LLM, set this to True
##################
regenerate = False  # toggle to True if the time-consuming (re-)generation of the knowlege extraction is required
regenerate = True
##################
if regenerate:
#########################################################    
    concepts_list = df2Graph(df, model='mixtral:latest')
#########################################################
    dfg1 = graph2Df(concepts_list)
    
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)
    
    dfg1.to_csv(output_graph_file_with_path, sep=";", index=False)
    df.to_csv(output_chunks_file_with_path, sep=";", index=False)
else:
    dfg1 = pd.read_csv(output_graph_file_with_path, sep=";")

dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1['count'] = 4 
## Increasing the weight of the relation to 4. 
## We will assign the weight of 1 when later the contextual proximity will be calculated.  
print(dfg1.shape)
dfg1.head()

json-string:
[
   {
       "chunk_id": "55fba9390d9b40a9b2080b407539097c",
       "node_1": "Mr. Dursley",
       "node_2": "Grunnings",
       "edge": "Mr. Dursley is the director of Grunnings, a firm that makes drills."
   },
   {
       "chunk_id": "55fba9390d9b40a9b2080b407539097c",
       "node_1": "Dudley",
       "node_2": "The Dursleys",
       "edge": "Dudley is the small son of the Dursleys."
   },
   {
       "chunk_id": "55fba9390d9b40a9b2080b407539097c",
       "node_1": "The Dursleys",
       "node_2": "secret",
       "edge": "The Dursleys have a secret, and their greatest fear is that somebody would discover it."
   },
   {
       "chunk_id": "55fba9390d9b40a9b2080b407539097c",
       "node_1": "The Dursleys",
       "node_2": "Mrs. Potter's family",
       "edge": "The Dursleys' greatest fear is that someone would find out about the Potters."
   }
]
§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§
json-string:
[
   {
       "chunk_id": "48e

Unnamed: 0,chunk_id,node_1,node_2,edge,count
0,55fba9390d9b40a9b2080b407539097c,mr. dursley,grunnings,"Mr. Dursley is the director of Grunnings, a fi...",4
1,55fba9390d9b40a9b2080b407539097c,dudley,the dursleys,Dudley is the small son of the Dursleys.,4
2,55fba9390d9b40a9b2080b407539097c,the dursleys,secret,"The Dursleys have a secret, and their greatest...",4
3,55fba9390d9b40a9b2080b407539097c,the dursleys,mrs. potter's family,The Dursleys' greatest fear is that someone wo...,4
4,48e4a5ce2c81495db5523f7dcbc077d6,mr. dursley,mrs. dursley's sister's husband,Mr. Dursley considers Mrs. Dursley's sister's ...,4


## Calculating contextual proximity

In [62]:
dfg2 = contextual_proximity(dfg1)
dfg2.to_csv(output_context_prox_file_with_path, sep=";", index=False)
dfg2.tail()

Unnamed: 0,node_1,node_2,chunk_id,count,edge
2221,writing a letter to the dursleys,hagrid,"5f82479908824d8b840c6a2bedd670c1,5f82479908824...",2,contextual proximity
2231,you-know-who,dumbledore,"17c96105c1404e8fa8440d826854b0dd,17c96105c1404...",2,contextual proximity
2233,you-know-who,powers,"17c96105c1404e8fa8440d826854b0dd,17c96105c1404...",2,contextual proximity
2234,you-know-who,professor mcgonagall,"17c96105c1404e8fa8440d826854b0dd,17c96105c1404...",2,contextual proximity
2236,you-know-who,voldemort,"17c96105c1404e8fa8440d826854b0dd,17c96105c1404...",2,contextual proximity


### Merge both the dataframes

In [63]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
    .reset_index()
)
dfg

Unnamed: 0,node_1,node_2,chunk_id,edge,count
0,'potter',mr. dursley,"5e6a39886ae44e48ade45448fc9051ca,5e6a39886ae44...",contextual proximity,2
1,'potter',mrs. dursley,"5e6a39886ae44e48ade45448fc9051ca,5e6a39886ae44...",contextual proximity,2
2,'potter',petunia,"5e6a39886ae44e48ade45448fc9051ca,5e6a39886ae44...",contextual proximity,2
3,'potter',sister of petunia,"5e6a39886ae44e48ade45448fc9051ca,5e6a39886ae44...",The name 'Potter' is related to Petunia's sist...,7
4,a piercing stare,are dead,"26c5b0d9c07d41e7baccd8e596c93446,26c5b0d9c07d4...",contextual proximity,2
...,...,...,...,...,...
789,you-know-who,dumbledore,"17c96105c1404e8fa8440d826854b0dd,17c96105c1404...",contextual proximity,2
790,you-know-who,powers,"17c96105c1404e8fa8440d826854b0dd,17c96105c1404...",contextual proximity,2
791,you-know-who,professor mcgonagall,"17c96105c1404e8fa8440d826854b0dd,17c96105c1404...",contextual proximity,2
792,you-know-who,voldemort,"17c96105c1404e8fa8440d826854b0dd,17c96105c1404...",You-Know-Who is a nickname or alias for the ac...,6


## Calculate the NetworkX Graph

In [64]:
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()
nodes.shape

(192,)

In [65]:
import networkx as nx
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)
    )

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row['count']/4
    )

### Calculate communities for coloring the nodes

In [66]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

Number of Communities =  12
[["'potter'", 'bakery', 'car', 'cat', 'collecting tin', "daughter's problems", 'driving', 'dudley', "dudley's age", 'fear', 'five different people', 'frozen in armchair', 'grunnings', 'grunnings parking lot', 'half past eight', 'happy', 'harry', 'he', "heard the name 'potter'", 'howard/harry', 'learning new word', 'lily potter (implied)', 'looking at the sign', 'lunchtime', 'map', 'morning', 'mr. dursley', "mr. dursley's garden wall", 'mrs. dursley', "mrs. dursley's sister's husband", 'mrs. next door', "mrs. potter's family", 'muggle', 'nasty, common name', "nation's owls", 'night hunting', 'normal behavior', 'normal day', 'not young', 'noticing strange people', 'owl-free', 'owls', 'people down in the street', 'people in cloaks', 'petunia', 'potters', 'secret', 'several important telephone calls', 'sister of petunia', 'son, harry', 'strangely dressed people', 'stranger', 'tabby cat', 'the dursleys', 'the potters', "the potters' son", "the potters, that's rig

### Create a dataframe for community colors

In [67]:
import seaborn as sns
palette = "hls"

## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)
colors

Unnamed: 0,node,color,group
0,'potter',#d3db57,1
1,bakery,#d3db57,1
2,car,#d3db57,1
3,cat,#d3db57,1
4,collecting tin,#d3db57,1
...,...,...,...
187,weatherman,#a157db,10
188,scars,#db5f57,11
189,usefulness,#db5f57,11
190,strange and mysterious things happening all ov...,#5791db,12


### Add colors to the graph

In [68]:
for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]

In [69]:
from pyvis.network import Network

#graph_output_directory = "./docs/index.html"

net = Network(
    notebook=True,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="800px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=False,
)

net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)

# net.show(graph_output_directory)
net.show_buttons(filter_=['physics'])
net.show("knowledge_graph.html")

knowledge_graph.html


In [70]:
# DETAILED STEPS OF TERM PROXIMITY CALCULATION (same as function, only step by step to better understand the process)

In [71]:
## Melt the dataframe into a list of nodes
dfg_long = pd.melt(
    dfg1, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
)

In [72]:
dfg_long.tail(5)

Unnamed: 0,chunk_id,variable,node
341,ddf9749327e34206a38ea0f8b56be1f5,node_2,special
342,ddf9749327e34206a38ea0f8b56be1f5,node_2,famous
343,ddf9749327e34206a38ea0f8b56be1f5,node_2,woken up
344,ddf9749327e34206a38ea0f8b56be1f5,node_2,prodded and pinched
345,ddf9749327e34206a38ea0f8b56be1f5,node_2,harry potter


In [73]:
dfg_long.drop(columns=["variable"], inplace=True)
# Self join with chunk id as the key will create a link between terms occuring in the same text chunk.

In [74]:
dfg_long.tail(5)

Unnamed: 0,chunk_id,node
341,ddf9749327e34206a38ea0f8b56be1f5,special
342,ddf9749327e34206a38ea0f8b56be1f5,famous
343,ddf9749327e34206a38ea0f8b56be1f5,woken up
344,ddf9749327e34206a38ea0f8b56be1f5,prodded and pinched
345,ddf9749327e34206a38ea0f8b56be1f5,harry potter


In [75]:
dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))

In [76]:
dfg_wide.head()

Unnamed: 0,chunk_id,node_1,node_2
0,55fba9390d9b40a9b2080b407539097c,mr. dursley,mr. dursley
1,55fba9390d9b40a9b2080b407539097c,mr. dursley,dudley
2,55fba9390d9b40a9b2080b407539097c,mr. dursley,the dursleys
3,55fba9390d9b40a9b2080b407539097c,mr. dursley,the dursleys
4,55fba9390d9b40a9b2080b407539097c,mr. dursley,grunnings


In [77]:
# drop self loops
self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
dfgraph2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)

In [78]:
dfgraph2.head()

Unnamed: 0,chunk_id,node_1,node_2
0,55fba9390d9b40a9b2080b407539097c,mr. dursley,dudley
1,55fba9390d9b40a9b2080b407539097c,mr. dursley,the dursleys
2,55fba9390d9b40a9b2080b407539097c,mr. dursley,the dursleys
3,55fba9390d9b40a9b2080b407539097c,mr. dursley,grunnings
4,55fba9390d9b40a9b2080b407539097c,mr. dursley,the dursleys


In [79]:
## Group and count edges.
dfgraph2 = (
    dfgraph2.groupby(["node_1", "node_2"])
    .agg({"chunk_id": [",".join, "count"]})
    .reset_index()
)

In [80]:
dfgraph2.head()

Unnamed: 0_level_0,node_1,node_2,chunk_id,chunk_id
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,join,count
0,'potter',heard the name 'potter',5e6a39886ae44e48ade45448fc9051ca,1
1,'potter',lily potter (implied),5e6a39886ae44e48ade45448fc9051ca,1
2,'potter',mr. dursley,"5e6a39886ae44e48ade45448fc9051ca,5e6a39886ae44...",2
3,'potter',mrs. dursley,"5e6a39886ae44e48ade45448fc9051ca,5e6a39886ae44...",2
4,'potter',petunia,"5e6a39886ae44e48ade45448fc9051ca,5e6a39886ae44...",2


In [81]:
dfgraph2.columns = ["node_1", "node_2", "chunk_id", "count"]
dfgraph2.replace("", np.nan, inplace=True)
dfgraph2.dropna(subset=["node_1", "node_2"], inplace=True)
# Drop edges with 1 count
dfgraph2 = dfg2[dfg2["count"] != 1]
dfgraph2["edge"] = "contextual proximity"

In [82]:
dfgraph2.head()

Unnamed: 0,node_1,node_2,chunk_id,count,edge
2,'potter',mr. dursley,"5e6a39886ae44e48ade45448fc9051ca,5e6a39886ae44...",2,contextual proximity
3,'potter',mrs. dursley,"5e6a39886ae44e48ade45448fc9051ca,5e6a39886ae44...",2,contextual proximity
4,'potter',petunia,"5e6a39886ae44e48ade45448fc9051ca,5e6a39886ae44...",2,contextual proximity
5,'potter',sister of petunia,"5e6a39886ae44e48ade45448fc9051ca,5e6a39886ae44...",3,contextual proximity
11,a piercing stare,are dead,"26c5b0d9c07d41e7baccd8e596c93446,26c5b0d9c07d4...",2,contextual proximity


In [83]:
dfgraph2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 748 entries, 2 to 2236
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   node_1    748 non-null    object
 1   node_2    748 non-null    object
 2   chunk_id  748 non-null    object
 3   count     748 non-null    int64 
 4   edge      748 non-null    object
dtypes: int64(1), object(4)
memory usage: 35.1+ KB
