## Setup

In [1]:
import pandas as pd
import numpy as np
import os
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random


In [2]:
# packages used by helper functions
import uuid

In [3]:
# packages for prompting definitions
import sys
sys.path.append("..")
import json

In [4]:
from langchain_community.llms import Ollama

In [5]:
import ollama
from ollama import Client
client = Client(host='http://192.168.1.166:11434')

## Prompt definitions (function to extract triplets in JSON-format for knowledge graph from text chunks)

In [29]:
#################################
# Definition of used LLM
#################################
##########################################################################
def graphPrompt(input: str, metadata={}, model="mixtral:latest"):
    if model == None:
        model = "mixtral:latest"
    
    chunk_id = metadata.get('chunk_id', None)

    # model_info = client.show(model_name=model)
    # print( chalk.blue(model_info))

    SYS_PROMPT = ("You are a network graph maker who extracts terms and their relations from a given context. "
        "You are provided with a context chunk (delimited by ```) Your task is to extract the ontology "
        "of terms mentioned in the given context. These terms should represent the key concepts as per the context. \n"
        "Thought 1: While traversing through each sentence, Think about the key terms mentioned in it.\n"
            "\tTerms may include person (agent), location, organization, date, duration, \n"
            "\tcondition, concept, object, entity  etc.\n"
            "\tTerms should be as atomistic as possible\n\n"
        "Thought 2: Think about how these terms can have one on one relation with other terms.\n"
            "\tTerms that are mentioned in the same sentence or the same paragraph are typically related to each other.\n"
            "\tTerms can be related to many other terms\n\n"
        "Thought 3: Find out the relation between each such related pair of terms. \n\n"
        "Format your output as a list of json. Each element of the list contains a pair of terms"
        "and the relation between them like the follwing. NEVER change the value of the chunk_ID as defined in this prompt: \n"
        "[\n"
        "   {\n"
        '       "chunk_id": "CHUNK_ID_GOES_HERE",\n'
        '       "node_1": "A concept from extracted ontology",\n'
        '       "node_2": "A related concept from extracted ontology",\n'
        '       "edge": "relationship between the two concepts, node_1 and node_2 in one or two sentences"\n' 
        "   }, {...}\n"
        "]"
    )
    SYS_PROMPT = SYS_PROMPT.replace('CHUNK_ID_GOES_HERE', chunk_id)

    USER_PROMPT = f"context: ```{input}``` \n\n output: "

    response = client.generate(model="mixtral:latest", system=SYS_PROMPT, prompt=USER_PROMPT)

    aux1 = response['response']
    # Find the index of the first open bracket '['
    start_index = aux1.find('[')
    # Slice the string from start_index to extract the JSON part and fix an unexpected problem with insertes escapes (WHY ?)
    json_string = aux1[start_index:]
    json_string = json_string.replace('\\\\\_', '_')
    json_string = json_string.replace('\\\\_', '_')
    json_string = json_string.replace('\\\_', '_')
    json_string = json_string.replace('\\_', '_')
    json_string = json_string.replace('\_', '_')
    json_string.lstrip() # eliminate eventual leading blank spaces
#####################################################
    print("json-string:\n" + json_string)
#####################################################         
    try:
        result = json.loads(json_string)
        result = [dict(item) for item in result]
    except:
        print("\n\nERROR ### Here is the buggy response: ", response, "\n\n")
        result = None
    print("§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§")

    return result

## Functions

In [7]:
def documents2Dataframe(documents) -> pd.DataFrame:
    rows = []
    for chunk in documents:
        row = {
            "text": chunk.page_content,
            **chunk.metadata,
            "chunk_id": uuid.uuid4().hex,
        }
        rows = rows + [row]

    df = pd.DataFrame(rows)
    return df

In [8]:
def df2Graph(dataframe: pd.DataFrame, model=None) -> list:
    # dataframe.reset_index(inplace=True)
    results = dataframe.apply(
        lambda row: graphPrompt(row.text, {"chunk_id": row.chunk_id}, model), axis=1
    )
    # invalid json results in NaN
    results = results.dropna()
    results = results.reset_index(drop=True)

    ## Flatten the list of lists to one single list of entities.
    concept_list = np.concatenate(results).ravel().tolist()
    return concept_list

In [9]:
def graph2Df(nodes_list) -> pd.DataFrame:
    ## Remove all NaN entities
    graph_dataframe = pd.DataFrame(nodes_list).replace(" ", np.nan)
    graph_dataframe = graph_dataframe.dropna(subset=["node_1", "node_2"])
    graph_dataframe["node_1"] = graph_dataframe["node_1"].apply(lambda x: x.lower())
    graph_dataframe["node_2"] = graph_dataframe["node_2"].apply(lambda x: x.lower())

    return graph_dataframe

In [10]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2

## Variables

In [11]:
## Input data directory
##########################################################
input_file_name = "Saxony_Eastern_Expansion_EP_96.txt"
##########################################################
data_dir = "HotG_Data/"+input_file_name
inputdirectory = Path(f"./{data_dir}")

## This is where the output csv files will be written
outputdirectory = Path(f"./data_output")

In [14]:
output_graph_file_name = f"graph_{input_file_name[:-4]}.csv"
output_graph_file_with_path = outputdirectory/output_graph_file_name

output_chunks_file_name = f"chunks_{input_file_name[:-4]}.csv"
output_chunks_file_with_path = outputdirectory/output_chunks_file_name

output_context_prox_file_name = f"graph_contex_prox_{input_file_name[:-4]}.csv"
output_context_prox_file_with_path = outputdirectory/output_context_prox_file_name

print(output_graph_file_with_path)
print(output_chunks_file_with_path)
print(output_context_prox_file_with_path)

data_output\graph_Saxony_Eastern_Expansion_EP_96.csv
data_output\chunks_Saxony_Eastern_Expansion_EP_96.csv
data_output\graph_contex_prox_Saxony_Eastern_Expansion_EP_96.csv


## Load Documents

In [16]:
#loader = TextLoader("./HotG_Data/Hanse.txt")
loader = TextLoader(inputdirectory)
Document = loader.load()
# clean unnecessary line breaks
Document[0].page_content = Document[0].page_content.replace("\n", " ")

In [17]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(Document)
print("Number of chunks = ", len(pages))
print(pages[5].page_content)

Number of chunks =  10
copy of a letter (86.11.370a,câ€“e) preserves an appeal from one of the last kings of Assyria, Sin-shar-ishkun (r. 622â€“612 B.C.), apparently pleading to retain his throne by making an alliance with the Babylonian king Nabopolassar (r. 625â€“605 B.C.). By this stage, however, the Assyrian state was doomed: Nabopolassar and his son Nebuchadnezzar II (r. 604â€“562 B.C.) came to rule most of its former empire, a swathe of territory reaching from the shores of the Persian Gulf to those of the Mediterranean Sea.  Nebuchadnezzar II took his name from the king who had recovered the statue of Marduk from Susa. The later king was ultimately to become far more famous than his predecessor, however: it is Nebuchadnezzar II who appears in the Bible. As king of Babylon, he rebuilt much of the city (86.11.60), constructing an imperial capital with vast palaces and well-appointed temples, colossal city walls, and a great northern entry point, the Ishtar Gate, approached via a l

## Create a dataframe of all the chunks

In [18]:
df = documents2Dataframe(pages)
print(df.shape)
df.head()

(10, 3)


Unnamed: 0,text,source,chunk_id
0,The city of Babylon lay on the River Euphrates...,HotG_Data\Saxony_Eastern_Expansion_EP_96.txt,370351b430c64ba4bc9f3ca7abc8b47e
1,the period; some larger examples of sculpture ...,HotG_Data\Saxony_Eastern_Expansion_EP_96.txt,a13e13b40b174fc1a723ba7642575579
2,"religious imagery. The images, which included ...",HotG_Data\Saxony_Eastern_Expansion_EP_96.txt,46539e03eb6f4418ba9d2d8d8eb93966
3,second millennium B.C. saw power over Babylon ...,HotG_Data\Saxony_Eastern_Expansion_EP_96.txt,41dce135bf0245168fb7a943c7f78b95
4,"a rebellion, the act was considered so sacrile...",HotG_Data\Saxony_Eastern_Expansion_EP_96.txt,e4d02dedadf049cdaaf331df3573c99e


## Extract Concepts

In [19]:
## To regenerate the graph with LLM, set this to True
##################
regenerate = False  # toggle to True if the time-consuming (re-)generation of the knowlege extraction is required
##################
if regenerate:
#########################################################    
    concepts_list = df2Graph(df, model='mixtral:latest')
#########################################################
    dfg1 = graph2Df(concepts_list)
    
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)
    
    dfg1.to_csv(output_graph_file_with_path, sep=";", index=False)
    df.to_csv(output_chunks_file_with_path, sep=";", index=False)
else:
    dfg1 = pd.read_csv(output_graph_file_with_path, sep=";")

dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1['count'] = 4 
## Increasing the weight of the relation to 4. 
## We will assign the weight of 1 when later the contextual proximity will be calculated.  
print(dfg1.shape)
dfg1.head()

(310, 5)


Unnamed: 0,chunk_id,node_1,node_2,edge,count
0,3b1d78faf9064a97b1b667d4aa940745,episode 96,last week's exploration,refers to the content covered in the previous ...,4
1,3b1d78faf9064a97b1b667d4aa940745,saxons,charlemagne's invasion in 772,were invaded by Charlemagne in 772,4
2,3b1d78faf9064a97b1b667d4aa940745,otto i,death in 973,died in 973,4
3,3b1d78faf9064a97b1b667d4aa940745,duchy,well set up,was well established after Otto I's death,4
4,3b1d78faf9064a97b1b667d4aa940745,original territory,between the rhine and the elbe river,refers to the initial Saxon lands between the ...,4


## Calculating contextual proximity

In [20]:
dfg2 = contextual_proximity(dfg1)
dfg2.to_csv(output_context_prox_file_with_path, sep=";", index=False)
dfg2.tail()

Unnamed: 0,node_1,node_2,chunk_id,count,edge
7190,zone of influence,early polish rulers,"9e984b449e2c49338bb8f48c0ac557f3,9e984b449e2c4...",2,contextual proximity
7198,zone of influence,neighbors,"9e984b449e2c49338bb8f48c0ac557f3,9e984b449e2c4...",2,contextual proximity
7200,zone of influence,poland,"9e984b449e2c49338bb8f48c0ac557f3,9e984b449e2c4...",2,contextual proximity
7202,zone of influence,traces of smaller fortifications,"9e984b449e2c49338bb8f48c0ac557f3,9e984b449e2c4...",2,contextual proximity
7203,zone of influence,widukind,"9e984b449e2c49338bb8f48c0ac557f3,9e984b449e2c4...",2,contextual proximity


### Merge both the dataframes

In [21]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
    .reset_index()
)
dfg

Unnamed: 0,node_1,node_2,chunk_id,edge,count
0,960,"dobrawa, a daughter of duke boleslaus i of boh...","8ad949a3488a430cae0864089c8cef99,8ad949a3488a4...",contextual proximity,2
1,960,margrave gero,"8ad949a3488a430cae0864089c8cef99,8ad949a3488a4...",contextual proximity,3
2,960,mieszko,"8ad949a3488a430cae0864089c8cef99,8ad949a3488a4...",contextual proximity,8
3,960,otto the great,"8ad949a3488a430cae0864089c8cef99,8ad949a3488a4...",contextual proximity,2
4,960,poland,"8ad949a3488a430cae0864089c8cef99,8ad949a3488a4...",contextual proximity,2
...,...,...,...,...,...
2729,zone of influence,early polish rulers,"9e984b449e2c49338bb8f48c0ac557f3,9e984b449e2c4...",contextual proximity,2
2730,zone of influence,neighbors,"9e984b449e2c49338bb8f48c0ac557f3,9e984b449e2c4...",contextual proximity,2
2731,zone of influence,poland,"9e984b449e2c49338bb8f48c0ac557f3,9e984b449e2c4...",contextual proximity,2
2732,zone of influence,traces of smaller fortifications,"9e984b449e2c49338bb8f48c0ac557f3,9e984b449e2c4...",contextual proximity,2


## Calculate the NetworkX Graph

In [22]:
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()
nodes.shape

(401,)

In [23]:
import networkx as nx
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)
    )

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row['count']/4
    )

### Calculate communities for coloring the nodes

In [24]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

Number of Communities =  6
[['960', '970', 'apostles to the slavs', 'archaeologists', 'balkans', 'base in central bohemia', 'basilica de san clemente', 'basilica of san clemente', 'bavarian church', 'bavarian missionaries', 'being thrown overboard in the black sea with an anchor attached to his feet', 'bohemian warlords', 'boleslaus son, duke boleslaus ii', 'bořivoj', 'byzantian empire, slavonic literature and material heritage of the west-slavonic peoples', 'carolingian empire', 'carolingians', 'centers of power', 'charlemagne', 'christianization period, central europe', 'church liturgy', 'church slavonic', 'consolidated protofeudal state, ruling dynasty, castles, church, economy', 'constantinople', 'convert miesco to christianity', 'converting to christianity', 'crimea beach', 'cyrill', 'desire', 'developed later than bohemia', 'dobrawa, a daughter of duke boleslaus i of bohemia', 'early polish rulers', 'east francians', 'economic model', 'emperor in constantinople', 'erected between

### Create a dataframe for community colors

In [25]:
import seaborn as sns
palette = "hls"

## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)
colors

Unnamed: 0,node,color,group
0,960,#d3db57,1
1,970,#d3db57,1
2,apostles to the slavs,#d3db57,1
3,archaeologists,#d3db57,1
4,balkans,#d3db57,1
...,...,...,...
396,unharmed hand of poppo,#57d3db,4
397,normandy or parts of normandy,#57db5f,5
398,rollo,#57db5f,5
399,political and economic factors,#db57d3,6


### Add colors to the graph

In [26]:
for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]

In [28]:
from pyvis.network import Network

#graph_output_directory = "./docs/index.html"

net = Network(
    notebook=True,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="800px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=False,
)

net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)

# net.show(graph_output_directory)
net.show_buttons(filter_=['physics'])
net.show("knowledge_graph.html")

knowledge_graph.html


UnicodeEncodeError: 'charmap' codec can't encode character '\u0159' in position 29809: character maps to <undefined>

In [25]:
# DETAILED STEPS OF TERM PROXIMITY CALCULATION (same as function, only step by step to better understand the process)

In [26]:
## Melt the dataframe into a list of nodes
dfg_long = pd.melt(
    dfg1, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
)

In [27]:
dfg_long.tail(5)

Unnamed: 0,chunk_id,variable,node
615,9d455bc6ec4946bf81b106a8d103c837,node_2,christian powers
616,9d455bc6ec4946bf81b106a8d103c837,node_2,superior strength
617,9d455bc6ec4946bf81b106a8d103c837,node_2,surrender or be traded south as slaves
618,9d455bc6ec4946bf81b106a8d103c837,node_2,hatred
619,9d455bc6ec4946bf81b106a8d103c837,node_2,freedom


In [28]:
dfg_long.drop(columns=["variable"], inplace=True)
# Self join with chunk id as the key will create a link between terms occuring in the same text chunk.

In [29]:
dfg_long.tail(5)

Unnamed: 0,chunk_id,node
615,9d455bc6ec4946bf81b106a8d103c837,christian powers
616,9d455bc6ec4946bf81b106a8d103c837,superior strength
617,9d455bc6ec4946bf81b106a8d103c837,surrender or be traded south as slaves
618,9d455bc6ec4946bf81b106a8d103c837,hatred
619,9d455bc6ec4946bf81b106a8d103c837,freedom


In [30]:
dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))

In [31]:
dfg_wide.head()

Unnamed: 0,chunk_id,node_1,node_2
0,3b1d78faf9064a97b1b667d4aa940745,episode 96,episode 96
1,3b1d78faf9064a97b1b667d4aa940745,episode 96,saxons
2,3b1d78faf9064a97b1b667d4aa940745,episode 96,otto i
3,3b1d78faf9064a97b1b667d4aa940745,episode 96,duchy
4,3b1d78faf9064a97b1b667d4aa940745,episode 96,original territory


In [32]:
# drop self loops
self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
dfgraph2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)

In [33]:
dfgraph2.head()

Unnamed: 0,chunk_id,node_1,node_2
0,3b1d78faf9064a97b1b667d4aa940745,episode 96,saxons
1,3b1d78faf9064a97b1b667d4aa940745,episode 96,otto i
2,3b1d78faf9064a97b1b667d4aa940745,episode 96,duchy
3,3b1d78faf9064a97b1b667d4aa940745,episode 96,original territory
4,3b1d78faf9064a97b1b667d4aa940745,episode 96,cities


In [34]:
## Group and count edges.
dfgraph2 = (
    dfgraph2.groupby(["node_1", "node_2"])
    .agg({"chunk_id": [",".join, "count"]})
    .reset_index()
)

In [35]:
dfgraph2.head()

Unnamed: 0_level_0,node_1,node_2,chunk_id,chunk_id
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,join,count
0,960,convert miesco to christianity,8ad949a3488a430cae0864089c8cef99,1
1,960,"dobrawa, a daughter of duke boleslaus i of boh...","8ad949a3488a430cae0864089c8cef99,8ad949a3488a4...",2
2,960,first polish ruler,8ad949a3488a430cae0864089c8cef99,1
3,960,gniesno in the 10th century,8ad949a3488a430cae0864089c8cef99,1
4,960,margrave gero,"8ad949a3488a430cae0864089c8cef99,8ad949a3488a4...",3


In [36]:
dfgraph2.columns = ["node_1", "node_2", "chunk_id", "count"]
dfgraph2.replace("", np.nan, inplace=True)
dfgraph2.dropna(subset=["node_1", "node_2"], inplace=True)
# Drop edges with 1 count
dfgraph2 = dfg2[dfg2["count"] != 1]
dfgraph2["edge"] = "contextual proximity"

In [37]:
dfgraph2.head()

Unnamed: 0,node_1,node_2,chunk_id,count,edge
1,960,"dobrawa, a daughter of duke boleslaus i of boh...","8ad949a3488a430cae0864089c8cef99,8ad949a3488a4...",2,contextual proximity
4,960,margrave gero,"8ad949a3488a430cae0864089c8cef99,8ad949a3488a4...",3,contextual proximity
5,960,mieszko,"8ad949a3488a430cae0864089c8cef99,8ad949a3488a4...",8,contextual proximity
6,960,otto the great,"8ad949a3488a430cae0864089c8cef99,8ad949a3488a4...",2,contextual proximity
8,960,poland,"8ad949a3488a430cae0864089c8cef99,8ad949a3488a4...",2,contextual proximity


In [38]:
dfgraph2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2640 entries, 1 to 7203
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   node_1    2640 non-null   object
 1   node_2    2640 non-null   object
 2   chunk_id  2640 non-null   object
 3   count     2640 non-null   int64 
 4   edge      2640 non-null   object
dtypes: int64(1), object(4)
memory usage: 123.8+ KB
