In [None]:
import pandas as pd
import os

dataset_path = os.path.join('..', '2M-PAST', 'info', 'Info_DS.txt')
with open(dataset_path, 'r', encoding='utf-8') as dataset_file:
    lines = dataset_file.readlines()

# Define the heads, relations, and tails
head = []
relation1 = []
middle = []
relation2 = []
tail = []
for line in lines:
    if (line.split(' | ')[5]).strip() == "YES":
        head.append(line.split(' | ')[2])
        relation1.append("resilience by")
        middle.append(line.split(' | ')[1])
        relation2.append("in")
        tail.append(line.split(' | ')[3])

# Create a dataframe
df = pd.DataFrame({'head': head, 'relation1': relation1, 'middle': middle, 'relation2': relation2, 'tail': tail})
df

In [2]:
import networkx as nx
import matplotlib.pyplot as plt

# # Uncomment if you want only the 100 most common combinations instead of the whole graph
# most_common = df.groupby(['head', 'relation1', 'middle', 'relation2', 'tail']).size()
# most_common = most_common.sort_values(ascending=False)
# df = most_common.head(100).reset_index(name='count')

# Create a knowledge graph
G = nx.Graph()
for _, row in df.iterrows():
    G.add_edge(row['head'], row['middle'], label=row['relation1'])
    G.add_edge(row['middle'], row['tail'], label=row['relation2'])

In [None]:
# Visualize the knowledge graph
pos = nx.spring_layout(G, seed=42, k=0.9)
labels = nx.get_edge_attributes(G, 'label')
plt.figure(figsize=(30, 25))
nx.draw(G, pos, with_labels=True, font_size=10, node_size=700, node_color='lightblue', edge_color='gray', alpha=0.6)
nx.draw_networkx_edge_labels(G, pos, edge_labels=labels, font_size=8, label_pos=0.3, verticalalignment='baseline')
plt.title('Knowledge Graph', fontsize=14)
plt.show()

In [None]:
num_nodes = G.number_of_nodes()
num_edges = G.number_of_edges()
print(f'Number of nodes: {num_nodes}')
print(f'Number of edges: {num_edges}')
print(f'Ratio edges to nodes: {round(num_edges / num_nodes, 2)}')

In [None]:
degree_centrality = nx.degree_centrality(G)
degree_centrality = dict(sorted(degree_centrality.items(), key=lambda item: item[1], reverse=True))
print('Degree Centrality Scores:')
for node, centrality in degree_centrality.items():

    print(f'{node}: {centrality:.2f}')

In [None]:
betweenness_centrality = nx.betweenness_centrality(G)
betweenness_centrality = dict(sorted(betweenness_centrality.items(), key=lambda item: item[1], reverse=True))
print('Betweenness Centrality Scores:')
for node, centrality in betweenness_centrality.items():
    print(f'{node}: {centrality:.2f}')

In [None]:
closeness_centrality = nx.closeness_centrality(G)
closeness_centrality = dict(sorted(closeness_centrality.items(), key=lambda item: item[1], reverse=True))
print('Closeness Centrality Scores:')
for node, centrality in closeness_centrality.items():
    print(f'{node}: {centrality:.2f}')

In [None]:
eigenvector_centrality = nx.eigenvector_centrality(G, max_iter=2000)
eigenvector_centrality = dict(sorted(eigenvector_centrality.items(), key=lambda item: item[1], reverse=True))
print('Eigenvector Centrality Scores:')
for node, centrality in eigenvector_centrality.items():
    print(f'{node}: {centrality:.2f}')

In [None]:
# Degree centrality
plt.figure(figsize=(10, 7))
nx.draw(
    G, pos, 
    with_labels=True, 
    font_size=5, 
    node_size=[v * 3000 for v in degree_centrality.values()], 
    node_color=list(degree_centrality.values()), 
    cmap=plt.cm.Blues, 
    edge_color='gray', 
    alpha=0.6
)
plt.title('Degree Centrality', fontsize=14)
plt.show()

In [None]:
# Betweenness centrality
plt.figure(figsize=(10, 7))
nx.draw(
    G, pos, 
    with_labels=True, 
    font_size=5, 
    node_size=[v * 3000 for v in betweenness_centrality.values()], 
    node_color=list(betweenness_centrality.values()), 
    cmap=plt.cm.Oranges, 
    edge_color='gray', 
    alpha=0.6
)
plt.title('Betweenness Centrality', fontsize=14)
plt.show()

In [None]:
# Closeness centrality
plt.figure(figsize=(10, 7))
nx.draw(
    G, pos, 
    with_labels=True, 
    font_size=5, 
    node_size=[v * 3000 for v in closeness_centrality.values()], 
    node_color=list(closeness_centrality.values()), 
    cmap=plt.cm.Greens, 
    edge_color='gray', 
    alpha=0.6
)
plt.title('Closeness Centrality', fontsize=14)
plt.show()

In [None]:
source_node = 'Vitis vinifera'
target_node = 'drought'

try:
    # Find all shortest paths
    all_shortest_paths = list(nx.all_shortest_paths(G, source=source_node.capitalize(), target=target_node.capitalize()))
    # Limit to the first 10 shortest paths if there are more
    all_shortest_paths = all_shortest_paths[:10]

    # Visualize each shortest path
    plt.figure(figsize=(24, 20))
    pos = nx.spring_layout(G)  # Generate layout for consistent visualization
    nx.draw(G, pos, with_labels=True, font_size=10, node_size=400, node_color='lightblue', edge_color='gray', alpha=0.6)
    for idx, path in enumerate(all_shortest_paths):
        path_edges = [(path[i], path[i + 1]) for i in range(len(path) - 1)]
        nx.draw_networkx_edges(G, pos, edgelist=path_edges, edge_color='red', width=2, alpha=0.8, label=f'Path {idx+1}')

    plt.title(f'10 Shortest Paths from {source_node} to {target_node}', fontsize=20)
    plt.legend()
    plt.show()

    # Print all shortest paths
    print(f"Shortest Paths from {source_node} to {target_node} (showing up to 10):")
    for idx, path in enumerate(all_shortest_paths, 1):
        print(f"Path {idx}: {path}")

except:
    print(f'{source_node.capitalize()} or {target_node} does not exist in dataset, please check again!')
