In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import seaborn as sns
import networkx as nx
import pandas as pd
import numpy as np
import yaml
import os
from collections import defaultdict
mpl.rcParams['text.usetex'] = True
mpl.rcParams['font.family'] = 'computer modern'
mpl.rcParams['mathtext.fontset'] = 'cm'

In [None]:
config_path = os.path.join("..", "config.yaml")
with open(config_path, "rt") as config_file:
    config = yaml.safe_load(config_file)

all_nodes = set()
for field in config["DOMAINS"]:
	all_nodes |= set(pd.read_csv(os.path.join("..", "OpenAlex_Knowledge_Graph", "nodes", f"{field}.csv")).values.flatten())
edges_dir = os.path.join("..", "FOS_Benchmark", "_".join(config["DOMAINS"]), "edges")
years = list(range(config["START_YEAR"], config["END_YEAR"]+1))

In [None]:
snapshots = {}
edge_years = defaultdict(list)

for year in years:
	G = nx.Graph()
	G.add_nodes_from(all_nodes)
	df = pd.read_csv(os.path.join(edges_dir, f"{year}.csv"), header=None, names=['src', 'dst', 'year'])
	for _, row in df.iterrows():
		G.add_edge(row['src'], row['dst'])
		key = frozenset([row['src'], row['dst']])
		edge_years[key].append(row['year'])
	snapshots[year] = G

# Node Statistices

In [None]:
node_degrees_over_time = {node: {} for node in all_nodes}
node_active_years = {node: [] for node in all_nodes}
for year in years:
	for node in all_nodes:
		deg = snapshots[year].degree(node)
		node_degrees_over_time[node][year] = deg
		if deg > 0:
			node_active_years[node].append(year)

node_first_appearance = {node: min(node_active_years[node], default=float('inf')) for node in all_nodes}

node_counts = {year: sum(1 for node in all_nodes if year in node_active_years[node]) for year in years}

node_degrees = {}
for year in years:
	node_degrees[year] = [deg[year] for node, deg in node_degrees_over_time.items() if deg[year] > 0]
average_node_degrees = {year: np.mean(degs) for year, degs in node_degrees.items()}

growth_rates = {}
for year in years[1:]:
    growth_rate = []
    for node in all_nodes:
        if year > node_first_appearance[node]:
            delta_degree = node_degrees_over_time[node][year] - node_degrees_over_time[node][year-1]
            growth_rate.append(delta_degree)
    growth_rates[year] = np.mean(growth_rate)

mean_clustering_coefficients = {}
for year in years:
	clustering_coefficients = []
	for node in all_nodes:
		if year in node_active_years[node]:
			clust_coeff = nx.clustering(snapshots[year], node)
			clustering_coefficients.append(clust_coeff)
	mean_clustering_coefficients[year] = np.mean(clustering_coefficients)

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(14, 10))
palette = sns.color_palette("Set2", 4)

axs[0, 0].plot(list(node_counts.keys()), list(node_counts.values()), marker='o', color=palette[0])
axs[0, 0].set_xlabel('Year')
axs[0, 0].set_ylabel('Number of Active Nodes')
axs[0, 0].set_title('Number of Active Nodes Over The Years')
axs[0, 0].grid(True)
axs[0, 0].xaxis.set_major_locator(MaxNLocator(integer=True))

axs[0, 1].plot(list(average_node_degrees.keys()), list(average_node_degrees.values()), marker='o', color=palette[1])
axs[0, 1].set_xlabel('Year')
axs[0, 1].set_ylabel('Mean Node Degree')
axs[0, 1].set_title('Mean Node Degree Over The Years')
axs[0, 1].grid(True)
axs[0, 1].xaxis.set_major_locator(MaxNLocator(integer=True))

axs[1, 0].plot(list(growth_rates.keys()), list(growth_rates.values()), marker='o', color=palette[2])
axs[1, 0].axhline(0, color='gray', linestyle='dashed')
axs[1, 0].set_xlabel('Year')
axs[1, 0].set_ylabel('Mean Node Growth Rate')
axs[1, 0].set_title('Mean Node Growth Rate Over The Years')
axs[1, 0].grid(True)
axs[1, 0].xaxis.set_major_locator(MaxNLocator(integer=True))

axs[1, 1].plot(list(mean_clustering_coefficients.keys()), list(mean_clustering_coefficients.values()), marker='o', color=palette[3])
axs[1, 1].set_xlabel('Year')
axs[1, 1].set_ylabel('Mean Clustering Coefficient')
axs[1, 1].set_title('Mean Clustering Coefficient Over The Years')
axs[1, 1].grid(True)
axs[1, 1].xaxis.set_major_locator(MaxNLocator(integer=True))

fig.tight_layout()
plt.subplots_adjust(wspace=0.2, hspace=0.22)
plt.show()
fig.savefig("node_statistics_1.pdf", format="pdf", bbox_inches="tight")

In [None]:
node_last_appearance = {node: max(node_active_years[node], default=float('-inf')) for node in all_nodes}
last_appearance_counts = pd.Series(list(node_last_appearance.values())).value_counts().sort_index()
last_appearance_counts = last_appearance_counts[last_appearance_counts.index != float('-inf')]

node_persistence_spans = {node: (max(y) - min(y)) if y else 0 for node, y in node_active_years.items()}
persistence_counts = pd.Series(list(node_persistence_spans.values())).value_counts().sort_index()

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(14, 5))
colors = sns.color_palette("Set2", 2)

axs[0].bar(persistence_counts.index, persistence_counts.values, color=colors[0])
axs[0].set_xlabel('Persistence Span (Years)')
axs[0].set_ylabel('Number of Nodes')
axs[0].set_title('Node Persistence Span Distribution')
axs[0].grid(axis='y')
for idx, value in enumerate(persistence_counts.values):
	axs[0].text(persistence_counts.index[idx], value, str(value), ha='center', va='bottom')

axs[1].bar(last_appearance_counts.index, last_appearance_counts.values, color=colors[1])
axs[1].set_xlabel('Year of Last Activity')
axs[1].set_ylabel('Number of Nodes')
axs[1].set_title('Number of Nodes Last Active Each Year (Recency)')
axs[1].grid(axis='y')
for idx, value in enumerate(last_appearance_counts.values):
	axs[1].text(last_appearance_counts.index[idx], value, str(value), ha='center', va='bottom')

fig.tight_layout()
plt.subplots_adjust(wspace=0.2)
plt.show()
fig.savefig("node_statistics_2.pdf", format="pdf", bbox_inches="tight")

In [None]:
# node_burstiness = {}
# for node in all_nodes:
# 	active_y = sorted(set(node_active_years[node]))
# 	if len(active_y) >= 2:
# 		inter = np.diff(active_y)
# 		mean_inter = np.mean(inter)
# 		var_inter = np.var(inter)
# 		node_burstiness[node] = var_inter / mean_inter if mean_inter > 0 else 0
# 	else:
# 		node_burstiness[node] = 0

# burstiness_values = list(node_burstiness.values())
# plt.hist(burstiness_values, bins=30, edgecolor='black')
# plt.xlabel('Burstiness')
# plt.ylabel('Number of Nodes')
# plt.title('Distribution of Node Burstiness')
# plt.grid(axis='y')
# plt.show()

# Edge Statistics

In [None]:
edge_count = {year: G.number_of_edges() for year, G in snapshots.items()}

edge_density = {}
for year, G in snapshots.items():
	num_edges = G.number_of_edges()
	num_nodes = len([1 for node in all_nodes if G.degree(node) > 0])
	density = num_edges / (num_nodes * (num_nodes - 1))
	edge_density[year] = density

repetition_rate_per_year = {}
previous_edges = set()
for year in years:
    current_edges = set(snapshots[year].edges())
    repeated = current_edges.intersection(previous_edges)
    repetition_rate_per_year[year] = len(repeated) / len(current_edges) if current_edges else 0
    previous_edges = previous_edges.union(current_edges)

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(14, 5))
color1 = sns.color_palette("dark", 2)[0]
color2 = sns.color_palette("dark", 2)[1]

axs[0].plot(list(edge_count.keys()), list(edge_count.values()), marker='o', color=color1, label='Edge Count')
axs[0].set_xlabel('Year')
axs[0].set_ylabel('Edge Count', color=color1)
axs[0].tick_params(axis='y', labelcolor=color1)
axs[0].grid(True, axis='x')
axs[0].xaxis.set_major_locator(MaxNLocator(integer=True))
axs[0].set_title('Edge Count and Edge Density Over The Years')
ax2 = axs[0].twinx()
ax2.plot(list(edge_density.keys()), list(edge_density.values()), marker='s', color=color2, label='Edge Density')
ax2.set_ylabel('Edge Density', color=color2)
ax2.tick_params(axis='y', labelcolor=color2)

axs[1].plot(list(repetition_rate_per_year.keys()), list(repetition_rate_per_year.values()), marker='o', color=sns.color_palette("Set2")[4])
axs[1].set_xlabel('Year')
axs[1].set_ylabel('Repetition Rate')
axs[1].set_title('Edge Repetition Rate Over The Years')
axs[1].grid(True)
axs[1].xaxis.set_major_locator(MaxNLocator(integer=True))

fig.tight_layout()
plt.subplots_adjust(wspace=0.25)
plt.show()
fig.savefig("edge_statistics_1.pdf", format="pdf", bbox_inches="tight")

In [None]:
edge_frequencies = {key: len(sorted(set(years_list))) for key, years_list in edge_years.items()}
freq = list(range(1, max(edge_frequencies.values()) + 1))

edge_lifetimes = {key: max(sorted(set(years_list))) - min(sorted(set(years_list))) for key, years_list in edge_years.items()}
lifetime = list(range(1, max(edge_lifetimes.values()) + 1))

edge_inter_mean = {}
edge_inter_var = {}
edge_last_appearance = {}
for key, years_list in edge_years.items():
	years_set = sorted(set(years_list))
	edge_last_appearance[key] = years_set[-1]
	if len(years_set) >= 2:
		inter = np.diff(years_set)
		edge_inter_mean[key] = np.mean(inter)
		edge_inter_var[key] = np.var(inter)

inter_mean_values = list(edge_inter_mean.values())
imv = sorted(set(inter_mean_values))

last_appearance_values = list(edge_last_appearance.values())
lav = sorted(set(last_appearance_values))

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(14, 10))
colors = sns.color_palette("Set2", 4)

axs[0, 0].bar(freq, [list(edge_frequencies.values()).count(i) for i in freq], color=colors[0])
axs[0, 0].set_xlabel('Edge Frequency (Number of Years Edge is Present)')
axs[0, 0].set_ylabel('Number of Edges')
axs[0, 0].set_title('Distribution of Edge Frequencies')
axs[0, 0].grid(axis='y')

axs[0, 1].bar(lifetime, [list(edge_lifetimes.values()).count(i) for i in lifetime], color=colors[1])
axs[0, 1].set_xlabel('Edge Lifetime')
axs[0, 1].set_ylabel('Number of Edges')
axs[0, 1].set_title('Distribution of Edge Lifetimes')
axs[0, 1].grid(axis='y')
axs[0, 1].xaxis.set_major_locator(MaxNLocator(integer=True))

axs[1, 0].bar([str(round(i, 3)) for i in imv], [inter_mean_values.count(i) for i in imv], color=colors[2])
axs[1, 0].set_xlabel('Mean Inter-Event Time')
axs[1, 0].set_ylabel('Number of Edges')
axs[1, 0].set_title('Distribution of Edge Mean Inter-Event Times')
axs[1, 0].grid(axis='y')

axs[1, 1].bar([str(round(i, 3)) for i in lav], [last_appearance_values.count(i) for i in lav], color=colors[3])
axs[1, 1].set_xlabel('Last Appearance Year')
axs[1, 1].set_ylabel('Number of Edges')
axs[1, 1].set_title('Distribution of Edge Last Appearance Years')
axs[1, 1].grid(axis='y')

fig.tight_layout()
plt.subplots_adjust(wspace=0.2, hspace=0.22)
plt.show()
fig.savefig("edge_statistics_2.pdf", format="pdf", bbox_inches="tight")

In [None]:
# inter_var_values = list(edge_inter_var.values())
# x = sorted(set(inter_var_values))
# plt.bar([str(round(i, 3)) for i in x], [inter_var_values.count(i) for i in x])
# plt.xlabel('Variance of Inter-Event Time')
# plt.ylabel('Number of Edges')
# plt.title('Distribution of Edge Variance of Inter-Event Times')
# plt.grid(axis='y')
# plt.show()

# Graph Statistics

In [None]:
# edge_density = {}
# for year, G in snapshots.items():
# 	num_edges = G.number_of_edges()
# 	num_nodes = len([1 for node in all_nodes if G.degree(node) > 0])
# 	density = num_edges / (num_nodes * (num_nodes - 1))
# 	edge_density[year] = density

# plt.plot(list(edge_density.keys()), list(edge_density.values()), marker='o')
# plt.xlabel('Year')
# plt.ylabel('Edge Density')
# plt.title('Edge Density Over Years')
# plt.grid(True)
# plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
# plt.show()

In [None]:
component_count_per_year = {}
largest_cc_size_per_year = {}
diameter_per_year = {}
for year in years:
	active_nodes = [node for node in all_nodes if year in node_active_years[node]]
	subgraph = snapshots[year].subgraph(active_nodes)
	components = list(nx.connected_components(subgraph))
	component_count_per_year[year] = len(components)
	largest_cc_size_per_year[year] = max([len(c) for c in components], default=0)
	if largest_cc_size_per_year[year] > 0:
		largest_cc = max(components, key=len)
		G_cc = subgraph.subgraph(largest_cc)
		#### HEAVY PROCESSING ####
		# diameter_per_year[year] = nx.diameter(G_cc) if nx.is_connected(G_cc) else float('inf')
		#### HEAVY PROCESSING ####
		##########################
		#### LIGHT PROCESSING ####
		diameter_per_year[year] = nx.approximation.diameter(G_cc) if nx.is_connected(G_cc) else float('inf')
		#### LIGHT PROCESSING ####
	else:
		diameter_per_year[year] = 0
	print(f"Processed year {year}")

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(14, 5))

color_palette = sns.color_palette("dark", 5)
color1 = color_palette[4]
color2 = color_palette[3]
axs[0].plot(list(component_count_per_year.keys()), list(component_count_per_year.values()), marker='o', color=color1, label='Component Count')
axs[0].set_xlabel('Year')
axs[0].set_ylabel('Number of Connected Components', color=color1)
axs[0].tick_params(axis='y', labelcolor=color1)
axs[0].grid(True)
axs[0].xaxis.set_major_locator(MaxNLocator(integer=True))
ax2 = axs[0].twinx()
ax2.plot(list(largest_cc_size_per_year.keys()), list(largest_cc_size_per_year.values()), marker='s', color=color2, label='Largest CC Size')
ax2.set_ylabel('Largest Connected Component Size', color=color2)
ax2.tick_params(axis='y', labelcolor=color2)
axs[0].set_title('CC Count \\& Largest CC Size Over The Years')

axs[1].plot(list(diameter_per_year.keys()), list(diameter_per_year.values()), marker='o', color=color_palette[2])
axs[1].set_xlabel('Year')
axs[1].set_ylabel('Diameter')
axs[1].set_title('Diameter Over The Years')
axs[1].grid(True)
axs[1].xaxis.set_major_locator(MaxNLocator(integer=True))

fig.tight_layout()
plt.subplots_adjust(wspace=0.25)
plt.show()
fig.savefig("graph_statistics.pdf", format="pdf", bbox_inches="tight")

In [None]:
# activity_distribution = {year: edge_count[year] / float(np.sum(list(edge_count.values()))) for year in years}
# plt.plot(list(activity_distribution.keys()), list(activity_distribution.values()), marker='o')
# plt.xlabel('Year')
# plt.ylabel('Activity Distribution')
# plt.title('Activity Distribution Over Years')
# plt.grid(True)
# plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
# plt.show()

# Temporal Statistics

In [None]:
assortativity_per_year = {}
for year in years:
	active_nodes = [node for node in all_nodes if year in node_active_years[node]]
	subgraph = snapshots[year].subgraph(active_nodes)
	assortativity_per_year[year] = nx.degree_assortativity_coefficient(subgraph)
	print(f"Processed assortativity for year {year}")

churn_ratio_per_year = {}
prev_edges = set()
for year in years:
    current_edges = set(snapshots[year].edges())
    appearing = len(current_edges - prev_edges)
    disappearing = len(prev_edges - current_edges)
    churn_ratio_per_year[year] = appearing / disappearing if disappearing > 0 else (float('inf') if appearing > 0 else 0)
    prev_edges = current_edges

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(14, 5))
colors = sns.color_palette("Set2", 2)

axs[0].plot(list(assortativity_per_year.keys()), list(assortativity_per_year.values()), marker='o', color=colors[0])
axs[0].set_xlabel('Year')
axs[0].set_ylabel('Assortativity Coefficient')
axs[0].set_title('Assortativity Coefficient Over Years')
axs[0].grid(True)
axs[0].xaxis.set_major_locator(MaxNLocator(integer=True))

axs[1].plot(list(churn_ratio_per_year.keys()), list(churn_ratio_per_year.values()), marker='o', color=colors[1])
axs[1].set_xlabel('Year')
axs[1].set_ylabel('Churn Ratio')
axs[1].set_title('Churn Ratio Over Years')
axs[1].grid(True)
axs[1].xaxis.set_major_locator(MaxNLocator(integer=True))

fig.tight_layout()
plt.subplots_adjust(wspace=0.2)
fig.savefig("temporal_statistics.pdf", format="pdf", bbox_inches="tight")
plt.show()