In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import yaml
import os

In [None]:
config_path = os.path.join("..", "config.yaml")
with open(config_path, "rt") as config_file:
    config = yaml.safe_load(config_file)

edges_dir = os.path.join("..", "FOS_Benchmark", "_".join(config["DOMAINS"]), "edges")
edges = pd.read_csv(os.path.join(edges_dir, "all_edges.csv"), header=None, names=["src", "dst", "ts"])

In [None]:
# Sort the DataFrame by timestamp
edges = edges.sort_values('ts')

# Create a unique identifier for each edge
edges['edge'] = edges['src'].astype(str) + '-' + edges['dst'].astype(str)

# Find the first timestamp for each edge
first_ts = edges.groupby('edge')['ts'].min().reset_index(name='first_ts')
last_ts = edges.groupby('edge')['ts'].max().reset_index(name='last_ts')

# Merge the first_ts back into the DataFrame
edges = edges.merge(first_ts, on='edge').merge(last_ts, on='edge')

# Classify edges as 'new' or 'recurring'
edges['type'] = 'new'
edges.loc[edges['ts'] > edges['first_ts'], 'type'] = 'recurring'

# Group by timestamp and type, count the number of edges
counts = edges.groupby(['ts', 'type']).size().unstack(fill_value=0)

# Plot the stacked bar chart
counts.plot(kind='bar', stacked=True, figsize=(10, 6))
plt.xlabel('Timestamp')
plt.ylabel('Number of Edges')
plt.title('Edges per Timestamp: New vs Recurring')
plt.legend(title='Edge Type')
plt.savefig("TEA.pdf", dpi=300)
plt.show()

In [None]:
novelty = counts["new"]/(counts["new"]+counts["recurring"])
print("novelty:", novelty.mean())

In [None]:
# Create a unique identifier for each edge
edges['edge'] = edges['src'].astype(str) + '-' + edges['dst'].astype(str)

# Compute first and last timestamp for each edge in one go
edge_ts = edges.groupby('edge')['ts'].agg(['min', 'max']).reset_index()
edge_ts.columns = ['edge', 'first_ts', 'last_ts']

# Sort edges by first_ts and last_ts, and assign edge_index
edge_ts = edge_ts.sort_values(['first_ts', 'last_ts']).reset_index(drop=True)
edge_ts['edge_index'] = edge_ts.index + 1  # 1-based indexing

# Create a mapping from edge to edge_index for efficient lookup
edge_to_index = dict(zip(edge_ts['edge'], edge_ts['edge_index']))

# Map edge_index to the original DataFrame using vectorized operation
edges['edge_index'] = edges['edge'].map(edge_to_index)
assert np.all(np.diff(edges['ts']) >= 0), "Edges timestamps are not sorted!"
q = int(edges['edge_index'].max() * 0.85)

# Plot scatter plot directly from DataFrame
plt.figure(figsize=(10, 6))
plt.scatter(edges['edge_index'][edges['edge_index']<q], edges['ts'][edges['edge_index']<q], alpha=0.5, s=10, marker="|", color='green')  # Smaller marker size for speed
plt.scatter(edges['edge_index'][edges['edge_index']>=q], edges['ts'][edges['edge_index']>=q], alpha=0.5, s=10, marker="|", color='red')  # Smaller marker size for speed
plt.xlabel('Edge Index')
plt.ylabel('Timestamp')
plt.title('Edge Appearances Across Timestamps')
plt.grid(True)
plt.savefig("TET.pdf", dpi=300)
plt.show()

In [None]:
t_train = edges["ts"].quantile(0.7)
t_val = edges["ts"].quantile(0.85)
print("t_train", t_train, "t_val", t_val)

train_val_edges_df = edges[edges["ts"] <= t_val]
test_edges_df  = edges[edges["ts"] > t_val]

train_edge_set = set((train_val_edges_df['src'].astype(str) + '-' + train_val_edges_df['dst'].astype(str)).values)
test_edge_set = set((test_edges_df['src'].astype(str) + '-' + test_edges_df['dst'].astype(str)).values)

reocurrence = len(train_edge_set & test_edge_set) / len(train_edge_set)
surprise = len(test_edge_set - train_edge_set) / len(test_edge_set)

print("reocurrence:", reocurrence, "surprise:", surprise)