In [20]:
import pandas as pd, numpy as np
import os, random

# Load and clean data
df = pd.read_csv('data/benzoin_ANGLE/edges2.csv')
df = df.drop(columns=['idx', 'ext_roll'], errors='ignore')
df = df.sort_values('time').drop_duplicates(['src', 'dst'], keep='last').reset_index(drop=True)

# Get unique nodes and map to 0-based index
unique_nodes = sorted(set(df['src'].tolist() + df['dst'].tolist()))
num_nodes = len(unique_nodes)

# Separate positive and negative samples
pos_df = df[df['label'] == 1].copy()
neg_df = df[df['label'] == 0].copy()

# Track used edges (undirected)
used_edges = set(tuple(edge) for edge in zip(df['src'], df['dst']))
used_edges |= set((v, u) for u, v in used_edges)

print("Sample edges:\n", list(used_edges)[:5])
print("Any null values?", pd.isnull(list(used_edges)).any())

# Generate synthetic negative edges
def generate_extra_negatives(count, used_edges, num_nodes):
    extra_src, extra_dst = [], []
    while len(extra_src) < count:
        u, v = random.randint(0, num_nodes - 1), random.randint(0, num_nodes - 1)
        if u != v and (u, v) not in used_edges:
            extra_src.append(u)
            extra_dst.append(v)
            used_edges.add((u, v))
            used_edges.add((v, u))  # treat as undirected
    return pd.DataFrame({'src': extra_src, 'dst': extra_dst, 'label': 0})

extra_neg_df = generate_extra_negatives(len(pos_df), used_edges, num_nodes)

# Combine and upsample positives
df_combined = pd.concat([pos_df, neg_df, extra_neg_df], ignore_index=True)
pos_df_final = df_combined[df_combined['label'] == 1]
neg_df_final = df_combined[df_combined['label'] == 0]
pos_df_upsampled = pos_df_final.sample(n=len(neg_df_final), replace=True, random_state=42)

df_balanced = pd.concat([pos_df_upsampled, neg_df_final], ignore_index=True)
df_balanced = df_balanced.sample(frac=1.0, random_state=42).reset_index(drop=True)

# Train/val/test split
n = len(df_balanced)
train_df = df_balanced.iloc[:int(0.7 * n)].reset_index(drop=True)
val_df   = df_balanced.iloc[int(0.7 * n):int(0.85 * n)].reset_index(drop=True)
test_df  = df_balanced.iloc[int(0.85 * n):].reset_index(drop=True)

# Save to disk
os.makedirs('data/benzoin_ANGLE/fold1', exist_ok=True)
train_df.to_csv('data/benzoin_ANGLE/fold1/train.csv', index=True)
val_df.to_csv('data/benzoin_ANGLE/fold1/val.csv', index=True)
test_df.to_csv('data/benzoin_ANGLE/fold1/test.csv', index=True)

# Save unique nodes
# Create a final list of all unique nodes after all processing
all_nodes = sorted(set(df_balanced['src'].tolist() + df_balanced['dst'].tolist()))
unique_node_df = pd.DataFrame(all_nodes, columns=['src'])
unique_node_df['src'] = unique_node_df['src'].astype(int)  # ensure int
unique_node_df.to_csv('data/benzoin_ANGLE/unique_nodes.csv', index=False)
# Print label distribution
print("Train pos:", (train_df['label'] == 1).sum(), "neg:", (train_df['label'] == 0).sum())
print("Val pos:", (val_df['label'] == 1).sum(), "neg:", (val_df['label'] == 0).sum())
print("Test pos:", (test_df['label'] == 1).sum(), "neg:", (test_df['label'] == 0).sum())
print("Label distribution:")
print(df_balanced['label'].value_counts())

Sample edges:
 [(476, 78), (541, 207), (183, 499), (58, 504), (562, 37)]
Any null values? False
Train pos: 6971 neg: 6904
Val pos: 1472 neg: 1501
Test pos: 1468 neg: 1506
Label distribution:
label
1    9911
0    9911
Name: count, dtype: int64
