In [None]:
import sys
sys.path.append("../scripts")

import os, torch
from sklearn.model_selection import train_test_split
import pickle
import torch_geometric.transforms as T
import numpy as np
from torch_geometric.nn import GCNConv
GCNConv._orig_propagate = GCNConv.propagate
import matplotlib.pyplot as plt
from models import *
from tg_functions import *
from bike_functions import *
import torch_geometric as tg
import pandas as pd
import seaborn as sns

graph_num = 28  # Replace with your graph number

with open(f'../data/graphs/{graph_num}/linegraph_tg.pkl', 'rb') as f:
    data = pickle.load(f)

data.edge_index = data.edge_index.contiguous()
data.x = data.x.contiguous()
data.y = data.y.contiguous()

data = stratified_split(data = data , random_seed = 100)

H = tg.utils.to_networkx(data, to_undirected=True)

# Degree Distribution Plot
plt.figure(figsize=(10, 6))
sns.histplot([d for n, d in H.degree()], discrete=True)
plt.title('Degree Distribution')
plt.xlabel('Degree')
plt.ylabel('Frequency')
plt.grid()
plt.show()

# Create table for graph statistics
df = pd.DataFrame()
df['Graph Number'] = [graph_num]
df['Number of Nodes'] = [data.num_nodes]
df['Number of Edges'] = [data.num_edges]
df['Average Degree'] = [data.num_edges / data.num_nodes]
# Read feature table


In [None]:
cols = data.feat_names
df2 = pd.DataFrame(data.x.numpy(), columns=cols)
drop_df = df2[['oneway', 'bc', 'reversed']]
df2 = df2.drop(columns=['oneway', 'bc', 'reversed'])

# Set grid size
n_cols = 5
n_rows = int(np.ceil(len(df2.columns) / n_cols))

fig, axes = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 4 * n_rows))
axes = axes.flatten()  # make it easier to index

for i, col in enumerate(df2.columns):
    ax = axes[i]
    temp = df2[col][df2[col] != 0]
    sns.histplot(temp, ax=ax, discrete=True)
    ax.set_title(f'{col}'.capitalize())
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_yscale('log')  # <-- log scale on count axis

# Turn off unused axes
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()


In [None]:
temp = drop_df['bc'][drop_df['bc'] > 0]  # avoid log(0)
sns.histplot(temp, log_scale=(True, False))  # x-axis log, y-axis normal
plt.title('Betweenness Centrality Distribution')
plt.xlabel('Betweenness Centrality')
plt.ylabel('Frequency')
plt.grid()
plt.show()

In [None]:
sns.histplot(data.y[data.y != 0])
plt.title('Target Distribution')
plt.xlabel('Target Value')
plt.ylabel('Frequency')
plt.grid()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import torch

# Filter y > 0 for each split
train_y = data.y[data.train_mask]
val_y = data.y[data.val_mask]
test_y = data.y[data.test_mask]

train_y = train_y[train_y > 0].cpu().numpy()
val_y = val_y[val_y > 0].cpu().numpy()
test_y = test_y[test_y > 0].cpu().numpy()

# Define bins and bucketize
bins = [400, 800, 1300, 2100, 3000, 3700, 4700, 7020, 9660]
edges = [0] + bins + [float('inf')]  # for labeling

# Bucketize
train_y_binned = torch.bucketize(torch.tensor(train_y), torch.tensor(bins))
val_y_binned = torch.bucketize(torch.tensor(val_y), torch.tensor(bins))
test_y_binned = torch.bucketize(torch.tensor(test_y), torch.tensor(bins))

# Create label strings for each bin
bin_labels = [f"{edges[i]}–{edges[i+1]}" for i in range(len(edges) - 1)]

# Combine into DataFrame
df_binned = pd.concat([
    pd.DataFrame({'Split': 'Train', 'Label': train_y_binned.numpy()}),
    pd.DataFrame({'Split': 'Validation', 'Label': val_y_binned.numpy()}),
    pd.DataFrame({'Split': 'Test', 'Label': test_y_binned.numpy()})
], ignore_index=True)

# Map bucket indices to bin label strings
df_binned['Label'] = df_binned['Label'].astype(int)
df_binned['Label'] = df_binned['Label'].apply(lambda x: bin_labels[x] if x < len(bin_labels) else f">{bins[-1]}")

# Convert Label to ordered categorical for correct x-axis sorting
df_binned['Label'] = pd.Categorical(df_binned['Label'], categories=bin_labels, ordered=True)

# Plot
plt.figure(figsize=(12, 6))
sns.histplot(data=df_binned, x='Label', hue='Split', multiple='stack', shrink=0.8, discrete=True)
plt.xticks(rotation=45, ha='right')
plt.title('Distribution of Binned AADT Labels Across Splits')
plt.xlabel('AADT Bin Range')
plt.ylabel('Count')
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import torch

# Filter y > 0 for each split
train_y = data.y[data.train_mask]
val_y = data.y[data.val_mask]
test_y = data.y[data.test_mask]

train_y = train_y[train_y > 0].cpu().numpy()
val_y = val_y[val_y > 0].cpu().numpy()
test_y = test_y[test_y > 0].cpu().numpy()

# Define bins and bucketize
bins = [3000]
edges = [0] + bins + [float('inf')]  # for labeling

# Bucketize
train_y_binned = torch.bucketize(torch.tensor(train_y), torch.tensor(bins))
val_y_binned = torch.bucketize(torch.tensor(val_y), torch.tensor(bins))
test_y_binned = torch.bucketize(torch.tensor(test_y), torch.tensor(bins))

# Create label strings for each bin
bin_labels = [f"{edges[i]}–{edges[i+1]}" for i in range(len(edges) - 1)]

# Combine into DataFrame
df_binned = pd.concat([
    pd.DataFrame({'Split': 'Train', 'Label': train_y_binned.numpy()}),
    pd.DataFrame({'Split': 'Validation', 'Label': val_y_binned.numpy()}),
    pd.DataFrame({'Split': 'Test', 'Label': test_y_binned.numpy()})
], ignore_index=True)

# Map bucket indices to bin label strings
df_binned['Label'] = df_binned['Label'].astype(int)
df_binned['Label'] = df_binned['Label'].apply(lambda x: bin_labels[x] if x < len(bin_labels) else f">{bins[-1]}")

# Convert Label to ordered categorical for correct x-axis sorting
df_binned['Label'] = pd.Categorical(df_binned['Label'], categories=bin_labels, ordered=True)

# Plot
plt.figure(figsize=(12, 6))
sns.histplot(data=df_binned, x='Label', hue='Split', multiple='stack', shrink=0.8, discrete=True)
plt.xticks(rotation=45, ha='right')
plt.title('Distribution of Binned AADT Labels Across Splits')
plt.xlabel('AADT Bin Range')
plt.ylabel('Count')
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Create individual DataFrames for each split
df_train = pd.DataFrame({'Split': 'Train', 'Label': train_y})
df_val = pd.DataFrame({'Split': 'Validation', 'Label': val_y})
df_test = pd.DataFrame({'Split': 'Test', 'Label': test_y})

# Plot using subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 5), sharey=True)

sns.histplot(data=df_train, x='Label', bins=50, element='step', ax=axes[0])
axes[0].set_title('Train Set')
axes[0].set_xlabel('AADT Value')
axes[0].set_ylabel('Density')
axes[0].grid(True)

sns.histplot(data=df_val, x='Label', bins=50, element='step', ax=axes[1])
axes[1].set_title('Validation Set')
axes[1].set_xlabel('AADT Value')
axes[1].grid(True)

sns.histplot(data=df_test, x='Label', bins=50, element='step', ax=axes[2])
axes[2].set_title('Test Set')
axes[2].set_xlabel('AADT Value')
axes[2].grid(True)

plt.suptitle('Distribution of AADT Labels Across Splits (y > 0)', fontsize=16)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()
