In [None]:
import sys
sys.path.append("../scripts")

import os, torch
from sklearn.model_selection import train_test_split
import pickle
import torch_geometric.transforms as T
import numpy as np
from torch_geometric.nn.models import Node2Vec
from torch_geometric.data import DataLoader
from torch_geometric.nn import MessagePassing
from torch_geometric.data import Data
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, GATConv
GCNConv._orig_propagate = GCNConv.propagate
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from torch_geometric.explain import GNNExplainer, Explainer
from models import *
from tg_functions import *
from bike_functions import *
import torch_geometric as tg
import pandas as pd
import seaborn as sns

graph_num = 17  # Replace with your graph number

with open(f'../data/graphs/{graph_num}/linegraph_tg.pkl', 'rb') as f:
    data = pickle.load(f)

data.edge_index = data.edge_index.contiguous()
data.x = data.x.contiguous()
data.y = data.y.contiguous()

# data = stratified_split(data = data , random_seed = random_seed)

H = tg.utils.to_networkx(data, to_undirected=True)

# Degree Distribution Plot
plt.figure(figsize=(10, 6))
sns.histplot([d for n, d in H.degree()], discrete=True)
plt.title('Degree Distribution')
plt.xlabel('Degree')
plt.ylabel('Frequency')
plt.grid()
plt.show()

# Create table for graph statistics
df = pd.DataFrame()
df['Graph Number'] = [graph_num]
df['Number of Nodes'] = [data.num_nodes]
df['Number of Edges'] = [data.num_edges]
df['Average Degree'] = [data.num_edges / data.num_nodes]
# Read feature table


In [None]:
df2 = pd.read_csv(f'../data/graphs/{graph_num}/node_features.csv')
# sort columns by name
df2 = df2.reindex(sorted(df2.columns), axis=1)

print(len(df2.columns))
### get square root of number of features
sqrt_num_features = int(np.sqrt(len(df2.columns)))
n_cols = sqrt_num_features
n_rows = int(np.ceil(len(df2.columns) / n_cols))
fig, ax = plt.subplots(
    n_rows, n_cols, figsize=(15, 15), constrained_layout=True)
for i, col in enumerate(df2.columns):
    row = i // n_cols
    col_idx = i % n_cols
    # exclude datapoints with 0 values
    df2[col] = df2[col].replace(0, np.nan)  # Replace 0 with NaN for better visualization
    sns.histplot(df2[col], ax=ax[row, col_idx], alpha=0.7, discrete=True)
    ax[row, col_idx].set_title(col)

plt.suptitle('Node Feature Distributions', fontsize=16)
plt.tight_layout()
plt.show()
