In [None]:
import sys
sys.path.append("../scripts")

import os, torch
from sklearn.model_selection import train_test_split
import pickle
import torch_geometric.transforms as T
import numpy as np
from torch_geometric.nn.models import Node2Vec
from torch_geometric.data import DataLoader
from torch_geometric.nn import MessagePassing
from torch_geometric.data import Data
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, GATConv
GCNConv._orig_propagate = GCNConv.propagate
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from torch_geometric.explain import GNNExplainer, Explainer
from models import *
from tg_functions import *
from bike_functions import *

dropout_p = 0.5
use_gat = True
bins = [int(i) for i in os.getenv("BINS", "400 800 1300 2100 3000 3700 4700 7020 9660").split(' ')]
bins = [int(i) for i in os.getenv("BINS", "3000").split(' ')]

bins = torch.tensor(bins, device='cuda' if torch.cuda.is_available() else 'cpu')
hidden_c = 100
num_layers = 0
random_seed = 100
nh = 1

graph_num = 29  # Replace with your graph number
model_name = 'upbeat-firebrand-246' # Replace with your model name
weight_prefix = 'best_accuracy'  # Replace with your weight prefix


if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"Using CUDA device: {torch.cuda.get_device_name(0)}", flush = True)
else:
    device = torch.device('cpu')
    print("Using CPU", flush = True)

# device = 'cpu'

with open(f'../data/graphs/{graph_num}/linegraph_tg.pkl', 'rb') as f:
    data = pickle.load(f)

data.edge_index = data.edge_index.contiguous()
data.x = data.x.contiguous()
data.y = data.y.contiguous()

data = stratified_split(data = data , random_seed = random_seed)

# --- Model Instantiation ---
model = GAT(hidden_c, num_layers, random_seed, bins, data, nh).to(device) if use_gat else GCN(hidden_c, num_layers, random_seed, bins, data).to(device)

if use_gat == 'MLP':
    model = MLP(hidden_c, num_layers, random_seed, bins, data, nh).to(device)

# Load the model with the GCN class
model = torch.load(f'../data/graphs/{graph_num}/models/{model_name}.pt', map_location=device)
model = model.to(device)

model.load_state_dict(torch.load(f'../data/graphs/{graph_num}/models/{model_name}_{weight_prefix}.pt', map_location=device))

criterion = torch.nn.CrossEntropyLoss()
data.edge_index = data.edge_index.contiguous()
data.x = data.x.contiguous()
data.y = data.y.contiguous()
print(data.x.shape, data.edge_index.shape, data.y.shape, flush = True)
data = stratified_split(data, random_seed=random_seed)
criterion = torch.nn.CrossEntropyLoss()


In [None]:
import json
import torch
from torch_geometric.explain import Explanation

def load_explanation_from_json(json_path, device='cpu'):
    """
    Loads an explanation from a JSON file and converts lists to torch tensors where appropriate.
    
    Parameters:
    - json_path: Path to the explanation JSON file.
    - device: The device to place tensors on ('cpu' or 'cuda').
    
    Returns:
    - explanation: An Explanation object.
    """
    with open(json_path, 'r') as f:
        explanation_dict = json.load(f)
    
    # Recursively convert lists to torch tensors
    def convert(item):
        if isinstance(item, list):
            # If it's a nested list (likely a tensor)
            if item and isinstance(item[0], list):
                return torch.tensor(item, device=device)
            # Otherwise, could be a 1D list
            else:
                return torch.tensor(item, device=device)
        elif isinstance(item, dict):
            return {k: convert(v) for k, v in item.items()}
        else:
            return item

    explanation_converted = {k: convert(v) for k, v in explanation_dict.items()}
    explanation = Explanation(**explanation_converted)
    return explanation

json_path = f'../data/graphs/{graph_num}/explanations/{model_name}/graph_level_explanation.json'
explanation = load_explanation_from_json(json_path, device=device)
print("✅ Explanation loaded and moved to device.")

In [None]:
# # get 20 most important features
# import pandas as pd
# import seaborn as sns

# scores = explanation.node_mask.sum(dim=0)
# scores = pd.DataFrame(scores.cpu().numpy(), index=data.feat_names, columns=['importance'])
# scores = scores.sort_values(by='importance', ascending=False).head(20)

# plt.figure(figsize=(8, 8))
# sns.barplot(y=scores.index, x=scores['importance'])
# plt.title('Top 20 Important Features')
# plt.xlabel('Importance Score')
# plt.ylabel('Features')
# plt.tight_layout()
# plt.show()
# import pandas as pd
# per_bin_gradients = pd.read_csv(f'../data/graphs/{graph_num}/explanations/{model_name}/per_bin_gradients_all_nodes.csv')
# plt.figure(figsize=(8, 8))
# sns.heatmap(per_bin_gradients.groupby('bin_index').mean().drop(columns=['node_index'])[scores.index].T, annot=True, cmap='coolwarm', cbar_kws={'label': 'Gradient'}, vmin=-0.1, vmax=0.1)
# plt.title('Average Gradient per Bin for Top 20 Features')
# plt.xlabel('Bin Index')
# plt.ylabel('Features')
# plt.tight_layout()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# === Top 20 Important Features (from explanation) ===
scores = explanation.node_mask.sum(dim=0)
scores = pd.DataFrame(scores.cpu().numpy(), index=data.feat_names, columns=['importance'])
scores = scores.sort_values(by='importance', ascending=False).head(20)

# === Per-Bin Gradients (load CSV) ===
per_bin_gradients = pd.read_csv(f'../data/graphs/{graph_num}/explanations/{model_name}/per_bin_gradients_all_nodes.csv')
gradients_mean = per_bin_gradients.groupby('bin_index').mean().drop(columns=['node_index'])[scores.index]

# === Plot ===
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 8))

# Plot 1: Bar Plot of Top 20 Important Features
sns.barplot(y=scores.index, x=scores['importance'], ax=axes[0])
axes[0].set_title('Top 20 Important Features')
axes[0].set_xlabel('Importance Score')
axes[0].set_ylabel('Features')

# Plot 2: Heatmap of Average Gradient per Bin
sns.heatmap(
    gradients_mean.T,
    annot=True,
    cmap='coolwarm',
    cbar_kws={'label': 'Gradient'},
    vmin=-0.1,
    vmax=0.1,
    ax=axes[1],
    fmt=".2f"
)
axes[1].set_title('Average Gradient per Bin for Top 20 Features')
axes[1].set_xlabel('Bin Index')
axes[1].set_ylabel('Features')

plt.tight_layout()
plt.show()


In [None]:
# calculate the spearman correlation between features and values

from scipy.stats import spearmanr

spearmans = []
mask = data.y > 0

for feat in data.feat_names:
    feat_index = data.feat_names.index(feat)
    feat_values = data.x[mask, feat_index].cpu().numpy()
    target_values = data.y[mask].cpu().numpy()
    
    if len(feat_values) > 1 and len(target_values) > 1:
        corr, _ = spearmanr(feat_values, target_values)
        spearmans.append((feat, corr))

spearman_df = pd.DataFrame(spearmans, columns=['Feature', 'Spearman Correlation'])

# filter to top and bottom 10 features
top_bottom = pd.concat([
    spearman_df.nlargest(10, 'Spearman Correlation'),
    spearman_df.nsmallest(10, 'Spearman Correlation')
]).sort_values(by='Spearman Correlation', ascending=False)
plt.figure(figsize=(6, 4))
sns.barplot(x='Spearman Correlation', y='Feature', data=top_bottom)
plt.title('Top and Bottom 10 Features by Spearman Correlation')
plt.xlabel('Spearman Correlation')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()