### USE PYG ENVIRONMENT!!!!

In [None]:
# !pip install torch_geometric

# # Optional dependencies:
# !pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.5.0+cu124.html

In [None]:

# # ### USE PYG ENVIRONMENT!!!!
# !pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126 --force-reinstall

# # %%
# # Install required packages
# import os, torch

# os.environ['TORCH'] = torch.__version__
# !pip install pyg-lib -f https://data.pyg.org/whl/nightly/torch-${TORCH}.html
# !pip install git+https://github.com/pyg-team/pytorch_geometric.git

# # !pip install sentence_transformers
# # !pip3 install fuzzywuzzy[speedup]
# # !pip install captum
# !pip install torch-sparse
# !pip install torch-scatter


In [None]:
# !conda activate pygenv 
# !pip3 install torch==2.5.0+cu124 --index-url https://download.pytorch.org/whl/cu124 --force-reinstall
# !pip install pyg-lib -f https://data.pyg.org/whl/nightly/torch-2.5.0+cu124.html --force-reinstall


In [None]:
import os, torch

# Check for CUDA availability and set device
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"Using CUDA device: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device('cpu')
    print("Using CPU")


In [None]:
random_seed = 80085
torch.manual_seed(random_seed)	
torch.cuda.manual_seed(random_seed)
torch.cuda.manual_seed_all(random_seed)

In [None]:
import torch_geometric.transforms as T
import numpy as np
from torch_geometric.nn.models import Node2Vec


### load graph data 
import pickle

with open('../data/graphs/linegraph_tg.pkl', 'rb') as f:
    data = pickle.load(f)
ebc = data.x.detach().cpu().numpy()

model = Node2Vec(data.edge_index, embedding_dim=64, walk_length=20,
                    context_size=10, walks_per_node=10,
                    num_negative_samples=1, p=2.0, q=0.5, sparse=True)

model.load_state_dict(torch.load('../data/node2vec_2.pt', weights_only=True))

n2v_weights = model.embedding.weight.detach().cpu().numpy()


In [None]:
import torch
from sklearn.model_selection import train_test_split

def stratified_split(data, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15):
    """Splits data into train, validation, and test sets, stratifying by y > 0."""

    # Create a boolean mask for nodes where y > 0
    positive_mask = data.y > 0

    # Get indices of positive and negative nodes
    positive_indices = positive_mask.nonzero(as_tuple=False).squeeze()
    negative_indices = (~positive_mask).nonzero(as_tuple=False).squeeze()

    # Split positive indices
    pos_train_idx, pos_temp_idx = train_test_split(positive_indices, train_size=train_ratio, random_state=random_seed)  # Adjust random_state for consistent splits
    pos_val_idx, pos_test_idx = train_test_split(pos_temp_idx, test_size=(test_ratio / (val_ratio + test_ratio)), random_state=random_seed)

    # Split negative indices
    neg_train_idx, neg_temp_idx = train_test_split(negative_indices, train_size=train_ratio, random_state=random_seed)
    neg_val_idx, neg_test_idx = train_test_split(neg_temp_idx, test_size=(test_ratio / (val_ratio + test_ratio)), random_state=random_seed)

    # Combine indices
    train_idx = torch.cat([pos_train_idx, neg_train_idx])
    val_idx = torch.cat([pos_val_idx, neg_val_idx])
    test_idx = torch.cat([pos_test_idx, neg_test_idx])

    # Create masks
    train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    val_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)

    train_mask[train_idx] = True
    val_mask[val_idx] = True
    test_mask[test_idx] = True

    data.train_mask = train_mask
    data.val_mask = val_mask
    data.test_mask = test_mask

    return data



In [None]:
data.x = torch.tensor(n2v_weights, dtype=torch.float32)
data.edge_index = data.edge_index.contiguous()
data.x = data.x.contiguous()
data.y = data.y.contiguous()

import os, torch

from torch_geometric.data import DataLoader
import torch_geometric.transforms as T
import numpy as np

from torch_geometric.loader import NeighborLoader

data = stratified_split(data)


# Set the input nodes for the loader
# loader = NeighborLoader(data, batch_size=batch_size,
#                         shuffle=True, 
#                         num_neighbors=[-1]*100,
#                         input_nodes=train_nodes)

import torch
from torch_geometric.nn import MessagePassing
from torch_geometric.data import Data
import torch
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        torch.manual_seed(80085)
        self.conv1 = GCNConv(data.num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.lin =  Linear(hidden_channels, 1)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)
        return x

model = GCN(hidden_channels=64).to(device) # Move model to device

# Move data to device
data.x = data.x.to(device)
data.edge_index = data.edge_index.to(device)
if hasattr(data, 'y'): # Check if 'y' exists and move it. Important for heterogeneous graphs.
    data.y = data.y.to(device)
if hasattr(data, 'train_mask'):
    data.train_mask = data.train_mask.to(device)

In [None]:
# torch.save(model.state_dict(), 'gcn.pt')

In [None]:
# model.load_state_dict(torch.load('gcn.pt', weights_only=True))

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5, weight_decay=5e-4)
criterion = torch.nn.MSELoss()

In [None]:
def train():
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(data.x, data.edge_index)  # Perform a single forward pass.
      loss = criterion(out[data.train_mask], data.y[data.train_mask])  # Compute the loss solely based on the training nodes.
      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss

In [None]:
for epoch in range(1, int(1e+6)):
    loss = train()
    if epoch % 1000 == 0:
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

In [None]:
# torch.save(model.state_dict(), 'gcn.pt')

In [None]:
def test():
    model.eval()
    out = model(data.x, data.edge_index)
    loss = criterion(out[data.val_mask], data.y[data.val_mask])  # Compute the loss solely based on the test nodes.
    return loss, out

loss, out = test()

## get indices of val nodes with y > 0
val_idx = data.val_mask.nonzero(as_tuple=False).squeeze()
val_idx = val_idx[data.y[data.val_mask] > 0]

from sklearn.metrics import r2_score
print(r2_score(data.y[val_idx].detach().cpu().numpy(), out[val_idx].detach().cpu().numpy()))

In [None]:

print(f'Loss: {loss:.4f}')
y_over_0 = data.y[data.val_mask].detach().cpu().numpy() ; y_over_0 = y_over_0[y_over_0 > 0]
out_over_0 = out[data.val_mask].detach().cpu().numpy() ; out_over_0 = out_over_0[out_over_0 > min(y_over_0)]
sns.histplot(out_over_0)

In [None]:
sns.histplot(y_over_0)

In [None]:
import seaborn as sns
non_zero = data.y[data.y > 0]
non_zero_pred = model(data.x, data.edge_index)[data.y > 0]
non_zero_pred = non_zero_pred.detach().cpu().numpy()
sns.histplot(non_zero_pred)

In [None]:
sns.histplot(non_zero.detach().cpu().numpy())

In [None]:
from sklearn.metrics import r2_score

def test_r2():
    model.eval()
    out = model(data.x, data.edge_index)
    r2 = r2_score(data.y[data.test_mask].detach().cpu().numpy(), out[data.test_mask].detach().cpu().numpy())
    return r2

print(f'Loss: {test():.4f}')

In [None]:
### get train_mask indices
train_indices = data.train_mask.nonzero(as_tuple=False).squeeze()
train_indices = train_indices.detach().cpu().numpy()

ebc_train = ebc[train_indices]
y_train = data.y[train_indices].detach().cpu().numpy()

### get test_mask indices
test_indices = data.test_mask.nonzero(as_tuple=False).squeeze()
test_indices = test_indices.detach().cpu().numpy()

ebc_test = ebc[test_indices]
y_test = data.y[test_indices].detach().cpu().numpy()


In [None]:
with open('../data/graphs/linegraph_nx.pkl', 'rb') as f:
    graph = pickle.load(f)

ebc, y = [], []

import networkx as nx	
for _, node in graph.nodes(data=True):
    if node['aadt'] > 0:
        ebc.append(node['bc'])
        y.append(node['aadt'])

ebc = np.array(ebc)
y = np.array(y)


In [None]:
y_train.shape, y_test.shape

In [None]:
X_train.shape, X_test.shape

In [None]:
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = train_test_split(ebc, y, test_size=0.2)

reg = LinearRegression().fit(X_train.reshape(-1,1), y_train)
print(reg.score(X_test.reshape(-1,1 ), y_test))

from sklearn.metrics import r2_score

r2_score(y_test, reg.predict(X_test.reshape(-1,1)))


In [None]:
# def train():
#     model.train()
#     optimizer.zero_grad()  # Clear gradients
#     losses = []
#     first = True
#     for batch in loader:
#         batch = batch.to(device) # Move batch to device
#         out = model(batch.x, batch.edge_index)
#         loss = criterion(out[batch.train_mask], batch.y[batch.train_mask])
#         loss.backward()  # Backward pass.
#         optimizer.step()
#         losses.append(loss.item())
#     return torch.tensor(losses).mean().item()


# for epoch in range(1, 10000):
#     loss = train()
#     if epoch % 10 == 0:
#         print(f'Epoch: {epoch:03d}, Loss: {loss}', 'improvement:', prev_loss-loss)
#     prev_loss = loss