# gnn

In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.data import Dataset, Data
from torch_geometric.nn import SAGEConv, BatchNorm
from torch_geometric.loader import DataLoader
from torch_geometric.nn.aggr import AttentionalAggregation
import pandas as pd
import numpy as np
from sklearn.metrics import root_mean_squared_error, r2_score
from tqdm import trange

meta = torch.load('../results/gnn_old/results.pt', weights_only=False)
data = pd.read_parquet('../results/kis_viability.parquet')
edges_df = pd.read_csv('../results/kinome.edgelist', names=['source', 'target'])
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class KinomeDataset(Dataset):
    def __init__(self, df, edge_index, feature_cols):
        super().__init__()
        self._indices     = None
        self.df           = df
        self.edge_index   = edge_index
        self.feature_cols = feature_cols

    def len(self):
        return len(self.df)

    def get(self, idx):
        row = self.df.iloc[idx]
        x = torch.from_numpy(
                row[self.feature_cols]
                   .to_numpy(dtype=np.float32, copy=False)
            ).unsqueeze(1)                  # [num_nodes, 1]
        y = torch.tensor(row['viability'], dtype=torch.float32)  # scalar
        return Data(x=x, edge_index=self.edge_index, y=y)

    __getitem__ = get

class KinomeGNN(torch.nn.Module):
    def __init__(self, hidden=32, p_drop=0.4):
        super().__init__()
        self.conv1 = SAGEConv(1, hidden)
        self.bn1   = BatchNorm(hidden)
        self.conv2 = SAGEConv(hidden, hidden)
        self.bn2   = BatchNorm(hidden)
        self.drop  = torch.nn.Dropout(p_drop)
        self.pool  = AttentionalAggregation(torch.nn.Linear(hidden, 1))
        self.lin   = torch.nn.Linear(hidden, 1)

    def forward(self, x, edge_index, batch):
        x = self.drop(F.relu(self.bn1(self.conv1(x, edge_index))))
        x = self.drop(F.relu(self.bn2(self.conv2(x, edge_index))))
        g = self.pool(x, batch)
        return self.lin(g).squeeze(1)

def load_and_process_data(edge_df, parquet_df):
    edges_df = edge_df

    data = parquet_df


    measured_kinases = [col[4:] for col in data.columns if col.startswith('inh_')]
    # print("\nKinase statistics:")
    # print(f"Number of measured kinases: {len(measured_kinases)}")
    # print("First 5 measured kinases:", measured_kinases[:5])

    edges_df_filtered = edges_df[
        (edges_df['source'].isin(measured_kinases)) &
        (edges_df['target'].isin(measured_kinases))
    ]
    network_kinases = set(measured_kinases) #set(edges_df_filtered['source']) | set(edges_df_filtered['target'])

    # 2) prune your feature list to match
    surviving_kinases = [k for k in measured_kinases if k in network_kinases]
    # print(f"After pruning: {len(measured_kinases)} kinases (should equal unique nodes)")

    # 3) rebuild the kinase → index map
    kinase_to_idx = {k: i for i, k in enumerate(surviving_kinases)}

    # 4) rebuild edge_index using only the surviving kinases
    edge_index = torch.tensor([
        [kinase_to_idx[s] for s in edges_df_filtered['source']],
        [kinase_to_idx[t] for t in edges_df_filtered['target']]
    ], dtype=torch.long)
    # print("\nEdge index statistics:")
    # print(f"Edge index shape: {edge_index.shape}")
    # print("First 5 edges (node indices):", edge_index[:, :5])

    # print("\nVerifying data consistency:")
    # print(f"Number of kinase features in data: {len(surviving_kinases)}")
    # print(f"Number of unique nodes in edge_index: {len(torch.unique(edge_index))}")

    return data, edge_index, kinase_to_idx, surviving_kinases

gnn_r2 = []
gnn_rmse = []

for fold in trange(10):

    # model
    model = KinomeGNN(hidden=32, p_drop=0.4).to(device)
    model.load_state_dict(torch.load(f'../results/gnn_old/best_fold{fold+1}.pt'))

    # data
    train_idx = meta['train_idxes'][fold]
    test_idx = meta['test_idxes'][fold]

    # split + standardize
    #Removing missing inhibition states

    data, edge_index, kinase_to_idx, measured_kinases = load_and_process_data(edges_df, data)

    # 1) Feature columns for the surviving kinases:
    feature_cols = [f'inh_{k}' for k in measured_kinases]

    train_df = data.iloc[train_idx].reset_index(drop=True)
    test_df  = data.iloc[test_idx].reset_index(drop=True)
    mu_feat  = train_df[feature_cols].mean()
    sd_feat  = train_df[feature_cols].std().replace(0,1)
    y_mean   = train_df['viability'].mean()

    def prep(df):
        df = df.copy()
        df[feature_cols] = (df[feature_cols] - mu_feat) / sd_feat
        df['viability']  = df['viability'] - y_mean
        return df

    test_df = prep(test_df)
    test_ds  = KinomeDataset(test_df,  edge_index, feature_cols)
    test_loader  = DataLoader(test_ds,  batch_size=32, shuffle=False)

    # eval
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for batch in test_loader:
            batch = batch.to(device)
            out   = model(batch.x, batch.edge_index, batch.batch)
            y_pred.append(out.cpu().numpy())
            y_true.append(batch.y.cpu().numpy())
    y_true = np.concatenate(y_true) + y_mean
    y_pred = np.concatenate(y_pred) + y_mean
    gnn_rmse.append(root_mean_squared_error(y_true, y_pred))
    gnn_r2.append(r2_score(y_true, y_pred))

NameError: name 'drug2idx' is not defined

In [None]:
# add gnn_rmse and gnn_r2 to meta
meta['gnn_rmse'] = np.array(gnn_rmse)
meta['gnn_r2']  = np.array(gnn_r2)
# save
torch.save(meta, '../results/gnn_old/results.pt')