In [1]:
import os
import torch
import torch.nn.functional as F
import torch.nn as nn
import numpy as np
import pandas as pd
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, SAGEConv
from torch_geometric.utils import from_scipy_sparse_matrix
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

In [2]:
df = pd.read_csv("../../dataset/dataset.csv") 

# 2. Define your feature columns and label
small_columns = ['CHF_F30',
                 'HICHOLRP',
                 'INCONT',
                 'BKBONMOM',
                 'PREG',
                 'AGE',
                 'ETHNICNIH',
                 'F45CALC',
                 'F60ALCWK',
                 'F60CALC',]

outcome_column = 'ANYFX'

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
# ===========================
# FULL GNN SCRIPT WITH BOTH GCN AND GRAPHSAGE, EARLY STOPPING, STRATIFIED SPLITS, KNN GRAPH, SAVING MODEL + EDGE INDEX
# ===========================

# Balance dataset by undersampling majority class
minority_class = df[outcome_column].value_counts().idxmin()
majority_class = df[outcome_column].value_counts().idxmax()

minority_df = df[df[outcome_column] == minority_class]
majority_df = df[df[outcome_column] == majority_class].sample(n=len(minority_df), random_state=42)

df_balanced = pd.concat([minority_df, majority_df]).sample(frac=1, random_state=42)  # shuffle

X = df_balanced[small_columns].values
y = df_balanced[outcome_column].values

# Stratified 60/20/20 split
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.20, random_state=42)
for train_val_idx, test_idx in sss.split(X, y):
    X_temp, X_test = X[train_val_idx], X[test_idx]
    y_temp, y_test = y[train_val_idx], y[test_idx]

sss_val = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=42)
for train_idx, val_idx in sss_val.split(X_temp, y_temp):
    X_train, X_val = X_temp[train_idx], X_temp[val_idx]
    y_train, y_val = y_temp[train_idx], y_temp[val_idx]

# ====================
# 2. KNN Graph Construction
# ====================

k = 5
knn = NearestNeighbors(n_neighbors=k)
knn.fit(X_train)
knn_graph = knn.kneighbors_graph(X_train, mode='connectivity')

edge_index, _ = from_scipy_sparse_matrix(csr_matrix(knn_graph))

# Save edge index for future use
os.makedirs('saved_models', exist_ok=True)
torch.save(edge_index, 'saved_models/edge_index.pt')

# Create PyG Data object
data = Data(x=torch.tensor(X_train, dtype=torch.float),
            edge_index=edge_index,
            y=torch.tensor(y_train, dtype=torch.long))

# ====================
# 3. Model Definitions
# ====================

class GCN(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.lin = nn.Linear(hidden_dim, num_classes)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.3, training=self.training)
        x = F.relu(self.conv2(x, edge_index))
        x = F.dropout(x, p=0.3, training=self.training)
        x = self.lin(x)
        return x

class GraphSAGE(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(input_dim, hidden_dim)
        self.conv2 = SAGEConv(hidden_dim, hidden_dim)
        self.lin = nn.Linear(hidden_dim, num_classes)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.3, training=self.training)
        x = F.relu(self.conv2(x, edge_index))
        x = F.dropout(x, p=0.3, training=self.training)
        x = self.lin(x)
        return x

# ====================
# 4. Early Stopping Class
# ====================

class EarlyStopping:
    def __init__(self, patience=30, verbose=False, delta=0):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.delta = delta

    def __call__(self, val_auc, model, path):
        score = val_auc

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(model, path)
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.verbose:
                print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(model, path)
            self.counter = 0

    def save_checkpoint(self, model, path):
        torch.save(model.state_dict(), path)

# ====================
# 5. Training and Evaluation Functions
# ====================

def train(model, optimizer, data, criterion):
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = criterion(out, data.y)
    loss.backward()
    optimizer.step()
    return loss.item()

@torch.no_grad()
def evaluate(model, split):
    model.eval()
    if split == 'train':
        X_split, y_split = X_train, y_train
    elif split == 'val':
        X_split, y_split = X_val, y_val
    else:
        X_split, y_split = X_test, y_test

    edge_idx, _ = from_scipy_sparse_matrix(csr_matrix(knn.kneighbors_graph(X_split, mode='connectivity')))
    data_split = Data(x=torch.tensor(X_split, dtype=torch.float).to(device),
                      edge_index=edge_idx.to(device))

    logits = model(data_split.x, data_split.edge_index)
    preds = logits.argmax(dim=1).cpu().numpy()

    acc = accuracy_score(y_split, preds)
    auc = roc_auc_score(y_split, preds)
    cm = confusion_matrix(y_split, preds)
    tn, fp, fn, tp = cm.ravel()
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    return acc, sensitivity, specificity, auc

# ====================
# 6. Main Training Loop (select model)
# ====================

def run_training(model_type='gcn'):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    if model_type == 'gcn':
        model = GCN(input_dim=X.shape[1], hidden_dim=64, num_classes=2).to(device)
    elif model_type == 'sage':
        model = GraphSAGE(input_dim=X.shape[1], hidden_dim=64, num_classes=2).to(device)
    else:
        raise ValueError("Invalid model_type. Choose 'gcn' or 'sage'")

    data_on_device = data.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=10, verbose=True)
    criterion = nn.CrossEntropyLoss()

    save_path = f'saved_models/best_{model_type}_model.pt'
    early_stopping = EarlyStopping(patience=30, verbose=True)

    for epoch in range(1, 501):
        loss = train(model, optimizer, data_on_device, criterion)
        if epoch % 5 == 0:
            train_acc, _, _, _ = evaluate(model, 'train')
            val_acc, val_sens, val_spec, val_auc = evaluate(model, 'val')
            print(f'{model_type.upper()} Epoch {epoch:03d} | Loss: {loss:.4f} | Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f} | Val AUC: {val_auc:.4f}')

            scheduler.step(val_auc)
            early_stopping(val_auc, model, save_path)

            if early_stopping.early_stop:
                print("Early stopping triggered.")
                break

    model.load_state_dict(torch.load(save_path))
    test_acc, test_sens, test_spec, test_auc = evaluate(model, 'test')
    print(f'\n{model_type.upper()} Test Metrics:')
    print(f'Accuracy: {test_acc:.4f}')
    print(f'Sensitivity: {test_sens:.4f}')
    print(f'Specificity: {test_spec:.4f}')
    print(f'AUC: {test_auc:.4f}')





In [5]:
run_training(model_type='gcn')

RuntimeError: index 3834 is out of bounds for dimension 0 with size 1737

In [6]:
run_training(model_type='sage')

ValueError: Encountered invalid 'dim_size' (got '1737' but expected >= '5208')

# Try 2

In [4]:
import pandas as pd
import torch
from torch_geometric.nn import GCNConv, SAGEConv
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, confusion_matrix

In [5]:
df = pd.read_csv("../../dataset/dataset.csv") 

# 2. Define your feature columns and label
small_columns = ['CHF_F30',
                 'HICHOLRP',
                 'INCONT',
                 'BKBONMOM',
                 'PREG',
                 'AGE',
                 'ETHNICNIH',
                 'F45CALC',
                 'F60ALCWK',
                 'F60CALC',]

outcome_column = 'ANYFX'

X = df[small_columns].values
y = df[outcome_column].values

# Balance dataset by undersampling majority class
minority_class = df[outcome_column].value_counts().idxmin()
majority_class = df[outcome_column].value_counts().idxmax()

minority_df = df[df[outcome_column] == minority_class]
majority_df = df[df[outcome_column] == majority_class].sample(n=len(minority_df), random_state=42)
df_balanced = pd.concat([minority_df, majority_df]).sample(frac=1, random_state=42)  # shuffle

X = df_balanced[small_columns].values
y = df_balanced[outcome_column].values

# Assume X is your feature matrix, y is ANYFX, small_columns is your feature list
# 1. Split data
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, stratify=y)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp)

# 2. Build KNN graph on training data
from sklearn.neighbors import NearestNeighbors
k = 5  # Hyperparameter to tune
nbrs = NearestNeighbors(n_neighbors=k).fit(X_train)
edges = nbrs.kneighbors_graph(X_train, mode='connectivity').tocoo()
edge_index = torch.tensor([edges.row, edges.col], dtype=torch.long)

# 3. Prepare PyTorch Geometric Data object
from torch_geometric.data import Data
data = Data(x=torch.tensor(X_train, dtype=torch.float), y=torch.tensor(y_train, dtype=torch.long), edge_index=edge_index)

# 4. Define models
class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, 2)  # 2 classes

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels):
        super().__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, 2)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

# 5. Train and tune on test set (not shown for brevity), select best hyperparameters

# 6. Evaluate on validation set
def evaluate(model, X_val, y_val, edge_index):
    model.eval()
    with torch.no_grad():
        logits = model(torch.tensor(X_val, dtype=torch.float), edge_index)
        probs = torch.softmax(logits, dim=1)[:, 1].numpy()
        preds = (probs > 0.5).astype(int)
        auc = roc_auc_score(y_val, probs)
        tn, fp, fn, tp = confusion_matrix(y_val, preds).ravel()
        sensitivity = tp / (tp + fn)
        specificity = tn / (tn + fp)
        return sensitivity, specificity, auc

# Repeat for both GCN and GraphSAGE


  edge_index = torch.tensor([edges.row, edges.col], dtype=torch.long)


In [None]:
evaluate(model, X_val, y_val, edge_index)