In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.neighbors import kneighbors_graph
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("../../dataset/dataset.csv", index_col = [0]).reset_index(drop = True)
df

Unnamed: 0,ID,CTFLAG,ANYFX,FRAX_SCORE,PARKINS,RHEUMAT,OSTEOPOR,ARTHRIT,CANC_F30,CATARACT,...,F60VITA,TEXPWK,WALKSPD,BKBONE,BKHIP,BKBACK,BKLARM,SMOKING,YEARS_MENOPAUSE,DUR_MENA_MENO
0,131073,1,0,6.14,0.0,0.0,0.0,0.0,0.0,0.0,...,975.84083,2.50000,3.0,1.0,0.0,0.0,1.0,1.0,10.0,-45.0
1,262147,1,0,8.05,0.0,8.0,0.0,1.0,0.0,0.0,...,848.40762,26.83333,3.0,0.0,0.0,0.0,0.0,1.0,13.0,-44.0
2,131075,0,0,12.88,0.0,8.0,0.0,1.0,0.0,1.0,...,629.72861,21.00000,3.0,1.0,0.0,0.0,0.0,1.0,11.0,-45.0
3,262149,0,0,8.78,0.0,8.0,0.0,1.0,0.0,0.0,...,339.14853,32.83333,4.0,0.0,0.0,0.0,0.0,1.0,15.0,-45.0
4,262150,1,1,1.73,0.0,0.0,0.0,0.0,0.0,0.0,...,1574.51101,21.83333,3.0,0.0,0.0,0.0,0.0,0.0,19.0,-30.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74199,262130,1,0,3.07,0.0,0.0,0.0,0.0,0.0,0.0,...,668.50414,0.00000,3.0,1.0,0.0,0.0,0.0,0.0,2.0,-46.0
74200,131066,1,0,3.94,0.0,0.0,0.0,0.0,0.0,0.0,...,334.67271,7.50000,3.0,0.0,0.0,0.0,0.0,0.0,7.0,-46.0
74201,262131,0,0,4.45,0.0,0.0,0.0,0.0,0.0,0.0,...,1195.77043,17.08333,3.0,0.0,0.0,0.0,0.0,1.0,1.0,-47.0
74202,131068,1,0,8.54,0.0,0.0,0.0,0.0,0.0,0.0,...,1169.27512,0.00000,9.0,1.0,0.0,0.0,0.0,0.0,13.0,-45.0


In [3]:
small_columns = ['CHF_F30',
                 'HICHOLRP',
                 'INCONT',
                 'BKBONMOM',
                 'PREG',
                 'AGE',
                 'ETHNICNIH',
                 'F45CALC',
                 'F60ALCWK',
                 'F60CALC',]

In [15]:
df[small_columns]['F60CALC'].value_counts()

F60CALC
1079.40693    2
441.53598     2
286.22396     2
782.73040     2
701.96434     2
             ..
1465.15550    1
264.12976     1
349.37363     1
863.05166     1
586.25866     1
Name: count, Length: 74180, dtype: int64

In [21]:
# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Hyperparameters
K_NEIGHBORS = 5  # Number of neighbors for KNN graph construction
HIDDEN_CHANNELS = 64
LEARNING_RATE = 0.005
EPOCHS = 100
BATCH_SIZE = 32

class GNNModel(torch.nn.Module):
    def __init__(self, num_features, hidden_channels):
        super(GNNModel, self).__init__()
        # Graph convolutional layers
        self.conv1 = GCNConv(num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        
        # Output layers for binary classification
        self.lin1 = nn.Linear(hidden_channels, hidden_channels//2)
        self.lin2 = nn.Linear(hidden_channels//2, 1)
    
    def forward(self, x, edge_index, batch):
        # Node embedding
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.3, training=self.training)
        
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        
        # Global pooling (mean of all node features)
        x = global_mean_pool(x, batch)
        
        # Final classification layers
        x = self.lin1(x)
        x = F.relu(x)
        x = F.dropout(x, p=0.3, training=self.training)
        x = self.lin2(x)
        
        return x

def load_and_preprocess_data(filepath, predictor_columns, target_column="ANYFX"):
    """
    Load data from a CSV file and preprocess it.
    Uses specific column names for predictors and target.
    
    Args:
        filepath: Path to the CSV file
        predictor_columns: List of column names to use as predictors (small_columns)
        target_column: Name of the target column (ANYFX)
    """
    print(f"Loading data from {filepath}")
    data = pd.read_csv(filepath)

    # Drop rows that equal 9 in ETHNICNIH
    data = data[data['ETHNICNIH'] != 9]
    data = data.reset_index(drop=True)
    print(f"Data loaded. Shape: {data.shape}")
    
    # Ensure all specified columns exist in the dataset
    all_columns = predictor_columns + [target_column]
    missing_columns = [col for col in all_columns if col not in data.columns]
    if missing_columns:
        raise ValueError(f"Columns {missing_columns} not found in the dataset")
    
    # Split features and target using the specified column names
    X = data[predictor_columns]
    y = data[target_column]
    
    # Check for categorical columns (non-numeric)
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
    numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    
    print(f"Categorical columns: {categorical_cols}")
    print(f"Numeric columns: {numeric_cols}")
    
    # Preprocessing: Standardize numeric features and one-hot encode categorical features
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_cols),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
        ],
        remainder='passthrough'  # Include columns that were not explicitly listed
    )
    
    X_processed = preprocessor.fit_transform(X)
    
    # Convert to dense array if sparse
    if hasattr(X_processed, "toarray"):
        X_processed = X_processed.toarray()
    
    # Convert target to numeric if it's not
    if y.dtype == 'object' or y.dtype == 'category':
        y = y.astype(int)
    
    return X_processed, y.values, preprocessor

def create_graph_dataset(X, y, k_neighbors=5):
    """
    Create a graph dataset from tabular data using K-nearest neighbors.
    """
    # Compute adjacency matrix using KNN
    A = kneighbors_graph(X, n_neighbors=k_neighbors, mode='connectivity', include_self=False)
    
    # Convert sparse adjacency matrix to edge indices
    adj_coo = A.tocoo()
    edge_index = torch.tensor(np.vstack((adj_coo.row, adj_coo.col)), dtype=torch.long)
    
    # Convert features and labels to torch tensors
    x = torch.tensor(X, dtype=torch.float)
    y = torch.tensor(y, dtype=torch.float).view(-1, 1)
    
    # Create a single PyTorch Geometric Data object
    data = Data(x=x, edge_index=edge_index, y=y)
    
    return data

def train_val_test_split(data, train_size=0.6, val_size=0.2, test_size=0.2):
    """
    Split dataset into training, validation, and test sets.
    Returns masks for each split.
    """
    assert train_size + val_size + test_size == 1.0, "Split proportions must sum to 1"
    
    num_samples = data.y.size(0)
    indices = torch.randperm(num_samples)
    
    train_end = int(num_samples * train_size)
    val_end = train_end + int(num_samples * val_size)
    
    train_idx = indices[:train_end]
    val_idx = indices[train_end:val_end]
    test_idx = indices[val_end:]
    
    return train_idx, val_idx, test_idx

def train_model(model, data, train_idx, val_idx, device, epochs=100, lr=0.01, batch_size=32):
    """
    Train the GNN model and validate on validation set.
    """
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = torch.nn.BCEWithLogitsLoss()
    
    best_val_loss = float('inf')
    best_model_state = None
    train_losses = []
    val_losses = []
    
    data = data.to(device)
    
    # Create mini-batch loader for training set
    train_loader = DataLoader([data], batch_size=batch_size)
    
    for epoch in range(epochs):
        # Training
        model.train()
        train_loss = 0
        
        for batch in train_loader:
            batch = batch.to(device)
            optimizer.zero_grad()
            
            # Forward pass on the entire graph
            out = model(batch.x, batch.edge_index, torch.zeros(batch.x.size(0), dtype=torch.long, device=device))
            
            # Compute loss only on training nodes
            loss = criterion(out[train_idx], batch.y[train_idx])
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        
        train_loss /= len(train_loader)
        train_losses.append(train_loss)
        
        # Validation
        model.eval()
        with torch.no_grad():
            out = model(data.x, data.edge_index, torch.zeros(data.x.size(0), dtype=torch.long, device=device))
            val_loss = criterion(out[val_idx], data.y[val_idx]).item()
        
        val_losses.append(val_loss)
        
        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_state = model.state_dict().copy()
        
        # Print progress
        if (epoch + 1) % 10 == 0:
            print(f'Epoch: {epoch+1:03d}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')
    
    # Load best model
    model.load_state_dict(best_model_state)
    return model, train_losses, val_losses

def evaluate_model(model, data, test_idx, device):
    """
    Evaluate the trained model on the test set.
    """
    model.eval()
    data = data.to(device)
    
    with torch.no_grad():
        # Forward pass on the entire graph
        out = model(data.x, data.edge_index, torch.zeros(data.x.size(0), dtype=torch.long, device=device))
        
        # Extract predictions for test nodes
        y_prob = torch.sigmoid(out[test_idx]).cpu().numpy()
        y_pred = (y_prob > 0.5).astype(int)
        y_true = data.y[test_idx].cpu().numpy()
    
    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    roc_auc = roc_auc_score(y_true, y_prob)
    
    print("Test Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}")
    
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "roc_auc": roc_auc
    }

def plot_training_curve(train_losses, val_losses):
    """
    Plot training and validation loss curves.
    """
    plt.figure(figsize=(10, 6))
    plt.plot(train_losses, label='Training Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss Curves')
    plt.legend()
    plt.grid(True)
    plt.show()

def main(filepath, small_columns, target_column="ANYFX", k_neighbors=K_NEIGHBORS):
    # Determine device (GPU if available, otherwise CPU)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # Load and preprocess data with specific column names
    X, y, preprocessor = load_and_preprocess_data(filepath, small_columns, target_column)
    print(f"Data loaded and preprocessed. Features shape: {X.shape}, Target shape: {y.shape}")
    
    # Create graph dataset
    data = create_graph_dataset(X, y, k_neighbors=k_neighbors)
    print(f"Graph created with {data.num_nodes} nodes and {data.num_edges} edges")
    
    # Split dataset into train/val/test indices
    train_idx, val_idx, test_idx = train_val_test_split(
        data, train_size=0.6, val_size=0.2, test_size=0.2
    )
    print(f"Dataset split: {len(train_idx)} train, {len(val_idx)} validation, {len(test_idx)} test")
    
    # Create and train model
    num_features = X.shape[1]
    model = GNNModel(num_features=num_features, hidden_channels=HIDDEN_CHANNELS).to(device)
    print(f"Model created with {num_features} input features")
    
    print("Training model...")
    model, train_losses, val_losses = train_model(
        model, data, train_idx, val_idx, device, 
        epochs=EPOCHS, lr=LEARNING_RATE, batch_size=BATCH_SIZE
    )
    
    # Evaluate on test set
    print("Evaluating model on test set...")
    metrics = evaluate_model(model, data, test_idx, device)
    
    # Plot learning curves
    plot_training_curve(train_losses, val_losses)
    
    return model, preprocessor, metrics

In [23]:
# Replace with your actual CSV file path
filepath = "../../dataset/dataset.csv"
    
# Define your predictor columns and target column
small_columns = ['CHF_F30',
                 'HICHOLRP',
                 'INCONT',
                 'BKBONMOM',
                 'PREG',
                 'AGE',
                 'ETHNICNIH',
                 'F45CALC',
                 'F60ALCWK',
                 'F60CALC',]
target_column = "ANYFX"
    
# Train and evaluate the model
model, preprocessor, metrics = main(filepath, small_columns, target_column)
    
# Example of saving the model for later use
torch.save({
        'model_state_dict': model.state_dict(),
        'metrics': metrics,
        'predictor_columns': small_columns,
        'target_column': target_column
    }, 'gnn_model.pth')
    
print("Model training and evaluation complete!")

Using device: cpu
Loading data from ../../dataset/dataset.csv
Data loaded. Shape: (73621, 68)
Categorical columns: []
Numeric columns: ['CHF_F30', 'HICHOLRP', 'INCONT', 'BKBONMOM', 'PREG', 'AGE', 'ETHNICNIH', 'F45CALC', 'F60ALCWK', 'F60CALC']
Data loaded and preprocessed. Features shape: (73621, 10), Target shape: (73621,)
Graph created with 73621 nodes and 368105 edges
Dataset split: 44172 train, 14724 validation, 14725 test
Model created with 10 input features
Training model...


IndexError: index 70630 is out of bounds for dimension 0 with size 1

In [22]:
x_processed, y, preprocessor = load_and_preprocess_data(filepath, small_columns, target_column)

Loading data from ../../dataset/dataset.csv
Data loaded. Shape: (73621, 68)
Categorical columns: []
Numeric columns: ['CHF_F30', 'HICHOLRP', 'INCONT', 'BKBONMOM', 'PREG', 'AGE', 'ETHNICNIH', 'F45CALC', 'F60ALCWK', 'F60CALC']


In [18]:
x_processed

array([[-7.64345852e-02, -3.45374021e-01,  7.23927924e-01, ...,
        -3.12731858e-01,  9.56573919e-03,  1.11718295e+00],
       [-7.64345852e-02, -3.45374021e-01,  7.23927924e-01, ...,
         8.76108503e+00, -4.86180888e-01,  3.09020507e-01],
       [-7.64345852e-02, -3.45374021e-01,  7.23927924e-01, ...,
         1.13894026e+00,  5.68263767e-01, -3.99529748e-01],
       ...,
       [-7.64345852e-02, -3.45374021e-01,  7.23927924e-01, ...,
        -5.93365370e-01, -1.63553170e-01,  1.53407720e+00],
       [-7.64345852e-02, -3.45374021e-01, -1.38135299e+00, ...,
         5.67741506e-04, -4.86180888e-01,  8.46472658e-01],
       [-7.64345852e-02, -3.45374021e-01, -1.38135299e+00, ...,
         1.13894026e+00, -4.86180888e-01, -4.96358135e-01]])

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cpu


In [None]:
X, y, preprocessor = load_and_preprocess_data("../../dataset/dataset.csv", target_column='ANYFX', pred_columns=small_columns)
data = create_graph_dataset(X, y, k_neighbors=5)

Loading data from ../../dataset/dataset.csv
Categorical columns: []
Numeric columns: ['CHF_F30', 'HICHOLRP', 'INCONT', 'BKBONMOM', 'PREG', 'AGE', 'ETHNICNIH', 'F45CALC', 'F60ALCWK', 'F60CALC', 'ANYFX']


In [None]:
train_idx, val_idx, test_idx = train_val_test_split(
        data, train_size=0.6, val_size=0.2, test_size=0.2
    )

AttributeError: 'list' object has no attribute 'y'

In [None]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [None]:
num_features = X.shape[1]
model = GNNModel(num_features=num_features, hidden_channels=HIDDEN_CHANNELS).to(device)
print(f"Model created with {num_features} input features")

Model created with 11 input features


In [None]:
model, train_losses, val_losses = train_model2(
        model, train_loader, val_loader, device, 
        epochs=EPOCHS, lr=LEARNING_RATE
    )

TypeError: train_model2() missing 1 required positional argument: 'device'

In [None]:
if __name__ == "__main__":
    # Replace with your actual CSV file path
    filepath = "../../dataset/dataset.csv"
    model, preprocessor, metrics = main(filepath)
    
    # Example of saving the model for later use
    torch.save({
        'model_state_dict': model.state_dict(),
        'metrics': metrics
    }, 'gnn_model.pth')
    
    print("Model training and evaluation complete!")

Using device: cpu
Loading data from ../../dataset/dataset.csv
Categorical columns: []
Numeric columns: ['CHF_F30', 'HICHOLRP', 'INCONT', 'BKBONMOM', 'PREG', 'AGE', 'ETHNICNIH', 'F45CALC', 'F60ALCWK', 'F60CALC', 'ANYFX']
Data loaded and preprocessed. Features shape: (74204, 11), Target shape: (74204,)
Graph dataset created with 74204 samples
Dataset split: 44522 train, 14841 validation, 14841 test
Model created with 11 input features
Training model...


TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'torch_geometric.data.data.Data'>