## Imports

In [10]:
import numpy as np
import pandas as pd

!pip install torch torch_geometric

# import geomstats.datasets.utils as data_utils
import matplotlib as plt




### Load data

#### Training

In [4]:
file_path_trainFCM_new = "/Users/Haley/Desktop/WiDs Datathon/widsdatathon2025/TRAIN_NEW/TRAIN_FUNCTIONAL_CONNECTOME_MATRICES_new_36P_Pearson.csv"
train_FCM = pd.read_csv(file_path_trainFCM_new)
file_path_trainC_new = "/Users/Haley/Desktop/WiDs Datathon/widsdatathon2025/TRAIN_NEW/TRAIN_CATEGORICAL_METADATA_new.xlsx"
train_cat_new = pd.read_excel(file_path_trainC_new)
file_path_trainQ = "/Users/Haley/Desktop/WiDs Datathon/widsdatathon2025/TRAIN_NEW/TRAIN_QUANTITATIVE_METADATA_new.xlsx"
train_Quant = pd.read_excel(file_path_trainQ)
# train_Quant.head()

# ADHD and Sex solutions dataframe for model training
file_path_trainS = "/Users/Haley/Desktop/WiDs Datathon/widsdatathon2025/TRAIN_NEW/TRAINING_SOLUTIONS.xlsx"
train_Solutions = pd.read_excel(file_path_trainS)
# train_Solutions.head()

### Test

In [None]:
file_path_testC = "/Users/Haley/Desktop/WiDs Datathon/widsdatathon2025/TEST/TEST_CATEGORICAL.xlsx"
test_cat = pd.read_excel(file_path_testC)


Unnamed: 0,participant_id,ADHD_Outcome,Sex_F
0,UmrK0vMLopoR,1,1
1,CPaeQkhcjg7d,1,0
2,Nb4EetVPm3gs,1,0
3,p4vPhVu91o4b,1,1
4,M09PXs7arQ5E,1,1


### Turn vectorized connectomes back into matrices

In [5]:


def flatten_to_square_matrix(flattened_fcm, size=200):
    # Ensure the length of the flattened matrix corresponds to the upper triangular part of a 192x192 matrix
    num_elements = len(flattened_fcm)
    expected_elements = (size * (size - 1)) // 2

    if num_elements != expected_elements:
        raise ValueError(f"Flattened matrix size mismatch. Expected {expected_elements} elements, got {num_elements}")

    # Initialize a square matrix (size x size) filled with zeros
    matrix = np.zeros((size, size))

    # Extract the upper triangular indices (i, j) where i < j
    indices = np.triu_indices(size, k=1)  # k=1 excludes diagonal (i != j)

    # Assign the flattened values to the upper triangular part of the matrix
    matrix[indices] = flattened_fcm
    matrix.T[indices] = flattened_fcm  # Symmetric part: Copy to the lower triangle

    return matrix

# Example for the first participant
flattened_fcm = train_FCM.iloc[0, 1:].values  # Skip the participant_id column
fcm_matrix = flatten_to_square_matrix(flattened_fcm)
print(fcm_matrix.shape)  # Should print (192, 192)


(200, 200)


In [7]:
# Assuming train_FCM is a pandas DataFrame with participant IDs and flattened FCMs
connectivity_matrices = []

for i in range(len(train_FCM)):
    flattened_fcm = train_FCM.iloc[i, 1:].values  # Skip the participant_id column
    fcm_matrix = flatten_to_square_matrix(flattened_fcm)
    connectivity_matrices.append(fcm_matrix)

connectivity_matrices = np.array(connectivity_matrices)
print(connectivity_matrices.shape)  # Should print (N, 200, 200), where N is the number of participants


(1213, 200, 200)


array([[[ 0.        ,  0.2229301 ,  0.52790285, ...,  0.25914074,
          0.04393227,  0.12452861],
        [ 0.2229301 ,  0.        , -0.10883984, ..., -0.0927899 ,
         -0.24121283,  0.02421868],
        [ 0.52790285, -0.10883984,  0.        , ...,  0.22211525,
          0.11465865,  0.24847324],
        ...,
        [ 0.25914074, -0.0927899 ,  0.22211525, ...,  0.        ,
          0.56186395,  0.47117019],
        [ 0.04393227, -0.24121283,  0.11465865, ...,  0.56186395,
          0.        ,  0.36522073],
        [ 0.12452861,  0.02421868,  0.24847324, ...,  0.47117019,
          0.36522073,  0.        ]],

       [[ 0.        ,  0.61476485,  0.57725539, ..., -0.1469424 ,
         -0.19799274, -0.20565338],
        [ 0.61476485,  0.        ,  0.26085785, ..., -0.04644964,
         -0.17542383, -0.15507986],
        [ 0.57725539,  0.26085785,  0.        , ..., -0.14718288,
         -0.17831505, -0.19954765],
        ...,
        [-0.1469424 , -0.04644964, -0.14718288, ...,  

In [None]:

connectivity_matrices_test = []

for i in range(len(train_FCM)):
    flattened_fcm = train_FCM.iloc[i, 1:].values  # Skip the participant_id column
    fcm_matrix = flatten_to_square_matrix(flattened_fcm)
    connectivity_matrices_test.append(fcm_matrix)

connectivity_matrices_test = np.array(connectivity_matrices_test)
print(connectivity_matrices_test.shape) 

In [11]:
import torch
import torch.nn.functional as F
from torch_geometric.data import Data, Dataset, DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool

import numpy as np
import pandas as pd

# Dummy function to create edge index from a full matrix
def matrix_to_edges(matrix, threshold=0.0):
    edge_index = []
    edge_weight = []
    n = matrix.shape[0]
    for i in range(n):
        for j in range(n):
            if matrix[i, j] > threshold and i != j:
                edge_index.append([i, j])
                edge_weight.append(matrix[i, j])
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    edge_weight = torch.tensor(edge_weight, dtype=torch.float)
    return edge_index, edge_weight

class ConnectomeDataset(Dataset):
    def __init__(self, connectomes, adhd_labels, sex_labels, threshold=0.0):
        super().__init__()
        self.connectomes = connectomes  # list or array of matrices
        self.adhd_labels = adhd_labels  # list or array
        self.sex_labels = sex_labels    # list or array
        self.threshold = threshold

    def len(self):
        return len(self.connectomes)

    def get(self, idx):
        matrix = self.connectomes[idx]
        edge_index, edge_weight = matrix_to_edges(matrix, threshold=self.threshold)
        
        x = torch.eye(matrix.shape[0])  # optional: node features; here identity matrix
        y_adhd = torch.tensor(self.adhd_labels[idx], dtype=torch.long)
        y_sex = torch.tensor(self.sex_labels[idx], dtype=torch.long)

        data = Data(x=x, edge_index=edge_index, edge_attr=edge_weight)
        data.y_adhd = y_adhd
        data.y_sex = y_sex
        data.num_nodes = matrix.shape[0]
        return data


In [12]:
class GNN(torch.nn.Module):
    def __init__(self, hidden_channels, num_nodes):
        super().__init__()
        self.conv1 = GCNConv(num_nodes, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        
        self.lin_adhd = torch.nn.Linear(hidden_channels, 2)  # ADHD: binary
        self.lin_sex = torch.nn.Linear(hidden_channels, 2)   # Sex: binary (or adjust if >2 classes)

    def forward(self, x, edge_index, batch):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        
        # Global pooling: aggregate node features into graph feature
        x = global_mean_pool(x, batch)

        adhd_out = self.lin_adhd(x)
        sex_out = self.lin_sex(x)
        return adhd_out, sex_out


In [14]:
def train(model, loader, optimizer, device):
    model.train()
    total_loss = 0
    for data in loader:
        data = data.to(device)
        optimizer.zero_grad()
        adhd_out, sex_out = model(data.x, data.edge_index, data.batch)

        loss_adhd = F.cross_entropy(adhd_out, data.y_adhd)
        loss_sex = F.cross_entropy(sex_out, data.y_sex)
        
        loss = loss_adhd + loss_sex  # simple sum; can weigh differently
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)


In [17]:
n_samples = 1213
n_regions = 200

# Model variables
adhd_labels = train_Solutions['ADHD_Outcome']
sex_labels = train_Solutions['Sex_F']

In [18]:
# Create dataset and dataloader
dataset = ConnectomeDataset(connectivity_matrices, adhd_labels, sex_labels, threshold=0.2)
loader = DataLoader(dataset, batch_size=16, shuffle=True)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GNN(hidden_channels=64, num_nodes=n_regions).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training
for epoch in range(1, 21):
    loss = train(model, loader, optimizer, device)
    print(f'Epoch {epoch}, Loss: {loss:.4f}')

Epoch 1, Loss: 1.2975
Epoch 2, Loss: 1.2675
Epoch 3, Loss: 1.2671
Epoch 4, Loss: 1.2671
Epoch 5, Loss: 1.2710
Epoch 6, Loss: 1.2666
Epoch 7, Loss: 1.2693
Epoch 8, Loss: 1.2681
Epoch 9, Loss: 1.2685
Epoch 10, Loss: 1.2669
Epoch 11, Loss: 1.2683
Epoch 12, Loss: 1.2675
Epoch 13, Loss: 1.2682
Epoch 14, Loss: 1.2692
Epoch 15, Loss: 1.2689
Epoch 16, Loss: 1.2690
Epoch 17, Loss: 1.2666
Epoch 18, Loss: 1.2676
Epoch 19, Loss: 1.2678
Epoch 20, Loss: 1.2673


In [20]:
from sklearn.metrics import f1_score

def predict(model, loader, device):
    model.eval()
    adhd_preds = []
    sex_preds = []
    participant_ids = []

    with torch.no_grad():
        for data in loader:
            data = data.to(device)
            adhd_out, sex_out = model(data.x, data.edge_index, data.batch)
            
            # Apply softmax to get probabilities
            adhd_probs = F.softmax(adhd_out, dim=1)[:, 1]  # probability ADHD = 1
            sex_probs = F.softmax(sex_out, dim=1)[:, 1]    # probability Sex_F = 1

            adhd_preds.append(adhd_probs.cpu())
            sex_preds.append(sex_probs.cpu())
            participant_ids.extend(data.participant_id)  # make sure participant IDs are attached in dataset

    adhd_preds = torch.cat(adhd_preds).numpy()
    sex_preds = torch.cat(sex_preds).numpy()

    return participant_ids, adhd_preds, sex_preds


In [21]:
def weighted_f1_score(y_true_adhd, y_pred_adhd, y_true_sex, y_pred_sex, sex_true):
    # Threshold predictions
    y_pred_adhd_label = (y_pred_adhd > 0.5).astype(int)
    y_pred_sex_label = (y_pred_sex > 0.5).astype(int)

    # Normal F1 scores
    f1_adhd = f1_score(y_true_adhd, y_pred_adhd_label)
    f1_sex = f1_score(y_true_sex, y_pred_sex_label)

    # Special weight adjustment
    # Identify Female ADHD cases
    female_adhd_cases = (y_true_adhd == 1) & (sex_true == 1)

    # Increase recall/precision for these cases artificially
    # Double the true positives count for those cases
    tp_female = ((y_pred_adhd_label == 1) & (y_true_adhd == 1) & (sex_true == 1)).sum()

    # Redo precision/recall manually
    tp = ((y_pred_adhd_label == 1) & (y_true_adhd == 1)).sum()
    fp = ((y_pred_adhd_label == 1) & (y_true_adhd == 0)).sum()
    fn = ((y_pred_adhd_label == 0) & (y_true_adhd == 1)).sum()

    tp_weighted = tp + tp_female  # Add extra count for female ADHD correct cases

    precision_weighted = tp_weighted / (tp_weighted + fp + 1e-8)
    recall_weighted = tp_weighted / (tp_weighted + fn + 1e-8)

    f1_adhd_weighted = 2 * (precision_weighted * recall_weighted) / (precision_weighted + recall_weighted + 1e-8)

    final_score = (f1_adhd_weighted + f1_sex) / 2
    return final_score


In [22]:
import pandas as pd

def save_submission(participant_ids, adhd_preds, sex_preds, filename='submission.csv'):
    submission = pd.DataFrame({
        'participant_id': participant_ids,
        'ADHD_Outcome': adhd_preds,
        'Sex_F': sex_preds
    })
    submission.to_csv(filename, index=False)
    print(f"Saved submission to {filename}")
