### Load in the training data

In [1]:
import pandas as pd
import numpy as np

# Use read_excel and use participant_id as the index
train_cat = pd.read_excel("../../widsdatathon2025/TRAIN_NEW/TRAIN_CATEGORICAL_METADATA_new.xlsx")
train_func = pd.read_csv("../../widsdatathon2025/TRAIN_NEW/"
                         "TRAIN_FUNCTIONAL_CONNECTOME_MATRICES_new_36P_Pearson.csv")
train_quant = pd.read_excel("../../widsdatathon2025/TRAIN_NEW/TRAIN_QUANTITATIVE_METADATA_new.xlsx")
train_soln = pd.read_excel("../../widsdatathon2025/TRAIN_NEW/TRAINING_SOLUTIONS.xlsx")

In [2]:
# Set index for merging
train_cat.set_index("participant_id", inplace=True)
train_quant.set_index("participant_id", inplace=True)
train_func.set_index("participant_id", inplace=True)
train_soln.set_index("participant_id", inplace=True)

In [3]:
train_cat.head()
# train_func.head()
# train_quant.head()
# train_soln.head()

Unnamed: 0_level_0,Basic_Demos_Enroll_Year,Basic_Demos_Study_Site,PreInt_Demos_Fam_Child_Ethnicity,PreInt_Demos_Fam_Child_Race,MRI_Track_Scan_Location,Barratt_Barratt_P1_Edu,Barratt_Barratt_P1_Occ,Barratt_Barratt_P2_Edu,Barratt_Barratt_P2_Occ
participant_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
00aIpNTbG5uh,2019,4,1.0,0.0,3.0,21.0,45.0,,
00fV0OyyoLfw,2017,1,0.0,9.0,2.0,21.0,0.0,21.0,45.0
04X1eiS79T4B,2017,1,1.0,2.0,2.0,9.0,0.0,,
05ocQutkURd6,2018,1,3.0,8.0,2.0,18.0,10.0,18.0,0.0
06YUNBA9ZRLq,2018,1,0.0,1.0,2.0,12.0,0.0,,


## PCA for the functional connectome matrix to reduce dimension

In [4]:
from sklearn.decomposition import PCA

# Select only fMRI columns
fmri_data = train_func.drop(columns=['participant_id'], errors='ignore')

# Apply PCA to reduce to top N components (e.g. 50)
pca = PCA(n_components=50)
fmri_pca = pca.fit_transform(fmri_data)

train_func_pca = pd.DataFrame(fmri_pca, index=train_func.index,
                           columns=[f'fmri_pca_{i}' for i in range(1, 51)])
train_func_pca.head()

Unnamed: 0_level_0,fmri_pca_1,fmri_pca_2,fmri_pca_3,fmri_pca_4,fmri_pca_5,fmri_pca_6,fmri_pca_7,fmri_pca_8,fmri_pca_9,fmri_pca_10,...,fmri_pca_41,fmri_pca_42,fmri_pca_43,fmri_pca_44,fmri_pca_45,fmri_pca_46,fmri_pca_47,fmri_pca_48,fmri_pca_49,fmri_pca_50
participant_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
70z8Q2xdTXM3,6.3228,-0.36568,-4.918161,-3.823232,-5.706925,-0.875168,1.567537,-1.347493,-1.717403,1.626701,...,0.805249,-0.708449,0.249169,0.27625,1.572329,-2.987128,-3.439043,0.645766,-0.433957,0.182288
WHWymJu6zNZi,5.468294,-3.985252,1.542399,0.104719,-0.529037,-2.951977,0.786366,0.794952,-2.772055,-2.40892,...,1.04238,0.61414,-0.807448,0.320061,-1.816425,-0.34149,0.986329,-0.596418,1.178288,-0.786373
4PAQp1M6EyAo,0.44719,0.733,3.051332,1.763902,1.932819,6.451154,-3.356874,1.946051,-1.165636,-2.876858,...,-2.169494,-0.191489,2.517191,0.890518,-0.377794,-1.691477,-0.28225,-1.066496,2.499244,-1.190171
obEacy4Of68I,-9.149799,-2.044626,0.519359,-2.703924,-4.444126,2.91693,1.036396,0.08034,-0.148185,0.384233,...,-0.135033,-1.807585,0.420512,-0.754358,-0.740293,-0.62012,-0.357497,1.086135,0.242517,1.022253
s7WzzDcmDOhF,0.812814,-1.933265,1.834524,0.383159,-0.512337,2.972219,3.151062,3.419848,0.283679,-2.997809,...,2.412607,-2.108522,-2.713573,-0.937563,-0.43074,0.34697,0.454387,-1.873252,1.095752,0.388524


## Interpolate missing values with mean for quant and cat dfs

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer

# Check missing values *before* merging
missing = train_quant.isnull().sum()
missing_cat = train_cat.isnull().sum()
print("Missing values in train_quant:\n", missing[missing > 0])
print("Missing values in train_cat:\n", missing_cat[missing_cat > 0])

# Define columns with small gaps to impute
small_gap_cols = [
    'EHQ_EHQ_Total', 'ColorVision_CV_Score', 'APQ_P_APQ_P_CP', 'APQ_P_APQ_P_ID', 'APQ_P_APQ_P_INV',
    'APQ_P_APQ_P_OPD', 'APQ_P_APQ_P_PM', 'APQ_P_APQ_P_PP', 'SDQ_SDQ_Conduct_Problems', 
    'SDQ_SDQ_Difficulties_Total', 'SDQ_SDQ_Emotional_Problems', 'SDQ_SDQ_Externalizing',
    'SDQ_SDQ_Generating_Impact', 'SDQ_SDQ_Hyperactivity', 'SDQ_SDQ_Internalizing', 
    'SDQ_SDQ_Peer_Problems', 'SDQ_SDQ_Prosocial', 'PreInt_Demos_Fam_Child_Ethnicity',
    'PreInt_Demos_Fam_Child_Race', 'MRI_Track_Scan_Location']

small_gap_cols_cat = ['Barratt_Barratt_P1_Edu', 'Barratt_Barratt_P1_Occ']

# Fill small gaps using column means
for col in small_gap_cols:
    if col in train_quant.columns:
        train_quant[col] = train_quant[col].fillna(train_quant[col].mean())
for col in small_gap_cols_cat:
    if col in train_cat.columns:
        train_cat[col] = train_cat[col].fillna(train_cat[col].mean())

# drop rows where age is missing to train
age_train = train_quant.dropna(subset=['MRI_Track_Age_at_Scan'])
X = age_train.drop(columns=['MRI_Track_Age_at_Scan'])
y = age_train['MRI_Track_Age_at_Scan']

# maybe use SimpleImputer to handle small missingness in X
X_imputed = SimpleImputer(strategy='mean').fit_transform(X)

# train regressor
reg = LinearRegression().fit(X_imputed, y)

# now predict missing ages
missing_age_rows = train_quant['MRI_Track_Age_at_Scan'].isna()
X_missing = train_quant.loc[missing_age_rows].drop(columns=['MRI_Track_Age_at_Scan'])
X_missing_imputed = SimpleImputer(strategy='mean').fit_transform(X_missing)

# predict and fill
train_quant.loc[missing_age_rows, 'MRI_Track_Age_at_Scan'] = reg.predict(X_missing_imputed)

train_cat.drop(columns=['Barratt_Barratt_P2_Occ', 'Barratt_Barratt_P2_Edu'], 
              inplace=True) # Dropped

Missing values in train_quant:
 EHQ_EHQ_Total                  13
ColorVision_CV_Score           23
APQ_P_APQ_P_CP                 12
APQ_P_APQ_P_ID                 12
APQ_P_APQ_P_INV                12
APQ_P_APQ_P_OPD                12
APQ_P_APQ_P_PM                 12
APQ_P_APQ_P_PP                 12
SDQ_SDQ_Conduct_Problems        9
SDQ_SDQ_Difficulties_Total      9
SDQ_SDQ_Emotional_Problems      9
SDQ_SDQ_Externalizing           9
SDQ_SDQ_Generating_Impact       9
SDQ_SDQ_Hyperactivity           9
SDQ_SDQ_Internalizing           9
SDQ_SDQ_Peer_Problems           9
SDQ_SDQ_Prosocial               9
MRI_Track_Age_at_Scan         360
dtype: int64
Missing values in train_cat:
 PreInt_Demos_Fam_Child_Ethnicity     43
PreInt_Demos_Fam_Child_Race          54
MRI_Track_Scan_Location               3
Barratt_Barratt_P1_Edu               15
Barratt_Barratt_P1_Occ               31
Barratt_Barratt_P2_Edu              198
Barratt_Barratt_P2_Occ              222
dtype: int64


In [6]:
# Check missing values *before* merging
missing = train_quant.isnull().sum()
print("Missing values in train_quant:\n", missing[missing > 0])

Missing values in train_quant:
 Series([], dtype: int64)


In [7]:
# Make sure participant_id is a column, not the index
train_quant = train_quant.reset_index()
train_cat = train_cat.reset_index()
train_soln = train_soln.reset_index()

# Merge everything on participant_id
X_full = train_quant.merge(train_cat, on='participant_id').merge(train_func_pca, on='participant_id')

# Target labels
y = train_soln[['participant_id', 'ADHD_Outcome', 'Sex_F']]

# Align rows across X and y
X_full = X_full.merge(y, on='participant_id')
y = X_full[['ADHD_Outcome', 'Sex_F']]
X_full = X_full.drop(columns=['ADHD_Outcome', 'Sex_F'])
X_full.set_index("participant_id", inplace=True)
X_full.head()

Unnamed: 0_level_0,EHQ_EHQ_Total,ColorVision_CV_Score,APQ_P_APQ_P_CP,APQ_P_APQ_P_ID,APQ_P_APQ_P_INV,APQ_P_APQ_P_OPD,APQ_P_APQ_P_PM,APQ_P_APQ_P_PP,SDQ_SDQ_Conduct_Problems,SDQ_SDQ_Difficulties_Total,...,fmri_pca_41,fmri_pca_42,fmri_pca_43,fmri_pca_44,fmri_pca_45,fmri_pca_46,fmri_pca_47,fmri_pca_48,fmri_pca_49,fmri_pca_50
participant_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00aIpNTbG5uh,100.0,13.0,3.0,15.0,44.0,14.0,20.0,27.0,3.0,17.0,...,0.178457,-1.058854,-1.048715,-0.469334,-0.086935,-0.36965,0.54688,-1.148994,-3.277996,-1.433213
00fV0OyyoLfw,92.27,14.0,3.0,12.0,35.0,25.0,28.0,30.0,5.0,20.0,...,-0.691156,-0.259812,0.293702,-0.859555,0.222115,1.175482,-2.890521,0.459076,-0.618329,-1.56608
04X1eiS79T4B,86.67,14.0,3.0,21.0,37.0,18.0,26.0,28.0,3.0,24.0,...,3.421913,-2.837251,-1.43868,0.240602,-2.693957,-0.72908,0.63493,2.377795,0.206991,-1.956194
05ocQutkURd6,93.34,14.0,3.0,11.0,42.0,15.0,20.0,28.0,0.0,5.0,...,1.624708,0.220991,2.707044,-1.292513,-3.473682,-2.696214,1.372668,0.741779,-0.760894,-1.424114
06YUNBA9ZRLq,0.0,14.0,8.0,12.0,35.0,22.0,12.0,24.0,6.0,23.0,...,2.079449,1.469297,0.378476,-0.005113,-2.057296,1.791186,-1.085582,1.310953,-0.982533,0.36515


In [13]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_full)

# Convert to tensors
X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.float32)

# Stratified train/test split
stratify_labels = y['ADHD_Outcome'].astype(str) + y['Sex_F'].astype(str)
X_train, X_test, y_train, y_test = train_test_split(
    X_tensor, y_tensor, test_size=0.2, random_state=42, stratify=stratify_labels)

train_ds = TensorDataset(X_train, y_train)
test_ds = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=32)
X_train

tensor([[-2.8209,  0.2776, -0.6192,  ...,  1.4297, -1.2718,  0.9630],
        [ 0.2795,  0.2776,  0.1365,  ...,  0.3559, -0.7471, -1.3930],
        [-1.0685,  0.2776, -0.6192,  ...,  0.4917, -0.5680, -0.1444],
        ...,
        [ 0.8186,  0.2776, -0.6192,  ..., -0.0194, -1.6136, -2.0732],
        [ 0.8186,  0.2776, -0.6192,  ...,  0.0346, -0.2227,  0.9131],
        [ 0.1447,  0.2776, -0.6192,  ..., -1.0130, -0.5518,  0.2717]])

In [ ]:
class DualOutputNN(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.shared = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        self.head_adhd = nn.Linear(128, 1)
        self.head_sex = nn.Linear(128, 1)

    def forward(self, x):
        shared_out = self.shared(x)
        adhd_logits = self.head_adhd(shared_out)
        sex_logits = self.head_sex(shared_out)
        return adhd_logits, sex_logits

class WeightedBCELoss(nn.Module):
    def __init__(self, weight_female_adhd=2.0):
        super(WeightedBCELoss, self).__init__()
        self.weight_female_adhd = weight_female_adhd
        self.bce = nn.BCEWithLogitsLoss(reduction='none')

    def forward(self, predictions, targets, sex):
        bce_loss = self.bce(predictions, targets)

        # Initialize weights to 1
        weight = torch.ones_like(bce_loss)

        # Apply weight for female ADHD cases (ADHD = 1 and sex = female)
        weight[(targets == 1) & (sex == 1)] = self.weight_female_adhd

        # Apply weights
        weighted_loss = weight * bce_loss
        return weighted_loss.mean()

# Initialize model and loss function
model = DualOutputNN(X_full.shape[1])
weighted_criterion = WeightedBCELoss(weight_female_adhd=2)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, factor=0.5)

# Training loop with early stopping
best_val_loss = np.inf
patience = 5
epochs_no_improve = 0

for epoch in range(30):
    model.train()
    running_loss = 0
    for xb, yb in train_loader:
        adhd_pred, sex_pred = model(xb)
        # Extract ADHD and Sex labels
        adhd_labels = yb[:, 0]
        sex_labels = yb[:, 1]
        
        # Compute weighted loss for both ADHD and Sex
        loss_adhd = weighted_criterion(adhd_pred.squeeze(), adhd_labels, sex_labels)
        loss_sex = weighted_criterion(sex_pred.squeeze(), sex_labels, sex_labels)
        
        # Total loss
        loss = loss_adhd + loss_sex
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    # Validation
    model.eval()
    val_losses = []
    with torch.no_grad():
        for xb, yb in test_loader:
            adhd_pred, sex_pred = model(xb)
            loss_adhd = weighted_criterion(adhd_pred.squeeze(), yb[:, 0], yb[:, 1])
            loss_sex = weighted_criterion(sex_pred.squeeze(), yb[:, 1], yb[:, 1])
            val_loss = loss_adhd + loss_sex
            val_losses.append(val_loss.item())
    avg_val_loss = np.mean(val_losses)
    scheduler.step(avg_val_loss)

    print(f"Epoch {epoch+1} | Train Loss: {running_loss:.4f} | Val Loss: {avg_val_loss:.4f}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        best_model = model.state_dict()
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print("Early stopping triggered.")
            break

# Evaluation
model.load_state_dict(best_model)
model.eval()

all_preds, all_true = [], []

with torch.no_grad():
    for xb, yb in test_loader:
        adhd_logits, sex_logits = model(xb)
        adhd_probs = torch.sigmoid(adhd_logits)
        sex_probs = torch.sigmoid(sex_logits)
        preds = torch.cat([adhd_probs, sex_probs], dim=1)
        all_preds.append(preds)
        all_true.append(yb)

all_preds = torch.cat(all_preds).numpy()
all_true = torch.cat(all_true).numpy()

# Binary prediction
adhd_pred_labels = (all_preds[:, 0] > 0.5).astype(int)
sex_pred_labels = (all_preds[:, 1] > 0.5).astype(int)

# True labels
all_true_adhd = all_true[:, 0].astype(int)
all_true_sex = all_true[:, 1].astype(int)

# Compute weighted F1 Score with 2x weight for Female ADHD cases
def weighted_f1(y_true, y_pred, weight_column):
    # Assign a weight of 2x for Female ADHD cases
    weight = np.ones_like(y_true)
    weight[(y_true == 1) & (weight_column == 1)] = 3  # Apply 2x weight for Female ADHD
    
    f1 = f1_score(y_true, y_pred, average='weighted', sample_weight=weight)
    return f1

f1_adhd = weighted_f1(all_true_adhd, adhd_pred_labels, all_true_sex)
f1_sex = weighted_f1(all_true_sex, sex_pred_labels, all_true_sex)

print(f"\nADHD Weighted F1 Score: {f1_adhd:.4f}")
print(f"Sex Weighted F1 Score: {f1_sex:.4f}")

# Metrics for ADHD
def print_metrics(y_true, y_pred, name):
    print(f"\n {name} Metrics:")
    print(f"Accuracy:  {accuracy_score(y_true, y_pred):.4f}")
    print(f"F1 Score:  {f1_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred):.4f}")
    print(f"Recall:    {recall_score(y_true, y_pred):.4f}")

print_metrics(all_true_adhd, adhd_pred_labels, "ADHD")
print_metrics(all_true_sex, sex_pred_labels, "Sex")

## Testing and Submission

In [14]:
# Use read_excel and use participant_id as the index
test_cat = pd.read_excel("../../widsdatathon2025/TEST/TEST_CATEGORICAL.xlsx")
test_func = pd.read_csv("../../widsdatathon2025/TEST/TEST_FUNCTIONAL_CONNECTOME_MATRICES.csv")
test_quant = pd.read_excel("../../widsdatathon2025/TEST/TEST_QUANTITATIVE_METADATA.xlsx")
solutions_path = "../../widsdatathon2025/TEST/TEST_SOLUTIONS.xlsx"

In [15]:
# Set index for merging
test_cat.set_index("participant_id", inplace=True)
test_quant.set_index("participant_id", inplace=True)
test_func.set_index("participant_id", inplace=True)

# train_cat.head()
train_func.head()
# train_quant.head()

Unnamed: 0,participant_id,0throw_1thcolumn,0throw_2thcolumn,0throw_3thcolumn,0throw_4thcolumn,0throw_5thcolumn,0throw_6thcolumn,0throw_7thcolumn,0throw_8thcolumn,0throw_9thcolumn,...,195throw_196thcolumn,195throw_197thcolumn,195throw_198thcolumn,195throw_199thcolumn,196throw_197thcolumn,196throw_198thcolumn,196throw_199thcolumn,197throw_198thcolumn,197throw_199thcolumn,198throw_199thcolumn
0,Cfwaf5FX7jWK,0.54848,0.713607,0.557319,0.524369,0.693364,0.770032,0.724406,0.390118,0.547912,...,0.080423,-0.054581,-0.088163,-0.028574,0.444847,0.350149,-0.012601,0.66575,0.560565,0.555732
1,vhGrzmvA3Hjq,0.42774,0.363022,0.402862,0.363003,0.534558,0.345347,0.409471,0.303328,0.402515,...,0.198009,-0.000724,0.083122,0.033043,0.687497,0.306229,0.717485,0.461809,0.559632,0.350027
2,ULliyEXjy4OV,0.139572,0.390106,-0.087041,0.196852,0.088148,0.023843,0.381782,0.068979,0.377488,...,0.051319,0.02363,-0.056819,0.117396,0.576086,0.517831,0.527044,0.605038,0.609856,0.750987
3,LZfeAb1xMtql,0.133561,0.778326,0.416355,0.47184,0.56846,0.63366,0.501113,0.345461,0.467943,...,0.046183,-0.238962,0.121868,-0.26097,0.646818,0.594902,0.608156,0.595459,0.683189,0.542296
4,EnFOUv0YK1RG,0.126699,0.575446,0.509422,0.363193,0.427544,0.449924,0.451796,0.223927,0.298248,...,0.315734,0.002234,0.290791,0.344149,0.480214,0.539824,0.447322,0.293088,0.148529,0.539823


In [16]:
from sklearn.decomposition import PCA

# Select only fMRI columns
fmri_data = test_func.drop(columns=['participant_id'], errors='ignore')

# Apply PCA to reduce to top N components (e.g. 50)
pca = PCA(n_components=50)
fmri_pca = pca.fit_transform(fmri_data)

test_func_pca = pd.DataFrame(fmri_pca, index=test_func.index,
                           columns=[f'fmri_pca_{i}' for i in range(1, 51)])
test_func_pca.head()

# Check missing values *before* merging
missing = test_quant.isnull().sum()
missing_cat = test_cat.isnull().sum()
print("Missing values in train_quant:\n", missing[missing > 0])
print("Missing values in train_cat:\n", missing_cat[missing_cat > 0])

# Define columns with small gaps to impute
small_gap_cols = [
    'EHQ_EHQ_Total', 'ColorVision_CV_Score',
    'APQ_P_APQ_P_CP', 'APQ_P_APQ_P_ID', 'APQ_P_APQ_P_INV',
    'APQ_P_APQ_P_OPD', 'APQ_P_APQ_P_PM', 'APQ_P_APQ_P_PP',
    'SDQ_SDQ_Conduct_Problems', 'SDQ_SDQ_Difficulties_Total',
    'SDQ_SDQ_Emotional_Problems', 'SDQ_SDQ_Externalizing',
    'SDQ_SDQ_Generating_Impact', 'SDQ_SDQ_Hyperactivity',
    'SDQ_SDQ_Internalizing', 'SDQ_SDQ_Peer_Problems', 'SDQ_SDQ_Prosocial',
    'PreInt_Demos_Fam_Child_Ethnicity', 'PreInt_Demos_Fam_Child_Race',
    'MRI_Track_Scan_Location'
]

small_gap_cols_cat = [
    'Barratt_Barratt_P1_Edu', 'Barratt_Barratt_P1_Occ'
]

# Fill small gaps using column means
for col in small_gap_cols:
    if col in test_quant.columns:
        test_quant[col] = test_quant[col].fillna(test_quant[col].mean())
for col in small_gap_cols_cat:
    if col in test_cat.columns:
        test_cat[col] = test_cat[col].fillna(test_cat[col].mean())

test_quant.drop(columns=['MRI_Track_Age_at_Scan'], inplace=True) # Dropped
test_cat.drop(columns=['Barratt_Barratt_P2_Occ', 'Barratt_Barratt_P2_Edu'], 
              inplace=True) # Dropped

# Make sure participant_id is a column, not the index
test_quant = test_quant.reset_index()
test_cat = test_cat.reset_index()

Missing values in train_quant:
 EHQ_EHQ_Total                  1
ColorVision_CV_Score           9
APQ_P_APQ_P_CP                15
APQ_P_APQ_P_ID                15
APQ_P_APQ_P_INV               15
APQ_P_APQ_P_OPD               15
APQ_P_APQ_P_PM                15
APQ_P_APQ_P_PP                15
SDQ_SDQ_Conduct_Problems      30
SDQ_SDQ_Difficulties_Total    30
SDQ_SDQ_Emotional_Problems    30
SDQ_SDQ_Externalizing         30
SDQ_SDQ_Generating_Impact     30
SDQ_SDQ_Hyperactivity         30
SDQ_SDQ_Internalizing         30
SDQ_SDQ_Peer_Problems         30
SDQ_SDQ_Prosocial             30
dtype: int64
Missing values in train_cat:
 PreInt_Demos_Fam_Child_Ethnicity     3
PreInt_Demos_Fam_Child_Race          6
Barratt_Barratt_P1_Edu               1
Barratt_Barratt_P1_Occ               1
Barratt_Barratt_P2_Edu              36
Barratt_Barratt_P2_Occ              42
dtype: int64


In [20]:
# Standardize test data
X_test_scaled = scaler.transform(X_test_selected)

# Convert to PyTorch tensor
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)

# Load into DataLoader
test_loader_final = DataLoader(X_test_tensor, batch_size=32)

# Set model to eval mode and load best weights
model.load_state_dict(best_model)
model.eval()

adhd_preds, sex_preds = [], []

with torch.no_grad():
    for xb in test_loader_final:
        adhd_out, sex_out = model(xb)
        adhd_preds.append(adhd_out)
        sex_preds.append(sex_out)

# Concatenate and threshold at 0.5
adhd_preds = torch.cat(adhd_preds).squeeze().numpy()
sex_preds = torch.cat(sex_preds).squeeze().numpy()

adhd_pred_labels = (adhd_preds > 0.5).astype(int)
sex_pred_labels = (sex_preds > 0.5).astype(int)

# Create test_soln submission DataFrame
test_soln = pd.DataFrame({
    'participant_id': X_full['participant_id'],  # or test_func['participant_id']
    'ADHD_Outcome': adhd_pred_labels,
    'Sex_F': sex_pred_labels
})

test_soln.to_csv("submission.csv", index=False)
test_soln

Unnamed: 0,participant_id,ADHD_Outcome,Sex_F
0,Cfwaf5FX7jWK,1,0
1,vhGrzmvA3Hjq,1,1
2,ULliyEXjy4OV,1,0
3,LZfeAb1xMtql,1,0
4,EnFOUv0YK1RG,1,0
...,...,...,...
299,UadZfjdEg7eG,1,0
300,IUEHiLmQAqCi,1,1
301,cRySmCadYFRO,1,0
302,E3MvDUtJadc5,1,0
