### Load in the training data

In [None]:
import pandas as pd
import numpy as np

# Use read_excel and use participant_id as the index
train_cat = pd.read_excel(
    "../../widsdatathon2025/TRAIN_NEW/TRAIN_CATEGORICAL_METADATA_new.xlsx")
train_func = pd.read_csv("../../widsdatathon2025/TRAIN_NEW/"
                         "TRAIN_FUNCTIONAL_CONNECTOME_MATRICES_new_36P_Pearson.csv")
train_quant = pd.read_excel(
    "../../widsdatathon2025/TRAIN_NEW/TRAIN_QUANTITATIVE_METADATA_new.xlsx")
train_soln = pd.read_excel(
    "../../widsdatathon2025/TRAIN_NEW/TRAINING_SOLUTIONS.xlsx")

In [None]:
# Set index for merging
train_cat.set_index("participant_id", inplace=True)
train_quant.set_index("participant_id", inplace=True)
train_func.set_index("participant_id", inplace=True)
train_soln.set_index("participant_id", inplace=True)

# train_cat.head()
train_func.head()
# train_quant.head()
# train_soln.head()

## PCA for the functional connectome matrix to reduce dimension

In [None]:
from sklearn.decomposition import PCA

# Select only fMRI columns
fmri_data = train_func.drop(columns=['participant_id'], errors='ignore')

# Apply PCA to reduce to top N components (e.g. 50)
pca = PCA(n_components=50)
fmri_pca = pca.fit_transform(fmri_data)

train_func_pca = pd.DataFrame(fmri_pca, index=train_func.index,
                           columns=[f'fmri_pca_{i}' for i in range(1, 51)])
train_func_pca.head()

## Interpolate missing values with mean for quant and cat dfs

In [None]:
# Check missing values *before* merging
missing = train_quant.isnull().sum()
missing_cat = train_cat.isnull().sum()
print("Missing values in train_quant:\n", missing[missing > 0])
print("Missing values in train_cat:\n", missing_cat[missing_cat > 0])

# Define columns with small gaps to impute
small_gap_cols = [
    'EHQ_EHQ_Total', 'ColorVision_CV_Score',
    'APQ_P_APQ_P_CP', 'APQ_P_APQ_P_ID', 'APQ_P_APQ_P_INV',
    'APQ_P_APQ_P_OPD', 'APQ_P_APQ_P_PM', 'APQ_P_APQ_P_PP',
    'SDQ_SDQ_Conduct_Problems', 'SDQ_SDQ_Difficulties_Total',
    'SDQ_SDQ_Emotional_Problems', 'SDQ_SDQ_Externalizing',
    'SDQ_SDQ_Generating_Impact', 'SDQ_SDQ_Hyperactivity',
    'SDQ_SDQ_Internalizing', 'SDQ_SDQ_Peer_Problems', 'SDQ_SDQ_Prosocial',
    'PreInt_Demos_Fam_Child_Ethnicity', 'PreInt_Demos_Fam_Child_Race',
    'MRI_Track_Scan_Location'
]

small_gap_cols_cat = [
    'Barratt_Barratt_P1_Edu', 'Barratt_Barratt_P1_Occ'
]

# Fill small gaps using column means
for col in small_gap_cols:
    if col in train_quant.columns:
        train_quant[col] = train_quant[col].fillna(train_quant[col].mean())
for col in small_gap_cols_cat:
    if col in train_cat.columns:
        train_cat[col] = train_cat[col].fillna(train_cat[col].mean())

train_quant.drop(columns=['MRI_Track_Age_at_Scan'], inplace=True) # Dropped
train_cat.drop(columns=['Barratt_Barratt_P2_Occ', 'Barratt_Barratt_P2_Edu'], 
              inplace=True) # Dropped

In [None]:
# Check missing values *before* merging
missing = train_quant.isnull().sum()
print("Missing values in train_quant:\n", missing[missing > 0])

In [None]:
train_quant.head()

## Feature Selection for train_quant using random forests

In [None]:
# Make sure participant_id is a column, not the index
train_quant = train_quant.reset_index()
train_cat = train_cat.reset_index()
train_soln = train_soln.reset_index()
train_quant.head()

In [None]:
train_cat.head()

In [None]:
# Combine with targets temporarily
quant_with_targets = train_quant.merge(train_soln[['participant_id', 'ADHD_Outcome',
                                                   'Sex_F']], on='participant_id')

# Compute correlations
corrs = quant_with_targets.drop(columns='participant_id').corr()

# Grab correlation with targets
adhd_corr = corrs['ADHD_Outcome'].abs().sort_values(ascending=False)
sex_corr = corrs['Sex_F'].abs().sort_values(ascending=False)

# Keep top N or those above a threshold
top_adhd_feats = adhd_corr[adhd_corr > 0.05].index.tolist()
top_sex_feats = sex_corr[sex_corr > 0.05].index.tolist()

# Union of both sets, drop targets themselves if included
quant_selected = list(set(top_adhd_feats + top_sex_feats) - {'ADHD_Outcome', 'Sex_F'})
quant_selected

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Merge everything on participant_id
X_full = train_quant[['participant_id'] + quant_selected] \
    .merge(train_cat, on='participant_id') \
    .merge(train_func_pca, on='participant_id')

# Target labels
y = train_soln[['participant_id', 'ADHD_Outcome', 'Sex_F']]

# Align rows across X and y
X_full = X_full.merge(y, on='participant_id')
y = X_full[['ADHD_Outcome', 'Sex_F']]
X_full = X_full.drop(columns=['ADHD_Outcome', 'Sex_F'])

# Use combined target just for ranking features
combo_target = y['ADHD_Outcome'] + y['Sex_F']

# Train the Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_full.drop(columns=['participant_id']), combo_target)

# Rank feature importances
feat_importances = pd.Series(rf.feature_importances_, index=X_full.drop(columns=['participant_id']).columns)
top_features_final = feat_importances.sort_values(ascending=False).head(50).index.tolist()

# Final X for model training
X_selected = X_full[top_features_final]
X_selected

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_selected)

# Convert to tensors
X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
y_tensor = torch.tensor(y[['ADHD_Outcome', 'Sex_F']].values, dtype=torch.float32)

# Stratified train/test split
stratify_labels = y['ADHD_Outcome'].astype(str) + y['Sex_F'].astype(str)
X_train, X_test, y_train, y_test = train_test_split(
    X_tensor, y_tensor, test_size=0.2, random_state=42, stratify=stratify_labels)

train_ds = TensorDataset(X_train, y_train)
test_ds = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=32)

# Beefed-up model
class BetterMultiOutputNN(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.shared = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        self.head_adhd = nn.Linear(128, 1)
        self.head_sex = nn.Linear(128, 1)

    def forward(self, x):
        shared_out = self.shared(x)
        return torch.sigmoid(self.head_adhd(shared_out)), torch.sigmoid(self.head_sex(shared_out))

# Weighted BCE Loss with 2x weight for Female ADHD cases
class WeightedBCELoss(nn.Module):
    def __init__(self, weight_female_adhd=2):
        super(WeightedBCELoss, self).__init__()
        self.weight_female_adhd = weight_female_adhd

    def forward(self, predictions, targets, sex):
        # Calculate standard BCE loss
        bce_loss = nn.BCEWithLogitsLoss(reduction='none')(predictions, targets)
        
        # Apply weight for female ADHD cases (ADHD_Outcome=1, Sex_F=1)
        weight = torch.ones_like(bce_loss)
        weight[(targets == 1) & (sex == 1)] = self.weight_female_adhd
        
        # Apply the weights to the loss
        weighted_loss = weight * bce_loss
        return weighted_loss.mean()

# Initialize model and loss function
model = BetterMultiOutputNN(X_selected.shape[1])
weighted_criterion = WeightedBCELoss(weight_female_adhd=2)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, factor=0.5)

# Training loop with early stopping
best_val_loss = np.inf
patience = 5
epochs_no_improve = 0

for epoch in range(30):
    model.train()
    running_loss = 0
    for xb, yb in train_loader:
        adhd_pred, sex_pred = model(xb)
        # Extract ADHD and Sex labels
        adhd_labels = yb[:, 0]
        sex_labels = yb[:, 1]
        
        # Compute weighted loss for both ADHD and Sex
        loss_adhd = weighted_criterion(adhd_pred.squeeze(), adhd_labels, sex_labels)
        loss_sex = weighted_criterion(sex_pred.squeeze(), sex_labels, sex_labels)
        
        # Total loss
        loss = loss_adhd + loss_sex
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    # Validation
    model.eval()
    val_losses = []
    with torch.no_grad():
        for xb, yb in test_loader:
            adhd_pred, sex_pred = model(xb)
            loss_adhd = weighted_criterion(adhd_pred.squeeze(), yb[:, 0], yb[:, 1])
            loss_sex = weighted_criterion(sex_pred.squeeze(), yb[:, 1], yb[:, 1])
            val_loss = loss_adhd + loss_sex
            val_losses.append(val_loss.item())
    avg_val_loss = np.mean(val_losses)
    scheduler.step(avg_val_loss)

    print(f"Epoch {epoch+1} | Train Loss: {running_loss:.4f} | Val Loss: {avg_val_loss:.4f}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        best_model = model.state_dict()
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print("Early stopping triggered.")
            break

# Evaluation
model.load_state_dict(best_model)
model.eval()
all_preds, all_true = [], []

with torch.no_grad():
    for xb, yb in test_loader:
        adhd_pred, sex_pred = model(xb)
        preds = torch.cat([adhd_pred, sex_pred], dim=1)
        all_preds.append(preds)
        all_true.append(yb)

all_preds = torch.cat(all_preds).numpy()
all_true = torch.cat(all_true).numpy()

# Thresholding
adhd_pred_labels = (all_preds[:, 0] > 0.5).astype(int)
sex_pred_labels = (all_preds[:, 1] > 0.5).astype(int)

# Ensure the true labels are correctly handled (indexing properly)
all_true_adhd = all_true[:, 0].astype(int)
all_true_sex = all_true[:, 1].astype(int)

# Compute weighted F1 Score with 2x weight for Female ADHD cases
def weighted_f1(y_true, y_pred, weight_column):
    # Assign a weight of 2x for Female ADHD cases
    weight = np.ones_like(y_true)
    weight[(y_true == 1) & (weight_column == 1)] = 3  # Apply 2x weight for Female ADHD
    
    f1 = f1_score(y_true, y_pred, average='weighted', sample_weight=weight)
    return f1

f1_adhd = weighted_f1(all_true_adhd, adhd_pred_labels, all_true_sex)
f1_sex = weighted_f1(all_true_sex, sex_pred_labels, all_true_sex)

print(f"\nADHD Weighted F1 Score: {f1_adhd:.4f}")
print(f"Sex Weighted F1 Score: {f1_sex:.4f}")

# Metrics for ADHD
def print_metrics(y_true, y_pred, name):
    print(f"\n {name} Metrics:")
    print(f"Accuracy:  {accuracy_score(y_true, y_pred):.4f}")
    print(f"F1 Score:  {f1_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred):.4f}")
    print(f"Recall:    {recall_score(y_true, y_pred):.4f}")

print_metrics(all_true_adhd, adhd_pred_labels, "ADHD")
print_metrics(all_true_sex, sex_pred_labels, "Sex")