<a href="https://colab.research.google.com/github/jchen0000/widsdatathon2025/blob/main/python/Lasso_Neural_Network.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Import

In [None]:
# prompt: Access data from google drive
from google.colab import drive
drive.mount('/content/drive')


# Import solution w outcome data (1213, 3)
train_sol = pd.read_excel("drive/My Drive/widsdatathon2025/TRAIN_NEW/TRAINING_SOLUTIONS.xlsx")

# Import train MRI data (1213, 19901)
train_mri_filepath = "drive/My Drive/widsdatathon2025/TRAIN_NEW/TRAIN_FUNCTIONAL_CONNECTOME_MATRICES_new_36P_Pearson.csv"
train_mri = pd.read_csv(train_mri_filepath)

# Remove first line, convert csv file to a functional connectome matrix
train_matrix_data = train_mri.iloc[:, 1:].to_numpy(dtype=float)  # (1213, 19900)

# Import Train Data
!wget "https://github.com/jchen0000/widsdatathon2025/blob/main/data/train/TRAIN_CATEGORICAL_METADATA_imputed_Jing.xlsx?raw=true" -O train_cat2.xlsx
train_cat2 = pd.read_excel("train_cat2.xlsx")      # (1213, 10)

!wget "https://github.com/jchen0000/widsdatathon2025/blob/main/data/train/TRAIN_QUANTITATIVE_METADATA_imputed_Jing.xlsx?raw=true" -O train_quant2.xlsx
train_quant2 = pd.read_excel("train_quant2.xlsx")  # (1213, 19)

# Merge train datasets on the participant ID
combined_train_data2 = pd.merge(train_cat2, train_quant2, on='participant_id')
train_with_outcome2 = pd.merge(combined_train_data2, train_sol, on='participant_id')  # (1213, 30)

# Combine metadata and fMRI into one feature matrix
features2 = np.hstack([combined_train_data2, train_matrix_data])

# Remove patient ID
combined_metadata_imputed = features2[:, 1:]
combined_metadata_imputed = combined_metadata_imputed.astype(float)  # (1213, 19927)



# Import test MRI data
test_mri_filepath = "drive/My Drive/widsdatathon2025/TEST/TEST_FUNCTIONAL_CONNECTOME_MATRICES.csv"
test_mri = pd.read_csv(test_mri_filepath)

# Remove first line, convert csv file to a functional connectome matrix
test_matrix_data = test_mri.iloc[:, 1:].to_numpy(dtype=float)  # (304, 19900)

# Import test Data
!wget "https://github.com/jchen0000/widsdatathon2025/blob/main/data/train/TEST_CATEGORICAL_METADATA_imputed_Jing_new.xlsx?raw=true" -O test_cat2.xlsx
test_cat2 = pd.read_excel("test_cat2.xlsx")      # (304, 28)

!wget "https://github.com/jchen0000/widsdatathon2025/blob/main/data/train/TEST_QUANTITATIVE_METADATA_imputed_Jing_new.xlsx?raw=true" -O test_quant2.xlsx
test_quant2 = pd.read_excel("test_quant2.xlsx")  # (304, 28)

# Merge test datasets on the participant ID
combined_test_data2 = pd.merge(test_cat2, test_quant2, on='participant_id')
test_no_id2 = combined_test_data2.drop(columns=['participant_id'])

test_patient_ids = combined_test_data2['participant_id'].values  # Extract patient IDs using column name

# Combine metadata and fMRI into one feature matrix
test_features2 = np.hstack([combined_test_data2, test_matrix_data])


# Remove patient ID
combined_test_metadata_imputed = test_features2[:, 1:]
combined_test_metadata_imputed = combined_test_metadata_imputed.astype(float)  # (304, 19927)

# Neural Network

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
import numpy as np

# --- Define the function to compute weighted F1 score ---
def compute_weighted_f1(y_true_adhd, y_pred_adhd, y_true_sex, y_pred_sex):
    """
    Compute the final weighted F1 score for the competition.
    Female ADHD cases (where ADHD_Outcome==1 and Sex_F==1) are given 2x weight.
    The final score is the average of the weighted F1 for ADHD_Outcome and the F1 for Sex_F.
    """
    # Create sample weights for ADHD_Outcome: weight=2 for female ADHD cases, else 1.
    weights = np.ones_like(y_true_adhd, dtype=float)
    weights[(y_true_adhd == 1) & (y_true_sex == 1)] = 2.0

    # Calculate weighted F1 for ADHD_Outcome
    f1_adhd = f1_score(y_true_adhd, y_pred_adhd, sample_weight=weights)
    # Calculate standard F1 for Sex_F
    f1_sex = f1_score(y_true_sex, y_pred_sex)

    # Final leaderboard score: average of the two F1 scores
    final_f1 = (f1_adhd + f1_sex) / 2.0
    return final_f1, f1_adhd, f1_sex

# --- Prepare the data ---
# Use combined_metadata_imputed as input features.
X = combined_metadata_imputed  # (1213, 19927)
# Extract targets from train_sol:
y_adhd = train_sol['ADHD_Outcome'].values  # Binary labels (0 or 1)
y_sex  = train_sol['Sex_F'].values         # Binary labels (0 or 1)

# Standardize features.
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into training and validation sets.
# Note: We split the labels separately but the splits will be aligned because we use the same X_scaled.
X_train_split, X_val_split, y_train_adhd_split, y_val_adhd_split = train_test_split(
    X_scaled, y_adhd, test_size=0.2, random_state=42
)
_, _, y_train_sex_split, y_val_sex_split = train_test_split(
    X_scaled, y_sex, test_size=0.2, random_state=42
)

# --- Build the multi-output neural network model ---
input_layer = Input(shape=(X_train_split.shape[1],))
x = Dense(128, activation='relu', kernel_regularizer=l2(0.01))(input_layer)
x = Dropout(0.5)(x)
x = BatchNormalization()(x)
x = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(x)
x = Dropout(0.5)(x)
x = BatchNormalization()(x)
x = Dense(32, activation='relu', kernel_regularizer=l2(0.01))(x)

# Two separate output layers:
output_adhd = Dense(1, activation='sigmoid', name='adhd_output')(x)
output_sex  = Dense(1, activation='sigmoid', name='sex_output')(x)

model = Model(inputs=input_layer, outputs=[output_adhd, output_sex])

# Compile the model.
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
              loss='binary_crossentropy',
              metrics={'adhd_output': 'accuracy', 'sex_output': 'accuracy'})


# Set up early stopping.
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# --- Train the model ---
history = model.fit(
    X_train_split,
    {'adhd_output': y_train_adhd_split, 'sex_output': y_train_sex_split},
    epochs=100,
    batch_size=32,
    validation_data=(X_val_split, {'adhd_output': y_val_adhd_split, 'sex_output': y_val_sex_split}),
    callbacks=[early_stopping]
)

# --- Evaluate the model ---
results = model.evaluate(X_val_split, {'adhd_output': y_val_adhd_split, 'sex_output': y_val_sex_split})
print("Evaluation Results:", results)

# --- Make predictions on the validation set ---
predictions = model.predict(X_val_split)
# predictions is a list: predictions[0] for ADHD_Outcome, predictions[1] for Sex_F.
y_val_pred_adhd = (predictions[0] > 0.5).astype(int).flatten()
y_val_pred_sex  = (predictions[1] > 0.5).astype(int).flatten()

# --- Compute the weighted F1 score ---
final_f1, f1_adhd, f1_sex = compute_weighted_f1(y_val_adhd_split, y_val_pred_adhd, y_val_sex_split, y_val_pred_sex)

print(f"ADHD F1 Score (weighted): {f1_adhd:.4f}")
print(f"Sex_F F1 Score: {f1_sex:.4f}")
print(f"Final Weighted F1 Score: {final_f1:.4f}")



# Threshold Tuning

In [None]:
# For ADHD:
#    y_val_adhd_prob: predicted probabilities from the model for ADHD_Outcome (shape: (n_samples,))
#    y_val_adhd: true ADHD_Outcome labels (0 or 1)
# For Sex_F:
#    y_val_sex_prob: predicted probabilities from the model for Sex_F (shape: (n_samples,))
#    y_val_sex: true Sex_F labels (0 or 1)

y_val_adhd_prob = predictions[0].flatten()
y_val_sex_prob  = predictions[1].flatten()

# --- Define the function to compute weighted F1 score ---
def compute_weighted_f1(y_true_adhd, y_pred_adhd, y_true_sex, y_pred_sex):
    """
    Compute the final weighted F1 score.
    Female ADHD cases (where ADHD_Outcome==1 and Sex_F==1) are given 2x weight.
    Returns:
       final_f1: the average of the weighted ADHD F1 and the Sex_F F1.
       f1_adhd: F1 score for ADHD_Outcome using sample weights.
       f1_sex:  F1 score for Sex_F (unweighted).
    """
    weights = np.ones_like(y_true_adhd, dtype=float)
    weights[(y_true_adhd == 1) & (y_true_sex == 1)] = 2.0
    f1_adhd = f1_score(y_true_adhd, y_pred_adhd, sample_weight=weights)
    f1_sex = f1_score(y_true_sex, y_pred_sex)
    final_f1 = (f1_adhd + f1_sex) / 2.0
    return final_f1, f1_adhd, f1_sex

# --- Define a range of thresholds to test ---
#thresholds = np.linspace(0, 1, 101)  # from 0.00 to 1.00 in steps of 0.01
thresholds = np.linspace(0.2, 0.99, 98)

best_final_f1 = 0.0
best_thr_adhd = 0.0
best_thr_sex  = 0.0
best_f1_adhd = 0.0
best_f1_sex  = 0.0

# --- Perform a 2D threshold sweep ---
for thr_a in thresholds:
    for thr_s in thresholds:
        y_pred_adhd = (y_val_adhd_prob > thr_a).astype(int)
        y_pred_sex  = (y_val_sex_prob > thr_s).astype(int)
        final_f1, f1_adhd, f1_sex = compute_weighted_f1(y_val_adhd_split, y_pred_adhd, y_val_sex_split, y_pred_sex)
        if final_f1 > best_final_f1:
            best_final_f1 = final_f1
            best_thr_adhd = thr_a
            best_thr_sex  = thr_s
            best_f1_adhd = f1_adhd
            best_f1_sex  = f1_sex

print(f"Best ADHD Threshold: {best_thr_adhd:.2f} with F1: {best_f1_adhd:.4f}")
print(f"Best Sex_F Threshold: {best_thr_sex:.2f} with F1: {best_f1_sex:.4f}")
print(f"Best Final Weighted F1: {best_final_f1:.4f}")



# Threshold Tuning Graph

In [None]:
# prompt: generate a threshold tuning graph for ADHD and Sex_F

import numpy as np
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

# Assuming y_val_adhd_prob, y_val_sex_prob, y_val_adhd_split, and y_val_sex_split are defined as in your code.

# Function to compute the weighted F1 score (same as before)
def compute_weighted_f1(y_true_adhd, y_pred_adhd, y_true_sex, y_pred_sex):
    weights = np.ones_like(y_true_adhd, dtype=float)
    weights[(y_true_adhd == 1) & (y_true_sex == 1)] = 2.0
    f1_adhd = f1_score(y_true_adhd, y_pred_adhd, sample_weight=weights)
    f1_sex = f1_score(y_true_sex, y_pred_sex)
    final_f1 = (f1_adhd + f1_sex) / 2.0
    return final_f1, f1_adhd, f1_sex


thresholds = np.linspace(0, 1, 101)
f1_scores_adhd = []
f1_scores_sex = []

for threshold in thresholds:
    y_pred_adhd = (y_val_adhd_prob > threshold).astype(int)
    y_pred_sex = (y_val_sex_prob > threshold).astype(int)
    _, f1_adhd, f1_sex = compute_weighted_f1(y_val_adhd_split, y_pred_adhd, y_val_sex_split, y_pred_sex)
    f1_scores_adhd.append(f1_adhd)
    f1_scores_sex.append(f1_sex)

# Plotting the threshold tuning graph
plt.figure(figsize=(10, 6))
plt.plot(thresholds, f1_scores_adhd, label='ADHD F1 Score')
plt.plot(thresholds, f1_scores_sex, label='Sex_F F1 Score')
plt.xlabel('Threshold')
plt.ylabel('F1 Score')
plt.title('Threshold Tuning Graph for ADHD and Sex_F')
plt.legend()
plt.grid(True)
plt.show()
