<a href="https://colab.research.google.com/github/jchen0000/widsdatathon2025/blob/main/python/CNN_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np

# Neural Networks
reference:

### Load Data

In [4]:
from google.colab import drive
drive.mount('/content/drive')

## suppose you've created a folder named "widsdatathon2025"
%cd /content/drive/MyDrive/widsdatathon2025/
%ls

Mounted at /content/drive
/content/drive/MyDrive/widsdatathon2025
'Data Dictionary.xlsx'   [0m[01;34mgnn[0m/   SAMPLE_SUBMISSION.xlsx   [01;34mTEST[0m/   [01;34mTRAIN[0m/   [01;34mWork[0m/


In [5]:
# training data
dir = './TRAIN/' #"./data/train/"
df_train_mri = pd.read_csv(dir+"TRAIN_FUNCTIONAL_CONNECTOME_MATRICES.csv")
df_train_mri.set_index('participant_id', inplace=True)
print(df_train_mri.shape)
df_train_mri.head()

df_train_meta_quant = pd.read_excel(dir+"TRAIN_QUANTITATIVE_METADATA.xlsx")
df_train_meta_quant.set_index('participant_id', inplace=True)
print(df_train_meta_quant.shape)

df_train_meta_cat = pd.read_excel(dir+"TRAIN_CATEGORICAL_METADATA.xlsx")
df_train_meta_cat.set_index('participant_id', inplace=True)
print(df_train_meta_cat.shape)

df_train_y = pd.read_excel(dir+"TRAINING_SOLUTIONS.xlsx")
df_train_y.set_index('participant_id', inplace=True)
print(df_train_y.shape)

(1213, 19900)
(1213, 18)
(1213, 9)
(1213, 2)


In [6]:
# Merge all training data together
df_train_X = pd.concat([df_train_mri, df_train_meta_quant, df_train_meta_cat], axis=1)
print(df_train_X.shape)

# Combine both columns for prediction
# (0,0): 0
# (0,1): 1
# (1,0): 2
# (1,1): 3
df_train_y_sparse = pd.DataFrame()
df_train_y_sparse['sparse_category'] = df_train_y['ADHD_Outcome'] *2 + df_train_y['Sex_F']
df_train_y_sparse.head()

(1213, 19927)

In [180]:
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer

df_train_X[df_train_X == "NA"] = np.nan

# Impute missing values with KNN imputation
imputer_metadata = KNNImputer(n_neighbors=5)  # Using KNN for imputation instead of mean
df_train_X_imputed = imputer_metadata.fit_transform(df_train_X)  # ((1213, 19927)
df_train_X_imputed = pd.DataFrame(df_train_X_imputed, columns=df_train_X.columns, index=df_train_X.index)
print(df_train_X_imputed.shape)

(1213, 19927)


In [184]:
# prompt: check NA value in the df_train_X_imputed variable

import numpy as np

# check NA in original dataset
na_count_old = np.isnan(df_train_X).sum()
print(f"Number of NA values in df_train_X: {na_count_old.sum()}")
print(f"NA value per column (>0): \n{na_count_old[na_count_old > 0]}")

# Check for NaN values in the imputed dataset
na_count = np.isnan(df_train_X_imputed).sum()
print(f"\n\nNumber of NA values in df_train_X_imputed: {na_count.sum()}")

# Check for NaN values in each column
na_per_column = np.isnan(df_train_X_imputed).sum(axis=0)
print(f"After Inputation: More than 1 NA columns: {sum(na_per_column > 0)}")
print(f"NA values per column:\n{na_per_column.values}")


Number of NA values in df_train_X: 371
NA value per column (>0): 
MRI_Track_Age_at_Scan               360
PreInt_Demos_Fam_Child_Ethnicity     11
dtype: int64


Number of NA values in df_train_X_imputed: 0
After Inputation: More than 1 NA columns: 0
NA values per column:
[0 0 0 ... 0 0 0]


In [82]:
# # prompt: normalize and standardize df_train_X

# from sklearn.preprocessing import MinMaxScaler, StandardScaler

# # # Normalize
# # scaler = MinMaxScaler()
# # df_train_X_normalized = scaler.fit_transform(df_train_X)
# # df_train_X_normalized = pd.DataFrame(df_train_X_normalized, columns=df_train_X.columns, index=df_train_X.index)

# # Standardize
# scaler = StandardScaler()
# df_train_X_standardized = scaler.fit_transform(df_train_X)
# df_train_X_standardized = pd.DataFrame(df_train_X_standardized, columns=df_train_X.columns, index=df_train_X.index)

# # Normalize
# scaler = MinMaxScaler()
# df_train_X_normalized = scaler.fit_transform(df_train_X_standardized)
# df_train_X_normalized = pd.DataFrame(df_train_X_normalized, columns=df_train_X.columns, index=df_train_X.index)

# df_train_X_normalized.iloc[:, -27:].describe()


In [7]:
# Test data
dir = './TEST/'
df_test_mri = pd.read_csv(dir+"TEST_FUNCTIONAL_CONNECTOME_MATRICES.csv")
df_test_mri.set_index('participant_id', inplace=True)
print(df_test_mri.shape)

df_test_meta_quant = pd.read_excel(dir+"TEST_QUANTITATIVE_METADATA.xlsx")
df_test_meta_quant.set_index('participant_id', inplace=True)
print(df_test_meta_quant.shape)

df_test_meta_cat = pd.read_excel(dir+"TEST_CATEGORICAL.xlsx")
df_test_meta_cat.set_index('participant_id', inplace=True)
print(df_test_meta_cat.shape)

(304, 19900)
(304, 18)
(304, 9)


In [8]:
df_test_X = pd.concat([df_test_mri, df_test_meta_quant, df_test_meta_cat], axis=1)
df_test_X.shape

(304, 19927)

### Train-validation data splitting

In [185]:
# prompt: split df_train_X and df_train_y into train and validation set and convert df into arrays able to be input into keras model

from sklearn.model_selection import train_test_split

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(df_train_X_imputed, df_train_y, test_size=0.2, random_state=42)

# Convert DataFrames to NumPy arrays
X_train = X_train.values
X_val = X_val.values
y_train = y_train.values
y_val = y_val.values

print(y_val.shape)

(243, 2)


## Evaluation Metrics

In [219]:
# prompt: I have a y_true array with two columns and a y_pred array with two columns, I want to use both columns together to determine if it's correct or not. how to calculate the f1 score. customized the weight variable based on the y_true, setting each (1,1) pair in y_true as 2 in weight, all others are 1 in weight. use sklearn f1-score function if possible. construct the function to be used as the keras neural network training metrics. The input is two tensor array

import numpy as np
from sklearn.metrics import f1_score
import tensorflow as tf

def custom_f1(y_true, y_pred):
    y_true = tf.cast(y_true, tf.int32)
    y_pred = tf.cast(tf.round(y_pred), tf.int32)

    if y_true.shape[-1] == 2:
      # Combine both columns for prediction
      # (0,0): 0
      # (0,1): 1
      # (1,0): 2
      # (1,1): 3
      y_true = y_true[:,0] * 2 + y_true[:,1]
      y_pred = y_pred[:,0] * 2 + y_pred[:,1]
    elif y_true.shape[-1] == 1:
      y_true = tf.squeeze(y_true)
      y_pred = tf.squeeze(y_pred)
    else:
      raise ValueError("y_true should have shape (None, 2) or (None, 1)")

    # Calculate weights
    weights = tf.where(tf.equal(y_true, 3), 2.0, 1.0)

    # Reshape for sklearn f1_score
    y_true_reshaped = tf.reshape(y_true, [-1]).numpy()
    y_pred_reshaped = tf.reshape(y_pred, [-1]).numpy()
    weights_reshaped = tf.reshape(weights, [-1]).numpy()
    print(y_true_reshaped.shape)
    print(y_pred_reshaped.shape)
    print(weights_reshaped.shape)

    f1 = f1_score(y_true_reshaped, y_pred_reshaped, average='weighted', sample_weight=weights_reshaped)
    return f1


In [220]:
## fake y_pred
y_pred = np.random.randint(0, 2, size=(y_train.shape[0], 2))
y_train_tensor = tf.convert_to_tensor(y_train, dtype=tf.float32)
y_pred_tensor = tf.convert_to_tensor(y_pred, dtype=tf.float32)
print(y_train_tensor.shape)
print(y_pred_tensor.shape)

custom_f1(y_train, y_pred)

(970, 2)
(970, 2)
(970,)
(970,)
(970,)


0.2670525573366143

## Neural network

In [120]:
# # prompt: I have a y_true array with two columns and a y_pred array with two columns, I want to use both columns together to determine if it's correct or not. how to calculate the f1 score. customized the weight variable based on the y_true, setting each (1,1) pair in y_true as 2 in weight, all others are 1 in weight. use sklearn f1-score function if possible. construct the function to be used in the keras neural network training metrics. The input is two tensor array

# import tensorflow as tf
# from sklearn.metrics import f1_score

# def custom_f1(y_true, y_pred):
#     y_true = tf.cast(y_true, tf.float32)
#     y_pred = tf.cast(tf.math.round(y_pred), tf.float32)
#     print(y_true.shape)
#     print(y_pred.shape)

#     # Create weights based on y_true
#     weights = tf.where(tf.reduce_all(tf.equal(y_true, [[1.0, 1.0]]), axis=1), 2.0, 1.0)

#     # Calculate the F1 score using sklearn
#     f1 = tf.numpy_function(f1_score_with_weights,
#                            inp=[y_true, y_pred, weights],
#                            Tout=tf.float32)
#     return f1


# def f1_score_with_weights(y_true, y_pred, weights):
#     y_true = y_true.astype(int)
#     y_pred = y_pred.astype(int)
#     weights = weights.flatten()

#     # Combine both columns for prediction
#     # (0,0): 0
#     # (0,1): 1
#     # (1,0): 2
#     # (1,1): 3
#     combined_y_true = y_true[:,0] * 2 + y_true[:,1]
#     combined_y_pred = y_pred[:,0] * 2 + y_pred[:,1]

#     return f1_score(combined_y_true, combined_y_pred, average='weighted', sample_weight=weights)


In [121]:
# ## fake y_pred
# y_pred = np.random.randint(0, 2, size=(y_train.shape[0], 2))
# y_train_tensor = tf.convert_to_tensor(y_train, dtype=tf.float32)
# y_pred_tensor = tf.convert_to_tensor(y_pred, dtype=tf.float32)
# custom_f1(y_train_tensor, y_pred_tensor)

(970, 2)
(970, 2)


<tf.Tensor: shape=(), dtype=float32, numpy=0.26099467277526855>

In [212]:
# prompt: set up a neural network model with an input layer of 19927 cells, and output layer of 2 cells, add dropoff layer if needed, add activation functions. try use keras package to do that.

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import Dense, Dropout

model = keras.Sequential([
    Dense(128, activation='relu', input_shape=(19927,), kernel_regularizer=l2(0.01)),  # Input layer with 19927 cells and ReLU activation
    Dropout(0.5),  # Dropout layer for regularization
    Dense(64, activation='relu', kernel_regularizer=l2(0.01)),  # Hidden layer with 64 cells and ReLU activation
    Dropout(0.3), # Another dropout layer
    Dense(2, activation='sigmoid', kernel_regularizer=l2(0.01))  # Output layer with 2 cells and sigmoid activation for binary classification
])

# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3, clipvalue=1)
# f1_scorer = tf.keras.metrics.F1Score(average='weight')
model.compile(optimizer=optimizer,
              loss='binary_crossentropy', # Use binary_crossentropy for binary classification
              metrics=['f1_score']) # Include custom F1 metric

# Print model summary
model.summary()


In [123]:
# prompt: set up a neural network model with an input layer of 19927 cells, and output layer of 1 cells with 4 potential categories, add dropoff layer if needed, add activation functions. try use keras package to do that.

sparse_model = keras.Sequential([
    keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(4, activation='softmax') # Output layer with 4 categories and softmax activation
])

optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
sparse_model.compile(optimizer=optimizer,
              loss='sparse_categorical_crossentropy', # Use sparse_categorical_crossentropy for integer labels
              metrics=['f1_score']) # Use accuracy as metric

sparse_model.summary()


In [101]:
# # prompt: set up a neural network model with an input layer of 19927 cells, and output layer of 2 cells, add dropoff layer if needed, add activation functions. try use keras package to do that.

# import tensorflow as tf
# from tensorflow import keras
# from tensorflow.keras.layers import Dense, Dropout

# sparse_model = keras.Sequential([
#     Dense(128, activation='relu', input_shape=(19900,), kernel_initializer='he_normal'),  # Input layer with 19927 cells and ReLU activation
#     Dropout(0.5),  # Dropout layer for regularization
#     Dense(64, activation='relu'),  # Hidden layer with 64 cells and ReLU activation
#     Dropout(0.3), # Another dropout layer
#     Dense(1, activation='softmax')  # Output layer with 2 cells and sigmoid activation for binary classification
# ])

# # Compile the model
# optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4, clipvalue=1)
# # f1_scorer = tf.keras.metrics.F1Score(average='weight')
# sparse_model.compile(optimizer=optimizer,
#               loss='sparse_categorical_crossentropy', # Use binary_crossentropy for binary classification
#               metrics=['f1_score']) # Include custom F1 metric

# # Print model summary
# sparse_model.summary()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [35]:
# # prompt: set up a neural network model with an input layer of 19927 cells, and output layer of 2 cells, add dropoff layer if needed, add activation functions. try use keras package to do that. Use weighted F1 score as training metrics (2x in y=(1,1))

# import tensorflow as tf
# from tensorflow import keras
# from tensorflow.keras.layers import Dense, Dropout
# from tensorflow.keras.models import Sequential
# from sklearn.metrics import f1_score

# # Assuming df_train_mri, df_train_meta_quant, df_train_meta_cat, and df_train_y are defined as in the previous code.
# # And that X_train and y_train are preprocessed appropriately

# # Define the model
# model = Sequential()
# model.add(Dense(128, activation='relu', input_shape=(19927,))) # Input layer with 19927 cells
# model.add(Dropout(0.5)) # Dropout layer for regularization
# model.add(Dense(64, activation='relu'))
# model.add(Dropout(0.3))
# model.add(Dense(2, activation='sigmoid')) # Output layer with 2 cells and softmax activation for classification

# model.compile(optimizer='adam',
#               loss='categorical_crossentropy', # Or categorical_crossentropy if y_train is one-hot encoded
#               metrics=['accuracy'])

# # Example training (replace with your actual data and training parameters)
# # Assuming X_train and y_train are numpy arrays
# # X_train = np.array(...)
# # y_train = np.array(...)

# # history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# # Evaluate the model
# # loss, weighted_f1_value = model.evaluate(X_test, y_test)
# # print('Weighted F1 Score:', weighted_f1_value)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [213]:
## including meta data
from tensorflow.keras.callbacks import EarlyStopping
# Set up early stopping to monitor validation accuracy and stop when it stops improving
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

Epoch 1/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 87ms/step - f1_score: 0.4883 - loss: 4.4387 - val_f1_score: 0.4530 - val_loss: 4.5896
Epoch 2/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 50ms/step - f1_score: 0.6573 - loss: 4.4663 - val_f1_score: 0.4443 - val_loss: 4.6531
Epoch 3/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 47ms/step - f1_score: 0.7189 - loss: 4.3012 - val_f1_score: 0.4723 - val_loss: 4.4671
Epoch 4/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 51ms/step - f1_score: 0.7453 - loss: 3.8346 - val_f1_score: 0.4823 - val_loss: 4.1784
Epoch 5/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 47ms/step - f1_score: 0.7563 - loss: 3.4791 - val_f1_score: 0.4443 - val_loss: 3.8323
Epoch 6/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 47ms/step - f1_score: 0.7450 - loss: 3.1081 - val_f1_score: 0.4482 - val_loss: 3.5413
Epoch 7/100
[1m25/25[0m [

In [140]:
## only mri data
history = model.fit(X_train, y_train, epochs=100, batch_size=64, validation_split=0.2)

Epoch 1/100


AttributeError: 'SymbolicTensor' object has no attribute 'numpy'

In [214]:
## only mri data with single y output
# history = sparse_model.fit(X_train, y_train, epochs=50, batch_size=64, validation_split=0.2)

In [215]:
loss, f1_score = model.evaluate(X_val, y_val)
print(f1_score)

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - f1_score: 0.5236 - loss: 1.2395
tf.Tensor([0.7130435  0.31081074], shape=(2,), dtype=float32)


In [221]:
# prompt: predict a test set X_val and return the y_pred output

y_pred = model.predict(X_val)
y_pred

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step


array([[0.45012036, 0.6245119 ],
       [0.19051644, 0.39136204],
       [0.726145  , 0.22643347],
       [0.726012  , 0.2812463 ],
       [0.82815355, 0.18551928],
       [0.75762916, 0.32699674],
       [0.7137311 , 0.80004567],
       [0.38594168, 0.27984872],
       [0.63901997, 0.08362451],
       [0.7787992 , 0.15354931],
       [0.8769829 , 0.65035135],
       [0.77857727, 0.68517995],
       [0.95726097, 0.8579606 ],
       [0.52710074, 0.3931329 ],
       [0.5795346 , 0.6319059 ],
       [0.76435924, 0.71583474],
       [0.8506829 , 0.79897445],
       [0.5378481 , 0.72258604],
       [0.34419382, 0.2747322 ],
       [0.9289797 , 0.8571267 ],
       [0.6795474 , 0.08179553],
       [0.8016355 , 0.35278285],
       [0.6306865 , 0.3100822 ],
       [0.2820302 , 0.88591665],
       [0.9134137 , 0.40096116],
       [0.84276515, 0.14927608],
       [0.6098923 , 0.90851724],
       [0.8858541 , 0.6041616 ],
       [0.90122265, 0.06754637],
       [0.36660528, 0.57506144],
       [0.

In [225]:
# prompt: help me calculate the accuracy score of y_pred and y_val for each column separately

from sklearn.metrics import accuracy_score

# Assuming y_pred and y_val are numpy arrays
# and y_pred contains probabilities for each class

# Convert probabilities to class labels (0 or 1)
y_pred_classes = (y_pred > 0.5).astype(int)
print(y_val.shape)
print(y_pred_classes.shape)

# Calculate accuracy for each column separately
for i in range(y_val.shape[1]):
    accuracy = accuracy_score(y_val[:, i], y_pred_classes[:, i])
    print(f"Accuracy for column {i}: {accuracy}")

## weighted F1 score
# print(y_val.shape)
# print(y_pred_classes.shape)
y_val_tensor = tf.convert_to_tensor(y_val, dtype=tf.float32)
y_pred_tensor = tf.convert_to_tensor(y_pred_classes, dtype=tf.float32)
f1 = custom_f1(y_val_tensor, y_pred_tensor)
print(f"Weighted F1 Score: {f1}")

(243, 2)
(243, 2)
Accuracy for column 0: 0.6008230452674898
Accuracy for column 1: 0.49382716049382713
(243,)
(243,)
(243,)
Weighted F1 Score: 0.2971913925499785


(243, 2)
(243, 2)
(243,)
(243,)
(243,)


0.2971913925499785

## Convolutional Neural Network