In [65]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from interpret.glassbox import ExplainableBoostingClassifier
from interpret import show
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report)
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
import xgboost as xgb
import catboost
from catboost import CatBoostClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Flatten


### Prepare data

In [2]:
train = pd.read_csv('/Users/keishavalenna/Desktop/WiDS/TRAIN_MERGED.csv')

In [3]:
train.shape

(1213, 19930)

In [49]:
# qualitative + quantitative predictors
df = train.iloc[:, :30]
df.drop(['participant_id'],axis=1,inplace=True)

In [50]:
# mri predictors
df_mri = train.drop(columns = train.iloc[:, 3:30].columns.to_list())
df_mri.drop(['participant_id'],axis=1,inplace=True)

In [51]:
def evaluate_model(y_true, y_pred, target_name):
    print(f"\n🔹 Metrics for {target_name}:")
    print(f"  - Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"  - Precision: {precision_score(y_true, y_pred):.4f}")
    print(f"  - Recall: {recall_score(y_true, y_pred):.4f}")
    print(f"  - F1-Score: {f1_score(y_true, y_pred):.4f}")
    
    # Print full classification report
    print("\n" + classification_report(y_true, y_pred))

### EBM (qualitative + quantitative data)

In [64]:
X = df.iloc[:,2:]
y1 = df.iloc[:, 0]
y2 = df.iloc[:,1]

X_train, X_test, y1_train, y1_test, y2_train, y2_test = train_test_split(X, y1, y2, test_size=0.2, random_state=42)

ebm1 = ExplainableBoostingClassifier(random_state=42)
ebm2 = ExplainableBoostingClassifier(random_state=42)

ebm1.fit(X_train, y1_train)
ebm2.fit(X_train, y2_train)

y1_pred = ebm1.predict(X_test)
y2_pred = ebm2.predict(X_test)

accuracy1 = accuracy_score(y1_test, y1_pred)
accuracy2 = accuracy_score(y2_test, y2_pred)

evaluate_model(y1_test, y1_pred, "Target 1")
evaluate_model(y2_test, y2_pred, "Target 2")

# Show feature importance for both models
print("\nFeature Importance for Target 1:")
show(ebm1.explain_global())

print("\nFeature Importance for Target 2:")
show(ebm2.explain_global())


Missing values detected. Our visualizations do not currently display missing values. To retain the glassbox nature of the model you need to either set the missing values to an extreme value like -1000 that will be visible on the graphs, or manually examine the missing value score in ebm.term_scores_[term_index][0]




🔹 Metrics for Target 1:
  - Accuracy: 0.8066
  - Precision: 0.8429
  - Recall: 0.9045
  - F1-Score: 0.8726

              precision    recall  f1-score   support

           0       0.67      0.54      0.60        65
           1       0.84      0.90      0.87       178

    accuracy                           0.81       243
   macro avg       0.76      0.72      0.74       243
weighted avg       0.80      0.81      0.80       243


🔹 Metrics for Target 2:
  - Accuracy: 0.7284
  - Precision: 0.5714
  - Recall: 0.3333
  - F1-Score: 0.4211

              precision    recall  f1-score   support

           0       0.76      0.89      0.82       171
           1       0.57      0.33      0.42        72

    accuracy                           0.73       243
   macro avg       0.67      0.61      0.62       243
weighted avg       0.70      0.73      0.70       243


Feature Importance for Target 1:



Feature Importance for Target 2:


### EBM (MRI data)

In [45]:
X = df_mri.iloc[:,2:]
y1 = df_mri.iloc[:, 0]
y2 = df_mri.iloc[:,1]

X_train, X_test, y1_train, y1_test, y2_train, y2_test = train_test_split(X, y1, y2, test_size=0.2, random_state=42)

ebm1 = ExplainableBoostingClassifier(random_state=42)
ebm2 = ExplainableBoostingClassifier(random_state=42)

ebm1.fit(X_train, y1_train)
ebm2.fit(X_train, y2_train)

y1_pred = ebm1.predict(X_test)
y2_pred = ebm2.predict(X_test)

accuracy1 = accuracy_score(y1_test, y1_pred)
accuracy2 = accuracy_score(y2_test, y2_pred)

print(f"EBM Accuracy for Target 1: {accuracy1:.4f}")
print(f"EBM Accuracy for Target 2: {accuracy2:.4f}")

# Show feature importance for both models
print("\nFeature Importance for Target 1:")
show(ebm1.explain_global())

print("\nFeature Importance for Target 2:")
show(ebm2.explain_global())

KeyboardInterrupt: 

In [None]:
evaluate_model(y1_test, y1_pred, "Target 1")
evaluate_model(y2_test, y2_pred, "Target 2")

### XGBoost (Q+Q data)

In [52]:
X = df.iloc[:,2:]
y1 = df.iloc[:, 0]
y2 = df.iloc[:,1]

X_train, X_test, y1_train, y1_test, y2_train, y2_test = train_test_split(X, y1, y2, test_size=0.2, random_state=42)

model1 = xgb.XGBClassifier(objective="binary:logistic", eval_metric="logloss", use_label_encoder=False)
model2 = xgb.XGBClassifier(objective="binary:logistic", eval_metric="logloss", use_label_encoder=False)

model1.fit(X_train, y1_train)
model2.fit(X_train, y2_train)

# Make predictions
y1_pred = model1.predict(X_test)
y2_pred = model2.predict(X_test)

# Evaluate accuracy
evaluate_model(y1_test, y1_pred, "Target 1")
evaluate_model(y2_test, y2_pred, "Target 2")


Parameters: { "use_label_encoder" } are not used.



Parameters: { "use_label_encoder" } are not used.





🔹 Metrics for Target 1:
  - Accuracy: 0.7819
  - Precision: 0.8415
  - Recall: 0.8652
  - F1-Score: 0.8532

              precision    recall  f1-score   support

           0       0.60      0.55      0.58        65
           1       0.84      0.87      0.85       178

    accuracy                           0.78       243
   macro avg       0.72      0.71      0.71       243
weighted avg       0.78      0.78      0.78       243


🔹 Metrics for Target 2:
  - Accuracy: 0.6790
  - Precision: 0.4375
  - Recall: 0.2917
  - F1-Score: 0.3500

              precision    recall  f1-score   support

           0       0.74      0.84      0.79       171
           1       0.44      0.29      0.35        72

    accuracy                           0.68       243
   macro avg       0.59      0.57      0.57       243
weighted avg       0.65      0.68      0.66       243



### XGBoost (MRI data)

In [73]:
X = df_mri.iloc[:,2:]
y1 = df_mri.iloc[:, 0]
y2 = df_mri.iloc[:,1]

X_train, X_test, y1_train, y1_test, y2_train, y2_test = train_test_split(X, y1, y2, test_size=0.2, random_state=42)

model1 = xgb.XGBClassifier(objective="binary:logistic", eval_metric="logloss", use_label_encoder=False)
model2 = xgb.XGBClassifier(objective="binary:logistic", eval_metric="logloss", use_label_encoder=False)

model1.fit(X_train, y1_train)
model2.fit(X_train, y2_train)

# Make predictions
y1_pred = model1.predict(X_test)
y2_pred = model2.predict(X_test)

# Evaluate accuracy
evaluate_model(y1_test, y1_pred, "Target 1")
evaluate_model(y2_test, y2_pred, "Target 2")


Parameters: { "use_label_encoder" } are not used.



Parameters: { "use_label_encoder" } are not used.





🔹 Metrics for Target 1:
  - Accuracy: 0.7202
  - Precision: 0.7350
  - Recall: 0.9663
  - F1-Score: 0.8350

              precision    recall  f1-score   support

           0       0.33      0.05      0.08        65
           1       0.74      0.97      0.83       178

    accuracy                           0.72       243
   macro avg       0.53      0.51      0.46       243
weighted avg       0.63      0.72      0.63       243


🔹 Metrics for Target 2:
  - Accuracy: 0.7202
  - Precision: 0.5667
  - Recall: 0.2361
  - F1-Score: 0.3333

              precision    recall  f1-score   support

           0       0.74      0.92      0.82       171
           1       0.57      0.24      0.33        72

    accuracy                           0.72       243
   macro avg       0.65      0.58      0.58       243
weighted avg       0.69      0.72      0.68       243



### CatBoost (Q+Q)

In [63]:
X = df.iloc[:,2:]
y1 = df.iloc[:, 0]
y2 = df.iloc[:,1]

X_train, X_test, y1_train, y1_test, y2_train, y2_test = train_test_split(X, y1, y2, test_size=0.2, random_state=42)

model1 = CatBoostClassifier(iterations=500, depth=6, learning_rate=0.1, loss_function='Logloss')
model1.fit(X_train, y1_train)

# Initialize the CatBoostClassifier for the second target
model2 = CatBoostClassifier(iterations=500, depth=6, learning_rate=0.1, loss_function='Logloss')
model2.fit(X_train, y2_train)

# Predict both targets
y1_pred = model1.predict(X_test)
y2_pred = model2.predict(X_test)

# Output predictions for both targets
evaluate_model(y1_test, y1_pred, "Target 1")
evaluate_model(y2_test, y2_pred, "Target 2")

0:	learn: 0.6404860	total: 2.97ms	remaining: 1.48s
1:	learn: 0.5958451	total: 4.18ms	remaining: 1.04s
2:	learn: 0.5598618	total: 5.29ms	remaining: 876ms
3:	learn: 0.5305377	total: 7.73ms	remaining: 959ms
4:	learn: 0.5084951	total: 9.59ms	remaining: 950ms
5:	learn: 0.4896320	total: 10.5ms	remaining: 864ms
6:	learn: 0.4766758	total: 11.4ms	remaining: 805ms
7:	learn: 0.4636270	total: 12.3ms	remaining: 756ms
8:	learn: 0.4515887	total: 14.4ms	remaining: 784ms
9:	learn: 0.4436291	total: 15.2ms	remaining: 743ms
10:	learn: 0.4349762	total: 16.6ms	remaining: 740ms
11:	learn: 0.4275197	total: 17.4ms	remaining: 708ms
12:	learn: 0.4228110	total: 18ms	remaining: 676ms
13:	learn: 0.4135610	total: 18.7ms	remaining: 651ms
14:	learn: 0.4064314	total: 19.9ms	remaining: 645ms
15:	learn: 0.4004319	total: 21.1ms	remaining: 639ms
16:	learn: 0.3931413	total: 28.9ms	remaining: 822ms
17:	learn: 0.3861929	total: 46.4ms	remaining: 1.24s
18:	learn: 0.3799724	total: 47.2ms	remaining: 1.2s
19:	learn: 0.3750599	tota

### Trying out CNN - LSTM model on fMRI data

In [72]:
X = df.iloc[:,2:]
y1 = df.iloc[:, 0]
y2 = df.iloc[:,1]

X_train, X_test, y1_train, y1_test, y2_train, y2_test = train_test_split(X, y1, y2, test_size=0.2, random_state=42)

X_train = X_train.to_numpy().reshape(X_train.shape[0], 1, X_train.shape[1])
X_test = X_test.to_numpy().reshape(X_test.shape[0], 1, X_test.shape[1])

model1 = Sequential([
    Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(100, 16000)),
    MaxPooling1D(pool_size=2),
    LSTM(50, return_sequences=True),
    LSTM(25),
    Dense(20, activation='relu'),
    Dense(2, activation='linear')  # Two targets (Regression)
])

model1.compile(optimizer='adam', loss='mse', metrics=['mae'])
model1.summary()

model2 = Sequential([
    Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(100, 16000)),
    MaxPooling1D(pool_size=2),
    LSTM(50, return_sequences=True),
    LSTM(25),
    Dense(20, activation='relu'),
    Dense(2, activation='linear')  # Two targets (Regression)
])

model2.compile(optimizer='adam', loss='mse', metrics=['mae'])
model2.summary()

model1.fit(X_train, y1_train, epochs=10, batch_size=32, validation_data=(X_test, y1_test))
model2.fit(X_train, y2_train, epochs=10, batch_size=32, validation_data=(X_test, y2_test))

# Predict both targets
y1_pred = model1.predict(X_test)
y2_pred = model2.predict(X_test)

# Output predictions for both targets
evaluate_model(y1_test, y1_pred, "Target 1")
evaluate_model(y2_test, y2_pred, "Target 2")


Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.



Epoch 1/10


ValueError: Exception encountered when calling Sequential.call().

[1mInput 0 of layer "conv1d_4" is incompatible with the layer: expected axis -1 of input shape to have value 16000, but received input with shape (None, 1, 27)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(None, 1, 27), dtype=float32)
  • training=True
  • mask=None