# ADHD Dectection
[INTRODUCTION OF THE PROBLEM HERE]

## Dataset:
[ADD AND COMMENT ABOUT DATASETS HERE]

## Task:
[ADD TASK HERE]

## Integrants:
Name, Matricola, Email

- Jan Elfes, 2040496, jan.elfes@studenti.unipd.it

- Santiago Víquez Segura, 2048722, santiago.viquezsegura@studenti.unipd.it


## Dependencies

In [1]:
import os
import re
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import nibabel as nib
import pandas as pd
from config_local import helpers

from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

hp = helpers()

In [2]:
_data_path = hp.get_data_path()
print(_data_path)

/media/jan/TOSHIBA EXT/SMHDD_22/data/


## Baseline

Before working with the FMRI data we will fit a logistic regression classifier on the phenotypic tabular data to come up with a baseline. We expect that any experiment using the FMRI data outperforms the following result.

In [3]:
phenotypic_df = pd.read_csv(_data_path + "NYU_phenotypic.csv")
phenotypic_df.head()

Unnamed: 0,ScanDir ID,Site,Gender,Age,Handedness,DX,Secondary Dx,ADHD Measure,ADHD Index,Inattentive,...,Full4 IQ,Med Status,QC_Rest_1,QC_Rest_2,QC_Rest_3,QC_Rest_4,QC_Anatomical_1,QC_Anatomical_2,Fold,Partition
0,1000804,5,1.0,7.29,0.83,0,,2,40,41,...,109,1,1.0,0.0,,,1.0,,4.0,train
1,1023964,5,1.0,8.29,0.57,3,,2,60,56,...,123,-999,1.0,0.0,,,1.0,1.0,3.0,train
2,1057962,5,1.0,8.78,-999.0,1,,2,77,81,...,129,1,1.0,,,,1.0,0.0,1.0,train
3,1099481,5,0.0,8.04,0.5,1,,2,86,82,...,116,1,1.0,0.0,,,1.0,1.0,2.0,train
4,1127915,5,0.0,12.44,0.21,0,,2,42,43,...,124,1,1.0,1.0,,,1.0,1.0,4.0,train


We are going to use `"Gender", "ADHD Index", "Age", "Handedness", "Inattentive", "Hyper/Impulsive", "Verbal IQ","Performance IQ", "Full4 IQ", "Med Status", "QC_Rest_1", "QC_Rest_2", "QC_Anatomical_1", "QC_Anatomical_2"` as predictors and use the token `-999` to fill and recognize NaN values.

In [4]:
features = ["Gender", "ADHD Index", "Age", "Handedness", 
            "Inattentive", "Hyper/Impulsive", "Verbal IQ",
           "Performance IQ", "Full4 IQ", "Med Status",
            "QC_Rest_1", "QC_Rest_2", "QC_Anatomical_1", 
            "QC_Anatomical_2"]

phenotypic_df = phenotypic_df[features + ["DX", "Fold", "Partition"]].fillna(-999)

In [5]:
train_df = phenotypic_df[phenotypic_df["Partition"] == "train"]
test_df = phenotypic_df[phenotypic_df["Partition"] == "test"]
scores = []

for fold in range(1, 6):
    X_train = train_df[train_df["Fold"] != fold][features]
    y_train = train_df[train_df["Fold"] != fold]["DX"]
    
    X_val = train_df[train_df["Fold"] == fold][features]
    y_val = train_df[train_df["Fold"] == fold]["DX"]
    
    clf = LogisticRegression(max_iter=7000,
                         solver="saga",
                         penalty=None,
                         random_state=42).fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    
    score = f1_score(y_val, y_pred, average="weighted")
    scores.append(score)
    print(f"Fold {fold} -- f1-Score (val): {score}")
print(f"\nAverage f1-Score (val): {np.mean(scores)}")

Fold 1 -- f1-Score (val): 0.544041184041184
Fold 2 -- f1-Score (val): 0.8856209150326796
Fold 3 -- f1-Score (val): 0.7632043808514397
Fold 4 -- f1-Score (val): 0.7854291037682042
Fold 5 -- f1-Score (val): 0.691800356506239

Average f1-Score (val): 0.7340191880399494


# Lasso

As first model we will use a simple Lasso regression for the fMRI data. 

In [4]:
phenotypic_df = pd.read_csv(_data_path + "NYU_phenotypic.csv").fillna(-999)
dic = {}

for i, row in phenotypic_df.iterrows():
    
    id = str(row['ScanDir ID']).zfill(7)
    fold = int(row['Fold'])
    part = row['Partition']

    if fold != -999:
        file = _data_path + f"{part}/fold{fold}/wmean_mrda{id}_session_1_rest_1.nii.gz"
        img = nib.load(file)
        data = img.get_fdata()
        y = row['DX']
        y_bin = int(y>=1)
        dic[id] = {"img":img, "data":data, "dx":y, "dx_bin":y_bin, "part":part, "fold":fold}
    
    

In [5]:
def train_loader(dic, i):
    X_val = []
    X_train = []
    y_val = []
    y_train = []

    for id, subj in dic.items():

        if ((subj["part"]=="train") & (subj["fold"]==i)):
            X_val.append(subj['data'].reshape(-1))
            y_val.append(subj['dx_bin'])
        
        if ((subj["part"]=="train") & (subj["fold"]!=i)):
            X_train.append(subj['data'].reshape(-1))
            y_train.append(subj['dx_bin'])
    
    return np.array(X_train), np.array(y_train), np.array(X_val), np.array(y_val)

In [6]:
# Create X and y array
for i in range(1, 6):
    X_train, y_train, X_val, y_val = train_loader(dic, i)
    print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

(136, 133574) (136,) (35, 133574) (35,)
(137, 133574) (137,) (34, 133574) (34,)
(137, 133574) (137,) (34, 133574) (34,)
(137, 133574) (137,) (34, 133574) (34,)
(137, 133574) (137,) (34, 133574) (34,)


In [21]:
scores = []

for fold in range(1, 6):

    X_train, y_train, X_val, y_val = train_loader(dic, fold)

    clf = Lasso(alpha=0.01,
                random_state=42,
                max_iter=7000).fit(X_train, y_train)
    y_pred = np.round(clf.predict(X_val))
    
    score = f1_score(y_val, y_pred, average="weighted")
    scores.append(score)
    print(f"Fold {fold} -- f1-Score (val): {score}")
print(f"\nAverage f1-Score (val): {np.mean(scores)}")

Fold 1 -- f1-Score (val): 0.5413632119514472
Fold 2 -- f1-Score (val): 0.6470588235294118
Fold 3 -- f1-Score (val): 0.6470588235294118
Fold 4 -- f1-Score (val): 0.6803884780929257
Fold 5 -- f1-Score (val): 0.5013077593722755

Average f1-Score (val): 0.6034354192950944


## Model selection Lasso

In [34]:
from sklearn.metrics import mean_squared_error

In [46]:
cv_lambda = np.logspace(-3, 2, 30)
coefs = []
f1_scores = []
f1_scores_mean = []
mse_scores = []
mse_scores_mean = []

for lam in cv_lambda:
    for fold in range(1, 6):

        X_train, y_train, X_val, y_val = train_loader(dic, fold)

        clf = Lasso(alpha=lam,
                    random_state=42,
                    max_iter=7000).fit(X_train, y_train)
        y_pred = np.round(clf.predict(X_val))
        
        score = f1_score(y_val, y_pred, average="weighted")
        mse = mean_squared_error(y_val, y_pred)

        f1_scores.append(score)
        mse_scores.append(mse)
        coefs.append(clf.coef_)

    f1_scores_mean.append(np.mean(f1_scores[-5:]))
    mse_scores_mean.append(np.mean(mse_scores[-5:]))

    print(f"\n Lambda = {np.round(lam, 4)}  \tAverage f1-Score (val): {f1_scores_mean[-1]}\n \t\t\tMean Squared Error: \t{mse_scores_mean[-1]}")


 Lambda = 0.001 	Average f1-Score (val): 0.5212004801920768
 		Mean Squared Error: 	0.5714285714285714

 Lambda = 0.003593813663804626 	Average f1-Score (val): 0.531281846071762
 		Mean Squared Error: 	0.5142857142857142

 Lambda = 0.01291549665014884 	Average f1-Score (val): 0.5346423013649905
 		Mean Squared Error: 	0.4952380952380952

 Lambda = 0.046415888336127795 	Average f1-Score (val): 0.5363225290116046
 		Mean Squared Error: 	0.48571428571428565

 Lambda = 0.1668100537200059 	Average f1-Score (val): 0.5319151660664265
 		Mean Squared Error: 	0.48571428571428565

 Lambda = 0.5994842503189409 	Average f1-Score (val): 0.5363958916900093
 		Mean Squared Error: 	0.4628571428571429

 Lambda = 2.1544346900318843 	Average f1-Score (val): 0.5421253762050726
 		Mean Squared Error: 	0.45714285714285713

 Lambda = 7.742636826811277 	Average f1-Score (val): 0.5102548502168995
 		Mean Squared Error: 	0.45714285714285713

 Lambda = 27.825594022071257 	Average f1-Score (val): 0.4783843242287

TODO: need to implement AIC/BIC for model selection. For Lasso in sklearn this seems to be `LassoLarsIC` but did not make it work yet.

In [52]:
# from sklearn.linear_model import LassoLarsIC

# clf = LassoLarsIC(criterion="aic")
# clf.fit(X_train, y_train)

# Multiclass Lasso

Takes 1 hour to run all 5 folds

In [9]:
# scores = []

# for fold in range(1, 6):

#     X_train, y_train, X_val, y_val = train_loader(dic, fold)

#     clf = LogisticRegression(max_iter=7000,
#                              solver="saga",
#                              penalty="l1",
#                              C=1/0.1,
#                              random_state=42).fit(X_train, y_train)
#     y_pred = clf.predict(X_val)
    
#     score = f1_score(y_val, y_pred, average="weighted")
#     scores.append(score)
#     print(f"Fold {fold} -- f1-Score (val): {score}")
# print(f"\nAverage f1-Score (val): {np.mean(scores)}")                    

Fold 1 -- f1-Score (val): 0.5966666666666668
Fold 2 -- f1-Score (val): 0.7058823529411765
Fold 3 -- f1-Score (val): 0.4668730650154798
Fold 4 -- f1-Score (val): 0.5892014776925263
Fold 5 -- f1-Score (val): 0.47058823529411764

Average f1-Score (val): 0.5658423595219935
