Loading my dataset their way

In [5]:
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score as auc

In [6]:
from mriqc_learn.datasets import load_dataset
from mriqc_learn.models import preprocess as pp
from mriqc_learn.models.production import init_pipeline
from mriqc_learn.model_selection import split

## Load some data
We first load the ABIDE dataset, one of the default datasets distributed with MRIQC-learn

In [7]:
(train_x, train_y), (_, _) = load_dataset(dataset="SHIP183", split_strategy="none")
train_x["site"] = train_y.site

Let's pick the ratings from "rater_3" and binarize the three categories into only two.
We can also see that the dataset is unbalanced.

In [8]:
train_y = train_y[["rating"]].values.squeeze().astype(int)
print(f'Excluded={100 * (train_y == 1).sum() / len(train_y) :.2f}%')
print(f'Accept={100 * (train_y == 0).sum() / len(train_y) :.2f}%')
train_y[train_y >= 1] = 1

Excluded=24.59%
Accept=75.41%


Let's print out a pretty view of the data table:

In [9]:
train_x

Unnamed: 0,cjv,cnr,efc,fber,fwhm_avg,fwhm_x,fwhm_y,fwhm_z,icvs_csf,icvs_gm,...,summary_wm_median,summary_wm_n,summary_wm_p05,summary_wm_p95,summary_wm_stdv,tpm_overlap_csf,tpm_overlap_gm,tpm_overlap_wm,wm2max,site
0,0.489250,2.732287,0.579359,14688.645257,3.726930,3.58919,3.97218,3.61942,0.276925,0.324761,...,1000.134934,185258.000000,924.807739,1101.936190,54.448439,0.171807,0.373188,0.484403,0.487104,SHIP
1,0.410418,3.376646,0.586394,33378.752649,3.594793,3.39952,3.90242,3.48244,0.183945,0.390542,...,1000.058241,280061.000000,934.175681,1082.185815,45.223944,0.179215,0.448654,0.528051,0.585126,SHIP
2,0.417318,3.329145,0.597023,60210.142545,3.703273,3.59519,3.96207,3.55256,0.188880,0.383136,...,1000.086168,300476.000000,935.074246,1080.726230,44.762158,0.179264,0.444732,0.520380,0.569838,SHIP
3,0.436022,3.148955,0.567828,43001.150452,3.418750,3.29147,3.71746,3.24732,0.198440,0.388336,...,1000.077165,227734.000000,929.955451,1087.233951,48.068587,0.181540,0.445394,0.518943,0.648944,SHIP
4,0.483053,2.862426,0.632164,19353.297357,3.756670,3.69458,3.94944,3.62599,0.279497,0.323268,...,1000.103278,193360.000000,918.812823,1098.299117,55.069951,0.164981,0.377402,0.450177,0.477213,SHIP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178,1.071183,0.756806,0.513089,13223.535072,3.531503,3.36379,3.87357,3.35715,0.253178,0.342339,...,938.043566,543476.888780,793.057316,1064.632432,82.741266,0.221584,0.472521,0.510402,0.417705,SHIP
179,0.949581,0.770895,0.590186,9357.126340,3.701913,3.57988,3.94455,3.58131,0.238270,0.337652,...,969.531438,681362.703797,816.917181,1104.197065,87.644512,0.219535,0.455707,0.511997,0.473652,SHIP
180,1.135211,0.723645,0.620137,13638.551828,3.603803,3.42537,3.88281,3.50323,0.248058,0.362409,...,976.266379,665705.859447,824.627711,1108.937886,85.997866,0.222957,0.499861,0.516171,0.472185,SHIP
181,0.820651,0.860715,0.591436,17756.528093,3.678903,3.64377,3.79037,3.60257,0.248584,0.351158,...,967.798643,702074.431075,790.234133,1090.030241,92.344735,0.223956,0.487787,0.504300,0.483482,SHIP


## Cross-validation of the default classifier
Let's cross-validate the performance of our classifier using a Leave-one-site-out strategy.

In [None]:
# Define a splitting strategy
outer_cv = split.LeavePSitesOut(1, robust=True)

We can now feed the model into the cross-validation loop:

In [None]:
cv_score = cross_val_score(
    init_pipeline(),
    X=train_x,
    y=train_y,
    cv=5,
    scoring="roc_auc",
    n_jobs=16,
)

After one or two minutes, the scores have been caculated for each of the 14 folds our splitter created.
The average performance is AUC=0.885.

In [None]:
print(cv_score)
cv_score.mean()

In [None]:
custom_cv_score = {}
for train_index, (site, test_index) in outer_cv.split(train_x, y=train_y, return_key=True):
    # Validate on test fold
    print(f"Validating on left-out site ({site})...")
    model_split = init_pipeline()
    model_split = model_split.fit(train_x.iloc[train_index], train_y[train_index])
    custom_cv_score[site] = auc(train_y[test_index], model_split.predict(train_x.iloc[test_index]))

In [None]:
print(custom_cv_score)
np.mean(list(custom_cv_score.values()))

We now train the model on all available training data:

In [10]:
model = init_pipeline().fit(
    X=train_x,
    y=train_y,
)

In [11]:
from joblib import dump
dump(model, "/mnt/sda1/Repos/mriqc/mriqc-learn/mriqc-learn/mriqc_learn/data/classifier_N183_NoBrainIQMs.joblib")

['/mnt/sda1/Repos/mriqc/mriqc-learn/mriqc-learn/mriqc_learn/data/classifier_N183_NoBrainIQMs.joblib']

We can easily see the effects of overfitting by evaluating the classifier on the same folds we used for cross-validation.

In [None]:
overfit_cv_score = {}
for train_index, (site, test_index) in outer_cv.split(train_x, y=train_y, return_key=True):
    print(f"Validating on left-out site ({site})...")
    overfit_cv_score[site] = auc(train_y[test_index], model.predict(train_x.iloc[test_index]))

In [None]:
print([overfit_cv_score[s] - custom_cv_score[s] for s in overfit_cv_score.keys()])

In [None]:
from sklearn.metrics import classification_report

print(classification_report(train_y, model.predict(train_x)))

## Evaluating on held-out dataset
We first load the held-out dataset in, and evaluate:

In [None]:
(test_x, test_y), (_, _) = load_dataset("ds030", split_strategy="none")
test_x["site"] = test_y.site
test_x

In [None]:
has_ghost = test_y.has_ghost.values.astype(bool)
test_y = test_y[["rater_1"]].values.squeeze().astype(int)
print(f"Discard={100 * (test_y == -1).sum() / len(test_y)}")
print(f"Doubtful={100 * (test_y == 0).sum() / len(test_y)}")
print(f"Accept={100 * (test_y == 1).sum() / len(test_y)}")
test_y[test_y < 1] = 0

In [None]:
auc(test_y, model.predict(test_x))

In [None]:
auc(test_y[~has_ghost], model.predict(test_x[~has_ghost]))

In [None]:
print(classification_report(test_y, model.predict(test_x)))

In [None]:
print(classification_report(test_y[~has_ghost], model.predict(test_x[~has_ghost])))

## Nested cross-validation

In [None]:
p_grid = [{
    "scale__unit_variance": [True, False],
    "scale__with_centering": [True, False],
    "site_pred__disable": [False, True],
    "winnow__disable": [False, True],
    "svc__kernel": ["rbf"],
    "svc__C": [10],
    "svc__gamma": [0.1],
}]

In [None]:
# Nested CV with parameter optimization
inner_cv = split.LeavePSitesOut(1, robust=True)
inner_cv.get_n_splits(X=train_x, y=train_y)

clf = GridSearchCV(
    estimator=pipe,
    param_grid=p_grid,
    cv=inner_cv,
    n_jobs=30,
    scoring="roc_auc",
)
# clf.fit(train_x, y=train_y)

In [None]:
nested_score = cross_val_score(
    clf,
    X=train_x,
    y=train_y,
    cv=outer_cv,
    scoring="roc_auc",
    verbose=10,
    n_jobs=16,
)
nested_score.mean()

In [None]:
clf.cv_results_

In [None]:
clf.best_params_