Loading my dataset their way

In [None]:
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score as auc

In [None]:
from mriqc_learn.datasets import load_dataset
from mriqc_learn.models import preprocess as pp
from mriqc_learn.models.production import init_pipeline
from mriqc_learn.model_selection import split

## Load some data
We first load the ABIDE dataset, one of the default datasets distributed with MRIQC-learn

In [None]:
(train_x, train_y), (_, _) = load_dataset(dataset="SHIP183", split_strategy="none")
train_x["site"] = train_y.site

Let's pick the ratings from "rater_3" and binarize the three categories into only two.
We can also see that the dataset is unbalanced.

In [None]:
train_y = train_y[["rating"]].values.squeeze().astype(int)
print(f'Excluded={100 * (train_y == 1).sum() / len(train_y) :.2f}%')
print(f'Accept={100 * (train_y == 0).sum() / len(train_y) :.2f}%')
train_y[train_y >= 1] = 1

Let's print out a pretty view of the data table:

In [None]:
train_x

## Cross-validation of the default classifier
Let's cross-validate the performance of our classifier using a Leave-one-site-out strategy.

In [None]:
# Define a splitting strategy
outer_cv = split.LeavePSitesOut(1, robust=True)

We can now feed the model into the cross-validation loop:

In [None]:
cv_score = cross_val_score(
    init_pipeline(),
    X=train_x,
    y=train_y,
    cv=5,
    scoring="roc_auc",
    n_jobs=16,
)

After one or two minutes, the scores have been caculated for each of the 14 folds our splitter created.
The average performance is AUC=0.885.

In [None]:
print(cv_score)
cv_score.mean()

In [None]:
custom_cv_score = {}
for train_index, (site, test_index) in outer_cv.split(train_x, y=train_y, return_key=True):
    # Validate on test fold
    print(f"Validating on left-out site ({site})...")
    model_split = init_pipeline()
    model_split = model_split.fit(train_x.iloc[train_index], train_y[train_index])
    custom_cv_score[site] = auc(train_y[test_index], model_split.predict(train_x.iloc[test_index]))

In [None]:
print(custom_cv_score)
np.mean(list(custom_cv_score.values()))

We now train the model on all available training data:

In [None]:
model = init_pipeline().fit(
    X=train_x,
    y=train_y,
)

In [None]:
from joblib import dump
dump(model, "/mnt/sda1/Repos/mriqc/mriqc-learn/mriqc-learn/mriqc_learn/data/classifier_N183_NoBrainIQMs.joblib")

We can easily see the effects of overfitting by evaluating the classifier on the same folds we used for cross-validation.

In [None]:
overfit_cv_score = {}
for train_index, (site, test_index) in outer_cv.split(train_x, y=train_y, return_key=True):
    print(f"Validating on left-out site ({site})...")
    overfit_cv_score[site] = auc(train_y[test_index], model.predict(train_x.iloc[test_index]))

In [None]:
print([overfit_cv_score[s] - custom_cv_score[s] for s in overfit_cv_score.keys()])

In [None]:
from sklearn.metrics import classification_report

print(classification_report(train_y, model.predict(train_x)))

## Evaluating on held-out dataset
We first load the held-out dataset in, and evaluate:

In [None]:
(test_x, test_y), (_, _) = load_dataset("ds030", split_strategy="none")
test_x["site"] = test_y.site
test_x

In [None]:
has_ghost = test_y.has_ghost.values.astype(bool)
test_y = test_y[["rater_1"]].values.squeeze().astype(int)
print(f"Discard={100 * (test_y == -1).sum() / len(test_y)}")
print(f"Doubtful={100 * (test_y == 0).sum() / len(test_y)}")
print(f"Accept={100 * (test_y == 1).sum() / len(test_y)}")
test_y[test_y < 1] = 0

In [None]:
auc(test_y, model.predict(test_x))

In [None]:
auc(test_y[~has_ghost], model.predict(test_x[~has_ghost]))

In [None]:
print(classification_report(test_y, model.predict(test_x)))

In [None]:
print(classification_report(test_y[~has_ghost], model.predict(test_x[~has_ghost])))

## Nested cross-validation

In [None]:
p_grid = [{
    "scale__unit_variance": [True, False],
    "scale__with_centering": [True, False],
    "site_pred__disable": [False, True],
    "winnow__disable": [False, True],
    "svc__kernel": ["rbf"],
    "svc__C": [10],
    "svc__gamma": [0.1],
}]

In [None]:
# Nested CV with parameter optimization
inner_cv = split.LeavePSitesOut(1, robust=True)
inner_cv.get_n_splits(X=train_x, y=train_y)

clf = GridSearchCV(
    estimator=pipe,
    param_grid=p_grid,
    cv=inner_cv,
    n_jobs=30,
    scoring="roc_auc",
)
# clf.fit(train_x, y=train_y)

In [None]:
nested_score = cross_val_score(
    clf,
    X=train_x,
    y=train_y,
    cv=outer_cv,
    scoring="roc_auc",
    verbose=10,
    n_jobs=16,
)
nested_score.mean()

In [None]:
clf.cv_results_

In [None]:
clf.best_params_