In [1]:
from mriqc_learn.datasets import load_dataset

(train_x, train_y), (_, _) = load_dataset(dataset="SHIP183", split_strategy="none")
train_x["site"] = train_y.site

train_y = train_y[["rating"]].values.squeeze().astype(int)
print(f'Excluded={100 * (train_y < 1).sum() / len(train_y)}')
print(f'Accept={100 * (train_y >= 1).sum() / len(train_y)}')
train_y[train_y >= 1] = 1

Excluded=73.22404371584699
Accept=26.775956284153004


Load model 

In [2]:
from joblib import load

# Load the trained model
model = load("/mnt/sda1/Repos/mriqc/mriqc-learn/mriqc-learn/mriqc_learn/data/classifier2.joblib")

Load dataset

In [3]:
import pandas as pd

# Load the new dataset
ds_aux = pd.read_csv("/mnt/sda1/Repos/mriqc/mriqc-learn/mriqc-learn/mriqc_learn/datasets/SHIP1210.tsv", sep="\t")
# ds from the 3rd column
ds = ds_aux.iloc[:, 1:] # 2 if dataset contains ratings, 1 if not
# move first column to the last
ds = ds[[c for c in ds if c not in ["site"]] + ["site"]]

Preprocessing

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier as RFC
from mriqc_learn.models import preprocess as pp

# Preprocess the new dataset
preprocessor = Pipeline(
    [
        # (
        #     "drop_ft",
        #     pp.DropColumns(
        #         drop=[f"size_{ax}" for ax in "xyz"] + [f"spacing_{ax}" for ax in "xyz"]
        #     ),
        # ),
        (
            "scale",
            pp.SiteRobustScaler(
                with_centering=True,
                with_scaling=True,
            ),
        ),
        ("site_pred", pp.SiteCorrelationSelector()),
        # ("winnow", pp.NoiseWinnowFeatSelect(use_classifier=True)),
        # ("drop_site", pp.DropColumns(drop=["site"])),
        # (
        #     "rfc",
        #     RFC(
        #         bootstrap=True,
        #         class_weight=None,
        #         criterion="gini",
        #         max_depth=10,
        #         max_features="sqrt",
        #         max_leaf_nodes=None,
        #         min_impurity_decrease=0.0,
        #         min_samples_leaf=10,
        #         min_samples_split=10,
        #         min_weight_fraction_leaf=0.0,
        #         n_estimators=400,
        #         oob_score=True,
        #     ),
        # ),
    ]
)

ds_processed = preprocessor.fit_transform(ds)

## Predict

Predict model.fit (th=0.5)

In [17]:
# Predict the classes of the new dataset
y_pred = model.predict(ds)

In [18]:
# add the index of the excluded ones to a list
excluded = []
for i, x in enumerate(y_pred):
    if x == 0: # 0 is excluded
        excluded.append(i)

Predict mode.predict_proba (th)

In [5]:
y_scores = model.predict_proba(ds)[:, 0] # 0 for excluded according to the model training

In [6]:
# how many values > 0.5 from y_scores
threshold = 0.45
count = (y_scores > threshold).sum()
print(count)

40


In [7]:
# indices of values > threshold from y_scores
y_pred_idx = (y_scores > threshold).nonzero()[0]

In [22]:
# add the index of the excluded ones to a list
excluded = []
for i, x in enumerate(y_scores):
    if x > threshold:
        excluded.append(i)

## Excluded subjects

In [23]:
# "bids_name" of the indeces in "excluded" in "ds_aux"
excluded_bids = [] # bids names
for i in excluded:
    excluded_bids.append(ds_aux.iloc[i, 0])

Eye reports names of the excluded subjects

In [24]:
bids_csv = pd.read_csv("/home/jaimebarranco/Desktop/MRI-QC/fetal/fetalqc_non-labeled-dataset/bids_csv.csv")
# reports name of the indeces in "excluded" in "df_ls"
excluded_names = [] # reports names
for name in excluded_bids:
    name = name.split('-')[1].split('_')[0]
    sub = int(name)
    # index of bids_csv where the column 'sub' matches 'sub'
    index = bids_csv.index[bids_csv['sub'] == sub]
    # value of the column 'name' of a specific index
    report = bids_csv.iloc[index]['name'].values[0]
    excluded_names.append(report)

In [25]:
df_excluded = pd.DataFrame({'bids_name': excluded_bids, 'name': excluded_names})
print(f"Excluded subjects: {len(df_excluded)}/{len(y_pred)}")
print(df_excluded)

Excluded subjects: 40/1210
       bids_name       name
0    sub-001_T1w  sub-APUKS
1    sub-037_T1w  sub-VCVZH
2    sub-053_T1w  sub-XKICG
3    sub-096_T1w  sub-HMEFJ
4   sub-1078_T1w  sub-DGECA
5    sub-108_T1w  sub-EQKHT
6   sub-1105_T1w  sub-JSOOZ
7   sub-1130_T1w  sub-GDWSJ
8   sub-1144_T1w  sub-LPXED
9   sub-1156_T1w  sub-ZSLLA
10  sub-1171_T1w  sub-XFRWS
11  sub-1183_T1w  sub-MYHXL
12   sub-126_T1w  sub-JASYK
13   sub-132_T1w  sub-EPUIZ
14   sub-137_T1w  sub-UBIBB
15   sub-141_T1w  sub-IFMPW
16   sub-150_T1w  sub-GIHHF
17   sub-167_T1w  sub-FXFHX
18   sub-206_T1w  sub-UTSYQ
19   sub-275_T1w  sub-INDZM
20   sub-299_T1w  sub-MMFOS
21   sub-326_T1w  sub-WRQIQ
22   sub-342_T1w  sub-XIGDR
23   sub-363_T1w  sub-MHRGQ
24   sub-389_T1w  sub-GSNPW
25   sub-394_T1w  sub-VNVWN
26   sub-439_T1w  sub-ZEYHM
27   sub-451_T1w  sub-LYRAA
28   sub-484_T1w  sub-SFVHB
29   sub-551_T1w  sub-UDSBS
30   sub-641_T1w  sub-RMSIX
31   sub-665_T1w  sub-ZSFPW
32   sub-687_T1w  sub-OPBWJ
33   sub-690_T1w  sub

Copy subjects' reports to a folder

In [15]:
import os, shutil

reports_folder = '/home/jaimebarranco/Desktop/MRI-QC/fetal/fetalqc_non-labeled-dataset'
output_folder = '/home/jaimebarranco/Downloads/excluded_mriqclearn_th045'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# copy html reports from reports_folder that match the subjects in excluded dataframe to output_folder
for i in range(len(excluded)):
    subject = df_excluded['name'].values[i]
    for filename in os.listdir(reports_folder):
        if filename.startswith(f'{subject}_report'):
            shutil.copy(f'{reports_folder}/{filename}', f'{output_folder}/{filename}')

Compare excluded subjects

In [16]:
folder1 = '/home/jaimebarranco/Downloads/excluded_mriqclearn2'
folder2 = '/home/jaimebarranco/Downloads/excluded_mriqclearn_th045'

# number of files in folders
num_files_folder1 = len(os.listdir(folder1))
num_files_folder2 = len(os.listdir(folder2))

# compare the html reports in folder1 and folder
count = 0
if num_files_folder1 <= num_files_folder2: # folder with less files to do the loop with
    for filename in os.listdir(folder1):
        if filename in os.listdir(folder2):
            print(f'{filename} is in both folders')
            count += 1
        else:
            print(f'{filename} is only in folder1')
    print(f'\n{count}/{len(os.listdir(folder1))} html reports are in both folders')
else:
    for filename in os.listdir(folder2):
        if filename in os.listdir(folder1):
            print(f'{filename} is in both folders')
            count += 1
        else:
            print(f'{filename} is only in folder1')
    print(f'\n{count}/{len(os.listdir(folder2))} html reports are in both folders')


sub-APUKS_report.html is in both folders
sub-DGECA_report.html is in both folders
sub-JASYK_report.html is in both folders
sub-XKICG_report.html is in both folders
sub-KRZDQ_report.html is in both folders
sub-JSOOZ_report.html is in both folders
sub-WRQIQ_report.html is in both folders
sub-GDWSJ_report.html is in both folders
sub-IFMPW_report.html is in both folders
sub-GIHHF_report.html is in both folders
sub-UBIBB_report.html is in both folders
sub-SFVHB_report.html is in both folders
sub-LPXED_report.html is in both folders
sub-ZSLLA_report.html is in both folders
sub-LYRAA_report.html is in both folders
sub-UDSBS_report.html is in both folders
sub-OPBWJ_report.html is in both folders
sub-WXSQP_report.html is in both folders
sub-MHRGQ_report.html is in both folders
sub-XIGDR_report.html is in both folders
sub-OJULJ_report.html is in both folders
sub-HMEFJ_report.html is in both folders
sub-EQKHT_report.html is in both folders
sub-MYHXL_report.html is in both folders
sub-XFRWS_report