Load model 

In [1]:
from joblib import load

# Load the trained model
model = load("/mnt/sda1/Repos/mriqc/mriqc-learn/mriqc-learn/mriqc_learn/data/classifier_N183_NoBrainIQMs.joblib")

Load dataset

In [2]:
import pandas as pd

# Load the new dataset
ds_aux = pd.read_csv("/mnt/sda1/Repos/mriqc/mriqc-learn/mriqc-learn/mriqc_learn/datasets/SHIP1027.tsv", sep="\t")
# ds from the 3rd column
ds = ds_aux.iloc[:, 1:] # 2 if dataset contains ratings, 1 if not
# move first column to the last
ds = ds[[c for c in ds if c not in ["site"]] + ["site"]]

Excluded subjects from ds_aux

In [None]:
# add subjects whose 'rating' are 1 from 'ds_aux' to a list
# excluded_dsaux = ds_aux.loc[ds_aux['rating'] == 1, 'bids_name'].tolist()
# print("Excluded subjects: ", len(excluded_dsaux))
# print(excluded_dsaux)

Preprocessing

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier as RFC
from mriqc_learn.models import preprocess as pp

# Preprocess the new dataset
preprocessor = Pipeline(
    [
        (
            "drop_ft",
            pp.DropColumns(
                drop=[f"size_{ax}" for ax in "xyz"] + [f"spacing_{ax}" for ax in "xyz"]
            ),
        ),
        (
            "scale",
            pp.SiteRobustScaler(
                with_centering=True,
                with_scaling=True,
            ),
        ),
        (
            "drop_brainIQMs",
            pp.DropColumns(
                drop=[
                    "cjv",
                    "cnr",
                    # "efc",
                    # "fber",
                    # "fwhm_avg",
                    # "fwhm_x",
                    # "fwhm_y",
                    # "fwhm_z",
                    "icvs_csf",
                    "icvs_gm",
                    "icvs_wm",
                    # "inu_med",
                    # "inu_range",
                    "qi_1",
                    # "qi_2",
                    "rpve_csf",
                    "rpve_gm",
                    "rpve_wm",
                    "snr_csf",
                    "snr_gm",
                    "snr_total",
                    "snr_wm",
                    "snrd_csf",
                    "snrd_gm",
                    # "snrd_total",
                    "snrd_wm",
                    "summary_bg_k",
                    "summary_bg_mad",
                    "summary_bg_mean",
                    "summary_bg_median",
                    "summary_bg_n",
                    "summary_bg_p05",
                    "summary_bg_p95",
                    "summary_bg_stdv",
                    "summary_csf_k",
                    "summary_csf_mad",
                    "summary_csf_mean",
                    "summary_csf_median",
                    "summary_csf_n",
                    "summary_csf_p05",
                    "summary_csf_p95",
                    "summary_csf_stdv",
                    "summary_gm_k",
                    "summary_gm_mad",
                    "summary_gm_mean",
                    "summary_gm_median",
                    "summary_gm_n",
                    "summary_gm_p05",
                    "summary_gm_p95",
                    "summary_gm_stdv",
                    "summary_wm_k",
                    "summary_wm_mad",
                    "summary_wm_mean",
                    "summary_wm_median",
                    "summary_wm_n",
                    "summary_wm_p05",
                    "summary_wm_p95",
                    "summary_wm_stdv",
                    "tpm_overlap_csf",
                    "tpm_overlap_gm",
                    "tpm_overlap_wm",
                    "wm2max"
                ]
            ),
        ),
    ]
)

ds_processed = preprocessor.fit_transform(ds)

## Predict

Predict model.fit (th=0.5)

In [None]:
# Predict the classes of the new dataset
y_pred = model.predict(ds)

In [None]:
# add the index of the excluded ones to a list
excluded = []
for i, x in enumerate(y_pred):
    if x == 0: # 0 is excluded
        excluded.append(i)

Predict mode.predict_proba (th)

In [3]:
import numpy as np
y_scores = model.predict_proba(ds)[:, 1] # 0 for excluded according to the model training, or 1 if model trained with SHIP dataset
print(f"Median score: {np.median(y_scores):.3f}")
print(f"P95 score: {np.percentile(y_scores, 95):.3f}")

Median score: 0.203
P95 score: 0.389


In [29]:
# how many values > threshold from y_scores
threshold = 0.389
count = (y_scores > threshold).sum()
print(count)

54


In [30]:
# indices of values > threshold from y_scores
y_pred_idx = (y_scores > threshold).nonzero()[0]

In [31]:
# add the index of the excluded ones to a list
excluded = []
for i, x in enumerate(y_scores):
    if x > threshold:
        excluded.append(i)

## Excluded subjects

In [32]:
# "bids_name" of the indeces in "excluded" in "ds_aux"
excluded_bids = [] # bids names
for i in excluded:
    excluded_bids.append(ds_aux.iloc[i, 0])

Eye reports names of the excluded subjects

In [33]:
bids_csv = pd.read_csv("/home/jaimebarranco/Desktop/MRI-QC/fetal/fetalqc_non-labeled-dataset/bids_csv.csv")
# reports name of the indeces in "excluded" in "df_ls"
excluded_names = [] # reports names
for name in excluded_bids:
    name = name.split('-')[1].split('_')[0]
    sub = int(name)
    # index of bids_csv where the column 'sub' matches 'sub'
    index = bids_csv.index[bids_csv['sub'] == sub]
    # value of the column 'name' of a specific index
    report = bids_csv.iloc[index]['name'].values[0]
    excluded_names.append(report)

In [34]:
df_excluded = pd.DataFrame({'bids_name': excluded_bids, 'name': excluded_names})
print(f"Excluded subjects: {len(df_excluded)}/{len(y_scores)}") # y_scores or y_pred
# order df_excluded by 'name'
df_excluded = df_excluded.sort_values(by=['name'])
print(df_excluded)

Excluded subjects: 54/1027
       bids_name       name
0    sub-001_T1w  sub-APUKS
19   sub-473_T1w  sub-BDGIB
27   sub-650_T1w  sub-BGQKW
13   sub-331_T1w  sub-CXYLE
10   sub-274_T1w  sub-DCIUE
44   sub-871_T1w  sub-DESIW
9    sub-235_T1w  sub-EDVET
43   sub-868_T1w  sub-ENOOW
37   sub-764_T1w  sub-EXYUC
35   sub-718_T1w  sub-FKYUF
20   sub-483_T1w  sub-FUJML
24   sub-583_T1w  sub-GDCYF
2    sub-034_T1w  sub-GHDKQ
6    sub-150_T1w  sub-GIHHF
15   sub-450_T1w  sub-GUAIG
42   sub-865_T1w  sub-HLOOZ
25   sub-596_T1w  sub-HQGMI
30   sub-678_T1w  sub-INKRS
45   sub-892_T1w  sub-IVFIO
8    sub-224_T1w  sub-JPPEO
34   sub-713_T1w  sub-JTGMM
36   sub-746_T1w  sub-KJSIW
3   sub-1151_T1w  sub-KVDKS
16   sub-451_T1w  sub-LYRAA
17   sub-452_T1w  sub-MDZXS
49   sub-938_T1w  sub-MIFZE
41   sub-862_T1w  sub-MXPDD
11   sub-281_T1w  sub-NLMGC
33   sub-699_T1w  sub-ODLZQ
38   sub-772_T1w  sub-OJULJ
5    sub-147_T1w  sub-OOJUF
31   sub-687_T1w  sub-OPBWJ
28   sub-653_T1w  sub-ORXOC
32   sub-690_T1w  sub

Compare to subjective rating (Meri, Bene)

In [None]:
## print 'bids_name' of the subjects that are both in df_excluded and in excluded_dsaux
# common_subs = []
# for i in df_excluded['bids_name']:
#     if i in excluded_dsaux:
#         common_subs.append(i)
# print(f"Common subjects: {len(common_subs)}/{len(excluded_dsaux)}")
# print(common_subs)

Excluded subjects to an excel file

In [35]:
# df_excluded to an excel file
df_excluded.to_excel("/home/jaimebarranco/Downloads/excluded_mriqclearn_N183_NoBrainIQMs_th0389.xlsx", index=False)

Copy subjects' reports to a folder

In [36]:
import os, shutil

reports_folder = '/home/jaimebarranco/Desktop/MRI-QC/fetal/fetalqc_non-labeled-dataset'
output_folder = '/home/jaimebarranco/Downloads/excluded_mriqclearn_N183_NoBrainIQMs_th0389'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# copy html reports from reports_folder that match the subjects in excluded dataframe to output_folder
for i in range(len(excluded)):
    subject = df_excluded['name'].values[i]
    for filename in os.listdir(reports_folder):
        if filename.startswith(f'{subject}_report'):
            shutil.copy(f'{reports_folder}/{filename}', f'{output_folder}/{filename}')

Compare excluded subjects by folder

In [37]:
import os

folder1 = '/home/jaimebarranco/Downloads/excluded_N183_NoBrainIQMs_th043'
folder2 = '/home/jaimebarranco/Downloads/excluded_mriqclearn_N183_NoBrainIQMs_th0389'

# number of files in folders
num_files_folder1 = len(os.listdir(folder1))
num_files_folder2 = len(os.listdir(folder2))

# compare the html reports in folder1 and folder2
count = 0
if num_files_folder1 <= num_files_folder2: # folder with less files to do the loop with
    for filename in os.listdir(folder1):
        if filename in os.listdir(folder2):
            print(f'{filename} is in both folders')
            count += 1
        else:
            print(f'{filename} is only in folder1')
    print(f'\n{count}/{len(os.listdir(folder1))} html reports are in both folders')
else:
    for filename in os.listdir(folder2):
        if filename in os.listdir(folder1):
            print(f'{filename} is in both folders')
            count += 1
        else:
            print(f'{filename} is only in folder2')
    print(f'\n{count}/{len(os.listdir(folder2))} html reports are in both folders')


sub-APUKS_report.html is in both folders
sub-DESIW_report.html is in both folders
sub-OKQBK_report.html is only in folder1
sub-HHUUT_report.html is only in folder1
sub-XHWHZ_report.html is only in folder1
sub-ODLZQ_report.html is in both folders
sub-CYBAC_report.html is only in folder1
sub-RSMQO_report.html is in both folders
sub-BTBKI_report.html is only in folder1
sub-OQIYU_report.html is only in folder1
sub-TEOHS_report.html is only in folder1
sub-HLOOZ_report.html is in both folders
sub-YYCTV_report.html is in both folders
sub-GDCYF_report.html is in both folders
sub-ZSFPW_report.html is in both folders
sub-BGQKW_report.html is in both folders
sub-PCLII_report.html is only in folder1
sub-DGQKQ_report.html is only in folder1
sub-PXFKB_report.html is in both folders
sub-JZZEH_report.html is only in folder1
sub-LVQIG_report.html is only in folder1
sub-KJSIW_report.html is in both folders
sub-EHCJB_report.html is only in folder1
sub-ORXOC_report.html is in both folders
sub-BDGIB_report

Compare excluded subjects by threshold

If we increase the threshold, we would have less excluded subjects. But were those subjects excluded by me as well? Let's see...

In [38]:
excel1 = pd.read_excel('/home/jaimebarranco/Downloads/excluded_mriqclearn_N183_NoBrainIQMs_th0389.xlsx')
excel2 = pd.read_excel('/home/jaimebarranco/Downloads/excluded_mriqclearn_N183_NoBrainIQMs_th043.xlsx')
# excel2 = df_excluded

print(f'{len(excel1)} subjects in excel1')
print(f'{len(excel2)} subjects in excel2')

# subdataframe of those subjects with equal 'bids_name' in both dataframes
common_subs1 = excel1[excel1['bids_name'].isin(excel2['bids_name'])]
common_subs2 = excel2[excel2['bids_name'].isin(excel1['bids_name'])] # to check intra-rater reliability

# percentage of excluded subjects in 'common_subs2'
print(f"Number of excluded subjects: {common_subs2['my_rate'].value_counts()[0]}/{len(common_subs2)} ==> {common_subs2['my_rate'].value_counts()[0]/len(common_subs2)*100:2f}%")

# how many of those subjects had the same 'my_rate' in both dataframes
count = 0
for i in range(len(common_subs1)):
    if common_subs1['my_rate'].values[i] == common_subs2['my_rate'].values[i]:
        count += 1
print(f'\n{count}/{len(common_subs1)} out of {len(excel2)} subjects have the same my_rate in both dataframes \n{len(common_subs1)-count} subjects have different my_rate \n')

# which of them don't have the same 'my_rate' in both dataframes
for i in range(len(common_subs1)):
    if common_subs1['my_rate'].values[i] != common_subs2['my_rate'].values[i]:
        print(f'{common_subs1["name"].values[i]} has {common_subs1["my_rate"].values[i]} in excel1 and {common_subs2["my_rate"].values[i]} in excel2')

54 subjects in excel1
53 subjects in excel2
Number of excluded subjects: 7/30 ==> 23.333333%

30/30 out of 53 subjects have the same my_rate in both dataframes 
0 subjects have different my_rate 



In [39]:
# count the number of '0' in "my_rate" column
common_subs = common_subs1
zeros = common_subs['my_rate'].value_counts()[0]
print(f'Excluded: {zeros}/{len(common_subs)} subjects. {zeros/len(common_subs)*100:.2f}%')
print(common_subs)

# list and percentage of the subjects with equal rating in both dataframes
common_excluded = common_subs[common_subs['my_rate'] == 0]
print(f'Excluded: {len(common_excluded)}/{len(common_subs)} subjects. {len(common_excluded)/len(common_subs)*100:.2f}%')
print(common_excluded)

Excluded: 7/30 subjects. 23.33%
      bids_name       name  my_rate comments
0   sub-001_T1w  sub-APUKS        1      NaN
1   sub-473_T1w  sub-BDGIB        0      NaN
2   sub-650_T1w  sub-BGQKW        1      NaN
5   sub-871_T1w  sub-DESIW        1      NaN
8   sub-764_T1w  sub-EXYUC        1      NaN
9   sub-718_T1w  sub-FKYUF        0      NaN
10  sub-483_T1w  sub-FUJML        1      NaN
11  sub-583_T1w  sub-GDCYF        1      NaN
15  sub-865_T1w  sub-HLOOZ        0      NaN
17  sub-678_T1w  sub-INKRS        1      NaN
20  sub-713_T1w  sub-JTGMM        1      NaN
21  sub-746_T1w  sub-KJSIW        1      NaN
25  sub-938_T1w  sub-MIFZE        0      NaN
26  sub-862_T1w  sub-MXPDD        1      NaN
28  sub-699_T1w  sub-ODLZQ        1      NaN
29  sub-772_T1w  sub-OJULJ        0      NaN
32  sub-653_T1w  sub-ORXOC        1      NaN
33  sub-690_T1w  sub-PMJIC        1      NaN
35  sub-962_T1w  sub-PXFKB        1      NaN
36  sub-633_T1w  sub-QDFPC        1      NaN
37  sub-971_T1w  sub-QO

Are they really excluded? - My rate

In [40]:
excel_file = "/home/jaimebarranco/Downloads/excluded_mriqclearn_N183_NoBrainIQMs_th0389.xlsx"
my_rate_df = pd.read_excel(excel_file)#, sheet_name="045")

# count the number of '0' in "my_rate" column
zeros = my_rate_df['my_rate'].value_counts()[0]
print(f'Excluded: {zeros}/{len(my_rate_df)} subjects ==> {zeros/len(my_rate_df)*100:.2f}%')

Excluded: 13/54 subjects ==> 24.07%
