Load model 

In [None]:
from joblib import load

# Load the trained model
model = load("/mnt/sda1/Repos/mriqc/mriqc-learn/mriqc-learn/mriqc_learn/data/classifier2.joblib")

Load dataset

In [None]:
import pandas as pd

# Load the new dataset
ds_aux = pd.read_csv("/mnt/sda1/Repos/mriqc/mriqc-learn/mriqc-learn/mriqc_learn/datasets/SHIP1210.tsv", sep="\t")
# ds from the 3rd column
ds = ds_aux.iloc[:, 1:] # 2 if dataset contains ratings, 1 if not
# move first column to the last
ds = ds[[c for c in ds if c not in ["site"]] + ["site"]]

Excluded subjects from ds_aux

In [None]:
# add subjects whose 'rating' are 1 from 'ds_aux to a list
# excluded_dsaux = ds_aux.loc[ds_aux['rating'] == 1, 'bids_name'].tolist()
# print("Excluded subjects: ", len(excluded_dsaux))
# print(excluded_dsaux)

Preprocessing

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier as RFC
from mriqc_learn.models import preprocess as pp

# Preprocess the new dataset
preprocessor = Pipeline(
    [
        # (
        #     "drop_ft",
        #     pp.DropColumns(
        #         drop=[f"size_{ax}" for ax in "xyz"] + [f"spacing_{ax}" for ax in "xyz"]
        #     ),
        # ),
        (
            "scale",
            pp.SiteRobustScaler(
                with_centering=True,
                with_scaling=True,
            ),
        ),
        ("site_pred", pp.SiteCorrelationSelector()),
        # ("winnow", pp.NoiseWinnowFeatSelect(use_classifier=True)),
        # ("drop_site", pp.DropColumns(drop=["site"])),
        # (
        #     "rfc",
        #     RFC(
        #         bootstrap=True,
        #         class_weight=None,
        #         criterion="gini",
        #         max_depth=10,
        #         max_features="sqrt",
        #         max_leaf_nodes=None,
        #         min_impurity_decrease=0.0,
        #         min_samples_leaf=10,
        #         min_samples_split=10,
        #         min_weight_fraction_leaf=0.0,
        #         n_estimators=400,
        #         oob_score=True,
        #     ),
        # ),
    ]
)

ds_processed = preprocessor.fit_transform(ds)

## Predict

Predict model.fit (th=0.5)

In [None]:
# Predict the classes of the new dataset
y_pred = model.predict(ds)

In [None]:
# add the index of the excluded ones to a list
excluded = []
for i, x in enumerate(y_pred):
    if x == 0: # 0 is excluded
        excluded.append(i)

Predict mode.predict_proba (th)

In [None]:
import numpy as np
y_scores = model.predict_proba(ds)[:, 0] # 0 for excluded according to the model training
print(f"Median score: {np.median(y_scores):.3f}")
print(f"P95 score: {np.percentile(y_scores, 95):.3f}")

In [None]:
# how many values > threshold from y_scores
threshold = 0.78
count = (y_scores > threshold).sum()
print(count)

In [None]:
# indices of values > threshold from y_scores
y_pred_idx = (y_scores > threshold).nonzero()[0]

In [None]:
# add the index of the excluded ones to a list
excluded = []
for i, x in enumerate(y_scores):
    if x > threshold:
        excluded.append(i)

## Excluded subjects

In [None]:
# "bids_name" of the indeces in "excluded" in "ds_aux"
excluded_bids = [] # bids names
for i in excluded:
    excluded_bids.append(ds_aux.iloc[i, 0])

Eye reports names of the excluded subjects

In [None]:
bids_csv = pd.read_csv("/home/jaimebarranco/Desktop/MRI-QC/fetal/fetalqc_non-labeled-dataset/bids_csv.csv")
# reports name of the indeces in "excluded" in "df_ls"
excluded_names = [] # reports names
for name in excluded_bids:
    name = name.split('-')[1].split('_')[0]
    sub = int(name)
    # index of bids_csv where the column 'sub' matches 'sub'
    index = bids_csv.index[bids_csv['sub'] == sub]
    # value of the column 'name' of a specific index
    report = bids_csv.iloc[index]['name'].values[0]
    excluded_names.append(report)

In [None]:
df_excluded = pd.DataFrame({'bids_name': excluded_bids, 'name': excluded_names})
print(f"Excluded subjects: {len(df_excluded)}/{len(y_pred)}")
# order df_excluded by 'name'
df_excluded = df_excluded.sort_values(by=['name'])
print(df_excluded)

Compare to subjective rating (Meri, Bene)

In [None]:
# print 'bids_name' of the subjects that are both in df_excluded and in excluded_dsaux
# common_subs = []
# for i in df_excluded['bids_name']:
#     if i in excluded_dsaux:
#         common_subs.append(i)
# print(f"Common subjects: {len(common_subs)}/{len(excluded_dsaux)}")
# print(common_subs)

Excluded subjects to an excel file

In [None]:
# df_excluded to an excel file
df_excluded.to_excel("/home/jaimebarranco/Downloads/excluded_mriqclearn_trN183_th078.xlsx", index=False)

Copy subjects' reports to a folder

In [None]:
import os, shutil

reports_folder = '/home/jaimebarranco/Desktop/MRI-QC/fetal/fetalqc_non-labeled-dataset'
output_folder = '/home/jaimebarranco/Downloads/excluded_mriqclearn_trN183_th080'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# copy html reports from reports_folder that match the subjects in excluded dataframe to output_folder
for i in range(len(excluded)):
    subject = df_excluded['name'].values[i]
    for filename in os.listdir(reports_folder):
        if filename.startswith(f'{subject}_report'):
            shutil.copy(f'{reports_folder}/{filename}', f'{output_folder}/{filename}')

Compare excluded subjects by folder

In [None]:
folder1 = '/home/jaimebarranco/Downloads/excluded_mriqclearn_trN183_th0753'
folder2 = '/home/jaimebarranco/Downloads/excluded_mriqclearn_th045'

# number of files in folders
num_files_folder1 = len(os.listdir(folder1))
num_files_folder2 = len(os.listdir(folder2))

# compare the html reports in folder1 and folder2
count = 0
if num_files_folder1 <= num_files_folder2: # folder with less files to do the loop with
    for filename in os.listdir(folder1):
        if filename in os.listdir(folder2):
            print(f'{filename} is in both folders')
            count += 1
        else:
            print(f'{filename} is only in folder1')
    print(f'\n{count}/{len(os.listdir(folder1))} html reports are in both folders')
else:
    for filename in os.listdir(folder2):
        if filename in os.listdir(folder1):
            print(f'{filename} is in both folders')
            count += 1
        else:
            print(f'{filename} is only in folder1')
    print(f'\n{count}/{len(os.listdir(folder2))} html reports are in both folders')


Compare excluded subjects by threshold

If we increase the threshold, we would have less excluded subjects. But were those subjects excluded by me as well? Let's see...

In [None]:
excel1 = pd.read_excel('/home/jaimebarranco/Downloads/excluded_mriqclearn_trN183_th0753.xlsx')
excel2 = pd.read_excel('/home/jaimebarranco/Downloads/excluded_mriqclearn_trN183_th080.xlsx')

# subdataframe of those subjects with equal 'bids_name' in both dataframes
common_subs = excel1[excel1['bids_name'].isin(excel2['bids_name'])]

# count the number of '0' in "my_rate" column
zeros = common_subs['my_rate'].value_counts()[0]
print(f'Excluded: {zeros}/{len(common_subs)} subjects. {zeros/len(common_subs)*100:.2f}%')

print(common_subs)

Are they really excluded? - My rate

In [None]:
excel_file = "/home/jaimebarranco/Downloads/excluded_mriqclearn_trN183_th0753.xlsx"
my_rate_df = pd.read_excel(excel_file)#, sheet_name="045")

# count the number of '0' in "my_rate" column
zeros = my_rate_df['my_rate'].value_counts()[0]
print(f'Excluded: {zeros}/{len(my_rate_df)} subjects')