In [2]:
# from cmbnet.preprocessing.loading.
import pandas as pd
import numpy as np
import nibabel as nib
import nilearn as nl
import os
from cmbnet.preprocessing.loading import get_metadata_from_cmb_format
import cmbnet.utils.utils_general as utils_general
import cmbnet.utils.utils_plotting as utils_plotting

from cmbnet.preprocessing.datasets.cerebriu import enrich_reprocessmetadata_with_processed
from tqdm import tqdm
from copy import deepcopy


ModuleNotFoundError: No module named 'seaborn'

### Load all studies  data

In [61]:
all_scans = pd.read_csv("../../data-misc/csv/ALL_studies.csv")
all_scans.columns

Index(['seriesUID', 'series', 'n_CMB_new', 'seq_type', 'res_level', 'healthy',
       'healthy_all', 'field_strength', 'TE', 'subject', 'patientUID',
       'Dataset', 'n_CMB_old', 'old_shape', 'new_shape', 'old_voxel_dim',
       'new_voxel_dim', 'old_orientation', 'new_orientation', 'studyUID_old',
       'n_indataset', 'newCMB', 'diffCMB', 'nCMB_avg', 'CMB_level'],
      dtype='object')

### Generate CMB metadata

In [62]:
data_dirs = [f'/storage/evo1/jorge/datasets/cmb/{d}/Data' for d in ['cmb_train', 'cmb_dou', 'cmb_crb']]

# CRB specific paths
rawdir = "/storage/evo1/jorge/datasets/cmb/raw/CEREBRIU"
processed_dir = "/storage/evo1/jorge/datasets/cmb/processed/CRB/Data"
reprocessed_dir = "/storage/evo1/jorge/datasets/cmb/processed_final/CRB/Data"

# Load all metadata
metadata_CMBs = {}
for d_dir in data_dirs:
    studies = [s for s in os.listdir(d_dir) if os.path.isdir(os.path.join(d_dir, s))]
    for s in studies:
        metadata_CMBs[s] = get_metadata_from_cmb_format(data_dir=d_dir, sub_id=s)

cmb_metadata = []

for serieuid, met_s in tqdm(metadata_CMBs.items()):
    cmb_new_meta = deepcopy(met_s['CMBs_new'])
    
    for id_num, cmb_meta in cmb_new_meta.items():
        if serieuid.startswith("CRB-"):
            subject_id = all_scans[all_scans['seriesUID'] == serieuid]['subject'].values[0]
            cmb_meta_rich = enrich_reprocessmetadata_with_processed(
                subject_id,
                rawdir,
                processed_dir,
                reprocessed_dir
            )
            cmb_meta = cmb_meta_rich['CMBs_new'][id_num]
        
        cmb_row = {
            "seriesUID": serieuid,
            "cmb_id": id_num,
            **deepcopy(cmb_meta)  # Ensure changes to cmb_row do not affect original cmb_meta
        }
        
        cmb_metadata.append(cmb_row)

df_cmb_metadata = pd.DataFrame(cmb_metadata)

100%|██████████| 5025/5025 [00:01<00:00, 3596.58it/s] 


In [63]:
# Expand attributes from RB
df_cmb_metadata['RB_metadata'] = df_cmb_metadata['RB_metadata'].apply(lambda x: x.get('attributes', {}) if isinstance(x, dict) else {})
attributes_df = df_cmb_metadata['RB_metadata'].apply(pd.Series)
df_cmb_metadata.drop('RB_metadata', axis=1, inplace=True)
df_cmb_metadata = pd.concat([df_cmb_metadata, attributes_df], axis=1)


In [64]:
# Add metadata study-level
df_cmb_metadata = (df_cmb_metadata
    .pipe(lambda df: pd.merge(df, all_scans[['seriesUID', 'seq_type', 'res_level', 'field_strength', 'TE', 'subject', 'patientUID', 'Dataset']], on='seriesUID', how='left'))
)

In [65]:
df_cmb_metadata[~df_cmb_metadata['RB_label'].isnull()].head()

Unnamed: 0,seriesUID,cmb_id,CM,size,radius,processed_id,RB_label,Location,Multiple,<5mm,Cause,Uncertain,Other,seq_type,res_level,field_strength,TE,subject,patientUID,Dataset
37642,CRB-1.2.826.1.3680043.9.5282.150415.34194.3419...,0,"[67, 203, 196]",248,3.9,8,3,Cortex / grey-white junction,True,True,,,,T2S,high,1.5/3,32.5,1.2.826.1.3680043.9.5282.150415.34194.34194222...,1.2.826.1.3680043.9.5282.150415.34194.34194222...,CRB
37643,CRB-1.2.826.1.3680043.9.5282.150415.34194.3419...,1,"[81, 239, 162]",269,4.0,4,2,Basal ganglia grey matter,True,True,Hypertension,,,T2S,high,1.5/3,32.5,1.2.826.1.3680043.9.5282.150415.34194.34194222...,1.2.826.1.3680043.9.5282.150415.34194.34194222...,CRB
37644,CRB-1.2.826.1.3680043.9.5282.150415.34194.3419...,2,"[94, 87, 246]",683,5.46,12,4,Cortex / grey-white junction,True,,CAA,True,,T2S,high,1.5/3,32.5,1.2.826.1.3680043.9.5282.150415.34194.34194222...,1.2.826.1.3680043.9.5282.150415.34194.34194222...,CRB
37645,CRB-1.2.826.1.3680043.9.5282.150415.34194.3419...,3,"[99, 104, 59]",239,3.85,0,1,Cerebellum,,True,Other,True,,T2S,high,1.5/3,32.5,1.2.826.1.3680043.9.5282.150415.34194.34194222...,1.2.826.1.3680043.9.5282.150415.34194.34194222...,CRB
37646,CRB-1.2.826.1.3680043.9.5282.150415.34194.3419...,4,"[100, 128, 263]",440,4.72,11,4,Cortex / grey-white junction,True,,CAA,True,,T2S,high,1.5/3,32.5,1.2.826.1.3680043.9.5282.150415.34194.34194222...,1.2.826.1.3680043.9.5282.150415.34194.34194222...,CRB


In [66]:
df_cmb_metadata['Location'].value_counts()


Location
Cortex / grey-white junction    71
Subcortical white matter        21
Basal ganglia grey matter       13
Brainstem                       10
Thalamus                         6
Cerebellum                       5
Name: count, dtype: int64

In [67]:
df_cmb_metadata['Dataset'].value_counts()

Dataset
sMOMENI    36812
RODEJA       357
VALDO        253
MOMENI       146
CRB          127
DOU           74
Name: count, dtype: int64

In [71]:
df_cmb_metadata.to_csv("../../data-misc/csv/CMB_metadata_all.csv", index=False)

# CMB analysis

In [None]:
df_cmb_metadata = pd.read_csv("../../data-misc/csv/CMB_metadata_all.csv", index=False)

## Radius distribution

In [87]:
utils_plotting.create_boxplot(df_cmb_metadata, 'radius', 'Dataset', 'Distribution of Radii per Dataset')


AttributeError: module 'cmbnet.utils.utils_plotting' has no attribute 'create_boxplot'

## Number of microbleeds distribution

In [82]:
all_scans['CMB_level'].unique()
all_scans['res_level'].unique()

array(['high', 'low'], dtype=object)

# Scans analysis

In [85]:
grouped = all_scans.groupby(['Dataset']).agg(
    n_scans=pd.NamedAgg(column='seriesUID', aggfunc='nunique'),
    n_scans_cmb=pd.NamedAgg(column='seriesUID', aggfunc=lambda x: x[all_scans['healthy'] == 'no'].nunique()),
    n_patients=pd.NamedAgg(column='patientUID', aggfunc=lambda x: x.nunique()),
    n_patients_cmb=pd.NamedAgg(column='patientUID', aggfunc=lambda x: x[all_scans['healthy'] == 'no'].nunique()),
    n_CMB=pd.NamedAgg(column='n_CMB_new', aggfunc='sum'),
    avgCMB=pd.NamedAgg(column='n_CMB_new', aggfunc=lambda x: x.mean().astype(int)),
    avgCMB_no0=pd.NamedAgg(column='n_CMB_new', aggfunc=lambda x: (mean := x[all_scans.loc[x.index, 'healthy'] == 'no'].mean(), int(mean) if pd.notna(mean) else None)[1]),
    perc_patients_low_cmb=pd.NamedAgg(column='CMB_level', aggfunc=lambda x: round(100*((x == 'low').sum()/len(x)))),
    perc_scans_low_res=pd.NamedAgg(column='res_level', aggfunc=lambda x: round(100*((x == 'low').sum()/len(x))))
).reset_index()

grouped
# # Calculate totals for all studies combined
# totals = all_studies.agg(
#     n_patients=('patientUID', 'nunique'),
#     n_patients_cmb=('patientUID', lambda x: x[all_studies['healthy'] == 'no'].nunique()),
#     n_patients_h=('patientUID', lambda x: x[all_studies['healthy'] == 'yes'].nunique()),
#     n_series=('seriesUID', 'nunique'),
#     n_series_cmb=('seriesUID', lambda x: x[all_studies['healthy'] == 'no'].nunique()),
#     n_series_h=('seriesUID', lambda x: x[all_studies['healthy'] == 'yes'].nunique()),
#     n_CMB=('n_CMB_new', 'sum')
# )

# # Append totals to the grouped data
# totals['Dataset'] = 'Total'
# totals['seq_type'] = '-'
# totals_df = pd.DataFrame([totals], columns=grouped.columns)  # Ensure matching columns
# summary = pd.concat([grouped, totals_df], ignore_index=True)


Unnamed: 0,Dataset,n_scans,n_scans_cmb,n_patients,n_patients_cmb,n_CMB,avgCMB,avgCMB_no0,perc_patients_low_cmb,perc_scans_low_res
0,CRB,18,18,18,18,127,7,7.0,44,44
1,CRBneg,742,0,742,0,0,0,,0,40
2,DOU,20,20,20,20,74,3,3.0,70,0
3,MOMENI,370,57,118,30,146,0,2.0,21,100
4,RODEJA,103,61,103,61,357,3,5.0,41,15
5,VALDO,72,50,72,50,253,3,5.0,56,38
6,sMOMENI,3700,3700,118,118,36812,9,9.0,0,100
