In [1]:
import os
import re
import pandas as pd
from pathlib import Path
import numpy as np
import shutil
import nibabel as nib
from tqdm import tqdm
from collections import defaultdict
from datetime import datetime

### Helper functions

In [2]:
# get patient_ids from a csv file
def extract_unique_patient_ids_from_csv(csv_path):
    cohort = "CBTN"
    id_column = "CBTN Subject ID"
    df = pd.read_csv(csv_path)
    ids = set(df[id_column])
 
    print(f"Length of {cohort} cohort is {len(ids)}.")
    return ids

# get the treatment date for all scans before treatment
def get_first_treatment_dates_new(clinical_csv_path):
    clinical_df = pd.read_csv(clinical_csv_path)
    treatment_ages = {}
    for _, row in clinical_df.iterrows():
        patient_id = row['CBTN Subject ID']
        age = row['Age at First Treatment']
        if pd.isna(age):  # Check if treatment date is not NaN
            treatment_ages[patient_id] = None  # No treatment date recorded
        else: treatment_ages[patient_id] = int(age)
    return treatment_ages

# get the unique ids from the directory where the files were moved
def extract_unique_patient_ids_from_directory(directory_path):
    """
    Extracts the unique patient IDs from the directory where the files were moved.
    """
    patient_ids = set()
    for file in os.listdir(directory_path):
        if file.endswith(".nii.gz"):
            patient_id = file.split("_")[1]
            patient_ids.add(patient_id)
    return patient_ids

def count_files(directory):
    return len([name for name in os.listdir(directory) if os.path.isfile(os.path.join(directory, name))])


proccessed_dir = Path("/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/cbtn_longitudinal_dataset/pre_event/accepted")
ids = extract_unique_patient_ids_from_directory(proccessed_dir)
print(f"Length of processed cohort is {len(ids)}.")
files = count_files(proccessed_dir)
scans = int(files/2)
print(f"Length of processed files is {files}, {scans} scans.")

Length of processed cohort is 0.
Length of processed files is 0, 0 scans.


### Get the treatment dates from csv, compare set of IDs in directory with the ones

In [15]:

def move_files_to_treatment_folders(clinical_csv_path, source_directory, destination_directory, treatment_ages):
    clinical_df = pd.read_csv(clinical_csv_path)
    os.makedirs(os.path.join(destination_directory, 'pre_treatment'), exist_ok=True)
    os.makedirs(os.path.join(destination_directory, 'post_treatment'), exist_ok=True)
    count = 0 
    pats_failed = set()
    clinical_df['CBTN Subject ID'] = clinical_df['CBTN Subject ID'].astype(str)
    # print(clinical_df['CBTN Subject ID'].values)
    for file_name in os.listdir(source_directory):
        if file_name.endswith('.nii.gz') or file_name.endswith('_mask.nii.gz'):
            parts = file_name.split('_')
            patient_id = parts[1]
            scan_id = parts[2]
            if scan_id.endswith('_mask.nii.gz'):
                scan_id = int(scan_id.split('_')[0])
            else:
                scan_id = int(scan_id.split('.')[0])
            
            #scan_id = age !
                                                                     
            if str(patient_id) not in clinical_df['CBTN Subject ID'].values:
                print(f"Patient ID {patient_id} not found in clinical data. Skipping file {file_name}.")
                count = count + 1
                pats_failed.add(patient_id)
                continue
            
            first_treatment_date = treatment_ages[patient_id]
            if first_treatment_date is None:
                treatment_period = 'pre_treatment'
            else:
                treatment_period = 'pre_treatment' if scan_id < first_treatment_date else 'post_treatment'
                
            # Move the file to the appropriate subfolder
            src_path = os.path.join(source_directory, file_name)
            dest_path = os.path.join(destination_directory, treatment_period, file_name)
            shutil.move(src_path, dest_path)
            print(f"Moved {file_name} to {treatment_period} folder.")
    print(f"Failed on {count} files.  {len(pats_failed)} patients failed: {pats_failed}")


clinical_csv_path = Path("/home/jc053/GIT/mri_longitudinal_analysis/data/input/clinical/cbtn_filtered_pruned_treatment_513.csv")
source_directory = destination_directory = Path("/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/cbtn_longitudinal_dataset/pre_event/accepted")
treatment_ages = get_first_treatment_dates_new(clinical_csv_path)
move_files_to_treatment_folders(clinical_csv_path, source_directory, destination_directory, treatment_ages)

Moved image65_C1077234_6664_mask.nii.gz to pre_treatment folder.
Moved image57_C1077234_5012_mask.nii.gz to pre_treatment folder.
Moved image9_C1003434_4457_mask.nii.gz to pre_treatment folder.
Moved image33_C1042056_2972_mask.nii.gz to pre_treatment folder.
Moved image73_C1077357_3040_mask.nii.gz to pre_treatment folder.
Moved image137_C123861_7461_mask.nii.gz to pre_treatment folder.
Moved image169_C136161_2399_mask.nii.gz to post_treatment folder.
Moved image49_C1047222_5968_mask.nii.gz to pre_treatment folder.
Moved image145_C123861_8538_mask.nii.gz to pre_treatment folder.
Moved image1_C136161_4634_mask.nii.gz to post_treatment folder.
Moved image289_C2313384_5967_mask.nii.gz to pre_treatment folder.
Moved image225_C136161_6244_mask.nii.gz to post_treatment folder.
Moved image241_C140466_6132_mask.nii.gz to post_treatment folder.
Moved image257_C140466_7336_mask.nii.gz to post_treatment folder.
Moved image209_C136161_4830_mask.nii.gz to post_treatment folder.
Moved image281_C19557

### Compare IDs between folders and move files from exclusion list

In [3]:


pre_treatment_directory = Path("/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/cbtn_longitudinal_dataset/pre_event/accepted/pre_treatment")
post_treatment_directory = Path("/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/cbtn_longitudinal_dataset/pre_event/accepted/post_treatment")
ids_pre = extract_unique_patient_ids_from_directory(pre_treatment_directory)
ids_post = extract_unique_patient_ids_from_directory(post_treatment_directory)
files_pre = count_files(pre_treatment_directory)
files_post = count_files(post_treatment_directory)
print(f"Unique patient IDs from the post-treatment directory: {len(ids_post)}, Files: {files_post}")
print(f"Unique patient IDs from the pre-treatment directory: {len(ids_pre)}, Files: {files_pre}")

# from the unique ids in the pre-treatment directory, filter out ids and scans belonging to certain patients that are recorded in a exclusion list
def filter_out_excluded_patients(directory_path, exclusion_list):
    """
    Filter out the patients and their scans that are in the exclusion list.
    """
    os.makedirs(os.path.join(directory_path, "filtered"), exist_ok=True)
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            if file.endswith(".nii.gz") or file.endswith("_mask.nii.gz"):
                patient_id = file.split("_")[1]
                if patient_id in exclusion_list:
                    file_path = os.path.join(root, file)
                    dst_path = os.path.join(directory_path, "filtered", file)
                    shutil.move(file_path, dst_path)
                    print(f"Moved {file_path} from the directory.")

#exclusion_list = ["4791484", "298046", "299321", "335968", "4636143", "4159396", "4192047", "233126"]
#filter_out_excluded_patients(pre_treatment_directory, exclusion_list)
#filter_out_excluded_patients(post_treatment_directory, exclusion_list)


Unique patient IDs from the post-treatment directory: 37, Files: 462
Unique patient IDs from the pre-treatment directory: 91, Files: 1258


### Filter patients with less than 3 scans available

In [4]:
from datetime import datetime, timedelta
# analyze the number of scans per patient in the pre-treatment directory and make sure there are at least three scans per patient
def analyze_scans_per_patient(directory_path):
    """
    Analyze the number of scans per patient in the directory.
    """
    patient_scan_counts = defaultdict(int)
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            if file.endswith(".nii.gz"):
                patient_id = file.split("_")[1]
                patient_scan_counts[patient_id] += 1
    return patient_scan_counts

# for patients with less than 3 scans, move them to a different directory
def move_patients_with_less_than_three_scans(directory_path, pre_treatment_scan_counts, min_scans=6):
    """
    Move the patients with less than the minimum number of scans to a different directory.
    """
    count = 0 
    os.makedirs(os.path.join(directory_path, "insufficient_scans"), exist_ok=True)
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            if file.endswith(".nii.gz"):
                patient_id = file.split("_")[1]
                if pre_treatment_scan_counts[patient_id] < min_scans:
                    file_path = os.path.join(root, file)
                    dst_path = os.path.join(directory_path, "insufficient_scans", file)
                    shutil.move(file_path, dst_path)
                    # print(f"Moved {file_path} to insufficient_scans directory.")
                    count = count + 1
    print(f"Moved {count} files.")


# remember to divide by two since there are two files per scan
pre_treatment_scan_counts = analyze_scans_per_patient(pre_treatment_directory)
print("Number of scans per patient in the pre-treatment directory:", pre_treatment_scan_counts)
move_patients_with_less_than_three_scans(pre_treatment_directory, pre_treatment_scan_counts)
ids_pre = extract_unique_patient_ids_from_directory(pre_treatment_directory)
ids_removed = extract_unique_patient_ids_from_directory(os.path.join(pre_treatment_directory, "insufficient_scans"))
print("Unique patient IDs from the pre-treatment directory after 3 scans condition:", len(ids_pre))
print("Unique patient IDs from the removed directory:", len(ids_removed))

Number of scans per patient in the pre-treatment directory: defaultdict(<class 'int'>, {'C1077234': 22, 'C1003434': 6, 'C1042056': 32, 'C1077357': 10, 'C123861': 56, 'C1047222': 8, 'C2313384': 2, 'C19557': 6, 'C2354097': 58, 'C2334909': 4, 'C2380788': 14, 'C2617194': 8, 'C1095315': 10, 'C1264809': 2, 'C19188': 4, 'C1026558': 4, 'C1060998': 4, 'C2568978': 12, 'C2313876': 4, 'C243540': 4, 'C1042179': 6, 'C2318181': 4, 'C2647698': 4, 'C2334663': 10, 'C1232829': 8, 'C1064934': 4, 'C2697882': 4, 'C2795544': 4, 'C311190': 4, 'C154980': 2, 'C324597': 32, 'C3396522': 60, 'C2855076': 8, 'C3400212': 6, 'C2859135': 4, 'C324351': 36, 'C3557160': 8, 'C3399720': 20, 'C24969': 2, 'C28413': 4, 'C2832936': 4, 'C3684711': 22, 'C36654': 24, 'C270477': 2, 'C33825': 4, 'C38868': 108, 'C2653971': 4, 'C3684588': 6, 'C2819775': 2, 'C3817551': 10, 'C2819898': 2, 'C3615216': 4, 'C4065273': 32, 'C3971547': 20, 'C4095900': 14, 'C4311150': 2, 'C41082': 14, 'C46863': 28, 'C52644': 16, 'C53013': 14, 'C62730': 50, 'C

### Follow-up should be at least 1 year!

In [24]:
# make sure the first and last scan per paritent are at least 1 year apart, print the patients that do not meet this criteria
def check_time_between_first_and_last_scan(directory_path, min_time_difference=365):
    """
    Check the time difference between the first and last scan for each patient.
    """
    patient_first_last_scan_dates = {}
    fail_ids = []
    for file in os.listdir(directory_path):
        if file.endswith(".nii.gz"):
            patient_id = file.split("_")[1]
            age = file.split("_")[2].split(".")[0]
            
            if patient_id not in patient_first_last_scan_dates:
                patient_first_last_scan_dates[patient_id] = []
            patient_first_last_scan_dates[patient_id].append(int(age))

    for patient_id, ages in patient_first_last_scan_dates.items():
        if len(ages) > 1:
            ages.sort()
            time_difference = ages[-1] - ages[0]
            if abs(time_difference) < min_time_difference:
                print(f"Patient {patient_id} has scans less than 1 year apart: {ages[-1]} - {ages[0]}")
                fail_ids.append(patient_id)

    return fail_ids

fail_ids = check_time_between_first_and_last_scan(pre_treatment_directory)
# count the number of scans (files have format imageXYZ_patientID_scanID.nii.gz) of those patients listed the patient_first_last_scan_dates dictionary as keys above
scan_counts = {}
for file in os.listdir(pre_treatment_directory):
    if file.endswith(".nii.gz"):
        patient_id = file.split("_")[1]
        if patient_id in fail_ids:
            if patient_id not in scan_counts:
                scan_counts[patient_id] = 1
            else:
                scan_counts[patient_id] += 1
print(scan_counts)

#total numebr of files that should be moved due to the time difference condition
total_files = sum(scan_counts.values())
print(f"Total number of files that should be moved due to the time difference condition: {total_files}")

Patient C1042179 has scans less than 1 year apart: 1319 - 1256
Patient C1077357 has scans less than 1 year apart: 3195 - 3034
Patient C19557 has scans less than 1 year apart: 743 - 610
Patient C2617194 has scans less than 1 year apart: 5351 - 5332
Patient C3400212 has scans less than 1 year apart: 752 - 676
Patient C3557160 has scans less than 1 year apart: 2096 - 1849
Patient C3684588 has scans less than 1 year apart: 5967 - 5876
Patient C39606 has scans less than 1 year apart: 1367 - 1301
Patient C520905 has scans less than 1 year apart: 3691 - 3470
Patient C663339 has scans less than 1 year apart: 7245 - 7074
{'C1042179': 6, 'C1077357': 10, 'C19557': 6, 'C2617194': 8, 'C3400212': 6, 'C3557160': 8, 'C3684588': 6, 'C39606': 6, 'C520905': 8, 'C663339': 6}
Total number of files that should be moved due to the time difference condition: 70


In [25]:
# move the patients that do not meet the time difference criteria to a different directory
def move_patients_with_less_than_min_time_difference(directory_path, fail_ids):
    """
    Move the patients with less than the minimum time difference between first and last scan to a different directory.
    """
    os.makedirs(os.path.join(directory_path, "time_difference"), exist_ok=True)
    count = 0
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            if file.endswith(".nii.gz"):
                patient_id = file.split("_")[1]
                if patient_id in fail_ids:
                    file_path = os.path.join(root, file)
                    dst_path = os.path.join(directory_path, "time_difference", file)
                    shutil.move(file_path, dst_path)
                    count = count + 1
    print(f"Moved {count} files.")
    
move_patients_with_less_than_min_time_difference(pre_treatment_directory, fail_ids)

Moved 140 files.


### Final count

In [26]:
# Replace these paths with the actual paths of your directories
final_dir = "/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/cbtn_longitudinal_dataset/pre_event/accepted/pre_treatment"

# Function to extract patientID and scanID
def get_identifier(file_name):
    parts = file_name.split('_')
    return parts[1], parts[2]

# Extract identifiers from both directories
identifiers_ = set()
ids_ = set()
for file_name in os.listdir(final_dir):
    if file_name.endswith(".nii.gz"):
        patid, scanid = get_identifier(file_name)
        identifiers_.add((patid, scanid))
        ids_.add(patid)


print(f"Number of unique patients in final CBTN set: {len(ids_)}")
print(f"Number of total files in final CBTN data: {len(identifiers_)}, scans {int(len(identifiers_) /2)}")



Number of unique patients in final CBTN set: 45
Number of total files in final CBTN data: 1064, scans 532


In [27]:
patient_ids = []
files_per_patient = {}

for file in os.listdir(final_dir):
    if file.endswith(".nii.gz"):
        patient_id = file.split("_")[1]
        patient_ids.append(patient_id)
        if patient_id not in files_per_patient:
            files_per_patient[patient_id] = 1
        else:
            files_per_patient[patient_id] += 1

total_patients = len(set(patient_ids))

print("Total number of patients:", total_patients)
print("Number of files per patient:")
for patient_id, file_count in files_per_patient.items():
    print(f"Patient {patient_id}: {int(file_count / 2)} scans")


Total number of patients: 45
Number of files per patient:
Patient C1077234: 11 scans
Patient C1003434: 3 scans
Patient C1042056: 16 scans
Patient C123861: 28 scans
Patient C1047222: 4 scans
Patient C2354097: 29 scans
Patient C2380788: 7 scans
Patient C1095315: 5 scans
Patient C2568978: 6 scans
Patient C2334663: 5 scans
Patient C1232829: 4 scans
Patient C3396522: 30 scans
Patient C324597: 16 scans
Patient C2855076: 4 scans
Patient C324351: 18 scans
Patient C3399720: 10 scans
Patient C3684711: 11 scans
Patient C36654: 12 scans
Patient C38868: 54 scans
Patient C3817551: 5 scans
Patient C4065273: 16 scans
Patient C3971547: 10 scans
Patient C46863: 14 scans
Patient C52644: 8 scans
Patient C53013: 7 scans
Patient C62730: 25 scans
Patient C63099: 4 scans
Patient C66420: 12 scans
Patient C735540: 12 scans
Patient C4312872: 3 scans
Patient C4095900: 7 scans
Patient C77490: 5 scans
Patient C4309797: 4 scans
Patient C744642: 4 scans
Patient C41082: 7 scans
Patient C860508: 14 scans
Patient C83689