In [5]:
import os
import re
import pandas as pd
from pathlib import Path
import numpy as np
import shutil
import nibabel as nib
from tqdm import tqdm
from collections import defaultdict
from datetime import datetime

### Helper functions

In [2]:
def prefix_zeros_to_six_digit_ids(patient_id):
    """
    Adds 0 to the beginning of 6-digit patient IDs.
    """
    str_id = str(patient_id)
    if len(str_id) == 6:
        # print(f"Found a 6-digit ID: {str_id}. Prefixing a '0'.")
        patient_id = "0" + str_id

    else:
        patient_id = str_id
    return patient_id

def extract_unique_patient_ids_from_csv(csv_path):
    cohort = "BCH"
    id_column = "BCH MRN"
    df = pd.read_csv(csv_path)
    ids = set(df[id_column])
 
    print(f"Length of {cohort} cohort is {len(ids)}.")
    return ids

### Clinical .csv comparison

In [3]:
##############################################
# EDIT THESE VARIABLES BEFORE RUNNING SCRIPT #
##############################################
csv_path_old = Path("/home/jc053/GIT/mri_longitudinal_analysis/data/input/clinical/redcap_full_108_cohort.csv")
csv_path_new = Path("/home/jc053/GIT/mri_longitudinal_analysis/data/input/clinical/bch_filtering_68_.csv")
##############################################

ids_old = extract_unique_patient_ids_from_csv(csv_path_old)
ids_new = extract_unique_patient_ids_from_csv(csv_path_new)
#print(f"These are the old ids:", ids_old)
#print(f"These are the new ids:", ids_new)

ids_zero_filled_old = []
ids_zero_filled_new = []
for id in ids_old:
    ids_zero_filled_old.append(prefix_zeros_to_six_digit_ids(id))
for id in ids_new:
    ids_zero_filled_new.append(prefix_zeros_to_six_digit_ids(id))
    
#print(f"These are the old ids with 0s added:", ids_zero_filled_old)
#print(f"These are the new ids with 0s added:", ids_zero_filled_new)

diff = list(set(ids_zero_filled_old) - set(ids_zero_filled_new))
#print(f"The difference between the two lists is:", len(diff))


# Calculating the difference: IDs in the old set but not in the new set
ids_in_old_not_in_new = list(set(ids_zero_filled_old) - set(ids_zero_filled_new))
print(f"IDs in old set but not in new set: {len(ids_in_old_not_in_new)}")
print(ids_in_old_not_in_new)

# Calculating the difference: IDs in the new set but not in the old set
ids_in_new_not_in_old = list(set(ids_zero_filled_new) - set(ids_zero_filled_old))
print(f"IDs in new set but not in old set: {len(ids_in_new_not_in_old)}")
print(ids_in_new_not_in_old)

Length of BCH cohort is 108.
Length of BCH cohort is 68.
IDs in old set but not in new set: 40
['1053918', '1144789', '4319063', '4446126', '4485510', '5127658', '4073188', '4565140', '4179167', '1148595', '1115940', '2268068', '1132366', '4199911', '1071544', '2126809', '0238268', '4300567', '2184255', '4022683', '4394032', '0973766', '0233126', '1109676', '4099295', '2270579', '2104688', '4864792', '2326050', '5210065', '1017646', '2124457', '2088643', '2158479', '4032520', '4490520', '2249514', '1182323', '4466091', '1153221']
IDs in new set but not in old set: 0
[]


### Get the treatment dates and move files into pre/post treatment folder -> old, first iteration

In [28]:
# filter by the treatment date to get all scans before treatment
def extract_treatment_dates(clinical_df, patient_id):
    """
    Extract the dates of treatments from the clinical data for a specific patient.
    Parameters:
    - patient_id (str): The ID of the patient.
    Returns:
    - treatment_dates (dict): A dictionary of treatment types and their corresponding
    dates for the specified patient.
    """
    patient_data = clinical_df[clinical_df["BCH MRN"] == patient_id]
    if patient_data.empty:
        return None
    
    patient_data = patient_data.iloc[0]

    treatment_dates = {}

    if patient_data["Surgical Resection"] == "Yes":
        treatment_dates["Surgery"] = patient_data["Date of first surgery"]

    if patient_data["Systemic therapy before radiation"] == "Yes":
        treatment_dates["Chemotherapy"] = patient_data["Date of Systemic Therapy Start"]

    if patient_data["Radiation as part of initial treatment"] == "Yes":
        treatment_dates["Radiation"] = patient_data["Start Date of Radiation"]

    # print(f"\tPatient {patient_id} - Treatment Dates: {treatment_dates}")
    treatment_dates = [
        pd.to_datetime(date, dayfirst=True)
        for date in treatment_dates.values()
        if pd.notnull(date)
    ]

    first_treatment_date = min(treatment_dates, default=pd.NaT)
    return first_treatment_date    

def move_files_to_treatment_folders(clinical_csv_path, source_directory, destination_directory):
    clinical_df = pd.read_csv(clinical_csv_path)
    os.makedirs(os.path.join(destination_directory, 'pre_treatment'), exist_ok=True)
    os.makedirs(os.path.join(destination_directory, 'post_treatment'), exist_ok=True)
    count = 0 
    pats_failed = set()
    for id in clinical_df['BCH MRN']:
        id = str(id)
        if len(id) == 6:
            if id == '233126' or id == '973766':
                id = str(id)
                clinical_df['BCH MRN'] = clinical_df['BCH MRN'].replace(int(id), id)
                continue
            id = "0" + id
            clinical_df['BCH MRN'] = clinical_df['BCH MRN'].replace(int(id), id)
        else:
            id = str(id)
            clinical_df['BCH MRN'] = clinical_df['BCH MRN'].replace(int(id), id)

    print(clinical_df['BCH MRN'].values)
    for file_name in os.listdir(source_directory):
        if file_name.endswith('.nii.gz') or file_name.endswith('_mask.nii.gz'):
            parts = file_name.split('_')
            patient_id = parts[1]
            scan_id = parts[2]
            if scan_id.endswith('_mask.nii.gz'):
                scan_id = scan_id.split('_')[0]
            else:
                scan_id = scan_id.split('.')[0]
            scan_id = pd.to_datetime(scan_id)

                                                                     
            if str(patient_id) not in clinical_df['BCH MRN'].values:
                print(f"Patient ID {patient_id} not found in clinical data. Skipping file {file_name}.")
                count = count + 1
                pats_failed.add(patient_id)
                continue
            
            first_treatment_date = extract_treatment_dates(clinical_df, patient_id)
            if first_treatment_date is None:
                print(f"Patient ID {patient_id} has no treatment date.")
                treatment_period = 'pre_treatment'
            else:
                treatment_period = 'pre_treatment' if scan_id < first_treatment_date else 'post_treatment'
                
            # Move the file to the appropriate subfolder
            src_path = os.path.join(source_directory, file_name)
            dest_path = os.path.join(destination_directory, treatment_period, file_name)
            shutil.move(src_path, dest_path)
            # print(f"Moved {file_name} to {treatment_period} folder.")
    print(f"Failed on {count} files.  {len(pats_failed)} patients failed: {pats_failed}")

clinical_csv_path = Path("/home/jc053/GIT/mri_longitudinal_analysis/data/input/clinical/redcap_full_108_cohort.csv")
source_directory = destination_directory = Path("/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/bch_longitudinal_dataset/old_seg_accepted")
move_files_to_treatment_folders(clinical_csv_path, source_directory, destination_directory)

['0137476' '0238268' '1013946' '1058916' '233126' '1071544' '1138934'
 '1148595' '1109676' '1194890' '1232179' '1132366' '2001398' '2088643'
 '2103993' '1017646' '1144789' '2088116' '1053918' '0135939' '2124457'
 '4199911' '4394032' '4393612' '2306428' '4416410' '4132691' '4319063'
 '973766' '2260520' '4466091' '4032520' '4345209' '4489651' '2147101'
 '1115940' '4304956' '5002720' '2183847' '4300567' '2158479' '2184255'
 '5048067' '2249514' '4022683' '4624899' '2126809' '2173072' '2268068'
 '2270579' '2280828' '2104688' '4695947' '4092758' '5019569' '5208771'
 '4362479' '4635148' '4565140' '4446126' '4485510' '4490520' '2326050'
 '4802764' '4303399' '1182323' '4305171' '2297464' '2316922' '4073188'
 '4015437' '4052777' '1153221' '4099295' '4098993' '4108745' '4571440'
 '4137900' '2004560' '2113964' '4155943' '4179167' '4803246' '4931993'
 '5412608' '2261605' '4228140' '4252068' '4318694' '4348109' '4450936'
 '4455045' '4478592' '4505982' '4572857' '4647390' '4690530' '4857369'
 '486479

### Get the treatment dates for the new iteration where a 'First Treatment' column is defined

In [None]:
def get_first_treatment_dates_new(clinical_csv_path):
    clinical_df = pd.read_csv(clinical_csv_path)
    treatment_dates = {}
    for _, row in clinical_df.iterrows():
        patient_id = row['BCH MRN']
        date_str = row['First Treatment']
        if pd.isna(date_str):  # Check if treatment date is not NaN
            treatment_dates[patient_id] = None  # No treatment date recorded
        else:
            treatment_dates[patient_id] = datetime.strptime(date_str, '%d/%m/%Y')
    return treatment_dates

### Compare IDs between folders and move files from exclusion list

In [6]:
# get the unique ids from the directory where the files were moved
def extract_unique_patient_ids_from_directory(directory_path):
    """
    Extracts the unique patient IDs from the directory where the files were moved.
    """
    patient_ids = set()
    for file in os.listdir(directory_path):
        if file.endswith(".nii.gz"):
            patient_id = file.split("_")[1]
            patient_ids.add(patient_id)
    return patient_ids

def count_files(directory):
    return len([name for name in os.listdir(directory) if os.path.isfile(os.path.join(directory, name))])


pre_treatment_directory = Path("/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/bch_longitudinal_dataset/old_seg_accepted/pre_treatment")
post_treatment_directory = Path("/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/bch_longitudinal_dataset/old_seg_accepted/post_treatment")
initial_after_qa_directory = Path("/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/bch_longitudinal_dataset/old_seg_accepted_copy")
initial_before_qa_directory = Path("/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/bch_longitudinal_dataset/old_all_files_before_seg_review")
ids_pre = extract_unique_patient_ids_from_directory(pre_treatment_directory)
ids_post = extract_unique_patient_ids_from_directory(post_treatment_directory)
ids_initial_after_qa = extract_unique_patient_ids_from_directory(initial_after_qa_directory)
ids_initial_before_qa = extract_unique_patient_ids_from_directory(initial_before_qa_directory)
files_pre = count_files(pre_treatment_directory)
files_post = count_files(post_treatment_directory)
files_initial_after_qa = count_files(initial_after_qa_directory)
files_initial_before_qa = count_files(initial_before_qa_directory)
print(f"Unique patient IDs from the post-treatment directory: {len(ids_post)}, Files: {files_post}")
print(f"Unique patient IDs from the pre-treatment directory: {len(ids_pre)}, Files: {files_pre}")
print(f"Unique patient IDs from the initial directory after q&a: {len(ids_initial_after_qa)}, Files: {files_initial_after_qa}")
print(f"Unique patient IDs from the initial directory before q&a: {len(ids_initial_before_qa)}, Files: {files_initial_before_qa}")

# from the unique ids in the pre-treatment directory, filter out ids and scans belonging to certain patients that are recorded in a exclusion list
def filter_out_excluded_patients(directory_path, exclusion_list):
    """
    Filter out the patients and their scans that are in the exclusion list.
    """
    os.makedirs(os.path.join(directory_path, "filtered"), exist_ok=True)
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            if file.endswith(".nii.gz") or file.endswith("_mask.nii.gz"):
                patient_id = file.split("_")[1]
                if patient_id in exclusion_list:
                    file_path = os.path.join(root, file)
                    dst_path = os.path.join(directory_path, "filtered", file)
                    shutil.move(file_path, dst_path)
                    print(f"Moved {file_path} from the directory.")

exclusion_list = ["4791484", "298046", "299321", "335968", "4636143", "4159396", "4192047", "233126"]
filter_out_excluded_patients(pre_treatment_directory, exclusion_list)
filter_out_excluded_patients(post_treatment_directory, exclusion_list)


Unique patient IDs from the post-treatment directory: 84, Files: 1772
Unique patient IDs from the pre-treatment directory: 26, Files: 358
Unique patient IDs from the initial directory after q&a: 90, Files: 2200
Unique patient IDs from the initial directory before q&a: 102, Files: 3270
Moved /mnt/93E8-0534/JuanCarlos/mri-classification-sequences/bch_longitudinal_dataset/old_seg_accepted/post_treatment/filtered/image1170_233126_19981209.nii.gz from the directory.
Moved /mnt/93E8-0534/JuanCarlos/mri-classification-sequences/bch_longitudinal_dataset/old_seg_accepted/post_treatment/filtered/image1170_233126_19981209_mask.nii.gz from the directory.
Moved /mnt/93E8-0534/JuanCarlos/mri-classification-sequences/bch_longitudinal_dataset/old_seg_accepted/post_treatment/filtered/image1171_233126_19991105.nii.gz from the directory.
Moved /mnt/93E8-0534/JuanCarlos/mri-classification-sequences/bch_longitudinal_dataset/old_seg_accepted/post_treatment/filtered/image1171_233126_19991105_mask.nii.gz from

### Filter patients with less than 3 scans available

In [7]:
from datetime import datetime, timedelta
# analyze the number of scans per patient in the pre-treatment directory and make sure there are at least three scans per patient
def analyze_scans_per_patient(directory_path):
    """
    Analyze the number of scans per patient in the directory.
    """
    patient_scan_counts = defaultdict(int)
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            if file.endswith(".nii.gz"):
                patient_id = file.split("_")[1]
                patient_scan_counts[patient_id] += 1
    return patient_scan_counts

pre_treatment_scan_counts = analyze_scans_per_patient(pre_treatment_directory)
print("Number of scans per patient in the pre-treatment directory:", pre_treatment_scan_counts)
# for patients with less than 3 scans, move them to a different directory
def move_patients_with_less_than_three_scans(directory_path, pre_treatment_scan_counts, min_scans=6):
    """
    Move the patients with less than the minimum number of scans to a different directory.
    """
    count = 0 
    os.makedirs(os.path.join(directory_path, "insufficient_scans"), exist_ok=True)
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            if file.endswith(".nii.gz"):
                patient_id = file.split("_")[1]
                if pre_treatment_scan_counts[patient_id] < min_scans:
                    file_path = os.path.join(root, file)
                    dst_path = os.path.join(directory_path, "insufficient_scans", file)
                    shutil.move(file_path, dst_path)
                    # print(f"Moved {file_path} to insufficient_scans directory.")
                    count = count + 1
    print(f"Moved {count} files.")

move_patients_with_less_than_three_scans(pre_treatment_directory, pre_treatment_scan_counts)

ids_pre = extract_unique_patient_ids_from_directory(pre_treatment_directory)
ids_removed = extract_unique_patient_ids_from_directory(os.path.join(pre_treatment_directory, "insufficient_scans"))
print("Unique patient IDs from the pre-treatment directory after 3 scans condition:", len(ids_pre))
print("Unique patient IDs from the removed directory:", len(ids_removed))

# remember to divide by two the values in the dictionery since the masks are also counted

Number of scans per patient in the pre-treatment directory: defaultdict(<class 'int'>, {'0137476': 12, '1058916': 6, '1148595': 6, '2088643': 8, '2103993': 18, '2173072': 10, '2280828': 6, '2316922': 8, '4015437': 16, '4108745': 34, '4137900': 6, '4155943': 14, '4450936': 8, '4455045': 18, '4478592': 6, '4505982': 14, '4572857': 8, '4647390': 54, '4857369': 10, '4923951': 10, '5046466': 18, '5238412': 6, '5531498': 20, '2124457': 30, '2158479': 6, '4490520': 6, '1013946': 2, '1138934': 2, '2126809': 2, '2270579': 2, '4864792': 2, '4975776': 2, '5127658': 2, '5210065': 2, '2184255': 2, '2326050': 2, '4022683': 2, '4032520': 2, '4300567': 2, '4446126': 2, '4466091': 2, '4485510': 2, '2001398': 4, '4252068': 4})
Moved 40 files.
Unique patient IDs from the pre-treatment directory after 3 scans condition: 26
Unique patient IDs from the removed directory: 18


### Follow-up should be at least 1 year!

In [8]:
# make sure the first and last scan per paritent are at least 1 year apart, print the patients that do not meet this criteria
def check_time_between_first_and_last_scan(directory_path, min_time_difference=timedelta(days=365)):
    """
    Check the time difference between the first and last scan for each patient.
    """
    patient_first_last_scan_dates = {}
    fail_ids = []
    for file in os.listdir(directory_path):
        if file.endswith(".nii.gz"):
            patient_id = file.split("_")[1]
            scan_date = file.split("_")[2].split(".")[0]
            scan_date = datetime.strptime(scan_date, "%Y%m%d")
            if patient_id not in patient_first_last_scan_dates:
                patient_first_last_scan_dates[patient_id] = [scan_date, scan_date]
            else:
                if scan_date < patient_first_last_scan_dates[patient_id][0]:
                    patient_first_last_scan_dates[patient_id][0] = scan_date
                if scan_date > patient_first_last_scan_dates[patient_id][1]:
                    patient_first_last_scan_dates[patient_id][1] = scan_date

    for patient_id, dates in patient_first_last_scan_dates.items():
        first_scan_date, last_scan_date = dates
        time_difference = last_scan_date - first_scan_date
        if time_difference < min_time_difference:
            print(f"Patient {patient_id} has scans less than 1 year apart: {first_scan_date} - {last_scan_date}")
            fail_ids.append(patient_id)

    return fail_ids

fail_ids = check_time_between_first_and_last_scan(pre_treatment_directory)
# count the number of scans (files have format imageXYZ_patientID_scanID.nii.gz) of those patients listed the patient_first_last_scan_dates dictionary as keys above
scan_counts = {}
for file in os.listdir(pre_treatment_directory):
    if file.endswith(".nii.gz"):
        patient_id = file.split("_")[1]
        if patient_id in fail_ids:
            if patient_id not in scan_counts:
                scan_counts[patient_id] = 1
            else:
                scan_counts[patient_id] += 1
print(scan_counts)


Patient 1058916 has scans less than 1 year apart: 2000-07-06 00:00:00 - 2001-04-27 00:00:00
Patient 4450936 has scans less than 1 year apart: 2011-10-08 00:00:00 - 2012-09-12 00:00:00
Patient 5238412 has scans less than 1 year apart: 2018-09-19 00:00:00 - 2019-08-28 00:00:00
Patient 4490520 has scans less than 1 year apart: 2012-12-13 00:00:00 - 2013-08-21 00:00:00
{'1058916': 6, '4450936': 8, '5238412': 6, '4490520': 6}


### Final comparison with new dataset

In [10]:
# Replace these paths with the actual paths of your directories
directory1_path = "/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/bch_longitudinal_dataset/new_review/after_review_before_pp/output/seg_predictions"
directory2_path = "/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/bch_longitudinal_dataset/old_seg_accepted/pre_treatment"


# Function to extract patientID and scanID
def get_identifier(file_name, from_directory1=True):
    parts = file_name.split('_')
    if from_directory1:
        # From the first directory (patientID_ScanID_hash.nii.gz)
        return parts[0], parts[1]
    else:
        # From the second directory (imageXX_patientID_scanID.nii.gz)
        return parts[1], parts[2]

# Extract identifiers from both directories
identifiers_dir1 = set()
ids_dir1 = set()
for file_name in os.listdir(directory1_path):
    if file_name.endswith(".nii.gz"):
        patid, scanid = get_identifier(file_name)
        identifiers_dir1.add((patid, scanid))
        ids_dir1.add(patid)

identifiers_dir2 = set()
ids_dir2 = set()
for file_name in os.listdir(directory2_path):
    if file_name.endswith("_mask.nii.gz"):
        patid, scanid = get_identifier(file_name, from_directory1=False)
        identifiers_dir2.add((patid, scanid))
        ids_dir2.add(patid)


print("Number of total scans in dir 1:", len(identifiers_dir1))
print(f"Number of unique patients in Directory 1: {len(ids_dir1)}")
common_patients_set_1 = ids_dir1.intersection(final_set)
mismatch = final_set - common_patients_set_1
print(f"Common patients between the dir1 and the final set: {len(common_patients_set_1)}")
print(f"Mismatch between the final set and the dir1: {len(mismatch), mismatch}")

print("Numebr of total scans in dir 2:", len(identifiers_dir2))
print(f"Number of unique patients in Directory 2: {len(ids_dir2)}")
common_patients_set_2 = ids_dir2.intersection(final_set)
mismatch = final_set - common_patients_set_2
print(f"Common patients between the dir2 and the final set: {len(common_patients_set_2)}")
print(f"Mismatch between the final set and the dir2: {len(mismatch), mismatch}")

#unique_to_dir1 = identifiers_dir1 - identifiers_dir2
#unique_to_dir2 = identifiers_dir2 - identifiers_dir1
#common_identifiers = identifiers_dir1.intersection(identifiers_dir2)
#common_patients = ids_dir1.intersection(ids_dir2)
#print(f"Unique identifiers to Directory 1: {len(unique_to_dir1)}")
#print(f"Unique identifiers to Directory 2: {len(unique_to_dir2)}")
#print(f"Common identifiers between the directories: {len(common_identifiers)}")
#print(f"Common patients between the directories: {len(common_patients)}")

Number of total scans in dir 1: 1407
Number of unique patients in Directory 1: 85


NameError: name 'final_set' is not defined