In [37]:
import os
import re
import pandas as pd
from pathlib import Path
import numpy as np
import shutil
from tqdm import tqdm
from collections import defaultdict
from datetime import datetime

## BCH Data (Additional): Change dates in clinical csv dataset to right format


In [13]:
# Function to convert dates to DD/MM/YYYY format
def convert_date(date_item):
    # Check for NaN values or empty strings and return as is
    if pd.isnull(date_item) or date_item == '':
        return np.nan

    # Convert to string and remove time part if present
    date_str = str(date_item).strip().split(' ')[0]

    # Define date formats for parsing
    date_formats = ("%m/%d/%Y", "%m/%d/%y", "%Y-%m-%d")

    for fmt in date_formats:
        try:
            # Parse date
            dt = datetime.strptime(date_str, fmt)

            if fmt == "%m/%d/%y":
                if dt.year >= 70:  # Cutoff year: 1970
                    dt = dt.replace(year=dt.year + 1900)
                else:
                    dt = dt.replace(year=dt.year + 2000)

            # Check if the month is greater than 12, then swap month and day
            if dt.month > 12:
                dt = dt.replace(month=dt.day, day=dt.month)

            # Convert to desired format
            return "'" + dt.strftime("%d/%m/%Y")
        except ValueError:
            continue  # if the format does not match, it will pass to the next format



# Function to process the Excel file
def process_excel(df, column_name):
    # Read the Excel file
    print("Column: ", column_name)
    # Check if the column exists
    if column_name not in df.columns:
        raise ValueError(f"The column {column_name} does not exist in the Excel file.")

    # Convert all dates in the column to DD/MM/YYYY format
    df[column_name] = df[column_name].apply(lambda x: convert_date(str(x)) if not pd.isnull(x) else x)


In [14]:
dates = ["8/26/82", "10/24/83", "02/10/1987", "07/12/1983"]
converted_dates = [convert_date(d) for d in dates]

for original, converted in zip(dates, converted_dates):
    print(f"Original: {original}, Converted: {converted}")

Original: 8/26/82, Converted: '26/08/3882
Original: 10/24/83, Converted: '24/10/3883
Original: 02/10/1987, Converted: '10/02/1987
Original: 07/12/1983, Converted: '12/07/1983


In [15]:
input_excel_path = '/home/jc053/GIT/mri_longitudinal_analysis/data/redcap/bch_new_redcap.xlsx'
output_excel_path = 'revised_dates.xlsx'
date_column_names = ['Date of Birth', 'Date of last clinical follow-up', 'Date of diagnosis (Path, and MRI if no biopsy)', 'Date of MRI diagnosis','Date of first surgery', 'Date of Systemic Therapy Start','Start Date of Radiation'  ]

# Call the function with the paths and column name
df = pd.read_excel(input_excel_path)
for column in date_column_names:
    process_excel(df, column)
    
# Save the revised dates back to the Excel file
#df.to_excel(output_excel_path, index=False)

Column:  Date of Birth
Column:  Date of last clinical follow-up
Column:  Date of diagnosis (Path, and MRI if no biopsy)
Column:  Date of MRI diagnosis
Column:  Date of first surgery
Column:  Date of Systemic Therapy Start
Column:  Start Date of Radiation


## BCH Data: Get the set of all patients for given .csv files

In [38]:
def prefix_zeros_to_six_digit_ids(patient_id):
    """
    Adds 0 to the beginning of 6-digit patient IDs.
    """
    str_id = str(patient_id)
    if len(str_id) == 6:
        # print(f"Found a 6-digit ID: {str_id}. Prefixing a '0'.")
        patient_id = "0" + str_id

    else:
        patient_id = str_id
    return patient_id

def extract_unique_patient_ids_from_csv(csv_path):
    cohort = "BCH"
    id_column = "BCH MRN"
    df = pd.read_csv(csv_path)
    ids = set(df[id_column])
 
    print(f"Length of {cohort} cohort is {len(ids)}.")
    

In [40]:
csv_path_bch_init = "/home/jc053/GIT/mri_longitudinal_analysis/data/input/clinical/bch_filtered_88.csv"
extract_unique_patient_ids_from_csv(csv_path_bch_init)

Length of BCH cohort is 88.


In [6]:
##############################################
# EDIT THESE VARIABLES BEFORE RUNNING SCRIPT #
##############################################
csv_path_old = Path("/home/jc053/GIT/mri_longitudinal_analysis/data/input/clinical/redcap_full_108_cohort.csv")
csv_path_new = Path("/home/jc053/GIT/mri_longitudinal_analysis/data/input/clinical/bch_initial.csv")
##############################################





print(f"Length of old {cohort} cohort is {len(ids_old)}.")
print(f"Length of new {cohort} cohort is {len(ids_new)}.")
print(f"These are the old ids:", ids_old)
print(f"These are the new ids:", ids_new)

ids_zero_filled_old = []
ids_zero_filled_new = []
for id in ids_old:
    ids_zero_filled_old.append(prefix_zeros_to_six_digit_ids(id))
for id in ids_new:
    ids_zero_filled_new.append(prefix_zeros_to_six_digit_ids(id))
    
print(f"These are the old ids with 0s added:", ids_zero_filled_old)
print(f"These are the new ids with 0s added:", ids_zero_filled_new)

diff = list(set(ids_zero_filled_old) - set(ids_zero_filled_new))
print(f"The difference between the two lists is:", len(diff))


# Calculating the difference: IDs in the old set but not in the new set
ids_in_old_not_in_new = list(set(ids_zero_filled_old) - set(ids_zero_filled_new))
print(f"IDs in old set but not in new set: {len(ids_in_old_not_in_new)}")
print(ids_in_old_not_in_new)

# Calculating the difference: IDs in the new set but not in the old set
ids_in_new_not_in_old = list(set(ids_zero_filled_new) - set(ids_zero_filled_old))
print(f"IDs in new set but not in old set: {len(ids_in_new_not_in_old)}")
print(ids_in_new_not_in_old)


Length of old BCH cohort is 108.
Length of new BCH cohort is 86.
These are the old ids: [137476, 238268, 1013946, 1058916, 233126, 1071544, 1138934, 1148595, 1109676, 1194890, 1232179, 1132366, 2001398, 2088643, 2103993, 1017646, 1144789, 2088116, 1053918, 135939, 2124457, 4199911, 4394032, 4393612, 2306428, 4416410, 4132691, 4319063, 973766, 2260520, 4466091, 4032520, 4345209, 4489651, 2147101, 1115940, 4304956, 5002720, 2183847, 4300567, 2158479, 2184255, 5048067, 2249514, 4022683, 4624899, 2126809, 2173072, 2268068, 2270579, 2280828, 2104688, 4695947, 4092758, 5019569, 5208771, 4362479, 4635148, 4565140, 4446126, 4485510, 4490520, 2326050, 4802764, 4303399, 1182323, 4305171, 2297464, 2316922, 4073188, 4015437, 4052777, 1153221, 4099295, 4098993, 4108745, 4571440, 4137900, 2004560, 2113964, 4155943, 4179167, 4803246, 4931993, 5412608, 2261605, 4228140, 4252068, 4318694, 4348109, 4450936, 4455045, 4478592, 4505982, 4572857, 4647390, 4690530, 4857369, 4864792, 4923951, 4975776, 5029974

## Check that the set of patient has available imaging
Available imgaging is considered when in the folder with the data there is a patient_ID.csv with the paths and metadata and a patient_ID folder with the images.

In [7]:
def check_missing_patient_files_and_folders(patient_ids_set, directory_path):
    # List to store the result strings for missing files/folders
    missing_items = []

    # Iterate over each unique patient ID
    for patient_id in patient_ids_set:
        # Construct the expected csv file and folder name
        expected_csv_file = os.path.join(directory_path, f"{patient_id}.csv")
        expected_folder = os.path.join(directory_path, f"{patient_id}")

        # Check for existence of the expected csv file and folder
        has_csv = os.path.isfile(expected_csv_file)
        has_folder = os.path.isdir(expected_folder)

        # Prepare the result string if items are missing
        if not has_csv or not has_folder:
            missing_str = f"Patient ID: {patient_id} -"
            if not has_csv and not has_folder:
                missing_str += " Both CSV and Folder missing"
            elif not has_csv:
                missing_str += " CSV missing"
            elif not has_folder:
                missing_str += " Folder missing"
            missing_items.append(missing_str)

    return missing_items

In [8]:
##############################################
# EDIT THESE VARIABLES BEFORE RUNNING SCRIPT #
##############################################
data_path = Path("/mnt/an225/Anna/longitudinal_nifti_BCH/curated_BCH/")
missing_patient_items = check_missing_patient_files_and_folders(ids_zero_filled, data_path)

# Output the results
if len(missing_patient_items) == 0:
    print("No missing items found.")
else:
    for missing_item in missing_patient_items:
        print(missing_item)
##############################################

No missing items found.


## Count paitents before and after segmentation review

In [9]:
def count_patient_ids(folder_path):
    """counts the number of unique patient IDs in a given folder based on the filenames."""
    # Regular expression to match the patient ID in the file name
    pattern = re.compile(r"_(\d{7})_")

    # Set to store unique patient IDs
    patient_ids = set()

    # Iterate over files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".nii.gz"):
            match = pattern.search(filename)
            if match:
                patient_ids.add(match.group(1))

    # Return the count of unique patient IDs
    return patient_ids

In [10]:
folder_path_bch_before_review = "/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/bch_longitudinal_dataset/all_files_before_review"
folder_path_bch_after_review = "/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/bch_longitudinal_dataset/accepted"
pat_ids_before_review = count_patient_ids(folder_path_bch_before_review)
pat_ids_after_review = count_patient_ids(folder_path_bch_after_review)
print("IDs before review:",len(pat_ids_before_review))
print("IDs after review:", len(pat_ids_after_review))

IDs before review: 100
IDs after review: 88


In [11]:
overlapping_ids_before = pat_ids_before_review.intersection(ids_zero_filled)
overlapping_ids_after = pat_ids_after_review.intersection(ids_zero_filled)
print(f"Overlapping patient IDs Before: {len(overlapping_ids_before)}")
print(f"Overlapping patient IDs After: {len(overlapping_ids_after)}")

Overlapping patient IDs Before: 98
Overlapping patient IDs After: 86


## ------------------------------------------------------------------------------------------------------

## CBTN Data: Review

In [2]:
# get the csv with the data to review
def read_file(csv_file):
    """
    Read and return a dataframe from a CSV file.

    Args:
        csv_file (str): Path to the CSV file.

    Returns:
        pd.DataFrame: Loaded dataframe with the CSV data.
    """
    column_names = ["Image_Name", "Quality"]
    d_f = pd.read_csv(csv_file, names=column_names)
    return d_f

def delete_entries(df):
    print(df["Quality"].unique())
    cleaned_df = df[df["Quality"] == 5]
    print(cleaned_df.head(5))
    cleaned_df.to_csv("cleaned.csv", index=False, header=False)

In [3]:
csv_path = "/home/jc053/GIT/mri_longitudinal_analysis/mri_longitudinal_analysis/utils/annotations_after_1st_review_converted.csv"
df_cbtn = read_file(csv_path)
delete_entries(df_cbtn)

[1 5 2]
               Image_Name  Quality
37   C1042056_2383.nii.gz        2
43   C1042056_2972.nii.gz        2
49   C1042056_3597.nii.gz        2
69    C1072314_305.nii.gz        2
101  C1095315_6554.nii.gz        2


## CBTN Data: Patient IDs after Pre / Post Separation

In [17]:
def extract_unique_patient_ids_from_csv(csv_path):
    cohort = "CBTN"
    id_column = "Patient_ID"
    df = pd.read_csv(csv_path)
    ids = set(df[id_column])
 
    print(f"Length of {cohort} cohort is {len(ids)}.")
    
csv_path_cbtn = "/home/jc053/GIT/mri_longitudinal_analysis/data/output/clinical_data/cbtn_pre_event.csv"
extract_unique_patient_ids_from_csv(csv_path_cbtn)


Length of CBTN cohort is 440.


## CBTN Data: Patient IDs after 1st and 2nd review

In [31]:
unique_ids_1st_review = set()
unique_ids_2nd_review = set()
# Function to process a single directory
def process_directory(directory, unique_set):
    for filename in os.listdir(directory):
        if filename.endswith(".nii.gz"):
            # Extracting the part before the first underscore
            patient_id = filename.split('_')[0]
            # Check if the ID starts with 'C' followed by digits
            if patient_id.startswith('C') and patient_id[1:].isdigit():
                unique_set.add(patient_id)

def extract_unique_patient_ids(dir1, dir2, dir3, unique_set):
    # Process each directory
    process_directory(dir1, unique_set)
    process_directory(dir2, unique_set)
    process_directory(dir3, unique_set)

    return unique_set

# Example usage
dir1 = "/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/cbtn_longitudinal_dataset/pre_event/1"  
dir2 = "/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/cbtn_longitudinal_dataset/pre_event/2"
dir3 = "/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/cbtn_longitudinal_dataset/pre_event/5"
dir4 = "/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/cbtn_longitudinal_dataset/pre_event/2 - Found replacement"
dir5 = "/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/cbtn_longitudinal_dataset/pre_event/5 - Found replacement"

unique_patient_ids_after_1st_review = extract_unique_patient_ids(dir1, dir2, dir3, unique_ids_1st_review)
unique_patient_ids_after_2nd_review = extract_unique_patient_ids(dir1, dir4, dir5, unique_ids_2nd_review)

print(f"Unique patient IDs after 1st review: {len(unique_patient_ids_after_1st_review)}")
print(f"Unique patient IDs after 2nd review: {len(unique_patient_ids_after_2nd_review)}")


Unique patient IDs after 1st review: 440
Unique patient IDs after 2nd review: 437


## CBTN Data: Filter Patients with less than 3 sessions / scans and move to folder before pp

In [34]:
def has_minimum_sessions(source_dirs, min_sessions=3):
    """
    Checks if each patient has at least a minimum number of session IDs across the source directories.

    Parameters:
    source_dirs (list): List of source directory paths.
    min_sessions (int): Minimum number of sessions required for each patient.

    Returns:
    set: Set of patient IDs that meet the criteria.
    """
    session_count = defaultdict(set)
    for source_dir in source_dirs:
        for filename in os.listdir(source_dir):
            if filename.endswith(".nii.gz"):
                parts = filename.split('_')
                if len(parts) >= 2 and parts[0].startswith('C') and parts[0][1:].isdigit():
                    patient_id, session_id = parts[0], parts[1]
                    session_count[patient_id].add(session_id)

    # Filter patients who meet the minimum session requirement
    return {patient for patient, sessions in session_count.items() if len(sessions) >= min_sessions}

# copy patients after 2nd review and which meet the criteria of at least 3 folders
def copy_files_from_dirs_to_new_dir(source_dirs, target_dir, valid_patient_ids):
    """
    Copy files from the given source directories to the target directory.
    """
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
    total_files = sum([len(os.listdir(source_dir)) for source_dir in source_dirs])
    with tqdm(total= total_files,desc="Copying files") as pbar:
        for source_dir in source_dirs:
            for filename in os.listdir(source_dir):
                if filename.endswith(".nii.gz"):
                    patient_id = filename.split('_')[0]
                    if patient_id in valid_patient_ids:
                        source_file = os.path.join(source_dir, filename)
                        if os.path.isfile(source_file):
                            target_file = os.path.join(target_dir, filename)
                            shutil.copyfile(source_file, target_file)
                            pbar.update(1)  
                
source_dirs = [dir1, dir4, dir5]
target_dir = "/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/cbtn_longitudinal_dataset/pre_event/after_review_before_pp"
valid_patient_ids = has_minimum_sessions(source_dirs)
copy_files_from_dirs_to_new_dir(source_dirs, target_dir, valid_patient_ids)

Copying files: 100%|█████████▉| 1840/1841 [00:11<00:00, 162.35it/s]


In [35]:
unique_ids_after_3_scan_filtering = set()
process_directory(target_dir, unique_ids_after_3_scan_filtering)
print(f"Unique patient IDs after 3 scan filtering: {len(unique_ids_after_3_scan_filtering)}")

Unique patient IDs after 3 scan filtering: 115


## CBTN Data: PP comparison

In [12]:
def compare_image_sets(path1, preprocessed_path):
    # Get the set of image file names in the first path
    images_set1 = {file for file in os.listdir(path1) if file.endswith('.nii.gz')}
    
    # Get the set of image file names in the second path
    preprocessed_images = {file.replace('.nii_0000.nii.gz', '.nii.gz') for file in os.listdir(preprocessed_path) if '.nii_0000.nii.gz' in file}    
    
    # Find the difference between the two sets
    missing_images = images_set1.difference(preprocessed_images)
    
    # Return the set of missing image file names
    return missing_images

In [13]:
cbtn_path_before_pp = "/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/cbtn_longitudinal_dataset/pre_event/after_review_before_pp"
cbtn_path_after_registration = "/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/cbtn_longitudinal_dataset/pre_event/output/T2W_registration/"
cbtn_path_after_brain_extraction = "/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/cbtn_longitudinal_dataset/pre_event/output/T2W_brain_extraction/"
missing_scans_registration = compare_image_sets(cbtn_path_before_pp, cbtn_path_after_registration)
print(f"Number of missing scans after registration: {len(missing_scans_registration)}")
print(f"Missing scans after registration: {missing_scans_registration}")
missing_scans_brain_extraction = compare_image_sets(cbtn_path_after_registration, cbtn_path_after_brain_extraction)
print(f"Number of missing scans after brain extraction: {len(missing_scans_brain_extraction)}")
print(f"Missing scans after brain extraction: {missing_scans_brain_extraction}")

Number of missing scans after registration: 14
Missing scans after registration: {'C1047222_5759.nii.gz', 'C19188_4132.nii.gz', 'C62730_5329.nii.gz', 'C2647698_3958.nii.gz', 'C62730_8431.nii.gz', 'C883263_6041.nii.gz', 'C1047222_5968.nii.gz', 'C883263_5813.nii.gz', 'C3557160_2096.nii.gz', 'C1003434_3708.nii.gz', 'C3600948_1088.nii.gz', 'C3400212_695.nii.gz', 'C1003434_4182.nii.gz', 'C62730_5975.nii.gz'}
Number of missing scans after brain extraction: 1878
Missing scans after brain extraction: {'C3817551_1771.nii_0000.nii.gz', 'C4065273_3034.nii_0000.nii.gz', 'C88683_1357.nii_0000.nii.gz', 'C4127757_786.nii_0000.nii.gz', 'C3528132_950.nii_0000.nii.gz', 'C123861_9255.nii_0000.nii.gz', 'C140466_7588.nii_0000.nii.gz', 'C801960_5763.nii_0000.nii.gz', 'C3944610_4249.nii_0000.nii.gz', 'C38868_3284.nii_0000.nii.gz', 'C41451_1228.nii_0000.nii.gz', 'C38868_6977.nii_0000.nii.gz', 'C3422967_5355.nii_0000.nii.gz', 'C4256415_3925.nii_0000.nii.gz', 'C431607_5012.nii_0000.nii.gz', 'C36654_2992.nii_000

## CBTN Data (Additional): Rename images in case pp messed up

In [14]:
def rename_files(directory):
    count = 0
    for filename in os.listdir(directory):
        if filename.endswith('.nii_0000.nii.gz'):
            # Construct the new file name
            new_filename = filename.replace('.nii_0000.nii.gz', '_0000.nii.gz')
            # Construct full file paths
            old_file = os.path.join(directory, filename)
            new_file = os.path.join(directory, new_filename)
            # Rename the file
            os.rename(old_file, new_file)
            print(f'Renamed: {filename} to {new_filename}')
            count += 1
    print(f"Renamed {count} files.")

In [16]:
#rename_files(cbtn_path_after_registration)
#rename_files(cbtn_path_after_brain_extraction)

Renamed: C1003434_3593.nii_0000.nii.gz to C1003434_3593_0000.nii.gz
Renamed: C1003434_3825.nii_0000.nii.gz to C1003434_3825_0000.nii.gz
Renamed: C1003434_4205.nii_0000.nii.gz to C1003434_4205_0000.nii.gz
Renamed: C1003434_4260.nii_0000.nii.gz to C1003434_4260_0000.nii.gz
Renamed: C1003434_4355.nii_0000.nii.gz to C1003434_4355_0000.nii.gz
Renamed: C1003434_4457.nii_0000.nii.gz to C1003434_4457_0000.nii.gz
Renamed: C1003434_4599.nii_0000.nii.gz to C1003434_4599_0000.nii.gz
Renamed: C1003434_4817.nii_0000.nii.gz to C1003434_4817_0000.nii.gz
Renamed: C1003434_5048.nii_0000.nii.gz to C1003434_5048_0000.nii.gz
Renamed: C1003434_5331.nii_0000.nii.gz to C1003434_5331_0000.nii.gz
Renamed: C1003434_5706.nii_0000.nii.gz to C1003434_5706_0000.nii.gz
Renamed: C1003557_2317.nii_0000.nii.gz to C1003557_2317_0000.nii.gz
Renamed: C1003680_3385.nii_0000.nii.gz to C1003680_3385_0000.nii.gz
Renamed: C1003680_3394.nii_0000.nii.gz to C1003680_3394_0000.nii.gz
Renamed: C102459_1983.nii_0000.nii.gz to C102459

## CBTN Data: Make Segmentations and files ready for Q&A of clinian 