In [1]:
%pip install nibabel

Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import re
import pandas as pd
from pathlib import Path
import numpy as np
import shutil
import nibabel as nib
from tqdm import tqdm
from collections import defaultdict
from datetime import datetime

## BCH Data (Additional): Change dates in clinical csv dataset to right format


In [13]:
# Function to convert dates to DD/MM/YYYY format
def convert_date(date_item):
    # Check for NaN values or empty strings and return as is
    if pd.isnull(date_item) or date_item == '':
        return np.nan

    # Convert to string and remove time part if present
    date_str = str(date_item).strip().split(' ')[0]

    # Define date formats for parsing
    date_formats = ("%m/%d/%Y", "%m/%d/%y", "%Y-%m-%d")

    for fmt in date_formats:
        try:
            # Parse date
            dt = datetime.strptime(date_str, fmt)

            if fmt == "%m/%d/%y":
                if dt.year >= 70:  # Cutoff year: 1970
                    dt = dt.replace(year=dt.year + 1900)
                else:
                    dt = dt.replace(year=dt.year + 2000)

            # Check if the month is greater than 12, then swap month and day
            if dt.month > 12:
                dt = dt.replace(month=dt.day, day=dt.month)

            # Convert to desired format
            return "'" + dt.strftime("%d/%m/%Y")
        except ValueError:
            continue  # if the format does not match, it will pass to the next format



# Function to process the Excel file
def process_excel(df, column_name):
    # Read the Excel file
    print("Column: ", column_name)
    # Check if the column exists
    if column_name not in df.columns:
        raise ValueError(f"The column {column_name} does not exist in the Excel file.")

    # Convert all dates in the column to DD/MM/YYYY format
    df[column_name] = df[column_name].apply(lambda x: convert_date(str(x)) if not pd.isnull(x) else x)


In [14]:
dates = ["8/26/82", "10/24/83", "02/10/1987", "07/12/1983"]
converted_dates = [convert_date(d) for d in dates]

for original, converted in zip(dates, converted_dates):
    print(f"Original: {original}, Converted: {converted}")

Original: 8/26/82, Converted: '26/08/3882
Original: 10/24/83, Converted: '24/10/3883
Original: 02/10/1987, Converted: '10/02/1987
Original: 07/12/1983, Converted: '12/07/1983


In [15]:
input_excel_path = '/home/jc053/GIT/mri_longitudinal_analysis/data/redcap/bch_new_redcap.xlsx'
output_excel_path = 'revised_dates.xlsx'
date_column_names = ['Date of Birth', 'Date of last clinical follow-up', 'Date of diagnosis (Path, and MRI if no biopsy)', 'Date of MRI diagnosis','Date of first surgery', 'Date of Systemic Therapy Start','Start Date of Radiation'  ]

# Call the function with the paths and column name
df = pd.read_excel(input_excel_path)
for column in date_column_names:
    process_excel(df, column)
    
# Save the revised dates back to the Excel file
#df.to_excel(output_excel_path, index=False)

Column:  Date of Birth
Column:  Date of last clinical follow-up
Column:  Date of diagnosis (Path, and MRI if no biopsy)
Column:  Date of MRI diagnosis
Column:  Date of first surgery
Column:  Date of Systemic Therapy Start
Column:  Start Date of Radiation


## BCH Data: Get the set of all patients for a given .csv file

In [3]:
def prefix_zeros_to_six_digit_ids(patient_id):
    """
    Adds 0 to the beginning of 6-digit patient IDs.
    """
    str_id = str(patient_id)
    if len(str_id) == 6:
        # print(f"Found a 6-digit ID: {str_id}. Prefixing a '0'.")
        patient_id = "0" + str_id

    else:
        patient_id = str_id
    return patient_id

def extract_unique_patient_ids_from_csv(csv_path):
    cohort = "BCH"
    id_column = "BCH MRN"
    df = pd.read_csv(csv_path)
    ids = set(df[id_column])
 
    print(f"Length of {cohort} cohort is {len(ids)}.")
    return ids
    

In [4]:
csv_path_bch_init = "/home/jc053/GIT/mri_longitudinal_analysis/data/input/clinical/bch_filtered_88.csv"
extract_unique_patient_ids_from_csv(csv_path_bch_init)

Length of BCH cohort is 88.


{135939,
 137476,
 233126,
 238268,
 298046,
 299321,
 335968,
 1013946,
 1017646,
 1058916,
 1138934,
 1153221,
 1194890,
 1232179,
 2001398,
 2004560,
 2088116,
 2088643,
 2103993,
 2104688,
 2113964,
 2124457,
 2147101,
 2158479,
 2173072,
 2183847,
 2249514,
 2260520,
 2261605,
 2280828,
 2297464,
 2306428,
 2316922,
 4015437,
 4052777,
 4092758,
 4098993,
 4108745,
 4132691,
 4137900,
 4155943,
 4159396,
 4179167,
 4192047,
 4228140,
 4252068,
 4303399,
 4304956,
 4305171,
 4318694,
 4319063,
 4345209,
 4348109,
 4362479,
 4393612,
 4416410,
 4446126,
 4450936,
 4455045,
 4478592,
 4489651,
 4505982,
 4565140,
 4571440,
 4572857,
 4624899,
 4635148,
 4636143,
 4647390,
 4690530,
 4695947,
 4791484,
 4802764,
 4803246,
 4857369,
 4923951,
 4931993,
 4975776,
 5002720,
 5019569,
 5029974,
 5046466,
 5048067,
 5132625,
 5208771,
 5238412,
 5412608,
 5531498}

In [7]:
##############################################
# EDIT THESE VARIABLES BEFORE RUNNING SCRIPT #
##############################################
csv_path_old = Path("/home/jc053/GIT/mri_longitudinal_analysis/data/input/clinical/redcap_full_108_cohort.csv")
csv_path_new = Path("/home/jc053/GIT/mri_longitudinal_analysis/data/input/clinical/bch_filtered_88.csv")
##############################################

ids_old = extract_unique_patient_ids_from_csv(csv_path_old)
ids_new = extract_unique_patient_ids_from_csv(csv_path_new)
print(f"These are the old ids:", ids_old)
print(f"These are the new ids:", ids_new)

ids_zero_filled_old = []
ids_zero_filled_new = []
for id in ids_old:
    ids_zero_filled_old.append(prefix_zeros_to_six_digit_ids(id))
for id in ids_new:
    ids_zero_filled_new.append(prefix_zeros_to_six_digit_ids(id))
    
print(f"These are the old ids with 0s added:", ids_zero_filled_old)
print(f"These are the new ids with 0s added:", ids_zero_filled_new)

diff = list(set(ids_zero_filled_old) - set(ids_zero_filled_new))
print(f"The difference between the two lists is:", len(diff))


# Calculating the difference: IDs in the old set but not in the new set
ids_in_old_not_in_new = list(set(ids_zero_filled_old) - set(ids_zero_filled_new))
print(f"IDs in old set but not in new set: {len(ids_in_old_not_in_new)}")
print(ids_in_old_not_in_new)

# Calculating the difference: IDs in the new set but not in the old set
ids_in_new_not_in_old = list(set(ids_zero_filled_new) - set(ids_zero_filled_old))
print(f"IDs in new set but not in old set: {len(ids_in_new_not_in_old)}")
print(ids_in_new_not_in_old)


Length of BCH cohort is 108.
Length of BCH cohort is 88.
These are the old ids: {4624899, 4032520, 4635148, 4857369, 2326050, 4303399, 2260520, 4155943, 4228140, 4923951, 4394032, 4304956, 2184255, 2004560, 5029974, 4690530, 1058916, 2261605, 1182323, 2297464, 4450936, 2316922, 4478592, 4455045, 4393612, 5238412, 2173072, 4565140, 4975776, 233126, 2183847, 2124457, 1109676, 4803246, 1148595, 2088116, 2103993, 1013946, 4572857, 238268, 5046466, 2088643, 5208771, 1153221, 4802764, 4348109, 1053918, 4099295, 4179167, 4073188, 4362479, 1138934, 5412608, 135939, 137476, 5048067, 4305171, 4300567, 4490520, 4864792, 2147101, 1115940, 4052777, 2249514, 1017646, 4571440, 1232179, 4015437, 1132366, 5132625, 4132691, 4092758, 4319063, 5531498, 2104688, 2270579, 4345209, 2280828, 2306428, 4505982, 4485510, 1194890, 4695947, 2158479, 4931993, 4416410, 4022683, 2268068, 4252068, 4466091, 4137900, 2113964, 4446126, 5019569, 4098993, 4489651, 1071544, 973766, 4108745, 5210065, 1144789, 2126809, 464739

## BCH Data: Check that the set of patients has available imaging
Available imgaging is considered when in the folder with the data there is a patient_ID.csv with the paths and metadata and a patient_ID folder with the images.

In [8]:
def check_missing_patient_files_and_folders(patient_ids_set, directory_path):
    # List to store the result strings for missing files/folders
    missing_items = []

    # Iterate over each unique patient ID
    for patient_id in patient_ids_set:
        # Construct the expected csv file and folder name
        expected_csv_file = os.path.join(directory_path, f"{patient_id}.csv")
        expected_folder = os.path.join(directory_path, f"{patient_id}")

        # Check for existence of the expected csv file and folder
        has_csv = os.path.isfile(expected_csv_file)
        has_folder = os.path.isdir(expected_folder)

        # Prepare the result string if items are missing
        if not has_csv or not has_folder:
            missing_str = f"Patient ID: {patient_id} -"
            if not has_csv and not has_folder:
                missing_str += " Both CSV and Folder missing"
            elif not has_csv:
                missing_str += " CSV missing"
            elif not has_folder:
                missing_str += " Folder missing"
            missing_items.append(missing_str)

    return missing_items

In [9]:
##############################################
# EDIT THESE VARIABLES BEFORE RUNNING SCRIPT #
##############################################
data_path = Path("/mnt/an225/Anna/longitudinal_nifti_BCH/curated_BCH/")
missing_patient_items = check_missing_patient_files_and_folders(ids_zero_filled_new, data_path)

# Output the results
if len(missing_patient_items) == 0:
    print("No missing items found.")
else:
    for missing_item in missing_patient_items:
        print(missing_item)
##############################################

No missing items found.


## BCH Data: Pre / post treatment separation
Requirement: Have run the sequence inference before to know the available T2 Sequences

In [16]:
# first take the predictions and the paths and filter according to those
def filter_csv_files(folder_path):
    for filename in os.listdir(folder_path):
        if filename == "ids.csv":
            continue
        if filename.endswith(".csv"):
            csv_path = os.path.join(folder_path, filename)
            df = pd.read_csv(csv_path)

            # Apply the filtering criteria
            filtered_df = df[((df['Prediction'] == 'T2') | df['Path'].str.contains('t2', case=False)) 
                            & ~df['Path'].str.contains('flair', case=False)  
                            & ~df['Path'].str.contains('spine', case=False)
                            & ~df['Path'].str.contains('dti', case=False) 
                            & ~df['Path'].str.contains('trace', case=False)
                            & ~df['Path'].str.contains('tensor', case=False) 
                            & (df['Prediction'] != 'NO PREDICTION - DIMS')
                            & (df['Prediction'] != 'NO PREDICTION - METADATA') 
                            & (df['Prediction'] != 'NO PREDICTION - FILE ERROR ')
                            & ~df['Path'].str.contains('dw', case=False)]


            new_filename = filename.replace(".csv", "_filtered.csv")
            new_path = os.path.join(folder_path, new_filename)
            # Save the filtered DataFrame back to the CSV file
            filtered_df.to_csv(new_path, index=False)
            print(f"Processed and saved: {filename}")

folder_path = Path("/home/jc053/GIT/mri-sequence-classification/data_csv/long/bch_longitudinal_dataset")
filter_csv_files(folder_path)

Processed and saved: 1017646_file_paths.csv
Processed and saved: 5048067_file_paths.csv
Processed and saved: 4132691_file_paths.csv
Processed and saved: 2183847_file_paths.csv
Processed and saved: 4015437_file_paths.csv
Processed and saved: 4108745_file_paths.csv
Processed and saved: 1194890_file_paths.csv
Processed and saved: 5002720_file_paths.csv
Processed and saved: 1232179_file_paths.csv
Processed and saved: 5238412_file_paths.csv
Processed and saved: 4228140_file_paths.csv
Processed and saved: 4252068_file_paths.csv
Processed and saved: 4348109_file_paths.csv
Processed and saved: 4416410_file_paths.csv
Processed and saved: 4052777_file_paths.csv
Processed and saved: 0137476_file_paths.csv
Processed and saved: 4923951_file_paths.csv
Processed and saved: 4393612_file_paths.csv
Processed and saved: 4159396_file_paths.csv
Processed and saved: 2306428_file_paths.csv
Processed and saved: 1013946_file_paths.csv
Processed and saved: 5029974_file_paths.csv
Processed and saved: 4857369_fil

In [17]:
# filter by the treatment date to get all scans before treatment
def get_first_treatment_dates(clinical_csv_path):
    clinical_df = pd.read_csv(clinical_csv_path)
    treatment_dates = {}
    for _, row in clinical_df.iterrows():
        patient_id = row['BCH MRN']
        date_str = row['First Treatment']
        if pd.isna(date_str):  # Check if treatment date is not NaN
            treatment_dates[patient_id] = None  # No treatment date recorded
        else:
            treatment_dates[patient_id] = datetime.strptime(date_str, '%d/%m/%Y')
    return treatment_dates

def filter_by_treatment_date(folder_path, treatment_dates):
    for filename in os.listdir(folder_path):
        if filename.endswith("_filtered.csv"):
            csv_path = os.path.join(folder_path, filename)
            df = pd.read_csv(csv_path)

            def is_before_treatment(row):
                path_parts = row['Path'].split('/')
                patient_id = path_parts[-3]
                treatment_date = treatment_dates.get(patient_id)

                if treatment_date is None:
                    return True
                scan_date = datetime.strptime(path_parts[-2], '%Y%m%d')
                # Compare with treatment date
                return scan_date <= treatment_dates.get(patient_id, datetime.max)

            # Apply the filtering function
            filtered_df = df[df.apply(is_before_treatment, axis=1)]

            new_path = os.path.join(folder_path, filename.replace("_filtered.csv", "_filtered_treatment.csv"))
            # Save the filtered DataFrame back to the CSV file
            filtered_df.to_csv(new_path, index=False)
            print(f"Processed and saved: {filename}")

In [18]:
clinical_csv_path = Path("/home/jc053/GIT/mri_longitudinal_analysis/data/input/clinical/bch_filtered_88.csv")
folder_path = Path("/home/jc053/GIT/mri-sequence-classification/data_csv/long/bch_longitudinal_dataset")
treatment_dates = get_first_treatment_dates(clinical_csv_path)
filter_by_treatment_date(folder_path, treatment_dates)

Processed and saved: 4802764_file_paths_filtered.csv
Processed and saved: 5412608_file_paths_filtered.csv
Processed and saved: 4228140_file_paths_filtered.csv
Processed and saved: 1058916_file_paths_filtered.csv
Processed and saved: 4572857_file_paths_filtered.csv
Processed and saved: 2124457_file_paths_filtered.csv
Processed and saved: 2158479_file_paths_filtered.csv
Processed and saved: 0335968_file_paths_filtered.csv
Processed and saved: 4362479_file_paths_filtered.csv
Processed and saved: 5046466_file_paths_filtered.csv
Processed and saved: 2103993_file_paths_filtered.csv
Processed and saved: 4192047_file_paths_filtered.csv
Processed and saved: 2173072_file_paths_filtered.csv
Processed and saved: 2113964_file_paths_filtered.csv
Processed and saved: 4305171_file_paths_filtered.csv
Processed and saved: 4690530_file_paths_filtered.csv
Processed and saved: 4565140_file_paths_filtered.csv
Processed and saved: 2104688_file_paths_filtered.csv
Processed and saved: 4635148_file_paths_filter

In [19]:
# move filtered files to a new folder
target_dir = Path("/home/jc053/GIT/mri-sequence-classification/data_csv/long/bch_longitudinal_dataset_filtered")
os.makedirs(target_dir, exist_ok=True)
for file in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file)
    if file.endswith("_filtered_treatment.csv"):
        new_file_path = os.path.join(target_dir, file)
        shutil.copyfile(file_path, new_file_path)
        

# count the new number of patients


In [20]:
# Optional if you want to remove the old files
# remove files if they have a _filtered.csv in their name
def remove_filtered_files(folder_path):
    for filename in os.listdir(folder_path):
        if filename.endswith("_filtered.csv") or filename.endswith("_filtered_treatment.csv"):
            file_path = os.path.join(folder_path, filename)
            os.remove(file_path)
            print(f"Removed: {filename}")
remove_filtered_files(folder_path)

Removed: 4802764_file_paths_filtered.csv
Removed: 5412608_file_paths_filtered.csv
Removed: 2183847_file_paths_filtered_treatment.csv
Removed: 4228140_file_paths_filtered.csv
Removed: 1058916_file_paths_filtered.csv
Removed: 4572857_file_paths_filtered.csv
Removed: 2173072_file_paths_filtered_treatment.csv
Removed: 2124457_file_paths_filtered.csv
Removed: 2297464_file_paths_filtered_treatment.csv
Removed: 2158479_file_paths_filtered.csv
Removed: 0335968_file_paths_filtered.csv
Removed: 4362479_file_paths_filtered.csv
Removed: 5046466_file_paths_filtered.csv
Removed: 2103993_file_paths_filtered.csv
Removed: 4695947_file_paths_filtered_treatment.csv
Removed: 4192047_file_paths_filtered.csv
Removed: 2173072_file_paths_filtered.csv
Removed: 1017646_file_paths_filtered_treatment.csv
Removed: 2113964_file_paths_filtered.csv
Removed: 2088643_file_paths_filtered_treatment.csv
Removed: 4505982_file_paths_filtered_treatment.csv
Removed: 4305171_file_paths_filtered.csv
Removed: 4446126_file_paths_

## BCH Data: Obtain the best image out of the _filtered_treatment.csv files

In [21]:
def is_file_openable(filepath):
    """
    Check if a .nii.gz file is openable using nibabel.
    """
    try:
        nib.load(filepath)
        return True
    except Exception:
        return False

def extract_patient_id_and_scan_date(path):
    parts = path.split('/')
    patient_id = parts[-3]
    scan_date = parts[-2]
    return patient_id, scan_date

def parse_resolution(resolution_str):
    # Parse the resolution string and convert to a tuple of floats
    if isinstance(resolution_str, str):
        resolution = tuple(map(float, resolution_str.strip("()").split(',')))
        return resolution
    else:
        return (0.0, 0.0, 0.0)

def has_ax_or_tra(filename):
    return 'ax' in filename.lower() or 'tra' in filename.lower()

def select_best_image_per_scan(csv_path):
    df = pd.read_csv(csv_path)
    # Add columns for patient ID and scan date
    df[['Patient ID', 'Scan Date']] = df.apply(lambda row: pd.Series(extract_patient_id_and_scan_date(row['Path'])), axis=1)
    df['Resolution'] = df['Image Spacing (x,y,z)'].apply(parse_resolution)
    df['AxOrTra'] = df['Path'].apply(has_ax_or_tra)
    df['Openable'] = df['Path'].apply(is_file_openable)
    df = df[df['Openable']]

    df.sort_values(by=['AxOrTra', 'Resolution'], ascending=[False, False], inplace=True)
    best_images_df = df.groupby(['Patient ID', 'Scan Date']).first().reset_index()
    return best_images_df



In [22]:
folder_path = '/home/jc053/GIT/mri-sequence-classification/data_csv/long/bch_longitudinal_dataset_filtered'

for file in os.listdir(folder_path):
    if file.endswith("_filtered_treatment.csv"):
        csv_path = os.path.join(folder_path, file)
        best_images_df = select_best_image_per_scan(csv_path)
        best_images_file = os.path.join(folder_path, file.replace("_filtered_treatment.csv", "_filtered_treatment_best.csv"))
        best_images_df.to_csv(best_images_file, index=False)



In [23]:
remove_filtered_files(folder_path)

Removed: 2183847_file_paths_filtered_treatment.csv
Removed: 2173072_file_paths_filtered_treatment.csv
Removed: 2297464_file_paths_filtered_treatment.csv
Removed: 4695947_file_paths_filtered_treatment.csv
Removed: 1017646_file_paths_filtered_treatment.csv
Removed: 2088643_file_paths_filtered_treatment.csv
Removed: 4505982_file_paths_filtered_treatment.csv
Removed: 4446126_file_paths_filtered_treatment.csv
Removed: 4931993_file_paths_filtered_treatment.csv
Removed: 4975776_file_paths_filtered_treatment.csv
Removed: 4319063_file_paths_filtered_treatment.csv
Removed: 1232179_file_paths_filtered_treatment.csv
Removed: 2001398_file_paths_filtered_treatment.csv
Removed: 2260520_file_paths_filtered_treatment.csv
Removed: 4305171_file_paths_filtered_treatment.csv
Removed: 4092758_file_paths_filtered_treatment.csv
Removed: 1194890_file_paths_filtered_treatment.csv
Removed: 2261605_file_paths_filtered_treatment.csv
Removed: 4098993_file_paths_filtered_treatment.csv
Removed: 1153221_file_paths_fil

In [24]:
def check_unique_scan_dates_per_patient(csv_path):
    df = pd.read_csv(csv_path)

    # Group by Patient ID and check for duplicates in Scan Date
    grouped = df.groupby('Patient ID')
    duplicate_dates = {patient_id: scans['Scan Date'].duplicated().any() 
                       for patient_id, scans in grouped}

    # Patients with duplicate scan dates
    patients_with_duplicates = [patient for patient, has_duplicates in duplicate_dates.items() if has_duplicates]

    if patients_with_duplicates:
        print(f"Patient with duplicate scan dates: {patients_with_duplicates}")
        return False
    else:
        return True

def check_all_files_in_folder(folder_path):
    all_unique = True
    for file in os.listdir(folder_path):
        if file.endswith(".csv"):
            csv_path = os.path.join(folder_path, file)
            if not check_unique_scan_dates_per_patient(csv_path):
                all_unique = False
    return all_unique

all_files_checked = check_all_files_in_folder(folder_path)
if all_files_checked:
    print("All files have unique scan dates per patient.")
else:
    print("There are files with duplicate scan dates.")

All files have unique scan dates per patient.


In [25]:
# move files from the .csv's to a new folder
target_dir = Path("/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/bch_longitudinal_dataset")
folder_path = Path('/home/jc053/GIT/mri-sequence-classification/data_csv/long/bch_longitudinal_dataset_filtered')
def move_best_images(csv_path, target_dir):
    os.makedirs(target_dir, exist_ok=True)

    # Read the CSV file
    df = pd.read_csv(csv_path)

    # Loop through each row and move the file
    for _, row in df.iterrows():
        source_path = row['Path']  # Adjust this if the column name is different
        filename = os.path.basename(source_path)
        patient_id = source_path.split('/')[-3]
        scan_id = source_path.split('/')[-2]
        new_filename = f"{patient_id}_{scan_id}_{filename}"
        destination_path = os.path.join(target_dir, new_filename)

        # Move the file
        shutil.copyfile(source_path, destination_path)
        # print(f"Copied {source_path} to {destination_path}")

for file in tqdm(os.listdir(folder_path), desc="Copying files"):
    if file.endswith("_filtered_treatment_best.csv"):
        csv_path = os.path.join(folder_path, file)
        move_best_images(csv_path, target_dir)

Copying files: 100%|██████████| 88/88 [02:53<00:00,  1.97s/it]


## BCH Data: Manual Review of sequences
Requirement: review all of the data with Slicer and SegmentationReview.

In [None]:
# get the csv with the data to review
def read_file(csv_file):
    """
    Read and return a dataframe from a CSV file.

    Args:
        csv_file (str): Path to the CSV file.

    Returns:
        pd.DataFrame: Loaded dataframe with the CSV data.
    """
    column_names = ["Image_Name", "Quality", "Empty"]
    d_f = pd.read_csv(csv_file, names=column_names, header=None, delimiter=',')
    print(d_f.head(5))
    return d_f

def delete_entries(df):
    print(df["Quality"].unique())
    cleaned_df = df[df["Quality"] ==5]
    print(cleaned_df.head(5))
    cleaned_df.to_csv("cleaned.csv", index=False, header=False)

In [None]:
csv_path = "/home/jc053/GIT/mri_longitudinal_analysis/mri_longitudinal_analysis/utils/annotations_after_1st_review_converted.csv"
df_cbtn = read_file(csv_path)
delete_entries(df_cbtn)

## BCH Data Patient IDs after 1st and 2nd review
Have run the review_t2w script in order to separate the folders.

In [4]:
unique_ids_1st_review = set()
unique_ids_2nd_review = set()
# Function to process a single directory
def process_directory(directory, unique_set):
    for filename in os.listdir(directory):
        if filename.endswith(".nii.gz"):
            # Extracting the part before the first underscore
            patient_id = filename.split('_')[0]
            # Check if the ID starts with 'C' followed by digits
            if len(patient_id) == 7:
                unique_set.add(patient_id)

def extract_unique_patient_ids(dir1, dir2, unique_set):
    # Process each directory
    process_directory(dir1, unique_set)
    process_directory(dir2, unique_set)

    return unique_set

# Example usage
dir1 = "/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/bch_longitudinal_dataset/new_review/1"  
dir2 = "/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/bch_longitudinal_dataset/new_review/5"
dir3 = "/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/bch_longitudinal_dataset/new_review/5 - Found replacement"

unique_patient_ids_after_1st_review = extract_unique_patient_ids(dir1, dir2, unique_ids_1st_review)
unique_patient_ids_after_2nd_review = extract_unique_patient_ids(dir1, dir3, unique_ids_2nd_review)

print(f"Unique patient IDs after 1st review: {len(unique_patient_ids_after_1st_review)}")
print(f"Unique patient IDs after 2nd review: {len(unique_patient_ids_after_2nd_review)}")

Unique patient IDs after 1st review: 88
Unique patient IDs after 2nd review: 88


## BCH Data: Filter patients with less than 3 scans and move to fold for pp

In [45]:
def has_minimum_sessions(source_dirs, min_sessions=3):
    """
    Checks if each patient has at least a minimum number of session IDs across the source directories.

    Parameters:
    source_dirs (list): List of source directory paths.
    min_sessions (int): Minimum number of sessions required for each patient.

    Returns:
    set: Set of patient IDs that meet the criteria.
    """
    session_count = defaultdict(set)
    for source_dir in source_dirs:
        for filename in os.listdir(source_dir):
            if filename.endswith(".nii.gz"):
                parts = filename.split('_')
                if len(parts) >= 2 and parts[0][1:].isdigit():
                    patient_id, session_id = parts[0], parts[1]
                    session_count[patient_id].add(session_id)

    # Filter patients who meet the minimum session requirement
    return {patient for patient, sessions in session_count.items() if len(sessions) >= min_sessions}

# copy patients after 2nd review and which meet the criteria of at least 3 folders
def copy_files_from_dirs_to_new_dir(source_dirs, target_dir, valid_patient_ids):
    """
    Copy files from the given source directories to the target directory.
    """
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
    total_files = sum([len(os.listdir(source_dir)) for source_dir in source_dirs])
    with tqdm(total= total_files,desc="Copying files") as pbar:
        for source_dir in source_dirs:
            for filename in os.listdir(source_dir):
                if filename.endswith(".nii.gz"):
                    patient_id = filename.split('_')[0]
                    if patient_id in valid_patient_ids:
                        source_file = os.path.join(source_dir, filename)
                        if os.path.isfile(source_file):
                            target_file = os.path.join(target_dir, filename)
                            shutil.copyfile(source_file, target_file)
                            pbar.update(1)  
                
source_dirs = [dir1, dir3]
target_dir = "/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/bch_longitudinal_dataset/new_review/after_review_before_pp"
valid_patient_ids = has_minimum_sessions(source_dirs)
copy_files_from_dirs_to_new_dir(source_dirs, target_dir, valid_patient_ids)

unique_ids_after_3_scan_filtering = set()
process_directory(target_dir, unique_ids_after_3_scan_filtering)
print(f"Unique patient IDs after 3 scan filtering: {len(unique_ids_after_3_scan_filtering)}")

Copying files: 100%|█████████▉| 1409/1414 [00:17<00:00, 80.41it/s]

Unique patient IDs after 3 scan filtering: 85





## BCH Data (Additional): In case a segmentation already review took place, only analyze the truly new scans

In [14]:


import os
import shutil

# Replace these paths with the actual paths of your directories
directory1_path = "/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/bch_longitudinal_dataset/new_review/after_review_before_pp/output/seg_predictions"
directory2_path = "/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/bch_longitudinal_dataset/old_seg_accepted"
qa_dir = "/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/bch_longitudinal_dataset/new_review/to_pp_before_qa"
final_dir = "/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/bch_longitudinal_dataset/new_review/final"
additional_dir = "/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/bch_longitudinal_dataset/old_seg_accepted/no_longer_included"

# Ensure the output directories exist
# os.makedirs(qa_dir, exist_ok=True)
# os.makedirs(final_dir, exist_ok=True)
# os.makedirs(additional_dir, exist_ok=True)

# Function to extract patientID and scanID
def get_identifier(file_name, from_directory1=True):
    parts = file_name.split('_')
    if from_directory1:
        # From the first directory (patientID_ScanID_hash.nii.gz)
        return parts[0], parts[1]
    else:
        # From the second directory (imageXX_patientID_scanID.nii.gz)
        return parts[1], parts[2]

# Extract identifiers from both directories
identifiers_dir1 = set()
ids_dir1 = set()
for file_name in os.listdir(directory1_path):
    if file_name.endswith(".nii.gz"):
        patid, scanid = get_identifier(file_name)
        identifiers_dir1.add((patid, scanid))
        ids_dir1.add(patid)

identifiers_dir2 = set()
ids_dir2 = set()
for file_name in os.listdir(directory2_path):
    if not file_name.endswith("_mask.nii.gz"):
        patid, scanid = get_identifier(file_name, from_directory1=False)
        identifiers_dir2.add((patid, scanid))
        ids_dir2.add(patid)

print("Number of total scans in dir 1:", len(identifiers_dir1))
print(f"Number of unique patients in Directory 1: {len(ids_dir1)}")

print("Numebr of total scans in dir 2:", len(identifiers_dir2))
print(f"Number of unique patients in Directory 2: {len(ids_dir2)}")

unique_to_dir1 = identifiers_dir1 - identifiers_dir2
unique_to_dir2 = identifiers_dir2 - identifiers_dir1
common_identifiers = identifiers_dir1.intersection(identifiers_dir2)
common_patients = ids_dir1.intersection(ids_dir2)
print(f"Unique identifiers to Directory 1: {len(unique_to_dir1)}")
print(f"Unique identifiers to Directory 2: {len(unique_to_dir2)}")
print(f"Common identifiers between the directories: {len(common_identifiers)}")
print(f"Common patients between the directories: {len(common_patients)}")

# Compare and copy files
# for file_name in tqdm(os.listdir(directory1_path), desc="Processing Directory 2 QA and FINAL"):
#     if file_name.endswith(".nii.gz"):
#         identifier = get_identifier(file_name)
#         if identifier in identifiers_dir2:
#             final_path = os.path.join(final_dir, file_name)
#             shutil.copyfile(os.path.join(directory1_path, file_name), final_path)
#         else:
#             qa_path = os.path.join(qa_dir, file_name)
#             shutil.copyfile(os.path.join(directory1_path, file_name), qa_path)

# Count files in the 'final' and 'qa' folders
#final_count = len([name for name in os.listdir(final_dir) if name.endswith(".nii.gz")])
#qa_count = len([name for name in os.listdir(qa_dir) if name.endswith(".nii.gz")])


#print(f"Total files in 'final': {final_count}")
#print(f"Total files in 'qa': {qa_count}")
#print(f"Sum of 'final' and 'qa': {final_count + qa_count}")

# Check if the sum matches
#if dir1_count == final_count + qa_count:
    #print("The sum of files in 'final' and 'qa' matches the number of files in Directory 1.")
#else:
    #print("There is a mismatch in the file counts.")




Number of total scans in dir 1: 1407
Number of unique patients in Directory 1: 85
Numebr of total scans in dir 2: 1205
Number of unique patients in Directory 2: 91
Unique identifiers to Directory 1: 1320
Unique identifiers to Directory 2: 1118
Common identifiers between the directories: 87
Common patients between the directories: 64


## BCH Data: Count patients before and after segmentation review

In [9]:
def count_patient_ids(folder_path):
    """counts the number of unique patient IDs in a given folder based on the filenames."""
    # Regular expression to match the patient ID in the file name
    pattern = re.compile(r"_(\d{7})_")

    # Set to store unique patient IDs
    patient_ids = set()

    # Iterate over files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".nii.gz"):
            match = pattern.search(filename)
            if match:
                patient_ids.add(match.group(1))

    # Return the count of unique patient IDs
    return patient_ids

In [10]:
folder_path_bch_before_review = "/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/bch_longitudinal_dataset/all_files_before_review"
folder_path_bch_after_review = "/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/bch_longitudinal_dataset/accepted"
pat_ids_before_review = count_patient_ids(folder_path_bch_before_review)
pat_ids_after_review = count_patient_ids(folder_path_bch_after_review)
print("IDs before review:",len(pat_ids_before_review))
print("IDs after review:", len(pat_ids_after_review))

IDs before review: 100
IDs after review: 88


In [11]:
overlapping_ids_before = pat_ids_before_review.intersection(ids_zero_filled_new)
overlapping_ids_after = pat_ids_after_review.intersection(ids_zero_filled_new)
print(f"Overlapping patient IDs Before: {len(overlapping_ids_before)}")
print(f"Overlapping patient IDs After: {len(overlapping_ids_after)}")

Overlapping patient IDs Before: 98
Overlapping patient IDs After: 86


## ------------------------------------------------------------------------------------------------------

## CBTN Data: Review
Requirement: Have run the cbtn_parsing.py script before. Reviewed all of the data after the parsing script with Slicer and SegmentationReview.

In [17]:
# get the csv with the data to review
def read_file(csv_file):
    """
    Read and return a dataframe from a CSV file.

    Args:
        csv_file (str): Path to the CSV file.

    Returns:
        pd.DataFrame: Loaded dataframe with the CSV data.
    """
    column_names = ["Image_Name", "Quality", "Empty"]
    d_f = pd.read_csv(csv_file, names=column_names, header=None, delimiter=',')
    print(d_f.head(5))
    return d_f

def delete_entries(df):
    print(df["Quality"].unique())
    cleaned_df = df[df["Quality"] ==5]
    print(cleaned_df.head(5))
    cleaned_df.to_csv("cleaned.csv", index=False, header=False)

In [18]:
csv_path = "/home/jc053/GIT/mri_longitudinal_analysis/mri_longitudinal_analysis/utils/annotations_after_1st_review_converted.csv"
df_cbtn = read_file(csv_path)
delete_entries(df_cbtn)

                               Image_Name  Quality  Empty
0     0135939_19990319_2_ax_fse_t2.nii.gz        1    NaN
1     0135939_19990420_2_ax_fse_t2.nii.gz        1    NaN
2  0135939_19990811_2_axial_fse_t2.nii.gz        1    NaN
3  0135939_19991201_2_axial_fse_t2.nii.gz        1    NaN
4     0135939_20000405_2_ax_fse_t2.nii.gz        1    NaN
[1 5]
                                   Image_Name  Quality  Empty
9      0135939_20070517_8_axial_t2_tse.nii.gz        5    NaN
10            0135939_20070520_5_ax_t2.nii.gz        5    NaN
11     0135939_20090514_15_c_ax_t1_tsp.nii.gz        5    NaN
12    0135939_20120926_22_ax_t2_t-sp_7.nii.gz        5    NaN
13  0135939_20130311_22_ax_t2_frfse_t8.nii.gz        5    NaN


## CBTN Data: Patient IDs after Pre / Post Separation

In [17]:
def extract_unique_patient_ids_from_csv(csv_path):
    cohort = "CBTN"
    id_column = "Patient_ID"
    df = pd.read_csv(csv_path)
    ids = set(df[id_column])
 
    print(f"Length of {cohort} cohort is {len(ids)}.")
    
csv_path_cbtn = "/home/jc053/GIT/mri_longitudinal_analysis/data/output/clinical_data/cbtn_pre_event.csv"
extract_unique_patient_ids_from_csv(csv_path_cbtn)


Length of CBTN cohort is 440.


## CBTN Data: Patient IDs after 1st and 2nd review

In [31]:
unique_ids_1st_review = set()
unique_ids_2nd_review = set()
# Function to process a single directory
def process_directory(directory, unique_set):
    for filename in os.listdir(directory):
        if filename.endswith(".nii.gz"):
            # Extracting the part before the first underscore
            patient_id = filename.split('_')[0]
            # Check if the ID starts with 'C' followed by digits
            if patient_id.startswith('C') and patient_id[1:].isdigit():
                unique_set.add(patient_id)

def extract_unique_patient_ids(dir1, dir2, dir3, unique_set):
    # Process each directory
    process_directory(dir1, unique_set)
    process_directory(dir2, unique_set)
    process_directory(dir3, unique_set)

    return unique_set

# Example usage
dir1 = "/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/cbtn_longitudinal_dataset/pre_event/1"  
dir2 = "/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/cbtn_longitudinal_dataset/pre_event/2"
dir3 = "/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/cbtn_longitudinal_dataset/pre_event/5"
dir4 = "/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/cbtn_longitudinal_dataset/pre_event/2 - Found replacement"
dir5 = "/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/cbtn_longitudinal_dataset/pre_event/5 - Found replacement"

unique_patient_ids_after_1st_review = extract_unique_patient_ids(dir1, dir2, dir3, unique_ids_1st_review)
unique_patient_ids_after_2nd_review = extract_unique_patient_ids(dir1, dir4, dir5, unique_ids_2nd_review)

print(f"Unique patient IDs after 1st review: {len(unique_patient_ids_after_1st_review)}")
print(f"Unique patient IDs after 2nd review: {len(unique_patient_ids_after_2nd_review)}")


Unique patient IDs after 1st review: 440
Unique patient IDs after 2nd review: 437


## CBTN Data: Filter Patients with less than 3 sessions / scans and move to folder before pp

In [34]:
def has_minimum_sessions(source_dirs, min_sessions=3):
    """
    Checks if each patient has at least a minimum number of session IDs across the source directories.

    Parameters:
    source_dirs (list): List of source directory paths.
    min_sessions (int): Minimum number of sessions required for each patient.

    Returns:
    set: Set of patient IDs that meet the criteria.
    """
    session_count = defaultdict(set)
    for source_dir in source_dirs:
        for filename in os.listdir(source_dir):
            if filename.endswith(".nii.gz"):
                parts = filename.split('_')
                if len(parts) >= 2 and parts[0].startswith('C') and parts[0][1:].isdigit():
                    patient_id, session_id = parts[0], parts[1]
                    session_count[patient_id].add(session_id)

    # Filter patients who meet the minimum session requirement
    return {patient for patient, sessions in session_count.items() if len(sessions) >= min_sessions}

# copy patients after 2nd review and which meet the criteria of at least 3 folders
def copy_files_from_dirs_to_new_dir(source_dirs, target_dir, valid_patient_ids):
    """
    Copy files from the given source directories to the target directory.
    """
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
    total_files = sum([len(os.listdir(source_dir)) for source_dir in source_dirs])
    with tqdm(total= total_files,desc="Copying files") as pbar:
        for source_dir in source_dirs:
            for filename in os.listdir(source_dir):
                if filename.endswith(".nii.gz"):
                    patient_id = filename.split('_')[0]
                    if patient_id in valid_patient_ids:
                        source_file = os.path.join(source_dir, filename)
                        if os.path.isfile(source_file):
                            target_file = os.path.join(target_dir, filename)
                            shutil.copyfile(source_file, target_file)
                            pbar.update(1)  
                
source_dirs = [dir1, dir4, dir5]
target_dir = "/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/cbtn_longitudinal_dataset/pre_event/after_review_before_pp"
valid_patient_ids = has_minimum_sessions(source_dirs)
copy_files_from_dirs_to_new_dir(source_dirs, target_dir, valid_patient_ids)

Copying files: 100%|█████████▉| 1840/1841 [00:11<00:00, 162.35it/s]


In [35]:
unique_ids_after_3_scan_filtering = set()
process_directory(target_dir, unique_ids_after_3_scan_filtering)
print(f"Unique patient IDs after 3 scan filtering: {len(unique_ids_after_3_scan_filtering)}")

Unique patient IDs after 3 scan filtering: 115


## CBTN Data: PP comparison

In [12]:
def compare_image_sets(path1, preprocessed_path):
    # Get the set of image file names in the first path
    images_set1 = {file for file in os.listdir(path1) if file.endswith('.nii.gz')}
    
    # Get the set of image file names in the second path
    preprocessed_images = {file.replace('.nii_0000.nii.gz', '.nii.gz') for file in os.listdir(preprocessed_path) if '.nii_0000.nii.gz' in file}    
    
    # Find the difference between the two sets
    missing_images = images_set1.difference(preprocessed_images)
    
    # Return the set of missing image file names
    return missing_images

In [13]:
cbtn_path_before_pp = "/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/cbtn_longitudinal_dataset/pre_event/after_review_before_pp"
cbtn_path_after_registration = "/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/cbtn_longitudinal_dataset/pre_event/output/T2W_registration/"
cbtn_path_after_brain_extraction = "/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/cbtn_longitudinal_dataset/pre_event/output/T2W_brain_extraction/"
missing_scans_registration = compare_image_sets(cbtn_path_before_pp, cbtn_path_after_registration)
print(f"Number of missing scans after registration: {len(missing_scans_registration)}")
print(f"Missing scans after registration: {missing_scans_registration}")
missing_scans_brain_extraction = compare_image_sets(cbtn_path_after_registration, cbtn_path_after_brain_extraction)
print(f"Number of missing scans after brain extraction: {len(missing_scans_brain_extraction)}")
print(f"Missing scans after brain extraction: {missing_scans_brain_extraction}")

Number of missing scans after registration: 14
Missing scans after registration: {'C1047222_5759.nii.gz', 'C19188_4132.nii.gz', 'C62730_5329.nii.gz', 'C2647698_3958.nii.gz', 'C62730_8431.nii.gz', 'C883263_6041.nii.gz', 'C1047222_5968.nii.gz', 'C883263_5813.nii.gz', 'C3557160_2096.nii.gz', 'C1003434_3708.nii.gz', 'C3600948_1088.nii.gz', 'C3400212_695.nii.gz', 'C1003434_4182.nii.gz', 'C62730_5975.nii.gz'}
Number of missing scans after brain extraction: 1878
Missing scans after brain extraction: {'C3817551_1771.nii_0000.nii.gz', 'C4065273_3034.nii_0000.nii.gz', 'C88683_1357.nii_0000.nii.gz', 'C4127757_786.nii_0000.nii.gz', 'C3528132_950.nii_0000.nii.gz', 'C123861_9255.nii_0000.nii.gz', 'C140466_7588.nii_0000.nii.gz', 'C801960_5763.nii_0000.nii.gz', 'C3944610_4249.nii_0000.nii.gz', 'C38868_3284.nii_0000.nii.gz', 'C41451_1228.nii_0000.nii.gz', 'C38868_6977.nii_0000.nii.gz', 'C3422967_5355.nii_0000.nii.gz', 'C4256415_3925.nii_0000.nii.gz', 'C431607_5012.nii_0000.nii.gz', 'C36654_2992.nii_000

## CBTN Data (Additional): Rename images in case pp messed up

In [14]:
def rename_files(directory):
    count = 0
    for filename in os.listdir(directory):
        if filename.endswith('.nii_0000.nii.gz'):
            # Construct the new file name
            new_filename = filename.replace('.nii_0000.nii.gz', '_0000.nii.gz')
            # Construct full file paths
            old_file = os.path.join(directory, filename)
            new_file = os.path.join(directory, new_filename)
            # Rename the file
            os.rename(old_file, new_file)
            print(f'Renamed: {filename} to {new_filename}')
            count += 1
    print(f"Renamed {count} files.")

In [16]:
#rename_files(cbtn_path_after_registration)
#rename_files(cbtn_path_after_brain_extraction)

Renamed: C1003434_3593.nii_0000.nii.gz to C1003434_3593_0000.nii.gz
Renamed: C1003434_3825.nii_0000.nii.gz to C1003434_3825_0000.nii.gz
Renamed: C1003434_4205.nii_0000.nii.gz to C1003434_4205_0000.nii.gz
Renamed: C1003434_4260.nii_0000.nii.gz to C1003434_4260_0000.nii.gz
Renamed: C1003434_4355.nii_0000.nii.gz to C1003434_4355_0000.nii.gz
Renamed: C1003434_4457.nii_0000.nii.gz to C1003434_4457_0000.nii.gz
Renamed: C1003434_4599.nii_0000.nii.gz to C1003434_4599_0000.nii.gz
Renamed: C1003434_4817.nii_0000.nii.gz to C1003434_4817_0000.nii.gz
Renamed: C1003434_5048.nii_0000.nii.gz to C1003434_5048_0000.nii.gz
Renamed: C1003434_5331.nii_0000.nii.gz to C1003434_5331_0000.nii.gz
Renamed: C1003434_5706.nii_0000.nii.gz to C1003434_5706_0000.nii.gz
Renamed: C1003557_2317.nii_0000.nii.gz to C1003557_2317_0000.nii.gz
Renamed: C1003680_3385.nii_0000.nii.gz to C1003680_3385_0000.nii.gz
Renamed: C1003680_3394.nii_0000.nii.gz to C1003680_3394_0000.nii.gz
Renamed: C102459_1983.nii_0000.nii.gz to C102459

## CBTN Data: Histologies