In [2]:
import os
import pandas as pd
from pathlib import Path

### Main Functions

In [12]:
def get_identifier(file_name):
    """Extracts patient ID and scan ID from the file name."""
    # file format are:
    # imageXYZ_patientID_scanID_mask.nii.gz
    # imageXYZ_patientID_scanID.nii.gz
    parts = file_name.split("_")
    return parts[1], parts[2]

def get_patient_data(clinical_path, histologies_path, file_path) -> pd.DataFrame:
    """
    Get the patient IDs from the clinical data and 
    the histologies data that are present in the segmentation directory.
    """
    #dfs
    df_clinical = pd.read_csv(clinical_path)
    df_histologies = pd.read_csv(histologies_path)
    
    ids_dir = set()
    for file_name in os.listdir(file_path):
        if file_name.endswith("_mask.nii.gz"):
            patient_id, _ = get_identifier(file_name)
            ids_dir.add(patient_id)
    
    # assertation that is known from before
    assert len(ids_dir) == 45
    
    # make sure the IDs are strings
    df_clinical["CBTN Subject ID"] = df_clinical["CBTN Subject ID"].astype(str)
    df_histologies["cohort_participant_id"] = df_histologies["cohort_participant_id"].astype(str)
    
    df_cl = df_clinical[df_clinical["CBTN Subject ID"].isin(ids_dir)]
    assert len(df_cl) == len(ids_dir)

    df_his = df_histologies[df_histologies["cohort_participant_id"].isin(ids_dir)].drop_duplicates("cohort_participant_id", keep='first')

    mismatch_ids_cl = len(df_clinical) - len(df_cl)
    mismatch_ids_his = len(df_histologies) - len(df_his)    
    print(f"Number of unique patient IDs in the original clinical CSV: {len(df_clinical)}")
    print(f"Number of unique patient IDs in the directory: {len(ids_dir)}")
    print(f"Number of unique patient IDs in the filtered clinical CSV: {len(df_cl)}")
    print(f"Number of reduced patient IDs: {mismatch_ids_cl}")
    print(f"Number of unique patient IDs in the original histologies CSV: {len(df_histologies)}")
    print(f"Number of unique patient IDs in the final histologies CSV: {len(df_his)}")
    print(f"Number of reduced patient IDs: {mismatch_ids_his}")    
    
    return df_cl, df_his, ids_dir

def export_ids_and_diagnosis_to_csv(df, filepath):
    """
    Exports patient IDs and pathology free text diagnoses to a new CSV file.
    
    Parameters:
    - df: pandas DataFrame containing patient IDs and 'pathology_free_text_diagnosis'.
    - filepath: String representing the path to the output CSV file.
    """
    columns_to_export = ['cohort_participant_id', 'pathology_diagnosis', 'pathology_free_text_diagnosis', "primary_site"]
    if not set(columns_to_export).issubset(df.columns):
        raise ValueError(f"The DataFrame must contain the following columns: {columns_to_export}")
    df_to_export = df[columns_to_export]
    df_to_export.to_csv(filepath, index=False)
    print(f"Data exported successfully to {filepath}")

### Execution

In [13]:
clinical_data = Path('/home/jc053/GIT/mri_longitudinal_analysis/data/input/clinical/cbtn_filtered_pruned_treatment_513.csv')    
histologies = Path('/home/jc053/GIT/mri_longitudinal_analysis/data/input/clinical/cbtn_histologies.csv')
seg_dir = Path("/mnt/93E8-0534/JuanCarlos/mri-classification-sequences/cbtn_longitudinal_dataset/pre_event/accepted/pre_treatment")
output_dir = Path("/home/jc053/GIT/mri_longitudinal_analysis/data/output/clinical_data")
    
_, histologies_df, _  = get_patient_data(clinical_data, histologies, seg_dir)
export_ids_and_diagnosis_to_csv(histologies_df, os.path.join(output_dir, "cbtn_histologies_data.csv"))
    


Number of unique patient IDs in the original clinical CSV: 513
Number of unique patient IDs in the directory: 45
Number of unique patient IDs in the filtered clinical CSV: 45
Number of reduced patient IDs: 468
Number of unique patient IDs in the original histologies CSV: 6192
Number of unique patient IDs in the final histologies CSV: 9
Number of reduced patient IDs: 6183
Data exported successfully to /home/jc053/GIT/mri_longitudinal_analysis/data/output/clinical_data/cbtn_histologies_data.csv
