# Introduction
In this notebook, I will describe all the steps that I have taken to make a new dataset for Weekly CTs.

Basically, the process contains five different steps:

1. Navigation of the folder in which one think there maybe any weeklyCTs. These folders can be on this computer or a user can just make these folders by downloading new patients from MIRADA or other UMCG datasets.

2. Extracting only weeklyCTs from these folders and make an excel file from them.

3. Transferring the new-founded weeklyCTs into a destination folder (it can be an existing folder for the weeklyCTs or a new folder).

4. Making a report excel file of some information about the weeklyCTs in the destination file and some clinical information from the patients who have these weeklyCTs.

5. Making a pannel that contains different information about the WeeklyCT dataset.

6. A Watchdog is keep the track of all the additions to the destination folder, and save them in a log file.

In [2]:
# General Libraries
import glob
import os
import shutil
import math
import re
import numpy as np
import pandas as pd
from random import randint
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from datetime import time, datetime, date

# DICOM Libraries
import pydicom as pdcm
from pydicom.tag import Tag

# 1. Navigation Phase
### DICOM Files
All kinds of CTs were stored in the form of DICOM files. DICOM, which stands for Digital Imaging and Communications in Medicine, is a standard for transmitting, storing, and sharing medical images. DICOM files contain information about medical images, such as X-rays, CT scans, MRIs, and ultrasound. This standard ensures the interoperability of medical imaging equipment from different manufacturers. Some key features are:

**Metadata:** DICOM files store not only the pixel data of the medical images but also a wealth of metadata. This metadata includes patient information, imaging device details, acquisition parameters, and more.

**Interoperability:** DICOM enables the exchange of medical images and related information between different devices and systems. This interoperability is crucial in healthcare settings where various imaging modalities and equipment are used.

**Structured Data:** DICOM files use a structured format for information, allowing for consistency and ease of interpretation by different systems. This makes it possible for healthcare professionals to access and understand the data regardless of the equipment used to capture or generate the images.

For information of different tags and the definitions one can use the following links: [Wiki](https://en.wikipedia.org/wiki/DICOM), [link](https://dicom.innolitics.com/ciods)


In [13]:
def get_folder_name(image, subf):

    # find the name of the folder
    try:
        folder_name = image[Tag(0x0008103e)].value

    except:
        study = image[Tag(0x00081030)].value
        patient_id = image[Tag(0x00100020)].value
        print(f'Warning: folder {study} with {patient_id} ID does NOT have Series Description')
        folder_name = subf.split('\\')[-1]  

    return folder_name

def get_patient_id(image):

    # Extract the patient ID
    try:
        patient_id = image[Tag(0x00100020)].value

    except:
        print(f'Warning: There is NO patient ID')
        patient_id = None

    return patient_id

def get_probable_weklyct_name(name, number, names_list):

    lowercase_name = name.lower()

    # Search to find 'rct' or 'w' with a number
    if ('rct' in lowercase_name or 'w' in lowercase_name) and re.search(r'\d', name):
        saver = name

    elif 'wk..' in lowercase_name and not re.search(r'\d', name):
        saver = name

    # Check if 'w' is in 'j' and the next element in 'sep_names' is an integer
    elif 'w' in lowercase_name and number + 1 < len(names_list) and not re.search(r'\d', name):

        if '2.0' not in names_list[number + 1] and '2,' not in names_list[number + 1]:
            saver = name + str(names_list[number + 1])

    elif re.search('rct.*[..]|rct.*[#]', lowercase_name) and not re.search(r'\d', name):
        saver = name
    
    else:
        saver = None 

    return saver    
    
def get_hd_fov(name):

    lowercase_name = name.lower()
    # Search whether there is 'hd' or 'fov' in j
    if 'hd' in lowercase_name or 'fov' in lowercase_name:
        hd_fov = 1 
    
    else:
        hd_fov = 0
    
    return hd_fov

def get_fraction(name):

    lowercase_name = name.lower()

    # Find the fraction number
    if 'rct' in lowercase_name and re.search(r'\d', name):
        fraction = int(re.findall(r'\d+', name)[0])
    
    else:
        fraction = None
    
    return fraction

def get_date_information(image):

    # Extract the date, the week day, and the week number from study date time
    try:
        study_datetime_CT = datetime.strptime(image[Tag(0x00080020)].value ,"%Y%m%d")
        date_info = study_datetime_CT.date()
        weekday = study_datetime_CT.weekday() + 1
        week_num = study_datetime_CT.isocalendar().week
    except:
        date_info = None
        weekday = None
        week_num = None 
    
    return date_info, weekday, week_num

def get_slice_thickness(image):
    
    # Extract slice thickness
    try:
        slice_thickness = image['00180050'].value
    except:
        slice_thickness = None
    
    return slice_thickness

def get_contrast(image):

    # Extract contrast information
    try:
        image[Tag(0x00180010)].value
        contrast=1

    except:
        contrast=0
    
    return contrast

def get_pixel_spacing(image):

    # Extract pixel spacing
    try:
        pixel_spacing = image[Tag(0x00280030)].value
    except:
        pixel_spacing = None
    
    return pixel_spacing

def get_ref_uid(image):

    # Extract UID
    try:
        uid = image['00200052'].value
    except:
        uid = None
    
    return uid

In [16]:
def navigate_folder(path_folder, output_path, file_name):

    # Add in config
    exclusion_set = {'detail', 'ac_ct', 'ld_ct', 'ld ct', 'ac ct'} # CONFIG File
    min_slice_num = 50 # CONFIG File
    modality = 'CT' # CONFIG File

    # Make a group to save all the information
    group = list()

    for r, d, f in os.walk(path_folder):
        # make a list from all the directories 
        subfolders = [os.path.join(r, folder) for folder in d]

        for subf in subfolders:
            # number of slices (images) in each DICOM folder, and the name of the folders
            slice_num = len(glob.glob(subf+"/*.DCM"))

            # find whether subf is a path and the number of .DCM images is more than 50
            if slice_num > min_slice_num:

                # Extract the information of the image 
                image=pdcm.dcmread(glob.glob(subf+"/*.DCM")[0],force=True)
                folder_name = get_folder_name(image, subf)
    
                # Extract the CTs
                if image.Modality == modality and all(keyword not in folder_name.lower() for keyword in exclusion_set):
   
                    patient_id = get_patient_id(image)

                    # split the name of the folder into strings of information
                    names_list = folder_name.split()
            
                    print(patient_id, folder_name)

                    # Find different information
                    for number, name in enumerate(names_list):
                        saver = get_probable_weklyct_name(name, number, names_list) 
                        hd_fov = get_hd_fov(name)
                        fraction = get_fraction(name)

                    date_info, weekday, week_num = get_date_information(image)
                    slice_thickness = get_slice_thickness(image)
                    contrast = get_contrast(image)
                    pixel_spacing = get_pixel_spacing(image)
                    uid = get_ref_uid(image)

                    # Add the information of this group to the total dataset
                    group.append({
                                'ID': patient_id, 'folder_name': folder_name, 'date': date_info,
                                'week_day': weekday, 'week_num': week_num, 'info_header': saver,
                                'fraction': fraction, 'HD_FoV': hd_fov, 'slice_thickness': slice_thickness,
                                'num_slices': slice_num, 'pixel_spacing': pixel_spacing, 'contrast': contrast,
                                'UID': uid, 'path': subf
                                })
    
    # Make a datafrme from the main folder
    df = pd.DataFrame(group)

    # Save the dataframe
    df.to_csv(os.path.join(output_path,file_name), index=False)

    return df

In [None]:
path_folder = '//zkh/appdata/RTDicom/Projectline_HNC_modelling/OPC_data/DICOM_data_organized'
output_path = '//zkh/appdata/RTDicom/Projectline_HNC_modelling/OPC_data/DICOM_data_organized'
file_name = 'output.csv'
df = navigate_folder(path_folder, output_path, file_name)

Based on our knowledge about weeklyCTs, we know that they are only available after 2014, so we can just remove the patients before this specific time. Moreover, since this program just navigate all the folders, there may be some duplicated data in those folders, so I need to erase them from the dataset.

In [None]:
def clean_dataframe(df):
    """
    clean the dataset
    """
    df_copy = df.copy()

    # Slice the part of the dataset after the mentioned time.
    time_limit = pd.Timestamp('2014-01-01') # CONFIG File
    df_copy = df_copy[pd.to_datetime(df_copy.date) < time_limit]

    # Drop the doplicated folders
    df_copy = df_copy.drop_duplicates(subset=['ID', 'folder_name', 'date'],
                                       keep='first', inplace=False, ignore_index=True)
    
    # Erase the hours from 'date' column
    new_date_column = pd.Series([duration.date() for duration in df_copy.date])
    df_copy['date'] = new_date_column

    return df_copy

In [None]:
df = clean_dataframe(df)

In this stage, I will drop all the remained CTs that are not WeeklyCTs.

In [None]:
def find_matching_header(info_headers):
    for header in info_headers:
        try:
            lowercase_header = header.lower()

            if any(keyword in lowercase_header for keyword in ['rct', 'w']) and re.search(r'\d', header):
                return header

            elif 'wk..' in lowercase_header and not re.search(r'\d', header):
                return header

            elif re.search(r'rct.*[..]|rct.*[#]', lowercase_header) and not re.search(r'\d', header):
                return header

        except Exception as e:
            print(f"An exception occurred: {e}")

    return None

def get_weeklycts_names(df, date_list):

    header_list = list()

    # Find the headers
    for session in date_list[1:]:
        info_headers = df[df.date == session].info_header.tolist()
        header = find_matching_header(info_headers)

        header_list.append(header)

    # Ensure the header_list has 9 elements
    header_list += [None] * (9 - len(header_list))

    return header_list


In [None]:
def extract_weeklyct_folders(df, output_path, file_name):
    """
    This function finds weeklyCTs and drops other types of CTs
    """
    group = list()

    # Separate each ID dataframe
    id_df = pd.DataFrame(df.groupby(['ID']))

    for counter, id_num in enumerate(id_df[0]):

        df = id_df[1][counter]

        # Extract the parts suspected to contain weeklyCTs
        df = df[(df['folder_name'].str.lower().str.contains('rct') & (df['date'] != df['date'].min())) \
                | ((df['date'] == df['date'].min()))]
       
        date_list = sorted(list(df.date.unique())) # Find the list of dates
        rtstart = date_list[0] # Extract RTSTART
        first_day = df[df.date == date_list[1]].iloc[0].week_day # the week day of the first treatment

        # Extract the weeklyCTs names and first day of the treatment
        header_list= get_weeklycts_names(df, date_list)

        # Extract other parameters
        durations = date_list[1:]
        weekly_ct_num = len(durations)       
        durations += [None] * (9 - len(durations)) # Ensure it has 9 elements

        group.append({'ID': id_num[0], 'Baseline': rtstart, 'Session1': durations[0],
                        'Session2': durations[1], 'Session3': durations[2],'Session4': durations[3],
                        'Session5': durations[4], 'Session6': durations[5],'Session7': durations[6],
                        'Session8': durations[7],'Session9': durations[8], 'Fraction1': header_list[0],
                        'Fraction2': header_list[1], 'Fraction3': header_list[2],'Fraction4': header_list[3],
                        'Fraction5': header_list[4], 'Fraction6': header_list[5], 'Fraction7': header_list[6],
                        'Fraction8': header_list[7],'Fraction9': header_list[8], 'First_day': first_day,
                        'Number_of_CTs': df.shape[0], 'Number_of_weeklyCTs': weekly_ct_num})
        
    # Make a datafrme from the main folder
    df_final = pd.DataFrame(group)

    # Save the dataframe
    df_final.to_csv(os.path.join(output_path, file_name), index=False)

    return df_final

In [None]:
weekly_file_name = 'weeklyct_output.csv'
weeklyct_df = extract_weeklyct_folders(df, output_path, weekly_file_name)

In the last step of Navigation Phase, I will replace all the strings in the header part of the dataframe into fraction numbers. There are multiple conditions here. some patients have their own fractions in their headers e.g. 'rct13', but some others have week number like 'wk3' or have a part of the repeated CT name such as 'rct..', 'wk', 'wk..', and so on. for the first group, I just use the number of fractions in the header. However, for the second and third group, I calculate the probable numeber of fractions using the following criteria.
if the patient has accelarated RT plan, I assume that they should get 1.2 fraction per day (only in working days), so it mean 6 fractions per week.Ans, for patients with other types of the treatment, I suppose that they  should get 1 fraction per working day, so in total 5 per week.

In [None]:
# Define a custom function to extract numbers only if 'wk' is not present
def extract_numbers(text):
    if isinstance(text, str) and 'wk' not in text and re.search(r'\d', text):
        
        return  float(''.join(filter(str.isdigit, text)))       
    else:
        return text

def get_existing_fractions(df):
    """
    This function extract all the fractions exist in the data itself.
    """
    for header in df.iloc[:, 12:21].columns:
        df[header] = df[header].apply(extract_numbers)

    return df

def get_coef(accelerated_rt):
    """
    Get the coefficient of the fractions
    """
    if accelerated_rt == 0:
        coef = 1
    else:
        coef = 1.2
    return coef

def calculate_fraction(raw, fraction, fraction_num, coef, counter):
    try:

        if isinstance(fraction, str) and 'wk' in fraction and  counter == 0:
            fraction_num = (len(pd.bdate_range( raw[f'RTSTART'], raw[f'Session{1}'])) - 1) * coef + 1

        elif isinstance(fraction, str) and 'wk' in fraction and  counter != 0:
            fraction_num += (len(pd.bdate_range( raw[f'Session{counter}'], raw[f'Session{counter+1}'])) - 1) * coef
                
        elif isinstance(fraction, str) and 'wk' not in fraction and not re.search(r'\d', fraction) and counter==0:
            fraction_num += (len(pd.bdate_range( raw[f'RTSTART'], raw[f'Session{1}'])) - 1) * coef + 1

        # This part does not work  if the rct.. or rct# is seperated from other part
        elif isinstance(fraction, str) and 'wk' not in fraction and not re.search(r'\d', fraction) and counter!=0:
            fraction_num += (len(pd.bdate_range( raw[f'Session{counter}'], raw[f'Session{counter+1}'])) - 1) * coef

        elif fraction is np.nan and counter < raw.Number_of_weeklyCTs and counter==0:
            fraction_num = (len(pd.bdate_range( raw[f'RTSTART'], raw[f'Session{1}'])) - 1) * coef + 1

        elif fraction is np.nan and counter < raw.Number_of_weeklyCTs and counter!=0:
            fraction_num += (len(pd.bdate_range( raw[f'Session{counter}'], raw[f'Session{counter+1}'])) - 1) * coef              

        elif isinstance(fraction, int) or isinstance(fraction, float):
            fraction_num = fraction

        return fraction_num 

    except:
        return fraction_num

In [None]:
def add_fractions(df, output_path, file_name):
    """
    This function finds or calculates all the fractions
    """
    # Make a copy of the dataset
    df_copy = df.copy()

    # Find all the existing fractions in the dataset
    df_copy = get_existing_fractions(df_copy)

    # Iterate through patients
    for index, raw in df_copy.iterrows():

        fraction_list = list()
        fraction_num = 0

        # Calculate the coefficient
        coef = get_coef(raw.accelerated_rt)

        # Iterate through fractions
        for counter, fraction in enumerate(raw.iloc[12:21]):

            # Calculate and add different fractions to the list of fractions
            fraction_num = calculate_fraction(raw, fraction, fraction_num, coef, counter)
            fraction_list.append(fraction_num)

        # for column_name in fraction_list.keys():
        #     df_copy.loc[index, column_name] = fraction_list[column_name]
        # Assign calculated fractions directly to result DataFrame
            
        df_copy.iloc[index, 12:21] = fraction_list

    # Save the dataframe
    df_copy.to_csv(os.path.join(output_path, file_name), index=False)

    return df_copy


In [None]:
file_name = 'Final_weeklyCT_df.xlsx'
weeklyct_df = add_fractions(weeklyct_df, output_path, file_name)

---

In [31]:
path = '//zkh/appdata/RTDicom/Projectline_HNC_modelling/Users/Hooman Bahrdo/Models/Deep_Learning/DL_NTCP_Xerostomia/datasets/dataset_old_v2/stratified_sampling_test_manual_94.csv'

dff = pd.read_csv(path, sep=';').drop(columns=['Unnamed: 0'])

In [13]:
dff.age = dff.age / 100.

In [25]:
dff = dff

In [28]:
dff.to_csv(path, sep=';')

In [32]:
dff

Unnamed: 0,ID,surface_bsl_dlc,surface_wk3_dlc,OAR,Contra_Dmean,sex,age,Modality_adjusted,xer_bsl,Loctum2,...,delta_surf_dlc,xer_bsl_citor,xer_wk1_not_at_all,xer_wk1_little,xer_wk1_moderate_to_severe,xer_bsl_not_at_all,xer_bsl_little,xer_bsl_moderate_to_severe,sqr_parotid_Dmean,Split
0,20715,8108.746885,7070.864868,Parotid_R,17.946843,1,0.56,Chemoradiation,1,Oropharynx,...,10.378820,1,0,1,0,0,1,0,9.353827,train_val
1,21879,7874.630591,7585.310254,Parotid_R,26.628884,1,0.67,Chemoradiation,1,Oropharynx,...,2.893203,1,0,1,0,0,1,0,11.298645,train_val
2,52277,7960.561507,7511.293366,Parotid_L,29.492172,1,0.50,Chemoradiation,0,Oropharynx,...,4.492681,0,1,0,0,1,0,0,11.448122,train_val
3,59896,6660.874235,6301.546221,Parotid_L,30.214213,1,0.51,Chemoradiation,1,Oropharynx,...,3.593280,1,0,1,0,0,1,0,11.256083,train_val
4,70426,6068.357955,5823.534682,Parotid_R,21.277559,1,0.75,Conventional RT,1,Oropharynx,...,2.448233,1,1,0,0,0,1,0,11.156976,train_val
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,9899837,6851.795819,6120.349395,Parotid_R,24.439093,0,0.54,Accelerated RT,1,Oropharynx,...,7.314464,2,0,1,0,0,0,1,10.281808,train_val
298,9916161,6328.879356,6485.986032,Parotid_R,5.502483,0,0.65,Chemoradiation,1,Neus(bij)holte,...,-1.571067,2,0,0,1,0,0,1,4.536815,train_val
299,9946201,6701.471322,6357.803318,Parotid_R,13.855585,0,0.65,Bioradiation,0,Oropharynx,...,3.436680,0,0,0,1,1,0,0,11.422918,test
300,9956433,8802.968090,8901.740425,Parotid_R,15.743012,1,0.66,Chemoradiation,1,Oropharynx,...,-0.987723,2,0,1,0,0,0,1,10.014044,train_val


In [33]:
for x in dff.age:
    print(x)

0.56
0.67
0.5
0.51
0.75
0.48
0.67
0.63
0.52
0.55
0.73
0.69
0.61
0.65
0.62
0.54
0.55
0.5
0.55
0.62
0.57
0.53
0.66
0.72
0.52
0.72
0.3
0.61
0.53
0.69
0.67
0.45
0.46
0.63
0.75
0.49
0.64
0.54
0.6
0.79
0.48
0.58
0.7
0.59
0.51
0.71
0.57
0.67
0.67
0.73
0.52
0.62
0.57
0.63
0.68
0.79
0.64
0.51
0.66
0.49
0.7
0.55
0.67
0.53
0.59
0.69
0.71
0.63
0.54
0.53
0.57
0.78
0.68
0.68
0.72
0.61
0.79
0.76
0.63
0.57
0.73
0.71
0.49
0.53
0.84
0.76
0.64
0.71
0.71
0.71
0.83
0.47
0.48
0.6
0.65
0.59
0.59
0.66
0.57
0.7
0.55
0.46
0.66
0.63
0.66
0.73
0.56
0.71
0.58
0.53
0.5
0.72
0.74
0.72
0.68
0.68
0.59
0.76
0.63
0.7
0.7
0.7
0.55
0.63
0.56
0.66
0.64
0.74
0.73
0.59
0.67
0.62
0.68
0.72
0.65
0.78
0.66
0.69
0.66
0.6
0.69
0.75
0.66
0.64
0.29
0.53
0.49
0.74
0.86
0.58
0.78
0.51
0.68
0.75
0.66
0.58
0.76
0.7
0.68
0.7
0.67
0.67
0.72
0.77
0.73
0.67
0.72
0.67
0.75
0.8
0.56
0.53
0.6
0.56
0.65
0.81
0.66
0.58
0.7
0.66
0.64
0.87
0.55
0.52
0.88
0.82
0.64
0.52
0.55
0.56
0.71
0.64
0.81
0.78
0.63
0.73
0.6
0.65
0.64
0.64
0.47
0.59
0.86
0.55