# Introduction
In this notebook, I will describe all the steps that I have taken to make a new dataset for Weekly CTs.

Basically, the process contains five different steps:

1. Navigation of the folder in which one think there maybe any weeklyCTs. These folders can be on this computer or a user can just make these folders by downloading new patients from MIRADA or other UMCG datasets.

2. Extracting only weeklyCTs from these folders and make an excel file from them.

3. Transferring the new-founded weeklyCTs into a destination folder (it can be an existing folder for the weeklyCTs or a new folder).

4. Making a report excel file of some information about the weeklyCTs in the destination file and some clinical information from the patients who have these weeklyCTs.

5. Making a pannel that contains different information about the WeeklyCT dataset.

6. A Watchdog is keep the track of all the additions to the destination folder, and save them in a log file.

In [1108]:
# General Libraries
import os
import re
import glob
import math
import shutil
import numpy as np
import pandas as pd
from random import randint
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from datetime import time, datetime, date

# DICOM Libraries
import pydicom as pdcm
from pydicom.tag import Tag

# 1. Navigation Phase
### DICOM Files
All kinds of CTs were stored in the form of DICOM files. DICOM, which stands for Digital Imaging and Communications in Medicine, is a standard for transmitting, storing, and sharing medical images. DICOM files contain information about medical images, such as X-rays, CT scans, MRIs, and ultrasound. This standard ensures the interoperability of medical imaging equipment from different manufacturers. Some key features are:

**Metadata:** DICOM files store not only the pixel data of the medical images but also a wealth of metadata. This metadata includes patient information, imaging device details, acquisition parameters, and more.

**Interoperability:** DICOM enables the exchange of medical images and related information between different devices and systems. This interoperability is crucial in healthcare settings where various imaging modalities and equipment are used.

**Structured Data:** DICOM files use a structured format for information, allowing for consistency and ease of interpretation by different systems. This makes it possible for healthcare professionals to access and understand the data regardless of the equipment used to capture or generate the images.

For information of different tags and the definitions one can use the following links: [Wiki](https://en.wikipedia.org/wiki/DICOM), [link](https://dicom.innolitics.com/ciods)


In [1109]:
def get_folder_name(image, subf):

    # find the name of the folder
    try:
        folder_name = image[Tag(0x0008103e)].value

    except:
        study = image[Tag(0x00081030)].value
        patient_id = image[Tag(0x00100020)].value
        print(f'Warning: folder {study} with {patient_id} ID does NOT have Series Description')
        folder_name = subf.split('\\')[-1]  

    return folder_name

def get_patient_id(image):

    # Extract the patient ID
    try:
        patient_id = int(image[Tag(0x00100020)].value)

    except:
        print(f'Warning: There is NO patient ID')
        patient_id = None

    return patient_id

def get_probable_weklyct_name(name, number, names_list, saver):

    lowercase_name = name.lower()

    # Search to find 'rct' or 'w' with a number
    if ('rct' in lowercase_name or 'w' in lowercase_name) and re.search(r'\d', name):
        saver = name

    elif 'wk..' in lowercase_name and not re.search(r'\d', name):
        saver = name

    # Check if 'w' is in 'j' and the next element in 'sep_names' is an integer
    elif 'w' in lowercase_name and number + 1 < len(names_list) and not re.search(r'\d', name):

        if '2.0' not in names_list[number + 1] and '2,' not in names_list[number + 1]:
            saver = name + str(names_list[number + 1])

    elif re.search('rct.*[..]|rct.*[#]', lowercase_name) and not re.search(r'\d', name):
        saver = name
    
    else:
        pass

    return saver    
    
def get_hd_fov(name, hd_fov):

    lowercase_name = name.lower()
    # Search whether there is 'hd' or 'fov' in j
    if 'hd' in lowercase_name or 'fov' in lowercase_name:
        hd_fov = 1 
    
    else:
        pass
    
    return hd_fov

def get_fraction(name, fraction):

    lowercase_name = name.lower()

    # Find the fraction number
    if 'rct' in lowercase_name and re.search(r'\d', name):
        fraction = int(re.findall(r'\d+', name)[0])
    
    else:
        pass
    
    return fraction

def get_date_information(image):

    # Extract the date, the week day, and the week number from study date time
    try:
        study_datetime_CT = datetime.strptime(image[Tag(0x00080020)].value ,"%Y%m%d")
        date_info = study_datetime_CT.date()
        weekday = study_datetime_CT.weekday() + 1
        week_num = study_datetime_CT.isocalendar()[1] #week

    except:
        date_info = None
        weekday = None
        week_num = None 
    
    return date_info, weekday, week_num

def get_slice_thickness(image):
    
    # Extract slice thickness
    try:
        slice_thickness = image['00180050'].value
    except:
        slice_thickness = None
    
    return slice_thickness

def get_contrast(image):
    
    # Extract contrast information
    try:
        image[Tag(0x00180010)].value
        contrast=1

    except:
        contrast=0
    
    return contrast

def get_pixel_spacing(image):

    # Extract pixel spacing
    try:
        pixel_spacing = image[Tag(0x00280030)].value
    except:
        pixel_spacing = None
    
    return pixel_spacing

def get_ref_uid(image):

    # Extract UID
    try:
        uid = image['00200052'].value
    except:
        uid = None
    
    return uid

In [1110]:
def navigate_folder(path_folder, output_path, file_name):

    # Add in config
    exclusion_set = {'detail', 'ac_ct', 'ld_ct', 'ld ct', 'ac ct'} # CONFIG File
    min_slice_num = 50 # CONFIG File
    modality = 'CT' # CONFIG File

    # Make a group to save all the information
    group = list()

    for r, d, f in os.walk(path_folder):
        # make a list from all the directories 
        subfolders = [os.path.join(r, folder) for folder in d]

        for subf in subfolders:
            # number of slices (images) in each DICOM folder, and the name of the folders
            slice_num = len(glob.glob(subf+"/*.DCM"))

            # find whether subf is a path and the number of .DCM images is more than 50
            if slice_num > min_slice_num:

                # Extract the information of the image 
                image=pdcm.dcmread(glob.glob(subf+"/*.DCM")[0],force=True)
                folder_name = get_folder_name(image, subf)
    
                # Extract the CTs
                if image.Modality == modality and all(keyword not in folder_name.lower() for keyword in exclusion_set):
   
                    patient_id = get_patient_id(image)

                    # split the name of the folder into strings of information
                    names_list = folder_name.split()

                    # Initialize the following three patameters
                    saver = None
                    hd_fov = 0
                    fraction = None

                    for number, name in enumerate(names_list):
                        saver = get_probable_weklyct_name(name, number, names_list, saver) 
                        hd_fov = get_hd_fov(name, hd_fov)
                        fraction = get_fraction(name, fraction)

                    # Find different information
                    date_info, weekday, week_num = get_date_information(image)
                    slice_thickness = get_slice_thickness(image)
                    contrast = get_contrast(image)
                    pixel_spacing = get_pixel_spacing(image)
                    uid = get_ref_uid(image)

                    # Add the information of this group to the total dataset
                    group.append({
                                'ID': patient_id, 'folder_name': folder_name, 'date': date_info,
                                'week_day': weekday, 'week_num': week_num, 'info_header': saver,
                                'fraction': fraction, 'HD_FoV': hd_fov, 'slice_thickness': slice_thickness,
                                'num_slices': slice_num, 'pixel_spacing': pixel_spacing, 'contrast': contrast,
                                'UID': uid, 'path': subf
                                })
    
    # Make a datafrme from the main folder
    df = pd.DataFrame(group)

    # Save the dataframe
    df.to_excel(os.path.join(output_path,file_name), index=False)

    return df

In [1111]:
path_folder = '//zkh/appdata/RTDicom/Projectline_HNC_modelling/OPC_data/ART_DATA1'
output_path = '//zkh/appdata/RTDicom/Projectline_HNC_modelling/OPC_data/ART_DATA1'

# Correct this one in the main code, this folder have a name tha follow the following structure: 'General_information_{folder_name}.xlsx'
file_name = 'General_information_ART_DATA1.xlsx' 
df = navigate_folder(path_folder, output_path, file_name)

Based on our knowledge about weeklyCTs, we know that they are only available after 2014, so we can just remove the patients before this specific time. Moreover, since this program just navigate all the folders, there may be some duplicated data in those folders, so I need to erase them from the dataset.

In [1112]:
def clean_dataframe(df):
    """
    clean the dataset
    """
    df_copy = df.copy()

    # Slice the part of the dataset after the mentioned time.
    time_limit = pd.Timestamp('2014-01-01') # CONFIG File
    df_copy = df_copy[pd.to_datetime(df_copy.date) > time_limit]

    # Drop the doplicated folders
    df_copy = df_copy.drop_duplicates(subset=['ID', 'folder_name', 'date'],
                                       keep='first', inplace=False, ignore_index=True)

    return df_copy

In [1113]:
df = clean_dataframe(df)

In this stage, I will drop all the remained CTs that are not WeeklyCTs.

In [1114]:
def get_firstday(df, date_list):
    try:
        first_day = df[df.date == date_list[1]].iloc[0].week_day
    except:
        first_day = None
    
    return first_day

def find_matching_header(info_headers):
    for header in info_headers:
        try:
            lowercase_header = header.lower()

            if any(keyword in lowercase_header for keyword in ['rct', 'w']) and re.search(r'\d', header):
                return header

            elif 'wk..' in lowercase_header and not re.search(r'\d', header):
                return header

            elif re.search(r'rct.*[..]|rct.*[#]', lowercase_header) and not re.search(r'\d', header):
                return header

        except Exception as e:
            print(f"An exception occurred: {e}")

    return None

def get_weeklycts_names(df, date_list):

    header_list = list()

    # Find the headers
    for session in date_list[1:]:
        info_headers = df[df.date == session].info_header.tolist()
        header = find_matching_header(info_headers)

        header_list.append(header)

    # Ensure the header_list has 9 elements
    header_list += [None] * (9 - len(header_list))

    return header_list

def get_accelerated_rt(patient_id, clinical_df):
    try:
        accelerated_rt = clinical_df[clinical_df.UMCG==int(patient_id)].Modality_adjusted.values[0]
    
    except:
        accelerated_rt = 'Not Mentioned'
    
    return accelerated_rt


In [1115]:
def extract_weeklyct_folders(df, output_path):
    """
    This function finds weeklyCTs and drops other types of CTs
    """
    # Call clinical df to extract Accelerated program for each patient
    clinical_df_path = '//zkh/appdata/RTDicom/Projectline_HNC_modelling/OPC_data/ART Hooman/Xerostomia_dataset.xlsx' # CONFIG File
    clinical_df = pd.read_excel(clinical_df_path)

    group = list()

    # Separate each ID dataframe
    id_df = pd.DataFrame(df.groupby(['ID']))

    for counter, id_num in enumerate(id_df[0]):

        df = id_df[1][counter]

        # Extract the parts suspected to contain weeklyCTs
        df = df[(df['folder_name'].str.lower().str.contains('rct') & (df['date'] != df['date'].min())) \
                | ((df['date'] == df['date'].min()))]
       
        date_list = sorted(list(df.date.unique())) # Find the list of dates
        rtstart = date_list[0] # Extract RTSTART  
        first_day = get_firstday(df, date_list) # the week day of the first treatment

        # Extract the weeklyCTs names and first day of the treatment
        header_list= get_weeklycts_names(df, date_list)

        # Extract other parameters
        durations = date_list[1:]
        weekly_ct_num = len(durations)       
        durations += [None] * (9 - len(durations)) # Ensure it has 9 elements
        Modality_adjusted = get_accelerated_rt(id_num, clinical_df)

        group.append({'ID': int(id_num), 'Baseline': rtstart, 'Session1': durations[0],
                        'Session2': durations[1], 'Session3': durations[2],'Session4': durations[3],
                        'Session5': durations[4], 'Session6': durations[5],'Session7': durations[6],
                        'Session8': durations[7],'Session9': durations[8], 'Fraction1': header_list[0],
                        'Fraction2': header_list[1], 'Fraction3': header_list[2],'Fraction4': header_list[3],
                        'Fraction5': header_list[4], 'Fraction6': header_list[5], 'Fraction7': header_list[6],
                        'Fraction8': header_list[7],'Fraction9': header_list[8], 'First_day': first_day,
                        'Number_of_CTs': df.shape[0], 'Number_of_weeklyCTs': weekly_ct_num, 'modality_adjusted':Modality_adjusted})
        
    # Make a datafrme from the main folder
    df_final = pd.DataFrame(group)

    # Drop the patients who does not have weeklyCTs
    df_final = df_final[~(df_final.Number_of_weeklyCTs == 0)]
    df_final = df_final.reset_index().drop(columns=['index'])

    # Save the dataframe
    # df_final.to_csv(os.path.join(output_path, file_name), index=False)

    return df_final

In [1116]:
# weekly_file_name = 'weeklyct_output.csv' DO NOT NEED THIS ONE, IT IS A MIDDLE PROCESS
weeklyct_df = extract_weeklyct_folders(df, output_path)

In the last step of Navigation Phase, I will replace all the strings in the header part of the dataframe into fraction numbers. There are multiple conditions here. some patients have their own fractions in their headers e.g. 'rct13', but some others have week number like 'wk3' or have a part of the repeated CT name such as 'rct..', 'wk', 'wk..', and so on. for the first group, I just use the number of fractions in the header. However, for the second and third group, I calculate the probable numeber of fractions using the following criteria.
if the patient has accelarated RT plan, I assume that they should get 1.2 fraction per day (only in working days), so it mean 6 fractions per week.Ans, for patients with other types of the treatment, I suppose that they  should get 1 fraction per working day, so in total 5 per week.

In [1117]:
# Define a custom function to extract numbers only if 'wk' is not present
def extract_numbers(text):
    if isinstance(text, str) and 'wk' not in text and re.search(r'\d', text):
        
        return  float(''.join(filter(str.isdigit, text)))       
    else:
        return text

def get_existing_fractions(df):
    """
    This function extract all the fractions exist in the data itself.
    """
    for header in df.iloc[:, 11:20].columns:
        df[header] = df[header].apply(extract_numbers)

    return df

def get_coef(Modality_adjusted):
    """
    Get the coefficient of the fractions
    """
    accelerated_list = ['Accelerated RT', 'Bioradiation'] # CONFIG File
    not_accelerated_list = ['Chemoradiation', 'Conventional RT'] # CONFIG File
    
    if Modality_adjusted in not_accelerated_list:
        coef = 1.0
    
    elif Modality_adjusted in accelerated_list:
        coef = 1.2

    else:
        coef = 0.0

    return coef

def calculate_fraction(raw, fraction, fraction_num, coef, counter):
    try:
    
        if isinstance(fraction, str) and 'wk' in fraction and  counter == 0:
            fraction_num = (len(pd.bdate_range( raw[f'Baseline'], raw[f'Session{1}'])) - 1) * coef + 1

        elif isinstance(fraction, str) and 'wk' in fraction and  counter != 0:
            fraction_num += (len(pd.bdate_range( raw[f'Session{counter}'], raw[f'Session{counter+1}'])) - 1) * coef
                
        elif isinstance(fraction, str) and 'wk' not in fraction and not re.search(r'\d', fraction) and counter==0:
            fraction_num += (len(pd.bdate_range( raw[f'Baseline'], raw[f'Session{1}'])) - 1) * coef + 1

        # This part does not work  if the rct.. or rct# is seperated from other part
        elif isinstance(fraction, str) and 'wk' not in fraction and not re.search(r'\d', fraction) and counter!=0:
            fraction_num += (len(pd.bdate_range( raw[f'Session{counter}'], raw[f'Session{counter+1}'])) - 1) * coef

        elif fraction is np.nan and counter < raw.Number_of_weeklyCTs and counter==0:
            fraction_num = (len(pd.bdate_range( raw[f'Baseline'], raw[f'Session{1}'])) - 1) * coef + 1

        elif fraction is np.nan and counter < raw.Number_of_weeklyCTs and counter!=0:
            fraction_num += (len(pd.bdate_range( raw[f'Session{counter}'], raw[f'Session{counter+1}'])) - 1) * coef              

        elif isinstance(fraction, int) or isinstance(fraction, float):
            fraction_num = fraction

        else:
            fraction_num = None
        return fraction_num 

    except:
        return fraction_num

In [1118]:
def add_fractions(df, output_path, file_name):
    """
    This function finds or calculates all the fractions
    """
    # Make a copy of the dataset
    df_copy = df.copy()
    coef_list = list()
    # Find all the existing fractions in the dataset
    df_copy = get_existing_fractions(df_copy)

    # Iterate through patients
    for index, raw in df_copy.iterrows():

        fraction_list = list()
        fraction_num = 0

        # Calculate the coefficient
        coef = get_coef(raw.Modality_adjusted)

        # Iterate through fractions
        for counter, fraction in enumerate(raw.iloc[11:20]):

            # Calculate and add different fractions to the list of fractions
            fraction_num = calculate_fraction(raw, fraction, fraction_num, coef, counter)
            fraction_list.append(fraction_num)

        df_copy.iloc[index, 11:20] = fraction_list
        coef_list.append(coef)
 
    df_copy['Coefficient'] = coef_list

    # Save the dataframe
    df_copy.to_excel(os.path.join(output_path, file_name), index=False)

    return df_copy


In [1046]:
# Correct this one in the main code, this folder have a name tha follow the following structure: 'WeeklyCTs_fraction_{folder_name}.xlsx'
file_name = 'WeeklyCTs_fraction_ART_DATA1.xlsx'
weeklyct_df = add_fractions(weeklyct_df, output_path, file_name)

The last part of the first phase can be extracting the information of a specific week e.g. week3. To achieve this aim, I will make a function, that can be call and return an excel file for patients who have a specific week fraction. 

In [1194]:
def get_a_week_information(main_df, weeklyct_df, week_name):

    accelerated_list = ['Accelerated RT', 'Bioradiation'] # CONFIG File
    not_accelerated_list = ['Chemoradiation', 'Conventional RT'] # CONFIG File
    fraction_range_dict = {'week1': {'not_accelerated':[0.0, 5.0], 'accelerated': [0.0, 6.0]}, # Config File
                           'week2': {'not_accelerated':[5.0, 10.0], 'accelerated': [6.0, 12.0]},
                           'week3': {'not_accelerated':[10.0, 15.0], 'accelerated': [12.0, 18.0]},
                           'week4': {'not_accelerated':[15.0, 20.0], 'accelerated': [18.0, 24.0]},
                           'week5': {'not_accelerated':[20.0, 25.0], 'accelerated': [24.0, 30.0]},
                           'week6': {'not_accelerated':[25.0, 30.0], 'accelerated': [30.0, 36.0]},
                           'week7': {'not_accelerated':[30.0, 35.0], 'accelerated': [36.0, 42.0]},
                           'week8': {'not_accelerated':[35.0, 40.0], 'accelerated': [42.0, 48.0]}}
    week_list = list()

    # Iterate through patients
    for _, raw in weeklyct_df.iterrows():
        matching_list = []
        fraction_seri = raw.iloc[11:20]
        #print(raw.modality_adjusted)
        # Find any columns that have values inside the range of a a specific week
        if raw.modality_adjusted in not_accelerated_list:
            matching_list = [column for column in fraction_seri.index \
            if (raw[column]is not None and raw[column] > fraction_range_dict[week_name]['not_accelerated'][0] \
                and raw[column] <= fraction_range_dict[week_name]['not_accelerated'][1])]

        elif raw.modality_adjusted in accelerated_list:
            matching_list = [column for column in fraction_seri.index \
            if (raw[column]is not None and raw[column] > fraction_range_dict[week_name]['accelerated'][0] \
                and raw[column] <= fraction_range_dict[week_name]['accelerated'][1])]
        # print(matching_list)
        # If finds a column, add some information of  that patient to the dictionary
        if len(matching_list) > 0:
            for matched_fraction in matching_list:
                week_num = matched_fraction[-1]
                week_list.append({'ID': raw.ID,
                                 'date': raw[f'Session{week_num}'],
                                 'treatment_week': week_name,
                                 'Fraction_num': matched_fraction, 
                                 'Fraction_magnitude': raw[matched_fraction], 
                                 'modality_adjusted': raw.modality_adjusted})
            # print(week_list)
    # Make a datafrme from the main folder
    week_df = pd.DataFrame(week_list)
    final_df = week_df.merge(main_df, on=['ID', 'date']).drop(columns=['fraction'])

    return final_df

In [None]:
week_df = get_a_week_information(df, weeklyct_df, 'week6')

# Report Phase
In this phase, dataframes from different folders (it can be one or more folders) gather to gether to make a total dataframe for all the dataset in different folders.

In [1196]:
def read_dataframe(name):

    output_path = '//zkh/appdata/RTDicom/Projectline_HNC_modelling/OPC_data/ART_DATA1' # CONFIG File

    try:
        # If the file is an excel file
        if '.xlsx' in name:
            df = pd.read_excel(os.path.join(output_path, name))
            
        # If the file is a csv file
        elif '.csv' in name:
            df = pd.read_csv(os.path.join(output_path, name)) # Comma seperated

            # If the csv file is semi-colon seperated
            if ';' in df.columns[0]:
                df = pd.read_csv(os.path.join(output_path, name), sep=';')

        # Erase the index columns if there is any
        if any('unnamed' in col_name.lower() for col_name in df.columns):
            excess_column_names = [col_name for col_name in df.columns if 'unnamed' in col_name.lower()]
            df = df.drop(columns=excess_column_names)

        return df

    except FileNotFoundError:
        print(f'Warning: file {name} was not found')
    
    except ValueError:
        print(f'File {name} is not supported by this program.')


def concat_dataframes(df_name_list):
    """
    This function accepts excel and csv files. csvs can be comma-seperated or semicolon-seperated
    """
    # Make an empty df to gather all of the dataframes here.
    final_df = pd.DataFrame()

    for name in df_name_list:
        df = read_dataframe(name)

        try:
           final_df = pd.concat([final_df, df], ignore_index=True)

        except Exception as e:
            print(f'ERROR:error {e} ocurs for {name} folder')
            pass

    # Drop duplicated patients
    if 'weeklyct' in df_name_list[0].lower(): 
        final_df = final_df.drop_duplicates(subset=['ID'])

    # Reset the index
    final_df = final_df.sort_values('ID').reset_index().drop(columns=['index'])

    return final_df

In the next step, I will use the above functions to make two final datasets: WeeklyCT_dataset and General_dataset.

### WeeklyCT Final Dataframe
This dataset contains clinical and some technical information about the patients who have WeeklyCTs. This dataset will be used further in plotting phase.

In [1199]:
def find_dataframes(desired_file):
    """
    This function makes the list of the desired file names. It can be weeklyCT files or General files
    """
    output_path = '//zkh/appdata/RTDicom/Projectline_HNC_modelling/OPC_data/ART_DATA1' # CONFIG File

    # Find all the relavant dataframes
    file_list = os.listdir(output_path)
    desired_file_list = [file_name for file_name in file_list if desired_file in file_name.lower()]

    return desired_file_list

def call_clinical_dataframe():
    clinical_df_name = 'Xerostomia_dataset.xlsx' # CONFIG File

    # Define a mapping between source and target column names
    column_mapping = {'UMCG': 'ID', # CONFIG File
                      'GESLACHT': 'gender', 
                      'LEEFTIJD': 'age',
                      'Loctum2': 'tumor_location',
                      'N_stage': 'n_stage',
                      'TSTAD_DEF': 't_stage',
                      'HN35_Xerostomia_M06': 'xer_06',
                      'HN35_Xerostomia_M12': 'xer_12'}   
    
    clinical_df = read_dataframe(clinical_df_name)
    desired_column_list = list(column_mapping.keys())

    # Slice the desired part
    clinical_df = clinical_df.loc[:,desired_column_list]

    # Map the name of the columns to the desired names
    clinical_df = clinical_df.rename(columns=column_mapping)

    return clinical_df


def make_weeklyct_dataframe():
    """
    This function makes the final weeklyCT dataframe
    """
    make_label_df = True # Config File
    label_list = ['xer_06_y', 'xer_12_y'] # Config File

    file_names = find_dataframes('weeklyct')
    df = concat_dataframes(file_names)
    clinical_df = call_clinical_dataframe()
    final_weeklyct_df = df.merge(clinical_df, on='ID')

    # Save the dataframe
    final_weeklyct_df.to_excel(os.path.join(output_path, 'Overview_weeklyCT_patients.xlsx'), index=False)

    # If dataframe based on labels is needed
    if make_label_df:
        for label in label_list:
            label_df = final_weeklyct_df[final_weeklyct_df[label].notnull()]
            label_df.to_excel(os.path.join(output_path, f'Overview_weeklyCT_patients_{label}.xlsx'), index=False)

    return final_weeklyct_df


In [1200]:
weekly_df = make_weeklyct_dataframe()

### General Dataframe
This dataframe contains the information of the available weeklyCT folder. This dataframe will be used further in transferring phase.

In [1207]:
def make_general_dataframe():
    week_list = ['week1', 'week2', 'week3', 'week4', 'week5', 'week6', 'week7', 'week8'] # CONFIG File (It can be week dictionary key list)
    output_path = '//zkh/appdata/RTDicom/Projectline_HNC_modelling/OPC_data/ART_DATA1' # CONFIG File

    # Make a dataframe from all the general files
    file_names = find_dataframes('general')
    general_df = concat_dataframes(file_names)
    weekly_df = pd.read_excel(os.path.join(output_path, 'Overview_weeklyCT_patients.xlsx'))

    final_general_df = pd.DataFrame()

    # Make the datframe for each week and concat all of them to make a dataset
    for week_name in week_list:
        week_df = get_a_week_information(general_df, weekly_df, week_name)
        final_general_df = pd.concat([final_general_df, week_df], ignore_index=True)
    
    # Sort the dataset based on ID
    final_general_df = final_general_df.sort_values('ID').reset_index().drop(columns=['index'])
    # Save the dataframe
    final_general_df.to_excel(os.path.join(output_path, 'General_information.xlsx'), index=False)

    return final_general_df

In [1208]:
final_general_df = make_general_dataframe()

# Transferring Phase
In this phase, all the new weeklyCTs will be transferred into the determined and final folder. If there is the same weeklyCT with the same Patient ID, fraction and week number, this program skips that folder.

---

In [1029]:
path = '//zkh/appdata/RTDicom/Projectline_HNC_modelling/Users/Hooman Bahrdo/Models/Deep_Learning/DL_NTCP_Xerostomia/datasets/dataset_old_v2/stratified_sampling_test_manual_94.csv'

dff = pd.read_csv(path, sep=';').drop(columns=['Unnamed: 0'])

In [1032]:
dff.xer_12.unique()

array([0, 1], dtype=int64)

In [None]:
path_dataset = '//zkh/appdata/RTDicom/Projectline_HNC_modelling/Users/Hooman Bahrdo/Deep_learning_datasets/Six_month_final df/datasets/dataset_old_v2/0'
