In [132]:
import os
import pydicom as pdcm
import pandas as pd
import numpy as np

In [133]:
import yaml
with open('config.yaml', 'r') as stream:
    config = yaml.safe_load(stream)
    complete_collection_scans = config['complete_collection_of_scans_per_patient']
    endpoints = config['endpoints']
    
df = pd.read_excel(complete_collection_scans, index_col=0)
df.head()


Unnamed: 0_level_0,modality,study_date,series_description,study_description,folder_path,datasource
p_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
4879,CT,2018-07-26,Ave-IP(10) 0%_10%_20%_30%_40%_50%_60%_70%_80%_90%,RTHE CT THORAX,\\zkh\appdata\RTDicom\Projectline - Modelling ...,unstructured
4879,CT,2018-07-26,"MM Thorax2,5mm",RTHE CT THORAX,\\zkh\appdata\RTDicom\Projectline - Modelling ...,unstructured
4879,RTSTRUCT,2018-07-26,RS: Unapproved Structure Set,RTHE CT THORAX,\\zkh\appdata\RTDicom\Projectline - Modelling ...,unstructured
4879,CT,2018-07-26,"T=0%,PR=95% -> 3%,AR()=26 -> 57",RTHE CT THORAX,\\zkh\appdata\RTDicom\Projectline - Modelling ...,unstructured
4879,CT,2018-07-26,"T=10%,PR=5% -> 14%,AR()=37 -> 72",RTHE CT THORAX,\\zkh\appdata\RTDicom\Projectline - Modelling ...,unstructured


Checking for empty cells

In [134]:
df.isna().sum()

modality               0
study_date             0
series_description     9
study_description     13
folder_path            0
datasource             0
dtype: int64

In [135]:
df.fillna('name not registered', inplace=True)

We only want to take into account scans that were taken before RT. To do this we need to connect the RT date to the df

In [136]:
# lets remove the excluded patients form this list
ep_df = pd.read_excel(endpoints, index_col=1)
print(ep_df['Flowchart'].unique())
print(f'n patients in ep_df: {len(ep_df)}')

clean_ep_df = ep_df[ep_df['Flowchart'] == 'inlc']
# lets check that theres no more inlc patients
print(ep_df['Flowchart'].unique())
print(f'n patients in ep_df: {len(ep_df)}')




['inlc' 'exclude_dosing' 'niet vinden in epic' 'excluded_in_script'
 'exclude_concur']
n patients in ep_df: 957
['inlc' 'exclude_dosing' 'niet vinden in epic' 'excluded_in_script'
 'exclude_concur']
n patients in ep_df: 957


In [137]:
clean_ep_df.columns[clean_ep_df.columns.str.contains('RT')]
RT_startdate = clean_ep_df['RT_Start_Date']
RT_startdate

UMCGnr
4879      2018-08-06
10539     2015-05-18
13194     2015-09-10
14504     2015-06-22
17420     2015-07-06
             ...    
9935146   2019-07-01
9936840   2015-07-13
9949900   2015-04-13
9970729   2015-05-04
9989943   2017-02-01
Name: RT_Start_Date, Length: 874, dtype: datetime64[ns]

In [138]:
df['study_date']

p_id
4879       2018-07-26
4879       2018-07-26
4879       2018-07-26
4879       2018-07-26
4879       2018-07-26
              ...    
9935146    2019-11-19
9936840    2015-03-12
9936840    2015-03-12
9970729    2015-01-26
9970729    2015-01-26
Name: study_date, Length: 11804, dtype: object

Okay, lets merge this data.

In [139]:
# To compare the dates, first we need to convert the study date to date time.

# df['study_date'] = pd.to_datetime(df['study_date'], format='mixed')
# error: Unable to parse datetime string: --, at position 225
nodate = df[df['study_date'] == '--'] 
# so there's a few RTDOSE and RTSTRUCT files that have no study date.
# I'm going to fill these with the study date of the scan of these patients.
nodate = df.loc[nodate.index].groupby('p_id')['study_date'].unique()
nodate = nodate.apply(lambda x: x[1])
check_id = nodate.index
nodate = zip(nodate.index, nodate.values)

for i, date in nodate:
    df.loc[i, 'study_date'] = date

df.loc[check_id].groupby('p_id')['study_date'].unique()


p_id
2023115    [2014-03-27]
2939330    [2013-02-26]
4629433    [2013-03-29]
6357374    [2013-02-18]
7391514    [2014-01-29]
Name: study_date, dtype: object

In [140]:
df['study_date'] = pd.to_datetime(df['study_date'], format='mixed')
df['study_date']
# 👍

p_id
4879      2018-07-26
4879      2018-07-26
4879      2018-07-26
4879      2018-07-26
4879      2018-07-26
             ...    
9935146   2019-11-19
9936840   2015-03-12
9936840   2015-03-12
9970729   2015-01-26
9970729   2015-01-26
Name: study_date, Length: 11804, dtype: datetime64[ns]

In [161]:
# Lets remove patients that are not in the df from the RT_startdate

# RT_startdate.loc[df.index]
# KeyError: '[210905, 654137, 846557, 849406, 905985, 1092836, 1234013, 2022493, 
# 2022498, 2730427, 3146700, 3307795, 3865508, 4850472, 5124665, 5468590, 
# 5641888, 7391514, 7417192, 7536975, 9368667, 9573899] not in index'

# So theres patients that are in the scandataset but not in RT_startdate. 
# A part of these are excluded patients:
excluded_patients = \
    df.index.unique()[(~df.index.unique().isin(clean_ep_df.index.unique())) &
                    (df.index.unique().isin(ep_df.index.unique()))]
print(f'excluded patients: {excluded_patients.to_list()}')    


# Another part of these patients have no endpoint at all:
no_endpoint = \
    df.index.unique()[(~df.index.unique().isin(clean_ep_df.index.unique())) &
                    (~df.index.unique().isin(ep_df.index.unique()))]

print(f'patients with no endpoint: {no_endpoint.tolist()}')

# lets safe these for later checking.
with open('patient_excultion.txt', 'w') as doc:
    doc.write(f'excluded patients: {excluded_patients.to_list()} \n'
              f'patients with no endpoint: {no_endpoint.tolist()}')


excluded patients: [210905, 654137, 846557, 849406, 905985, 1092836, 3307795, 3865508, 4850472, 5124665, 5641888, 7417192, 7536975, 9368667]
patients with no endpoint: [1234013, 2022493, 2022498, 2730427, 3146700, 5468590, 7391514, 9573899]


In [160]:
# Lets safe these for later checking and remove them from df


KeyError: '[1234013, 2022493, 2022498, 2730427, 3146700, 5468590, 7391514, 9573899] not in index'