In [1]:
import numpy as np
import pandas as pd
import json
import math

In [2]:
# Read in dose data
dose_con = pd.read_csv('data/consolidation_dose.csv')
dose_dis = pd.read_csv('data/discovery_dose.csv')

def rename_columns(df):
    df_renamed = df.rename({'Patient id': 'patient'}, axis=1).copy()
    cols = df_renamed.drop('patient', axis=1).columns
    new_names = {col: f'{col}_{df.iloc[0][col]}'.replace('.1', '') for col in cols}
    df_renamed.rename(new_names, axis=1, inplace=True)
    df_renamed.drop(0, inplace=True)
    return df_renamed

dose_con = rename_columns(dose_con)
dose_dis = rename_columns(dose_dis)

dose_all = pd.concat([dose_con, dose_dis])
dose_all.set_index('patient', drop=False, inplace=True)

In [3]:
# Read in case details
case_detail = pd.read_excel('data/case_detail.xlsx').drop(0)
case_detail.rename({'HEP ID': 'patient', 'cons or disc?': 'cohort', 'Age': 'age'}, axis=1, inplace=True)
case_detail['cohort'] = case_detail['cohort'].apply(lambda a: a.upper())
case_detail.set_index('patient', drop=False, inplace=True)

In [4]:
# Load treatment json
with open('data/treatment.json') as file:
    treatment_list = json.load(file)
treatment = {}
for t in treatment_list:
    treatment.update(t)
    
# Load toxicity json
with open('data/toxicity.json') as file:
    toxicity_info = json.load(file)
    
# Create toxicity dataframe
data_tox = []
symptoms = set()
for patient in toxicity_info.keys():
                             
    # Loop through patient measurement data
    for key in toxicity_info[patient].keys():
        
        # Get disease and measurement stage
        if key == 'cns':
            continue
        disease, stage = key.rsplit('_', 1)
        if stage in ['v1', 'v2']:
            disease, stage = key[:-3].rsplit('_', 1)
        
        # Only consider head and neck
        if disease != 'head_and_neck':
            continue
            
        # Add each measurement date
        for date in toxicity_info[patient][key]:
            record = {
                'patient': patient,
                'disease': disease,
                'stage': stage,
                'date': pd.to_datetime(date, dayfirst=True)
            }

            # Add toxicity info
            tox_data = toxicity_info[patient][key][date]
            for tox in tox_data.keys():
                
                # Mass: add to dictionary
                if not isinstance(tox_data[tox], list):
                    record[tox] = tox_data[tox]
                    
                # Symptom reporting system
                else:
                    for symptom in tox_data[tox]:
                        
                        # Get name and grade
                        name = symptom['term']
                        grade = symptom['grade']
                        record[name] = grade
                        if name != 'mucosa_oral_&_pharyngeal(objective_weight)':
                            symptoms.add(name)

            data_tox.append(record)
            
df_tox = pd.DataFrame(data_tox)
for s in symptoms:
    df_tox[s] = df_tox[s].apply(float)

In [5]:
# Create master list of patients
patient_sets = {
    'dose': set(dose_all['patient']),
    'case_detail': set(case_detail['patient']),
    'baseline': set(df_tox[df_tox['stage'] == 'baseline']['patient']),
    'acute': set(df_tox[df_tox['stage'] == 'acute']['patient']),
    'late': set(df_tox[df_tox['stage'] == 'late']['patient'])
}

patients = set()
for dataset in patient_sets.values():
    patients.update(dataset)
    
df_patients = pd.DataFrame(patients, columns=['patient'])
for name, dataset in patient_sets.items():
    df_patients['has_' + name] = df_patients['patient'].apply(lambda p: p in dataset)
df_patients['has_treatment'] = df_patients['patient'].apply(lambda p: p in treatment.keys())
df_patients.to_csv('data/patients_meta.csv')
df_patients.set_index('patient', drop=False, inplace=True)

In [6]:
# Assemble dataframe of patients with dose data
df = df_patients[df_patients['has_dose']].copy()

# Get patient demographics
def get_age(patient):
    if patient in patient_sets['case_detail']:
        return case_detail.loc[patient, 'age']
    else:
        return float(treatment[patient]['age'][1:-1]) 

def get_cohort(patient):
    if patient in patient_sets['case_detail']:
        return case_detail.loc[patient, 'cohort'].upper()
    elif df_patients.loc[patient, 'has_treatment']:
        return treatment[patient]['cohort'][0].upper()
    
def get_fractions(patient):
    if df_patients.loc[patient, 'has_treatment']:
        return treatment[patient]['n_fraction']
    
df['age'] = df['patient'].apply(get_age)
df['sex'] = df['patient'].apply(lambda p: treatment[p]['sex'] if treatment[p]['sex'] != 'U' else 'M')
df['cohort'] = df['patient'].apply(get_cohort)
df['fractions'] = df['patient'].apply(get_fractions)

# Get treatment info
def get_case_detail(row, detail):
    if row['has_case_detail']:
        return case_detail.loc[row['patient'], detail]
    else:
        return np.nan
    
df['treatment_end'] = df['patient'].apply(lambda p: pd.to_datetime(
    treatment[p]['treatment_end'].split('_')[0], dayfirst=True))
for detail in ['neck', 'SACT', 'TNM', 'primary site']:
    df[detail] = df.apply(get_case_detail, args=[detail], axis=1)

# Get baseline scores of interest
outcome_names = {
    'CTCAE_xerostomia': 'dry_mouth',
    'CTCAE_oropharyngeal_pain': 'oropharyngeal_pain',
    'CTCAE_sdi': 'salivary_duct_inflammation',
    'CTCAE_dysphagia': 'dysphagia',
    'EORTC_dry_mouth': 'have_you_had_a_dry_mouth?',
    'EORTC_sticky_saliva': 'have_you_had_sticky_saliva?',
    'EORTC_taste': 'have_you_had_problems_with_your_sense_of_taste?'
}
df_tox_baseline = df_tox[df_tox['stage'] == 'baseline'].set_index('patient', drop=False)
df_tox_acute = df_tox[df_tox['stage'] == 'acute'].set_index('patient', drop=False)
def get_baseline_score(row, symptom):
    if row['patient'] in df_tox_baseline['patient']:
        return float(df_tox_baseline.loc[row['patient'], symptom])
    else:
        return np.nan

for name, init in outcome_names.items():
    df['baseline_' + name] = df.apply(get_baseline_score, args=[init], axis=1)
    
# Get other baseline info
baseline_to_add = ['smoking', 'smoking_age_started', 'smoking_age_stopped', 'smoking_per_day',
                   'alcohol', 'alcohol_age_started', 'alcohol_age_stopped', 'alcohol_units_per_week',
                   'neck_dissection', 'primary_surgery']
for b in baseline_to_add:
    df[b] = df.apply(get_baseline_score, args=[b], axis=1)
    
# Get mass
def fill_baseline_mass(row):
    
    # Check for baseline measurement
    patient = row['patient']
    if row['has_baseline']:
        mass = df_tox_baseline.loc[patient, 'mass']
        if not np.isnan(mass):
            return mass
    
    # Use earliest acute mass if no baseline
    if row['has_acute']:
        masses = df_tox_acute.loc[patient, ['date', 'mass']].dropna()
        if len(masses):
            return masses.sort_values('date')['mass'].iloc[0]
    return np.nan

# Get mass at treatment start
def fill_acute_mass(row):
    patient = row['patient']
    if row['has_acute']:
        masses = df_tox_acute.loc[patient, ['date', 'mass']].dropna()
        if len(masses):
            return masses.sort_values('date')['mass'].iloc[0]

# Get baseline quantity
def fill_from_baseline(row, var):
    if row['has_baseline']:
        patient = row['patient']
        return df_tox_baseline.loc[patient, var]
    else:
        return np.nan
    
df['mass'] = df.apply(fill_baseline_mass, axis=1)
df['acute_mass'] = df.apply(fill_acute_mass, axis=1)
df['height'] = df.apply(fill_from_baseline, axis=1, args=['height'])

# Calculate weight loss
df['weight_loss'] = df['mass'] - df['acute_mass']
df['weight_loss_pct'] = df['weight_loss'] / df['mass'] * 100

# Correct mistake in baseline mass for one patient
df.loc['VT1_H_8C10E2K1', 'mass'] = df.loc['VT1_H_8C10E2K1', 'acute_mass']
df.loc['VT1_H_8C10E2K1', 'weight_loss'] = np.nan
df.loc['VT1_H_8C10E2K1', 'weight_loss_pct'] = np.nan

# Add dose info
def get_dose(patient, dose):
    return dose_all.loc[patient, dose]

doses = [col for col in dose_all.columns if '_planned' in col]
for d in doses:
    df[d] = df['patient'].apply(get_dose, args=[d])
    

In [7]:
# Process data
# Parse TNM staging
def parse_stage(stage):
    if stage == 'x':
        return 0
    for letter in ['a', 'b', 'c', 'M', 'N']:
        stage = str(stage).replace(letter, '')
    return int(stage)

def parse_tnm(tnm):
    try:
        tnm = tnm.replace(' ', '')
        t = tnm.split('T')[1].split('N')[0]
        try:
            n = tnm.split('N')[1].split('M')[0]
        except IndexError:
            n = 0
    except (AttributeError, IndexError):
        if tnm == 'Grade2':
            t = 2
            n = 0
        else:
            t = n = 0
    return parse_stage(t), parse_stage(n)

df[['T_stage', 'N_stage']] = df['TNM'].apply(lambda tnm: pd.Series(parse_tnm(tnm)))
df['T_high'] = df['T_stage'].apply(lambda t: int(t >= 3))
df['N_high'] = df['N_stage'].apply(lambda n: int(n >= 2))

# Parse neck info
df['neck_bi'] = df['neck'].apply(lambda n: int(n in ['bilateral', 'BNI']))
df['neck_uni'] = df['neck'].apply(lambda n: int(n in ['left', 'right', 'UNI']))

# Binarise SACT and cisplatin
df['cisplatin'] = (df['SACT'] == 'cisplatin').apply(int)
df['SACT_binary'] = (df['SACT'].isin(['cisplatin', 'cetuximab'])).apply(int)

# Binarise sex
df['male'] = df['sex'].apply(lambda x: int(x in ['M', 'U']))

# Calculate smoking/drinking metric
mean_units_smoking = df['smoking_per_day'].mean()
mean_units_alcohol = df['alcohol_units_per_week'].mean()

def get_metric(row, habit, unit, default):
    
    if not row['has_baseline']:
        return np.nan
    if row[habit] == 2:
        return 0
    
    age_started = row[habit + '_age_started']
    if np.isnan(age_started):
        age_started = df[habit + '_age_started'].mean()
        
    if row[habit] == 1:
        age_stopped = row[habit + '_age_stopped']
        if np.isnan(age_stopped):
            age_stopped = row['age']
        
    elif row[habit] == 0:
        age_stopped = row['age']
        
    years_of_habit = age_stopped - age_started
    unit = row[unit] if not np.isnan(row[unit]) else default
    return years_of_habit * unit

df['smoking_metric'] = df.apply(get_metric, args=['smoking', 'smoking_per_day', mean_units_smoking], axis=1)
df['alcohol_metric'] = df.apply(get_metric, args=['alcohol', 'alcohol_units_per_week', mean_units_alcohol], axis=1)

df.to_csv('data/final/patient_info.csv', index=False)

In [8]:
# Process late toxicity data
df_late = df_tox[df_tox['stage'] == 'late'].copy()
print(len(df_late))

# Reject entries with no dose data
no_dose = df_late[~df_late['patient'].isin(df['patient'])].index
#df_late.drop(no_dose, inplace=True)

# Reject entries without treatment data
no_treatment = df_late[~df_late['patient'].isin(treatment.keys())].index
df_late.drop(no_treatment, inplace=True)

# Replace null entries in ng_tube/peg_tube
for col in ['ng_tube', 'peg_tube']:
    df_late[col] = df_late[col].fillna(0).astype(int)

# Get time difference between measurement and treatment end
#df_late['treatment_end'] = df_late['patient'].apply(lambda p: df.loc[p, 'treatment_end'])
df_late['treatment_end'] = df_late['patient'].apply(lambda p: treatment[p]['treatment_end'])
def convert_treatment_time(time):
    if isinstance(time, np.datetime64):
        return time
    date = time.split('_')[0]
    return pd.to_datetime(date, dayfirst=True)
df_late['treatment_end'] = df_late['treatment_end'].apply(convert_treatment_time)


df_late['time_diff'] = df_late['date'] - df_late['treatment_end']
days_in_year = 365.2425
df_late['time_diff_months'] = df_late['time_diff'].apply(lambda td: td.days * 12 / days_in_year)



# Tag each measurement with nearest number of months
months = [3, 6, 12, 24, 36, 48, 60]
def find_nearest_month(time_diff):
    diffs = [abs(m - time_diff) for m in months]
    closest = diffs.index(min(diffs))
    return months[closest]

df_late['months'] = df_late['time_diff_months'].apply(find_nearest_month)

# Binarise outcomes of interest
outcome_thresholds = {
    'CTCAE_xerostomia': 2,
    'CTCAE_sdi': 2,
    'CTCAE_dysphagia': 2,
    'CTCAE_oropharyngeal_pain': 2,
    'EORTC_dry_mouth': 3,
    'EORTC_sticky_saliva': 3,
    'EORTC_taste': 3
}
for name, init in outcome_names.items():
    df_late[name] = df_late[init].apply(lambda s: int(s >= outcome_thresholds[name]))
    df_late[name + '_grade'] = df_late[init]

# Save data
df_late.reset_index(drop=True)
df_late.to_csv('data/final/late_all.csv')

1214


In [9]:
# Save a data frame for each timepoint
# Only save columns of interest
cols = [
    'patient', 'age', 'male', 'T_high', 'N_high', 'neck_bi', 'neck_uni', 'weight_loss', 'mass',
    'weight_loss_pct', 'cisplatin', 'SACT_binary', 'smoking_metric', 'alcohol_metric', 'primary site',
    'primary_surgery', 'neck_dissection', 'T_stage', 'height', 'ng_tube', 'peg_tube', 'N_stage']
cols += list(outcome_names.keys())
cols += ['baseline_' + outcome for outcome in outcome_names.keys()]
cols += [outcome + '_grade' for outcome in outcome_names.keys()]
cols += doses

for m in months:
    
    late_to_save = df_late[df_late['months'] == m].copy()
    
    # Check for duplicate patients
    print(f'\nProcessing: {m} months')
    patient_counts = late_to_save['patient'].value_counts()
    dups = list(patient_counts[patient_counts > 1].index)
    print(f'Removing {len(dups)} duplicates')
    print(dups)
    for dup in dups:
        entries = late_to_save[late_to_save['patient'] == dup]['time_diff_months']
        diffs = abs(entries - m)
        entries_to_drop = list(entries.drop(index=diffs.idxmin()).index)
        late_to_save.drop(index=entries_to_drop, inplace=True)
    
    # Join with patient info
    print(f'Unique patients: {late_to_save["patient"].nunique()}')
    combined = pd.merge(
        df.reset_index(drop=True), late_to_save, 
        left_on='patient', right_on='patient', 
        suffixes=('', '_late'))
    combined = combined[cols]
    combined.rename({'SACT_binary': 'SACT'}, axis=1, inplace=True)
    combined.reset_index(drop=True)
    combined.to_csv(f'data/final/late_{m}_months.csv', index=False)
    
    # Update the patient info
    df_patients[f'has_{m}_months'] = df_patients['patient'].isin(late_to_save['patient'])
    
df_patients['cohort'] = df_patients['patient'].apply(get_cohort)
    
# Save patient metadata
df_patients.to_csv('data/final/patient_metadata.csv')



Processing: 3 months
Removing 7 duplicates
['VT1_H_585E02K1', 'VT1_H_7478D1K1', 'VT1_H_3D6E3K1L', 'VT1_H_BAF702K1', 'VT1_H_575013K1', 'VT1_H_BA7FF1K1', 'VT1_H_B8ED4K1L']
Unique patients: 193

Processing: 6 months
Removing 4 duplicates
['VT1_H_9BDEC1K1', 'VT1_H_324C1K1L', 'VT1_H_7BD802K1', 'VT1_H_16AEFK1L']
Unique patients: 184

Processing: 12 months
Removing 1 duplicates
['VT1_H_726D13K1']
Unique patients: 189

Processing: 24 months
Removing 1 duplicates
['VT1_H_2D8A6K1L']
Unique patients: 168

Processing: 36 months
Removing 3 duplicates
['VT1_H_A00CA1K1', 'VT1_H_DF341K1L', 'VT1_H_1BEE51K1']
Unique patients: 168

Processing: 48 months
Removing 0 duplicates
[]
Unique patients: 136

Processing: 60 months
Removing 4 duplicates
['VT1_H_D2BED1K1', 'VT1_H_491CE1K1', 'VT1_H_7143EK1L', 'VT1_H_F894CK1L']
Unique patients: 106


In [2]:
df = pd.read_csv("data/final/patient_info.csv")

In [11]:
df[df['patient'] == 'VT1_H_03F693K1']['primary_surgery']

166    0.0
Name: primary_surgery, dtype: float64