In [39]:
from autogluon.tabular import TabularDataset, TabularPredictor
from autogluon.multimodal import MultiModalPredictor
import seaborn as sns
import numpy as np
import pandas as pd
from skimage import data
from skimage import exposure
from skimage.exposure import match_histograms

In [None]:
train_data = TabularDataset('SNMMI_CHALLENGE_TRAINING.csv') # load training data csv
test_data = TabularDataset('SNMMI_CHALLENGE_TESTING.csv') ## load test data csv

In [None]:
#train PFS regression model after removal of short follow-up Event=0 data

threshold = train_data[train_data['Event']==1].Outcome_PFS.mean() + 2*train_data[train_data['Event']==1].Outcome_PFS.std() # compute threshold for Event = 0 data to exclude (due to short follow up period)

train_data_PFS = train_data[(train_data['Event']==1) | (train_data['Outcome_PFS'] >= threshold)] # exclude Event = 0 data with short follow up period

train_data_PFS = train_data_PFS.drop(columns=['PatientID','Event']) # exclude Patient ID and Event for training

label = 'Outcome_PFS'  # training label

predictor_PFS = TabularPredictor(label=label).fit(train_data_PFS,presets='best_quality') # train model

In [41]:
# function for prediction of PFS probabilities for the first 3 years based on ensemble model 
def calculate_percentage(row):
    count_0_to_1 = sum(1 for value in row if      value <= 12)
    count_1_to_2 = sum(1 for value in row if 12 < value <= 24)
    count_2_to_3 = sum(1 for value in row if 24 < value <= 36)
    
    total_entries = len(row)
    
    percentage_0_to_1 = (count_0_to_1 / total_entries) * 100
    percentage_1_to_2 = (count_1_to_2 / total_entries) * 100
    percentage_2_to_3 = (count_2_to_3 / total_entries) * 100
    
    return pd.Series({'0_to_1': percentage_0_to_1,
                      '1_to_2': percentage_1_to_2,
                      '2_to_3': percentage_2_to_3})

In [None]:
results = test_data.PatientID.to_frame() # initialize the results dataframe
results['PFS_pred'] = predictor_PFS.predict(test_data) # predict PFS on test data

In [43]:
# adjust training data PFS histogram for histogram matching (assuming a mean follow up period of 50 months on thest data)
mean_followup_time_no_event = train_data[train_data.Event==0].Outcome_PFS.mean() # compute mean follow-up time of Event = 0 training data
target_followup_time_no_event = 50 # arbitrary assumption for mean Event = 0 follow up on test data (assuming these patients were included at the end of training data study)
follow_up_diff = mean_followup_time_no_event - target_followup_time_no_event # correction factor for follow-up time for Event=0 training data for subsequwnt histogram matching
train_PFS_adjusted = train_data.Event * train_data.Outcome_PFS + (1-train_data.Event) * (train_data.Outcome_PFS - follow_up_diff) # train PFS with adjusted follow-up time for Event = 0 training samples for subsequent histogram matching

In [51]:
# histogram matching of predicted PFS with adjusted train PFS
reference = train_PFS_adjusted.to_numpy()

target = results.PFS_pred.to_numpy()

results['PFS_pred_histo_match'] = match_histograms(target, reference).round(1)

#results['histo_correction_delta'] = results['PFS_pred_histo_match'] - results['PFS_pred'] # compute degree of correction per sample for adjustment of single model outputs for year 1,2,3 probabilities

In [None]:
#compute predictions of each model of the PFS regression ensemble for computation of year 1,2,3 PFS probabilities
PFS_ensemble_pred = pd.DataFrame()

reference = train_PFS_adjusted.to_numpy() # for single model histogram matching

for model in predictor_PFS.get_model_names():
    model_prediction = predictor_PFS.predict(test_data, model).to_numpy()
    PFS_ensemble_pred[model] = match_histograms(model_prediction, reference)

In [45]:
# add year 1,2,3 PFS probabilities to results 
results = pd.concat([results, PFS_ensemble_pred.apply(calculate_percentage, axis=1)], axis=1) # add year 1,2,3 PFS probabilities to resuts

In [48]:
# calculate year 1,2,3 survivals from year 1,2,3 PFS for final results
results['PFS_1'] = round ( 100 - results['0_to_1']                                         , 1)
results['PFS_2'] = round ( 100 - results['0_to_1'] - results['1_to_2']                     , 1)
results['PFS_3'] = round ( 100 - results['0_to_1'] - results['1_to_2'] - results['2_to_3'] , 1)

In [52]:
#export results
results[['PatientID','PFS_pred_histo_match','PFS_1','PFS_2','PFS_3']].to_csv('results.csv')