In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
#from jointmodel import sim
import pandas as pd
import sys
import pystan
import random
random.seed(1234)
import survivalstan
from stancache import stancache
from stancache import config

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


INFO:stancache.seed:Setting seed to 1245502385


# get simulated data

In [2]:
data = survivalstan.sim.sim_data_jointmodel(N=100)

# confirm that models compile

In [3]:
model = pystan.stanc(file='jointmodel/jointmodel.stan')

In [4]:
model2 = pystan.stanc(file='jointmodel/jointmodel_with_biomarker.stan')

# Fit joint model for competing events

## review event-data

Inspect simulated event data

In [5]:
df = data['events']
df.sort_values(['subject_id', 'time'])\
    .loc[:,['subject_id','time','event_name','event_value']]\
    .head(10)

Unnamed: 0,subject_id,time,event_name,event_value
0,0,0.082865,death,1
1,1,0.210949,death,1
2,2,5.5,death,0
3,3,1.2186,death,1
24,4,1.499076,new_lesion,1
4,4,5.5,death,0
5,5,0.169179,death,1
36,6,0.076644,new_lesion,1
37,6,0.333531,new_lesion,1
6,6,0.399064,death,1


Confirm multi-event data for a particular subject

In [6]:
subjects_with_multiple_events = df.groupby('subject_id').count()
subjects_with_multiple_events = subjects_with_multiple_events[subjects_with_multiple_events['time']>1].index

In [7]:
df[df['subject_id'] == subjects_with_multiple_events[0]].sort_values(['subject_id','time'])

Unnamed: 0,subject_id,time,event_value,event_name
24,4,1.499076,1,new_lesion
4,4,5.5,0,death


Transform data to "long" format

In [8]:
ldf = survivalstan.prep_data_long_surv(df, event_col='event_value', time_col='time',
                                       sample_col='subject_id', event_name='event_name')

Confirm transformed data for same subject inspected above

In [9]:
ldf[ldf['subject_id'] == subjects_with_multiple_events[0]]\
    .query('(end_death == 1 or end_new_lesion == 1 or end_time > 3.0)')\
    .sort_values(['subject_id','end_time'])

Unnamed: 0,subject_id,end_time,end_death,end_new_lesion
229,4,1.499076,0,1
309,4,3.264619,False,False
315,4,3.313581,False,False
306,4,3.446047,False,False
331,4,3.519103,False,False
307,4,3.761857,False,False
334,4,3.887449,False,False
341,4,3.944624,False,False
328,4,4.133912,False,False
302,4,4.279967,False,False


Merge event-level data with covariate values

In [10]:
ldf = pd.merge(ldf, data['covars'], on='subject_id', how='outer')

## prepare inputs for stan model

This part is done more manually now since the `survivalstan` code hasn't been written

First we prepare input matrices for terminal event (death).

In [11]:
input_t = survivalstan.SurvivalStanData(df=ldf,
                                        event_col='end_death',
                                        timepoint_end_col='end_time',
                                        sample_col='subject_id',
                                        formula = '~ X1') 

In [12]:
input_t.timepoint_df.describe()

Unnamed: 0,timepoint_id,end_time,t_dur
count,116.0,116.0,116.0
mean,58.5,1.464706,0.047414
std,33.630343,1.54872,0.062803
min,1.0,0.001568,0.0003
25%,29.75,0.289993,0.007335
50%,58.5,0.837722,0.021239
75%,87.25,2.261291,0.058398
max,116.0,5.5,0.299062


Next we prepare the input matrices for recurrent event (new_lesion).

*in theory, since ids are assigned in a sorted order, the ids should be identical. We will confirm this before moving forward*.

In [13]:
input_r = survivalstan.SurvivalStanData(df=ldf, event_col='end_new_lesion',
                                        timepoint_end_col='end_time',
                                        sample_col='subject_id', 
                                        formula='~ X2')

In [14]:
input_r.timepoint_df.describe()

Unnamed: 0,timepoint_id,end_time,t_dur
count,116.0,116.0,116.0
mean,58.5,1.464706,0.047414
std,33.630343,1.54872,0.062803
min,1.0,0.001568,0.0003
25%,29.75,0.289993,0.007335
50%,58.5,0.837722,0.021239
75%,87.25,2.261291,0.058398
max,116.0,5.5,0.299062


Confirm that timepoint_ids assigned are identical between the two datasets.

In [15]:
## check whether assigned timepoint ids are indeed identical
timepoints_t = input_t.timepoint_df
timepoints_r = input_r.timepoint_df

merged_timepoints = pd.merge(timepoints_t, timepoints_r, on='timepoint_id', suffixes=['.t', '.r'], how='outer')
merged_timepoints['end_time.diff'] = merged_timepoints.apply(lambda row: row['end_time.t']-row['end_time.r'], axis=1)
assert(all(merged_timepoints['end_time.diff']==0))

In [16]:
assert(len(input_r.df_nonmiss[input_r.df_nonmiss.duplicated(subset=['subject_id','end_time'])].index) == 0)

## combine data inputs for multiple events

Finally we transform the data into a single dictionary to pass into Stan. 

Review keys prepared for each event type:

In [17]:
input_t.data.keys()

dict_keys(['t', 'N', 'T', 's', 'S', 't_dur', 'M', 't_obs', 'event', 'x'])

In [18]:
input_r.data.keys()

dict_keys(['t', 'N', 'T', 's', 'S', 't_dur', 'M', 't_obs', 'event', 'x'])

Confirm that items which should be shared between two event types are indeed shared.

In [19]:
for el in ['S','M','N']:
    assert(input_t.data[el] == input_r.data[el])
for el in ['s','t', 't_obs','t_dur']:
    assert((input_t.data[el] == input_r.data[el]).all())

Prepare dictionary to pass into stan.

In [20]:
stan_data = {
    'S': input_t.data['S'],
    'T': input_t.data['T'],
    't_obs': input_t.data['t_obs'],
    't_dur': input_t.data['t_dur'],
    'N': input_t.data['N'],
    's': input_t.data['s'],
    't': input_t.data['t'],
    'M_t': input_t.data['M'],
    'M_r': input_r.data['M'],
    'event_t': input_t.data['event'],
    'event_r': input_r.data['event'],
    'x_t': input_t.data['x'],
    'x_r': input_r.data['x'],
}

## Fit stan model to event-data only

Let's execute this Stan model on our competing-event data.

We wrap the input data in a CustomSurvivalData class so that the `fit_stan_survival_model` as written can function.

In [21]:
class CustomSurvivalData:
    
    def __init__(self, stan_data, x_df, df_nonmiss, df=None,
                 sample_col=None, sample_id_col=None,
                 group_id_col=None, 
                 timepoint_id_col=None, timepoint_end_col=None):
        self.data = stan_data
        self.x_df = x_df
        self.df_nonmiss = df_nonmiss
        self.group_id_col = group_id_col
        self.sample_col = sample_col
        self.sample_id_col = sample_id_col
        self.timepoint_id_col = timepoint_id_col
        self.timepoint_end_col = timepoint_end_col
        if df is None:
            self.df = df_nonmiss

Finally, we fit the stan model to the simulated data.

In [22]:
test = survivalstan.fit_stan_survival_model(input_data=CustomSurvivalData(stan_data=stan_data,
                                                                          x_df=input_t.x_df,
                                                                          df_nonmiss=input_t.df_nonmiss,
                                                                          sample_col='subject_id',
                                                                          sample_id_col='sample_id',
                                                                         timepoint_id_col='timepoint_id',
                                                                         timepoint_end_col='end_time'),
                                            file='jointmodel/jointmodel.stan',
                                            FIT_FUN=stancache.cached_stan_fit,
                                            iter=4000,
                                            chains=4,
                                           )

INFO:stancache.stancache:Step 1: Get compiled model code, possibly from cache
INFO:stancache.stancache:StanModel: cache_filename set to anon_model.cython_0_25_2.model_code_8947126159220755323.pystan_2_12_0_0.stanmodel.pkl
INFO:stancache.stancache:StanModel: Starting execution
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_682ac6962d1df5e1fcc4da538f356a71 NOW.
INFO:stancache.stancache:StanModel: Execution completed (0:01:06.489617 elapsed)
INFO:stancache.stancache:StanModel: Saving results to cache
INFO:stancache.stancache:Step 2: Get posterior draws from model, possibly from cache
INFO:stancache.stancache:sampling: cache_filename set to anon_model.cython_0_25_2.model_code_8947126159220755323.pystan_2_12_0_0.stanfit.chains_4.data_49523170025.iter_4000.seed_1245502385.pkl
INFO:stancache.stancache:sampling: Starting execution
INFO:stancache.stancache:sampling: Execution completed (2:18:39.972615 elapsed)
INFO:stancache.stancache:sampling: Saving results to cache
The relevant Stan

In [None]:
survivalstan.utils.plot_stan_summary([test])

# Fit joint model with biomarker

## Add biomarker (longitudinal) data to Stan

Next we want to include our biomarker/longitudinal data in the estimation.

First we need to prepare the data matrix to include in our input_data.

Let's review the simulated biomarker data.

In [None]:
data['biomarker'].head()

Merge this with simulated covariate data:

In [None]:
biodf = pd.merge(data['biomarker'], data['covars'], on='subject_id')

Prep input fields to pass to SurvivalStan:

In [None]:
biomarker_data = survivalstan.SurvivalStanData(df=biodf,
                                               event_col='biomarker_value',
                                               formula='~ X1 + X2',
                                               sample_col='subject_id',
                                               time_col='biomarker_time'
                                               )


(review keys)

In [None]:
biomarker_data.data.keys()

Review the non-missing data frame

In [None]:
biomarker_data.df_nonmiss.head()

Update `stan_data` dictionary to include biomarker data

In [None]:
stan_data.update({'N_l': biomarker_data.data['N'],
                 'M_l': biomarker_data.data['M'],
                  'subject_l' : biomarker_data.df_nonmiss['sample_id'].values,
                 'time_l': biomarker_data.df_nonmiss['biomarker_time'].values,
                 'y_l': biomarker_data.data['y'],
                 'x_l': biomarker_data.data['x']})

## Fit stan model on biomarker+event data

Finally we fit a version of our stan model including biomarker data: 

In [None]:
test2 = survivalstan.fit_stan_survival_model(input_data=CustomSurvivalData(stan_data=stan_data,
                                                                          x_df=input_t.x_df,
                                                                          df_nonmiss=input_t.df_nonmiss),
                                             file='jointmodel/jointmodel_with_biomarker.stan',
                                             FIT_FUN=stancache.cached_stan_fit,
                                             iter=2000,
                                             chains=4,
                                             )