In [56]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
#from jointmodel import sim
import pandas as pd
import pystan
import survivalstan

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Prep data inputs

In [57]:
data = survivalstan.sim.sim_data_jointmodel(N=400)

In [58]:
model = pystan.stanc(file='jointmodel/jointmodel.stan')

## Prep event-level data inputs

In [59]:
df = data['events']
df.sort_values(['subject_id', 'time'])\
    .loc[:,['subject_id','time','event_name','event_value']]\
    .head(10)

Unnamed: 0,subject_id,time,event_name,event_value
0,0,1.016622,new_lesion,1
1,0,2.03416,new_lesion,1
0,0,5.5,death,0
6,1,3.023333,new_lesion,1
1,1,5.5,death,0
2,2,1.787449,death,1
3,3,0.005271,death,1
24,4,2.00753,new_lesion,1
25,4,2.536777,new_lesion,1
4,4,3.632696,death,1


In [60]:
df.query('subject_id == 1')

Unnamed: 0,subject_id,time,event_value,event_name
1,1,5.5,0,death
6,1,3.023333,1,new_lesion


In [61]:
ldf = survivalstan.prep_data_long_surv(df, event_col='event_value', time_col='time',
                                       sample_col='subject_id', event_name='event_name')

In [62]:
ldf.query('subject_id == 1 and (end_death == 1 or end_new_lesion == 1 or end_time > 4.4)').sort_values(['subject_id','end_time'])

Unnamed: 0,subject_id,end_time,end_death,end_new_lesion
494,1,3.023333,0,1
961,1,4.596482,False,False
908,1,4.605852,False,False
914,1,4.630973,False,False
934,1,4.649867,False,False
945,1,4.711867,False,False
941,1,4.725107,False,False
916,1,4.758143,False,False
959,1,4.846161,False,False
913,1,4.851894,False,False


In [63]:
ldf = pd.merge(ldf, data['covars'], on='subject_id', how='outer')

## prepare inputs for stan model

This part is done more manually now since the `survivalstan` code hasn't been written

In [64]:
input_t = survivalstan.SurvivalStanData(df=ldf,
                                        event_col='end_death',
                                        timepoint_end_col='end_time',
                                        sample_col='subject_id',
                                        formula = '~ X1') 

In [65]:
input_t.timepoint_df.describe()

Unnamed: 0,timepoint_id,end_time,t_dur
count,492.0,492.0,492.0
mean,246.5,1.383478,0.011179
std,142.172431,1.487522,0.019219
min,1.0,0.000994,1.1e-05
25%,123.75,0.187764,0.001146
50%,246.5,0.771066,0.004093
75%,369.25,2.258458,0.012469
max,492.0,5.5,0.197529


In [66]:
## need to do a fuzzy-merge these, or somehow constrain the ids to be the same. 

## in theory, since ids are assigned in a sorted order, they should be identical.  

In [67]:
input_r = survivalstan.SurvivalStanData(df=ldf, event_col='end_new_lesion',
                                        timepoint_end_col='end_time',
                                        sample_col='subject_id', 
                                        formula='~ X2')

In [68]:
input_r.timepoint_df.describe()

Unnamed: 0,timepoint_id,end_time,t_dur
count,492.0,492.0,492.0
mean,246.5,1.383478,0.011179
std,142.172431,1.487522,0.019219
min,1.0,0.000994,1.1e-05
25%,123.75,0.187764,0.001146
50%,246.5,0.771066,0.004093
75%,369.25,2.258458,0.012469
max,492.0,5.5,0.197529


In [69]:
## check whether assigned timepoint ids are indeed identical
timepoints_t = input_t.timepoint_df
timepoints_r = input_r.timepoint_df

merged_timepoints = pd.merge(timepoints_t, timepoints_r, on='timepoint_id', suffixes=['.t', '.r'], how='outer')
merged_timepoints['end_time.diff'] = merged_timepoints.apply(lambda row: row['end_time.t']-row['end_time.r'], axis=1)
assert(all(merged_timepoints['end_time.diff']==0))

In [70]:
assert(len(input_r.df_nonmiss[input_r.df_nonmiss.duplicated(subset=['subject_id','end_time'])].index) == 0)

## single data inputs for multiple events

In [71]:
input_t.data.keys()

dict_keys(['N', 'T', 'event', 'S', 'x', 't', 's', 'M', 't_dur', 't_obs'])

In [72]:
input_r.data.keys()

dict_keys(['N', 'T', 'event', 'S', 'x', 't', 's', 'M', 't_dur', 't_obs'])

In [74]:
for el in ['S','M','N']:
    assert(input_t.data[el] == input_r.data[el])
for el in ['s','t', 't_obs','t_dur']:
    assert((input_t.data[el] == input_r.data[el]).all())

In [121]:
stan_data = {
    'S': input_t.data['S'],
    'T': input_t.data['T'],
    't_obs': input_t.data['t_obs'],
    't_dur': input_t.data['t_dur'],
    'N': input_t.data['N'],
    's': input_t.data['s'],
    't': input_t.data['t'],
    'M_t': input_t.data['M'],
    'M_r': input_r.data['M'],
    'event_t': input_t.data['event'],
    'event_r': input_r.data['event'],
    'x_t': input_t.data['x'],
    'x_r': input_r.data['x'],
}

## try stan model on event-data only

In [None]:
test = survivalstan.fit_stan_survival_model(input_data=CustomSurvivalData(stan_data=stan_data,
                                                                          x_df=input_t.x_df,
                                                                          df_nonmiss=input_t.df_nonmiss),
                                           file='jointmodel/jointmodel.stan'
                                           )

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_223de095e8831af3a0aeb134d95cfead NOW.


NOT reusing model.


## prepare longitudinal data for stan

In [128]:
class CustomSurvivalData:
    
    def __init__(self, stan_data, x_df, df_nonmiss):
        self.data = stan_data
        self.x_df = x_df
        self.df_nonmiss = df_nonmiss

In [122]:
data['biomarker'].head()

Unnamed: 0,subject_id,biomarker_time,biomarker_value
0,0,0.2,0.295732
1,0,0.4,2.668375
2,0,0.6,2.292061
3,0,0.8,2.361316
4,0,1.0,2.484984


In [123]:
biodf = pd.merge(data['biomarker'], data['covars'], on='subject_id')

In [124]:
biomarker_data = survivalstan.SurvivalStanData(df=biodf,
                                                  event_col='biomarker_value',
                                                  formula='~ X1 + X2',
                                                sample_col='subject_id',
                                               time_col='biomarker_time'
                                                 )


In [125]:
biomarker_data.data.keys()

dict_keys(['N', 'event', 'S', 'x', 'y', 'M', 's'])

In [126]:
biomarker_data.df_nonmiss.head()

Unnamed: 0,Intercept,X1,X2,biomarker_time,biomarker_value,subject_id,sample_id
0,1.0,0.0,1.0,0.2,0.295732,0,1
1,1.0,0.0,1.0,0.4,2.668375,0,1
2,1.0,0.0,1.0,0.6,2.292061,0,1
3,1.0,0.0,1.0,0.8,2.361316,0,1
4,1.0,0.0,1.0,1.0,2.484984,0,1


In [127]:
stan_data.update({'N_l': biomarker_data.data['N'],
                 'M_l': biomarker_data.data['M'],
                  'subject_l' : biomarker_data.df_nonmiss['sample_id'].values,
                 'time_l': biomarker_data.df_nonmiss['biomarker_time'].values,
                 'y_l': biomarker_data.data['y'],
                 'x_l': biomarker_data.data['x']})

# Try running stan model on biomarker+event data

In [None]:
test2 = survivalstan.fit_stan_survival_model(input_data=CustomSurvivalData(stan_data=stan_data,
                                                                          x_df=input_t.x_df,
                                                                          df_nonmiss=input_t.df_nonmiss),
                                           file='jointmodel/jointmodel_with_biomarker.stan'
                                           )