In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
from jointmodel import sim
import pandas as pd
import pystan
import survivalstan

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
data = sim.simulate_data(N=400)

In [3]:
model = pystan.stanc(file='jointmodel/jointmodel.stan')

# Prep event-level data inputs

In [4]:
## combine terminal and recurrent event datasets
df_t = data['t_df']
df_t.rename(columns={'event_status': 'event_value',
                     'event_time': 'time',
                     'index': 'subject_id'},
            inplace=True)
df_t['event_name'] = 'death'

df_r = data['r_df']
df_r.rename(columns={'recurrence_time': 'time',
                    'index': 'subject_id'},
           inplace=True)
df_r['event_value'] = 1
df_r['event_name'] = 'new_lesion'

x_df = data['x_df']
x_df.rename(columns={'index': 'subject_id'}, inplace=True)

In [5]:
df = pd.concat([df_t, df_r])
df.sort_values(['subject_id', 'time'])\
    .loc[:,['subject_id','time','event_name','event_value']]\
    .head(20)

Unnamed: 0,subject_id,time,event_name,event_value
0,0,0.720119,death,1
1,1,0.062381,death,1
12,2,0.921687,new_lesion,1
2,2,3.319949,death,1
3,3,1.219176,death,1
24,4,1.504987,new_lesion,1
4,4,3.126221,death,1
30,5,0.671881,new_lesion,1
5,5,5.298116,death,1
6,6,0.408451,death,1


In [13]:
pd.pivot_table(df, index=['subject_id','time'], columns=['event_name'], values=['event_value'], fill_value=0)\
    .head(n=10)

Unnamed: 0_level_0,Unnamed: 1_level_0,event_value,event_value
Unnamed: 0_level_1,event_name,death,new_lesion
subject_id,time,Unnamed: 2_level_2,Unnamed: 3_level_2
0,0.720119,1,0
1,0.062381,1,0
2,0.921687,0,1
2,3.319949,1,0
3,1.219176,1,0
4,1.504987,0,1
4,3.126221,1,0
5,0.671881,0,1
5,5.298116,1,0
6,0.408451,1,0


In [34]:
longdata_t = survivalstan.prep_data_long_surv(df, event_col='event_t', time_col='time',
                                             sample_col='subject_id')
longdata_r = survivalstan.prep_data_long_surv(df, event_col='event_r', time_col='time',
                                             sample_col='subject_id')

In [35]:
longdata_t.rename(columns = {'end_failure': 'end_t'}, inplace=True)
longdata_r.rename(columns = {'end_failure': 'end_r'}, inplace=True)

In [59]:
ldf = pd.merge(longdata_t.loc[:,['subject_id', 'end_time', 'end_t']],
              longdata_r.loc[:, ['subject_id', 'end_time', 'end_r']],
              on=['subject_id','end_time'],
              how='outer')
ldf = pd.merge(ldf, x_df, on='subject_id', how='outer')

In [64]:
ldf.query('subject_id == 2 and (end_t == 1 or end_r == 1)').sort_values(['subject_id','end_time'])

Unnamed: 0,subject_id,end_time,end_t,end_r,X1,X2
872,2,0.539432,0,1,1,1
636,2,1.121959,1,0,1,1


In [28]:
input_t = survivalstan.SurvivalStanData(df=ldf,
                                        event_col='end_t',
                                        timepoint_end_col='end_time',
                                        sample_id_col='subject_id',
                                        formula = '~ X2') 

In [29]:
ldf.head()

Unnamed: 0,subject_id,end_time,end_t,end_r,X1,X2
0,0,0.318088,1,0,0,0
1,0,0.255143,False,False,0,0
2,0,0.179245,False,False,0,0
3,0,0.033488,False,False,0,0
4,0,0.202801,False,False,0,0


In [30]:
input_t.timepoint_df.describe()

Unnamed: 0,timepoint_id,end_time,t_dur
count,471.0,471.0,471.0
mean,236.0,1.300365,0.011677
std,136.110249,1.452798,0.019537
min,1.0,0.000386,1e-06
25%,118.5,0.164054,0.001344
50%,236.0,0.692312,0.004012
75%,353.5,2.040779,0.012342
max,471.0,5.5,0.199702


In [32]:
## need to do a fuzzy-merge these, or somehow constrain the ids to be the same. 

## in theory, since ids are assigned in a sorted order, they should be identical.  

In [33]:
input_r = survivalstan.SurvivalStanData(df=ldf, event_col='end_r',
                                        timepoint_end_col='end_time',
                                        sample_id_col='subject_id', 
                                        formula='~ X2')

In [34]:
input_r.timepoint_df.describe()

Unnamed: 0,timepoint_id,end_time,t_dur
count,471.0,471.0,471.0
mean,236.0,1.300365,0.011677
std,136.110249,1.452798,0.019537
min,1.0,0.000386,1e-06
25%,118.5,0.164054,0.001344
50%,236.0,0.692312,0.004012
75%,353.5,2.040779,0.012342
max,471.0,5.5,0.199702


In [35]:
## check whether assigned timepoint ids are indeed identical
timepoints_t = input_t.timepoint_df
timepoints_r = input_r.timepoint_df

merged_timepoints = pd.merge(timepoints_t, timepoints_r, on='timepoint_id', suffixes=['.t', '.r'], how='outer')
merged_timepoints['end_time.diff'] = merged_timepoints.apply(lambda row: row['end_time.t']-row['end_time.r'], axis=1)
assert(all(merged_timepoints['end_time.diff']==0))

In [41]:
input_r.df_nonmiss[input_r.df_nonmiss.duplicated(subset=['subject_id','end_time'])].head()

Unnamed: 0,Intercept,X2,subject_id,end_r,end_time,timepoint_id
637,1.0,1.0,2,False,0.318088,165
640,1.0,1.0,2,False,0.412779,186
642,1.0,1.0,2,False,0.379931,176
645,1.0,1.0,2,False,0.255143,147
651,1.0,1.0,2,False,0.391109,178


In [50]:
longdata_t.query('subject_id == 2 and abs(end_time-0.318088) < 0.001')\
    .sort_values(['subject_id','end_time'])

Unnamed: 0,event_r,event_t,subject_id,time,key,end_time,end_t
942,0.0,1.0,2,1.121959,1,0.318088,False
188400,1.0,0.0,2,0.539432,1,0.318088,False


In [None]:
class MultiSurvivalStanData(SurvivalStanData):
    '''
        Input data for a Survivalstan model with multiple events
    '''
    