In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
#from jointmodel import sim
import pandas as pd
import patsy
import sys
import pystan
import random
random.seed(1234)
import survivalstan
from stancache import stancache
from stancache import config

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


INFO:stancache.seed:Setting seed to 1245502385


In [2]:
data = survivalstan.sim.sim_data_jointmodel(N=100)

In [3]:
data.keys()

dict_keys(['biomarker', 'events', 'params', 'covars'])

In [4]:
data['events'].head()

Unnamed: 0,subject_id,time,event_value,event_name
0,0,5.5,0,death
1,1,0.24068,1,death
2,2,5.5,0,death
3,3,0.515687,1,death
4,4,5.5,0,death


In [5]:
data['covars'].head()

Unnamed: 0,subject_id,X1,X2
0,0,1,0
1,1,1,0
2,2,1,0
3,3,1,1
4,4,1,0


In [6]:
df = pd.merge(data['events'].query('event_name == "death"'), data['covars'], on='subject_id')
df.head()

Unnamed: 0,subject_id,time,event_value,event_name,X1,X2
0,0,5.5,0,death,1,0
1,1,0.24068,1,death,1,0
2,2,5.5,0,death,1,0
3,3,0.515687,1,death,1,1
4,4,5.5,0,death,1,0


In [7]:
formula = '(time + event_value) ~ 0 + X1'
y, X = patsy.dmatrices(formula_like=formula, data=df)

In [8]:
md = patsy.ModelDesc.from_formula(formula)

In [9]:
md.lhs_termlist

[Term([EvalFactor('time')]), Term([EvalFactor('event_value')])]

In [10]:
len(md.lhs_termlist)

2

In [11]:
import numpy as np
class Id(object):
    def __init__(self):
        self.values = []
        pass
    
    def memorize_chunk(self, x):
        self.values.extend(np.unique(x))
    
    def memorize_finish(self):
        self.values = np.unique(self.values)
        self.ids = np.arange(len(self.values))+1
        self.lookup = dict(zip(self.values, self.ids))
    
    def transform(self, x):
        return [self.lookup[val] for val in x]

class SubjectId(Id):
    def __init__(self):
        self.values = []
        self._type = 'subject'

class TimepointId(Id):
    def __init__(self):
        self.values = []
        self._type = 'timepoint'

class GroupId(Id):
    def __init__(self):
        self.values = []
        self._type = 'group'

as_id = patsy.stateful_transform(Id)
#subject = patsy.stateful_transform(SubjectId)
#group = patsy.stateful_transform(GroupId)
#timepoint_end = patsy.stateful_transform(TimepointId)
#timepoint_id = patsy.stateful_transform(TimepointId)
print(as_id(np.array(['a','b','a','c'])))

[1, 2, 1, 3]


In [12]:
df.head()

Unnamed: 0,subject_id,time,event_value,event_name,X1,X2
0,0,5.5,0,death,1,0
1,1,0.24068,1,death,1,0
2,2,5.5,0,death,1,0
3,3,0.515687,1,death,1,1
4,4,5.5,0,death,1,0


In [13]:
test_formula = 'event_value + as_id(time) + as_id(subject_id) ~ X1 + X2'

In [14]:
y, X = patsy.dmatrices(formula_like=test_formula, data=df)

In [15]:
pd.DataFrame(y).head()

Unnamed: 0,0,1,2
0,0.0,73.0,1.0
1,1.0,35.0,2.0
2,0.0,73.0,3.0
3,1.0,47.0,4.0
4,0.0,73.0,5.0


In [16]:
class Surv(object):
    def __init__(self):
        self._tmp = {}
        self._long_format = False

    def memorize_chunk(self, time, event_status):
        # Updates internal state, this is called one or more times
        pass

    def memorize_finish(self):
        # this is called once after memorize chunk as called as many times 
        # as needed, for example, after reading in chunks of data incrementally        
        pass

    def transform(self, time, event_status):
        # this is called one or more times on old and new data
        # args are the same as memorize_chunk
        if patsy.util.have_pandas:
            if isinstance(time, (pd.Series, pd.DataFrame)) and isinstance(event_status, (pd.Series, pd.DataFrame)):
                dm = pd.DataFrame({'time': time, 'event_status': event_status})
                dm.index = time.index
        else: 
            dm = np.append(time, event_status, 1)
        return dm

surv1 = patsy.stateful_transform(Surv)

In [93]:
class SurvDataP(pd.DataFrame):
    def __init__(self, *args, **kwargs):
        super(SurvDataP, self).__init__(*args, **kwargs)

class LongSurvDataP(SurvDataP):
    ''' pd.DataFrame representing survival data with endpoint_time_id, event_status & subject_id '''

def _confirm_id(x):
    ''' an "ID" variable suitable for use in Stan satisfies following conditions:
            - can be coerced to an integer
            - sequentially numbered from 1..X
            - inherits from (or can be coerced to inherit from) Id class

        This function checks the above criteria
            & returns the coerced object if they are met
            
        If criteria are not met this will throw an error.
    '''
    if isinstance(x, Id):
        print('x is of type Id')
        return(x)
    else:
        print('x is not of type Id')
        return as_id(x)

def _prep_wide_surv(time, event_status, use_pandas):
    if use_pandas:
        dm = SurvDataP({'time': time, 'event_status': event_status})
        dm.index = time.index
    else:
        dm = np.append(time, event_status, 1)
    return(dm)
    
def _prep_long_surv(time, event_status, subject, use_pandas):
    timepoint_id = _confirm_id(time)
    subject_id = _confirm_id(subject)
    if use_pandas:
        dm = LongSurvDataP({'timepoint_id': timepoint_id,
                            'event_status': event_status,
                            'subject_id': subject_id
                           })
    else:
        dm = np.append(timepoint_id, event_status, subject_id, 1)
    return dm
    
def surv(time, event_status, group=None, subject=None, use_pandas=None, long_format=None):
    ''' Prep outcome data for survival modeling '''
    ## infer scenario settings from combinations of inputs
    if use_pandas is None:
        use_pandas = False
        # determine whether to use pandas dataframe
        # depending on (1) if pandas is installed, and (2) input object types
        if patsy.util.have_pandas:
            if isinstance(time, (pd.Series, pd.DataFrame)) and isinstance(event_status, (pd.Series, pd.DataFrame)):
                use_pandas = True
    if long_format is None:
        long_format = False
        # determine if data are long or wide format
        if subject is not None:
            long_format = True
    ## prep data depending on inferred settings
    if long_format:
        dm = _prep_long_surv(time=time, event_status=event_status, subject=subject, use_pandas=use_pandas)
    else:
        dm = _prep_wide_surv(time=time, event_status=event_status, use_pandas=use_pandas)
    return dm
        

In [94]:
surv(df['time'], df['event_value']).head()

Unnamed: 0,event_status,time
0,0,5.5
1,1,0.24068
2,0,5.5
3,1,0.515687
4,0,5.5


In [95]:
y, X = patsy.dmatrices('surv(time=time, event_status=event_value) ~ X1', data=df)

In [96]:
pd.DataFrame(y).head()

Unnamed: 0,0,1
0,0.0,5.5
1,1.0,0.24068
2,0.0,5.5
3,1.0,0.515687
4,0.0,5.5


In [97]:
y, X = patsy.dmatrices('surv(event_status=event_value, time=time) ~ X1', data=df)

In [98]:
pd.DataFrame(y).head()

Unnamed: 0,0,1
0,0.0,5.5
1,1.0,0.24068
2,0.0,5.5
3,1.0,0.515687
4,0.0,5.5


In [99]:
y, X = patsy.dmatrices('surv(event_status=event_value, time=time, subject=as_id(subject_id)) ~ X1', data=df)

x is not of type Id
x is not of type Id
x is not of type Id
x is not of type Id


In [85]:
isinstance(y, LongSurvDataP)

False

In [77]:
y.design_info

DesignInfo(['surv(event_status=event_value, time=time, subject=as_id(subject_id))[0]',
            'surv(event_status=event_value, time=time, subject=as_id(subject_id))[1]',
            'surv(event_status=event_value, time=time, subject=as_id(subject_id))[2]'],
           factor_infos={EvalFactor('surv(event_status=event_value, time=time, subject=as_id(subject_id))'): FactorInfo(factor=EvalFactor('surv(event_status=event_value, time=time, subject=as_id(subject_id))'),
                                    type='numerical',
                                    state=<factor state>,
                                    num_columns=3)},
           term_codings=OrderedDict([(Term([EvalFactor('surv(event_status=event_value, time=time, subject=as_id(subject_id))')]),
                                      [SubtermInfo(factors=(EvalFactor('surv(event_status=event_value, time=time, subject=as_id(subject_id))'),),
                                                   contrast_matrices={},
             