In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
#from jointmodel import sim
import pandas as pd
import patsy
import sys
import pystan
import random
random.seed(1234)
import survivalstan
from stancache import stancache
from stancache import config

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


INFO:stancache.seed:Setting seed to 1245502385


## prepare data

In [3]:
data = survivalstan.sim.sim_data_jointmodel(N=100)

In [4]:
data.keys()

dict_keys(['biomarker', 'events', 'covars', 'params'])

In [5]:
data['events'].head()

Unnamed: 0,subject_id,time,event_value,event_name
0,0,5.5,0,death
1,1,3.642285,1,death
2,2,1.995644,1,death
3,3,0.018313,1,death
4,4,1.28822,1,death


In [6]:
data['covars'].head()

Unnamed: 0,subject_id,X1,X2
0,0,0,1
1,1,0,1
2,2,1,1
3,3,0,1
4,4,1,1


In [7]:
df = pd.merge(data['events'].query('event_name == "death"'), data['covars'], on='subject_id')
df.head()

Unnamed: 0,subject_id,time,event_value,event_name,X1,X2
0,0,5.5,0,death,0,1
1,1,3.642285,1,death,0,1
2,2,1.995644,1,death,1,1
3,3,0.018313,1,death,0,1
4,4,1.28822,1,death,1,1


## standard patsy formula

In [8]:
formula = '(time + event_value) ~ 0 + X1'
y, X = patsy.dmatrices(formula_like=formula, data=df)

In [9]:
md = patsy.ModelDesc.from_formula(formula)

In [10]:
md.lhs_termlist

[Term([EvalFactor('time')]), Term([EvalFactor('event_value')])]

In [11]:
len(md.lhs_termlist)

2

## `as_id` function (stateful)

In [172]:
import numpy as np
class Id(object):
    def __init__(self):
        self.values = []
        pass
    
    def memorize_chunk(self, x):
        self.values.extend(np.unique(x))
    
    def memorize_finish(self):
        self.values = np.unique(self.values)
        self.ids = np.arange(len(self.values))+1
        self.lookup = dict(zip(self.values, self.ids))
    
    def transform(self, x):
        if patsy.util.have_pandas:
            return pd.Series([self.lookup[val] for val in x]).astype(int)
        else:
            return np.array([self.lookup[val] for val in x])

class SubjectId(Id):
    def __init__(self):
        self.values = []
        self._type = 'subject'

class TimepointId(Id):
    def __init__(self):
        self.values = []
        self._type = 'timepoint'

class GroupId(Id):
    def __init__(self):
        self.values = []
        self._type = 'group'

as_id = patsy.stateful_transform(Id)
#subject = patsy.stateful_transform(SubjectId)
#group = patsy.stateful_transform(GroupId)
#timepoint_end = patsy.stateful_transform(TimepointId)
#timepoint_id = patsy.stateful_transform(TimepointId)
print(as_id(np.array(['a','b','a','c'])))

0    1
1    2
2    1
3    3
dtype: int64


In [173]:
df.head()

Unnamed: 0,subject_id,time,event_value,event_name,X1,X2
0,0,5.5,0,death,0,1
1,1,3.642285,1,death,0,1
2,2,1.995644,1,death,1,1
3,3,0.018313,1,death,0,1
4,4,1.28822,1,death,1,1


In [174]:
test_formula = 'event_value + as_id(time) + as_id(subject_id) ~ X1 + X2'

In [175]:
y, X = patsy.dmatrices(formula_like=test_formula, data=df)

In [176]:
pd.DataFrame(y).head()

Unnamed: 0,0,1,2
0,0.0,77.0,1.0
1,1.0,72.0,2.0
2,1.0,63.0,3.0
3,1.0,8.0,4.0
4,1.0,50.0,5.0


## `surv1` function (stateful)

In [177]:
class Surv(object):
    def __init__(self):
        self._tmp = {}
        self._long_format = False

    def memorize_chunk(self, time, event_status, **kwargs):
        # Updates internal state, this is called one or more times
        self._type = surv(time=time, event_status=event_status, **kwargs).__class__
        pass

    def memorize_finish(self):
        # this is called once after memorize chunk as called as many times 
        # as needed, for example, after reading in chunks of data incrementally        
        pass

    def transform(self, time, event_status, **kwargs):
        # this is called one or more times on old and new data
        # args are the same as memorize_chunk
        dm = surv(time=time, event_status=event_status, **kwargs)
        return dm

surv1 = patsy.stateful_transform(Surv)

## `surv` function (not stateful)

In [197]:
class SurvData(pd.DataFrame):
    ''' patsy.DesignMatrix representing survival data output '''
    survival_type = 'wide'
    
    def __init__(self, *args, stan_data=dict(), meta_data=dict(), **kwargs):
        super().__init__(*args, **kwargs)
        self.stan_data = stan_data
        self.meta_data = meta_data

class LongSurvData(SurvData):
    ''' pd.DataFrame representing survival data with endpoint_time_id, event_status & subject_id '''
    survival_type = 'long'

class NotValidId(ValueError):
    ''' Class of errors pertaining to invalid Id variables '''
        

def _confirm_id(x, name=None):
    ''' an "ID" variable suitable for use in Stan satisfies following conditions:
            - can be coerced to an integer
            - integer values are sequentially numbered from 1..X

        This function checks the above criteria
            & returns the coerced object if they are met
            
        If criteria are not met this will throw an error.
    '''
    if name is None:
        if isinstance(x, pd.Series):
            name = x.name
        else:
            name = 'unknown'
    # check integer type
    try:
        x_int = x.astype('int')
    except:
        import pdb
        pdb.set_trace()
        raise NotValidId('Value ({}) cannot be coerced to an integer.'.format(name))
    # check sequential numbering 
    unique_values = np.unique(x)
    if not np.min(unique_values)==1:
        raise NotValidId('Value ({}) must start with 1 for use with Stan.'.format(name))
    if not np.max(unique_values) == len(unique_values):
        raise NotValidId('Value ({}) must be sequentially numbered for use with Stan.'.format(name))
    return(x_int)

def _prep_wide_surv(time, event_status, use_pandas):
    if use_pandas:
        dm = pd.DataFrame({'time': time, 'event_status': event_status})
        dm.index = time.index
    else:
        dm = np.append(time, event_status, 1)
    return(SurvData(dm))
    
def _prep_long_surv(time, event_status, subject, use_pandas):
    timepoint_id = _confirm_id(time)
    subject_id = _confirm_id(subject)
    unique_timepoints = survivalstan.survivalstan._prep_timepoint_dataframe(
        pd.DataFrame({'timepoint_id': timepoint_id, 'end_time': time}),
        timepoint_id_col='timepoint_id',timepoint_end_col='end_time')
    unique_subjects = pd.DataFrame({'subject_id': subject_id, 'subject': subject})
    unique_subjects.drop_duplicates(inplace=True)
    stan_data = { 
        't_dur': unique_timepoints['t_dur'],
        't_obs': unique_timepoints['end_time'],
        'T': len(unique_timepoints.index),
        'S': len(unique_subjects.index),
        }
    meta_data = {
        'subject_id': unique_subjects,
        'timepoint_id': unique_timepoints,
    }
    if use_pandas:
        dm = pd.DataFrame({'timepoint_id': timepoint_id,
                            'event_status': event_status,
                            'subject_id': subject_id
                           })
        dm.index = time.index
    else:
        dm = np.append(timepoint_id, event_status, subject_id, 1)
    return LongSurvData(dm, stan_data=stan_data, meta_data=meta_data)
    
def surv(time, event_status, group=None, subject=None, use_pandas=None, long_format=None):
    ''' Prep outcome data for survival modeling '''
    ## infer scenario settings from combinations of inputs
    if use_pandas is None:
        use_pandas = False
        # determine whether to use pandas dataframe
        # depending on (1) if pandas is installed, and (2) input object types
        if patsy.util.have_pandas:
            if isinstance(time, (pd.Series, pd.DataFrame)) and isinstance(event_status, (pd.Series, pd.DataFrame)):
                use_pandas = True
    if long_format is None:
        long_format = False
        # determine if data are long or wide format
        if subject is not None:
            long_format = True
    ## prep data depending on inferred settings
    if long_format:
        dm = _prep_long_surv(time=time, event_status=event_status, subject=subject, use_pandas=use_pandas)
    else:
        dm = _prep_wide_surv(time=time, event_status=event_status, use_pandas=use_pandas)
    return dm
        

In [198]:
pd.DataFrame(surv(df['time'], df['event_value'])).head()

Unnamed: 0,event_status,time
0,0,5.5
1,1,3.642285
2,1,1.995644
3,1,0.018313
4,1,1.28822


In [199]:
df.head()

Unnamed: 0,subject_id,time,event_value,event_name,X1,X2
0,0,5.5,0,death,0,1
1,1,3.642285,1,death,0,1
2,2,1.995644,1,death,1,1
3,3,0.018313,1,death,0,1
4,4,1.28822,1,death,1,1


In [200]:
pd.DataFrame(surv1(as_id(df['time']), df['event_value'], subject=as_id(df['subject_id']))).head()

Unnamed: 0,event_status,subject_id,timepoint_id
0,0,1,77
1,1,2,72
2,1,3,63
3,1,4,8
4,1,5,50


In [201]:
y, X = patsy.dmatrices('surv1(time=time, event_status=event_value) ~ X1', data=df)

In [202]:
pd.DataFrame(y).head()

Unnamed: 0,0,1
0,0.0,5.5
1,1.0,3.642285
2,1.0,1.995644
3,1.0,0.018313
4,1.0,1.28822


In [203]:
y, X = patsy.dmatrices('surv(event_status=event_value, time=time) ~ X1', data=df)

In [204]:
pd.DataFrame(y).head()

Unnamed: 0,0,1
0,0.0,5.5
1,1.0,3.642285
2,1.0,1.995644
3,1.0,0.018313
4,1.0,1.28822


In [206]:
y, X = patsy.dmatrices('surv(event_status=event_value,time=as_id(time), subject=as_id(subject_id)) ~ bs(X1, 3)',
                       data=df)

In [207]:
isinstance(y, LongSurvData)

False

In [208]:
y.shape

(100, 3)

In [209]:
y.design_info.term_names

['surv(event_status=event_value, time=as_id(time), subject=as_id(subject_id))']

In [210]:
X.design_info.factor_infos

{EvalFactor('bs(X1, 3)'): FactorInfo(factor=EvalFactor('bs(X1, 3)'),
            type='numerical',
            state=<factor state>,
            num_columns=3)}

In [211]:
y.design_info

DesignInfo(['surv(event_status=event_value, time=as_id(time), subject=as_id(subject_id))[0]',
            'surv(event_status=event_value, time=as_id(time), subject=as_id(subject_id))[1]',
            'surv(event_status=event_value, time=as_id(time), subject=as_id(subject_id))[2]'],
           factor_infos={EvalFactor('surv(event_status=event_value, time=as_id(time), subject=as_id(subject_id))'): FactorInfo(factor=EvalFactor('surv(event_status=event_value, time=as_id(time), subject=as_id(subject_id))'),
                                    type='numerical',
                                    state=<factor state>,
                                    num_columns=3)},
           term_codings=OrderedDict([(Term([EvalFactor('surv(event_status=event_value, time=as_id(time), subject=as_id(subject_id))')]),
                                      [SubtermInfo(factors=(EvalFactor('surv(event_status=event_value, time=as_id(time), subject=as_id(subject_id))'),),
                                     

In [212]:
y.survival_type

AttributeError: 'DesignMatrix' object has no attribute 'survival_type'

In [213]:
y.design_info

DesignInfo(['surv(event_status=event_value, time=as_id(time), subject=as_id(subject_id))[0]',
            'surv(event_status=event_value, time=as_id(time), subject=as_id(subject_id))[1]',
            'surv(event_status=event_value, time=as_id(time), subject=as_id(subject_id))[2]'],
           factor_infos={EvalFactor('surv(event_status=event_value, time=as_id(time), subject=as_id(subject_id))'): FactorInfo(factor=EvalFactor('surv(event_status=event_value, time=as_id(time), subject=as_id(subject_id))'),
                                    type='numerical',
                                    state=<factor state>,
                                    num_columns=3)},
           term_codings=OrderedDict([(Term([EvalFactor('surv(event_status=event_value, time=as_id(time), subject=as_id(subject_id))')]),
                                      [SubtermInfo(factors=(EvalFactor('surv(event_status=event_value, time=as_id(time), subject=as_id(subject_id))'),),
                                     

## SurvivalFactor class

In [219]:
import logging
logger = logging.getLogger(__name__)
class SurvivalFactor(patsy.EvalFactor):
    ''' A factor object to encode LHS variables 
        for Survival Models, including model type
    '''
    _is_survival = True
    
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._class = None
    
    def eval(self, *args, **kwargs):
        result = super().eval(*args, **kwargs)
        try:
            self._class = result.__class__
        except:
            logger.warning('Outcome class could not be determined')
        if isinstance(result, SurvData):
            self._type = result.survival_type
            self._meta_data = result.meta_data
            self._stan_data = result.stan_data
            
        return result
        
    

In [220]:
a = SurvivalFactor(code='surv(time=time, event_status=event_value)')

In [221]:
print(a._class)

None


In [222]:
md = patsy.ModelDesc([patsy.Term([a])],[])

In [223]:
y, X = patsy.dmatrices(md, data=df)

In [224]:
y.shape

(100, 2)

In [225]:
term = y.design_info.terms[0]

In [226]:
term.factors[0]._is_survival

True

In [227]:
term.factors[0]._class

__main__.SurvData

In [228]:
term.factors[0]._type

'wide'

## SurvivalModelDesc class

In [232]:
import re

In [233]:
class SurvivalModelDesc(object):
    
    def __init__(self, formula):
        self.formula = formula
        self.lhs, self.rhs = re.split(string=formula, pattern='~', maxsplit=1)
        self.lhs_termlist = [patsy.Term([SurvivalFactor(self.lhs)])]
        self.rhs_termlist = patsy.ModelDesc.from_formula(self.rhs).rhs_termlist
        
    def __patsy_get_model_desc__(self, eval_env):
        return patsy.ModelDesc(self.lhs_termlist, self.rhs_termlist)

### confirm we can determine type

In [266]:
my_formula = SurvivalModelDesc('surv(time=time, event_status=event_value) ~ X1')
y, X = patsy.dmatrices(my_formula, data=df)

In [267]:
pd.DataFrame(y).head()

Unnamed: 0,0,1
0,0.0,5.5
1,1.0,3.642285
2,1.0,1.995644
3,1.0,0.018313
4,1.0,1.28822


In [268]:
## should only be one LHS term
assert(len(y.design_info.terms) == 1)

## should only be one LHS factor (within single term)
assert(len(y.design_info.terms[0].factors) == 1)

## LHS factor should be of type "survival"
assert(y.design_info.terms[0].factors[0]._is_survival == True)

In [269]:
# get type of LHS term
y.design_info.terms[0].factors[0]._class

__main__.SurvData

In [270]:
survival_type = y.design_info.terms[0].factors[0]._type
survival_type

'wide'

### confirm we can extract stan & meta-data

In [271]:
my_formula2 = SurvivalModelDesc('surv(time=as_id(time), event_status=event_value, subject=as_id(subject_id)) ~ X1')
y2, X = patsy.dmatrices(my_formula2, data=df)

In [272]:
## should only be one LHS term
assert(len(y2.design_info.terms) == 1)

## should only be one LHS factor (within single term)
assert(len(y2.design_info.terms[0].factors) == 1)

## LHS factor should be of type "survival"
assert(y2.design_info.terms[0].factors[0]._is_survival == True)

In [273]:
y2.design_info.terms[0].factors[0]._class

__main__.LongSurvData

In [274]:
survival_type = y2.design_info.terms[0].factors[0]._type
survival_type

'long'

In [277]:
stan_data = y2.design_info.terms[0].factors[0]._stan_data
stan_data.keys()

dict_keys(['t_dur', 't_obs', 'T', 'S'])

In [279]:
meta_data = y2.design_info.terms[0].factors[0]._meta_data
meta_data.keys()

dict_keys(['subject_id', 'timepoint_id'])

In [280]:
meta_data['subject_id'].head()

Unnamed: 0,subject,subject_id
0,1,1
1,2,2
2,3,3
3,4,4
4,5,5


### test model-matrix design on newdata

In [286]:
y2.design_info

DesignInfo(['surv(time=as_id(time), event_status=event_value, subject=as_id(subject_id))[0]',
            'surv(time=as_id(time), event_status=event_value, subject=as_id(subject_id))[1]',
            'surv(time=as_id(time), event_status=event_value, subject=as_id(subject_id))[2]'],
           factor_infos={SurvivalFactor('surv(time=as_id(time), event_status=event_value, subject=as_id(subject_id))'): FactorInfo(factor=SurvivalFactor('surv(time=as_id(time), event_status=event_value, subject=as_id(subject_id))'),
                                    type='numerical',
                                    state=<factor state>,
                                    num_columns=3)},
           term_codings=OrderedDict([(Term([SurvivalFactor('surv(time=as_id(time), event_status=event_value, subject=as_id(subject_id))')]),
                                      [SubtermInfo(factors=(SurvivalFactor('surv(time=as_id(time), event_status=event_value, subject=as_id(subject_id))'),),
                     

In [285]:
y2.new, X.new = patsy.build_design_matrices(design_infos=[y2.design_info, X.design_info], data=df.tail())

PatsyError: Error evaluating factor: NotValidId: Value (None) must start with 1 for use with Stan.

## SurvivalDesignInfo class

In [247]:
class SurvivalDesignInfo(patsy.design_info.DesignInfo):
    
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        


In [426]:
class SurvivalModelDesc2(object):
    
    def __init__(self, formula):
        self.formula = formula
        self.lhs, self.rhs = re.split(string=formula, pattern='~', maxsplit=1)
        self.lhs_termlist = [patsy.Term([SurvivalFactor(self.lhs)])]
        self.rhs_termlist = patsy.ModelDesc.from_formula(self.rhs).rhs_termlist
        
    def __patsy_get_model_desc__(self, eval_env):
        return patsy.ModelDesc(self.lhs_termlist, self.rhs_termlist)

In [429]:
my_formula = SurvivalModelDesc2('surv1(time=time, event_status=event_value) ~ X1')
#my_formula.lhs_termlist
y, X = patsy.dmatrices(my_formula, data=df)

In [430]:
my_formula.lhs_termlist

[Term([SurvivalFactor('surv1(time=time, event_status=event_value)')])]

In [None]:
di = patsy.incr_dbuilder(formula_like='')

In [432]:
def data_iter_maker():
    return iter([df])

In [433]:
my_env = patsy.EvalEnvironment.capture()

In [435]:
design_infos = patsy.incr_dbuilder(formula_like='surv1(time=time, event_status=event_value)',
                                  data_iter_maker=data_iter_maker
                                  )

In [436]:
design_infos

DesignInfo(['Intercept',
            'surv1(time=time, event_status=event_value)[0]',
            'surv1(time=time, event_status=event_value)[1]'],
           factor_infos={EvalFactor('surv1(time=time, event_status=event_value)'): FactorInfo(factor=EvalFactor('surv1(time=time, event_status=event_value)'),
                                    type='numerical',
                                    state=<factor state>,
                                    num_columns=2)},
           term_codings=OrderedDict([(Term([]),
                                      [SubtermInfo(factors=(),
                                                   contrast_matrices={},
                                                   num_columns=1)]),
                                     (Term([EvalFactor('surv1(time=time, event_status=event_value)')]),
                                      [SubtermInfo(factors=(EvalFactor('surv1(time=time, event_status=event_value)'),),
                                                   

In [444]:
eval_env = patsy.EvalEnvironment.capture()

In [445]:
NA_action = 'drop'

In [457]:
design_infos2 = patsy.highlevel._try_incr_builders(
    formula_like=my_formula,
    data_iter_maker=data_iter_maker,
    eval_env=eval_env, NA_action=NA_action)

In [458]:
design_infos2

[DesignInfo(['surv1(time=time, event_status=event_value)[0]',
             'surv1(time=time, event_status=event_value)[1]'],
            factor_infos={SurvivalFactor('surv1(time=time, event_status=event_value)'): FactorInfo(factor=SurvivalFactor('surv1(time=time, event_status=event_value)'),
                                     type='numerical',
                                     state=<factor state>,
                                     num_columns=2)},
            term_codings=OrderedDict([(Term([SurvivalFactor('surv1(time=time, event_status=event_value)')]),
                                       [SubtermInfo(factors=(SurvivalFactor('surv1(time=time, event_status=event_value)'),),
                                                    contrast_matrices={},
                                                    num_columns=2)])])),
 DesignInfo(['Intercept', 'X1'],
            factor_infos={EvalFactor('X1'): FactorInfo(factor=EvalFactor('X1'),
                                     type='nu

In [463]:
evalfactor = patsy.build._eval_factor(FactorInfo, data=df, NA_action=patsy.missing.NAAction())

In [466]:
evalfactor[0].__class__

__main__.SurvData

## `SurvivalFactorInfo` class

In [468]:
patsy.FactorInfo(factor, 'numerical', {}, num_columns=2, categories=None)

FactorInfo(factor=SurvivalFactor('surv1(time=time, event_status=event_value)'),
           type='numerical',
           state=<factor state>,
           num_columns=2)

In [None]:
class SurvivalFactorInfo(patsy.FactorInfo):
    
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._result_class = None
    