In [1]:
import pandas as pd
import numpy as np
import os
import time
import copy
import pathlib, tempfile

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from graphviz import Digraph
from joblib import Parallel, delayed
from scipy import stats

from survivors import metrics as metr
from survivors import constants as cnt
from survivors import criteria as crit
from numba import njit, jit, int32, float64
from lifelines import KaplanMeierFitter, NelsonAalenFitter

import cProfile
import pstats

%load_ext line_profiler
%load_ext scalene

Scalene extension successfully loaded. Note: Scalene currently only
supports CPU+GPU profiling inside Jupyter notebooks. For full Scalene
profiling, use the command line version.


In [2]:
import survivors.datasets as ds
from survivors.tree import CRAID

param = {'criterion': 'peto', 'cut': True, 'depth': 10,
         'max_features': 1.0, 'min_samples_leaf': 5, 
         'signif': 0.05}

X, y, features, categ, sch_nan = ds.load_pbc_dataset()
param["categ"] = categ

# cr = CRAID(**param)
# cr.fit(X, y)

# pred_cens = cr.predict(X, target="cens")

In [12]:
categ

['trt', 'sex', 'ascites', 'hepato', 'spiders']

In [125]:
# def coxph_negative_gradient(cnp.npy_uint8[:] event,
#                             cnp.npy_double[:] time,
#                             cnp.npy_double[:] y_pred):
#     cdef cnp.npy_double s
#     cdef int i
#     cdef int j
#     cdef cnp.npy_intp n_samples = event.shape[0]

#     cdef cnp.ndarray[cnp.npy_double, ndim=1] gradient = cnp.PyArray_EMPTY(1, &n_samples, cnp.NPY_DOUBLE, 0)
#     cdef cnp.npy_double[:] exp_tsj = cnp.PyArray_ZEROS(1, &n_samples, cnp.NPY_DOUBLE, 0)

#     cdef cnp.npy_double[:] exp_pred = np.exp(y_pred)
#     with nogil:
#         for i in range(n_samples):
#             for j in range(n_samples):
#                 if time[j] >= time[i]:
#                     exp_tsj[i] += exp_pred[j]

#         for i in range(n_samples):
#             s = 0
#             for j in range(n_samples):
#                 if event[j] and time[i] >= time[j]:
#                     s += exp_pred[i] / exp_tsj[j]
#             gradient[i] = event[i] - s

#     return gradient

from numba import njit, jit

@jit  # ('f8(i8[:], f8[:], f8[:])')
def coxph_negative_gradient(event, time, y_pred):
    n_samples = event.shape[0]

    gradient = np.zeros(n_samples, dtype=float)
    exp_tsj = np.zeros(n_samples, dtype=float)

    exp_pred = np.exp(y_pred)
    for i in range(n_samples):
        for j in range(n_samples):
            if time[j] >= time[i]:
                exp_tsj[i] += exp_pred[j]

    for i in range(n_samples):
        s = 0
        for j in range(n_samples):
            if event[j] and time[i] >= time[j]:
                s += exp_pred[i] / exp_tsj[j]
        gradient[i] = event[i] - s

    return gradient

@jit
def fast_coxph_negative_gradient(event, time, y_pred):
    n_samples = event.shape[0]
    r = np.repeat(time[:, np.newaxis], n_samples, axis=1)
    exp_pred = np.exp(y_pred)
    exp_tsj = exp_pred.dot(r >= r.T)

    shared_j = event/exp_tsj
    gradient = event - exp_pred*shared_j.dot(r <= r.T)
    
    return gradient

In [128]:
%timeit coxph_negative_gradient(y["cens"], y["time"], pred_cens)

569 µs ± 4.47 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [129]:
%timeit fast_coxph_negative_gradient(y["cens"], y["time"], pred_cens)

1.98 ms ± 5.39 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [28]:
np.vstack([y["cens"], pred_cens, grad]).T

array([[ 1.        ,  1.        ,  0.85628939],
       [ 0.        ,  0.16470588, -0.93795051],
       [ 1.        ,  0.625     ,  0.73621406],
       ...,
       [ 0.        ,  0.16470588, -0.16909675],
       [ 0.        ,  0.16470588, -0.09234032],
       [ 0.        ,  0.01724138, -0.13255543]])

In [29]:
np.nan * 0

nan

In [None]:
from sksurv.ensemble import RandomSurvivalForest

In [281]:
import numpy as np
from survivors.metrics import ibs, iauc, ipa, get_survival_func
from survivors.constants import get_y

from lifelines import KaplanMeierFitter
from lifelines import NelsonAalenFitter

bins = np.array([1, 10, 100, 1000])

y_train = get_y(np.array([1, 0]), np.array([100, 100]))
y_test_1 = get_y(np.array([1]), np.array([100]))
y_test_2 = get_y(np.array([0]), np.array([100]))
y_test_3 = get_y(np.array([1]), np.array([50]))

kmf_train = KaplanMeierFitter()
kmf_train.fit(y_train['time'], event_observed=y_train['cens'])
sf_train = kmf_train.survival_function_at_times(bins).to_numpy()[np.newaxis, :]

In [282]:
ibs(y_train, y_test_3, sf_train, bins)

0.23648648648648649

In [212]:
class KaplanMeier:
    def __init__(self):
        self.timeline = None
        self.survival_function = None
        self.confidence_interval_ = None
        self.alpha = 0.05

    def fit(self, durations, right_censor, weights=None):
        if weights is None:
            weights = np.ones(right_censor.shape)
        self.timeline = np.unique(durations)

        dur_ = np.searchsorted(self.timeline, durations)
        hist_dur = np.bincount(dur_, weights=weights)
        self.hist_cens = np.bincount(dur_, weights=right_censor*weights)
        self.cumul_hist_dur = np.cumsum(hist_dur[::-1])[::-1]
        self.survival_function = np.hstack([1.0, np.cumprod((1.0 - self.hist_cens / (self.cumul_hist_dur)))])

    def count_confidence_interval(self):
        ''' exponential Greenwood: https://www.math.wustl.edu/~sawyer/handouts/greenwood.pdf '''
        z = ss.norm.ppf(1 - self.alpha / 2)
        cumulative_sq_ = np.sqrt(np.hstack([0.0, np.cumsum(self.hist_cens / (self.cumul_hist_dur * (self.cumul_hist_dur - self.hist_cens)))]))
        np.nan_to_num(cumulative_sq_, copy=False, nan=0)
        v = np.log(self.survival_function)
        np.nan_to_num(v, copy=False, nan=0)
        self.confidence_interval_ = np.vstack([np.exp(v * np.exp(- z * cumulative_sq_ / v)),
                                               np.exp(v * np.exp(+ z * cumulative_sq_ / v))]).T
        np.nan_to_num(self.confidence_interval_, copy=False, nan=1)

    def get_confidence_interval_(self):
        if self.confidence_interval_ is None:
            self.count_confidence_interval()
        return self.confidence_interval_

    def survival_function_at_times(self, times):
        place_bin = np.digitize(times, self.timeline)
        return self.survival_function[np.clip(place_bin, 0, None)]

In [348]:
# @jit()
# def _brier_score(ti, t, s, g_t, g_ti, d):
#     if (ti <= t) and d == 1:
#         return np.power(s, 2) * 1./g_ti
#     if ti > t:
#         return np.power(1 - s, 2) * 1./g_t
#     return 0.

@jit()
def _brier_score(ti, t, s, g_t, g_ti, d):
    return np.where(ti > t, 
                    np.power(1 - s, 2) * 1./g_t, 
                    np.where(d == 1, 
                             np.power(s, 2) * 1./g_ti, 
                             0.0))

def _inverse_censoring_metric(func):
    def metric(survival_train, survival_test, estimate, times):
        ipcw = KaplanMeier()
        ipcw.fit(survival_train["time"], 1 - survival_train["cens"])
        g_t = ipcw.survival_function_at_times(times)
        g_ti = ipcw.survival_function_at_times(survival_test["time"])
        arr = np.zeros_like(estimate, dtype=float)
        
        for i, t in enumerate(times):
            arr[:,i] = func(survival_test["time"], t, estimate[:,i], g_t[i], g_ti, survival_test["cens"])
        return arr
    return metric

def _integrated_metric(func):
    def metric(survival_train, survival_test, estimate, times):
        scores = func(survival_train, survival_test, estimate, times)
        integral = np.trapz(scores, times)
        return integral / (times[-1] - times[0])
    return metric

In [349]:
ibs__ = _inverse_censoring_metric(_brier_score)

In [350]:
ibs_ = _integrated_metric(ibs__)

In [287]:
ibs_(y_train, y_test_3, sf_train, bins)

[[0.   0.   0.25 0.25]]


array([0.23648649])

In [312]:
from survivors.experiments.grid import generate_sample

X, y, features, categ, sch_nan = ds.load_pbc_dataset()
gen = generate_sample(X, y, 5)
X_train, y_train, X_test, y_test, bins = next(gen)

cr = CRAID(**param)
cr.fit(X_train, y_train)

pred_sf = cr.predict_at_times(X_test, bins, mode="surv")

In [356]:
pred_hf = cr.predict_at_times(X_test, bins, mode="hazard")

In [353]:
%timeit ibs_(y_train, y_test, pred_sf, bins)

22.1 ms ± 361 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [354]:
%timeit ibs(y_train, y_test, pred_sf, bins, axis=0)

31 ms ± 623 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [355]:
def loglikelihood(time, cens, sf, cumhf, bins):
    index_times = np.digitize(time, bins, right=True) - 1
    hf = np.hstack((cumhf[:, 0][np.newaxis].T, np.diff(cumhf)))
    sf_by_times = np.take_along_axis(sf, index_times[:, np.newaxis], axis=1)[:, 0] + 1e-10
    hf_by_times = (np.take_along_axis(hf, index_times[:, np.newaxis], axis=1)[:, 0] + 1e-10)**cens
    likelihood = np.sum(np.log(sf_by_times) + np.log(hf_by_times))
    return likelihood

def kl(time, cens, sf, cumhf, bins):
    index_times = np.digitize(time, bins, right=True) - 1
    hf = np.hstack((cumhf[:, 0][np.newaxis].T, np.diff(cumhf)))
    sf_by_times = np.take_along_axis(sf, index_times[:, np.newaxis], axis=1)[:, 0] + 1e-10
    hf_by_times = (np.take_along_axis(hf, index_times[:, np.newaxis], axis=1)[:, 0] + 1e-10)**cens
    likelihood = np.sum(np.log(sf_by_times) + np.log(hf_by_times))
    return likelihood

In [357]:
loglikelihood(y_test["time"], y_test["cens"], pred_sf, pred_hf, bins)

-1099.563431516333

In [360]:
y_test["time"], bins
np.bincount(index_times)

array([ 0,  0,  0, ...,  0,  1, 12], dtype=int64)

In [390]:
# Kullback–Leibler divergence

index_times = np.digitize(y_test["time"], bins, right=True) - 1
arr = np.zeros_like(pred_hf, dtype=int)
arr[np.arange(arr.shape[0]), index_times] = 1
hf = np.hstack((pred_hf[:, 0][np.newaxis].T, np.diff(pred_hf)))

np.sum(hf * np.log((hf + 1e-20)/(arr + 1e-20)), axis=1)

array([103.07480247,  87.3916971 ,  36.01652794, 108.36656156,
        86.50062991,   0.        ,   5.05844785,  87.3916971 ,
        86.50062991,  86.50062991,  36.01652794, 103.07480247,
         0.        , 111.14680412,  87.3916971 ,   0.        ,
        86.50062991, 103.07480247,  87.3916971 , 103.07480247,
        36.01652794,  86.50062991,  86.50062991, 108.36656156,
         0.        ,  86.50062991, 103.07480247, 103.07480247,
        72.20241692,  86.50062991,  86.50062991, 108.36656156,
         0.        ,  87.3916971 ,  87.3916971 ,  72.20241692,
       103.07480247,  86.50062991,  36.01652794,  36.01652794,
        86.50062991, 108.36656156,  87.3916971 ,  86.50062991,
        72.20241692, 103.07480247,   0.        , 108.36656156,
        36.01652794,  36.01652794,  87.3916971 ,  87.3916971 ,
         0.        ,   0.        ,   5.05844785,   0.        ,
        72.20241692,  36.01652794,   0.        ,  68.7309792 ,
        68.7309792 ,  36.01652794,   0.        ,   0.  

In [394]:
def arc_x4(weights):
    scaled = 1 + weights**4
    return scaled/sum(scaled)

wei = np.array([-0.1, -1.2, -2.1, -3.5, -5.5])
arc_x4(wei)

array([0.00091614, 0.00281556, 0.01873142, 0.13838038, 0.83915649])

645.0

In [188]:
from survivors.ensemble import BoostingCRAID

def loglikelihood_i(time, cens, sf, cumhf, bins):
    index_times = np.digitize(time, bins, right=True) - 1
    hf = np.hstack((cumhf[:, 0][np.newaxis].T, np.diff(cumhf)))
    sf_by_times = np.take_along_axis(sf, index_times[:, np.newaxis], axis=1)[:, 0] + 1e-10
    hf_by_times = (np.take_along_axis(hf, index_times[:, np.newaxis], axis=1)[:, 0] + 1e-10)**cens
    return np.log(sf_by_times) + np.log(hf_by_times)

def values_to_freq(values):
    unq, idx = np.unique(values, return_inverse=True)
    # calculate the weighted frequencies of these indices
    freqs_idx = np.bincount(idx)
    # reconstruct the array of frequencies of the elements
    return freqs_idx[idx] / values.shape

def arc_x4(weights):
    scaled = 1 + weights**4
    return scaled/sum(scaled)

class ProbBoostingCRAID(BoostingCRAID):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.name = "ProbBoostingCRAID"
    
    def count_model_weights(self, model, X_sub, y_sub):
        if self.all_weight:
            X_sub = self.X_train
            y_sub = self.y_train
        
        pred_sf = model.predict_at_times(X_sub, bins=self.bins, mode="surv")
        pred_hf = model.predict_at_times(X_sub, bins=self.bins, mode="hazard")
        
        lp_ti = np.log(values_to_freq(y_sub["time"]))
        likel = loglikelihood_i(y_sub["time"], y_sub["cens"], pred_sf, pred_hf, self.bins)
        
        lp_xi = lp_ti - likel
        wei = - np.exp(-lp_xi)
        betta = np.sum(likel)
#         print(wei)
        return wei, betta
    
    def update_weight(self, index, wei_i):
        if self.all_weight:
            self.weights = self.weights + wei_i
        else:
            self.weights[index] = (self.weights[index] + wei_i)
        self.weights = (self.weights - self.weights.min())/(self.weights.max() - self.weights.min())
        
    def get_aggreg(self, x):
        if self.aggreg_func == 'median':
            return np.median(x, axis=0)
        elif self.aggreg_func == 'wei':
            inv_wei = -1/np.array(self.bettas)
            wei = inv_wei/sum(inv_wei)
            return np.sum((x.T*wei).T, axis=0)
        return np.mean(x, axis=0)

In [189]:
from survivors.experiments.grid import generate_sample

param = {
    "aggreg_func": 'wei',
    "criterion": "weights",
    "depth": 5, 
    "ens_metric_name": "roc",
    "max_features": "sqrt",
    "min_samples_leaf": 1,
    "n_estimators": 15,
    "size_sample": 0.5,
    "all_weight": False,
    "leaf_model": "base_fast"
}

X, y, features, categ, sch_nan = ds.load_pbc_dataset()
gen = generate_sample(X, y, 5)
X_train, y_train, X_test, y_test, bins = next(gen)

prb = ProbBoostingCRAID(**param)
prb.fit(X_train, y_train)

pred_sf = prb.predict_at_times(X_test, bins=bins, mode="surv")
pred_hf = prb.predict_at_times(X_test, bins=bins, mode="hazard")

np.sum(loglikelihood_i(y_test["time"], y_test["cens"], pred_sf, pred_hf, bins))

[0.6505 0.6986 0.7356 0.7133 0.7316 0.7475 0.7796 0.7731 0.7762 0.7822
 0.7836 0.7909 0.7978 0.7956 0.8008]
fitted: 15 models.


-815.7007316182924

In [146]:
prb.bettas

[843.677177487109,
 1128.4243320246874,
 976.7412198817369,
 1124.4976005335884,
 773.0267519903804,
 1062.5846902799788,
 884.7021519288851,
 726.4325516435139,
 825.9051592548383,
 799.8801451723468,
 1014.1193201824256,
 932.0453037013289,
 1036.669474339849,
 1095.0846413858062,
 1080.4951369594746]

In [139]:
inv_wei = -1/np.array(prb.bettas)
wei = inv_wei/sum(inv_wei)
wei

array([0.0738521 , 0.05521622, 0.06379103, 0.05540904, 0.08060177,
       0.05863752, 0.07042746, 0.08577166, 0.07544126, 0.07789583,
       0.06143984, 0.06685011, 0.06010337, 0.05689727, 0.05766553])

In [119]:
pred_sf

array([[nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan]])

In [78]:
a = -1/np.array(prb.bettas)
a / np.sum(a)

array([0.0999838 , 0.06793358, 0.06298866, 0.06171429, 0.06999208,
       0.06817173, 0.06718876, 0.06187132, 0.05586825, 0.06575848,
       0.06172019, 0.06332199, 0.06719463, 0.05969275, 0.06659949])

In [89]:
y["time"]

unq, idx = np.unique(y["time"], return_inverse=True)
# calculate the weighted frequencies of these indices
freqs_idx = np.bincount(idx)
# reconstruct the array of frequencies of the elements
freqs_idx[idx] / y["time"].shape

array([0.00239234, 0.00239234, 0.00239234, 0.00478469, 0.00239234,
       0.00239234, 0.00239234, 0.00239234, 0.00239234, 0.00239234,
       0.00239234, 0.00239234, 0.00239234, 0.00239234, 0.00239234,
       0.00239234, 0.00239234, 0.00239234, 0.00239234, 0.00239234,
       0.00478469, 0.00239234, 0.00478469, 0.00239234, 0.00239234,
       0.00239234, 0.00239234, 0.00239234, 0.00239234, 0.00239234,
       0.00239234, 0.00239234, 0.00239234, 0.00239234, 0.00239234,
       0.00239234, 0.00239234, 0.00239234, 0.00239234, 0.00239234,
       0.00239234, 0.00239234, 0.00239234, 0.00239234, 0.00239234,
       0.00239234, 0.00239234, 0.00239234, 0.00239234, 0.00239234,
       0.00239234, 0.00239234, 0.00239234, 0.00478469, 0.00239234,
       0.00239234, 0.00239234, 0.00239234, 0.00478469, 0.00239234,
       0.00239234, 0.00239234, 0.00239234, 0.00239234, 0.00239234,
       0.00239234, 0.00239234, 0.00239234, 0.00478469, 0.00239234,
       0.00239234, 0.00239234, 0.00239234, 0.00239234, 0.00478

In [93]:
sum(freqs_idx[idx] / y["time"].shape)

1.0909090909090877

In [95]:
values_to_freq(np.array([1, 2, 3, 2]))

array([0.25, 0.5 , 0.25, 0.5 ])