In [3]:
import pandas as pd
from pysurvival.models.simulations import SimulationModel
import numpy as np

In [4]:
path = '/Users/JUSC/Documents/xgbsurv_benchmarking/implementation_testing/simulation_data'
# Initializing the simulation model
sim = SimulationModel( survival_distribution =  'exponential',
                       risk_type = 'linear',
                       censored_parameter = 30.0,
                       alpha = 0.01,
                       beta = 5., )

# Generating N Random samples
N = [1000, 10000, 100000]

for size in N:    
    dataset = sim.generate_data(num_samples = size, num_features=5)
    preds = sim.predict_risk(dataset[['x_1', 'x_2', 'x_3', 'x_4', 'x_5']].to_numpy())
    dataset.to_csv(path+'/survival_simulation_'+str(size)+'.csv', index = False)
    pd.DataFrame(preds).to_csv(path+'/survival_simulation_preds_'+str(size)+'.csv', index = False)
# Showing a few data-points
dataset.head(2)


Number of data-points: 1000 - Number of events: 370.0
Number of data-points: 10000 - Number of events: 3744.0
Number of data-points: 100000 - Number of events: 37187.0


Unnamed: 0,x_1,x_2,x_3,x_4,x_5,time,event
0,0.029421,9.164799,0.167285,9.176764,0.623511,0.904514,1.0
1,0.015608,1.56328,0.151669,4.111674,8.107149,30.051213,0.0


In [5]:
dataset.shape

(100000, 7)

In [6]:
dataset.event.sum()
dataset.columns

Index(['x_1', 'x_2', 'x_3', 'x_4', 'x_5', 'time', 'event'], dtype='object')

In [7]:
## Hazard/risk Predictions
preds = sim.predict_risk(dataset[['x_1', 'x_2', 'x_3', 'x_4', 'x_5']].to_numpy())



In [8]:
dataset[['x_1', 'x_2', 'x_3', 'x_4', 'x_5']].to_numpy()

array([[ 2.94206573e-02,  9.16479927e+00,  1.67285442e-01,
         9.17676429e+00,  6.23510841e-01],
       [ 1.56083557e-02,  1.56327964e+00,  1.51668656e-01,
         4.11167381e+00,  8.10714935e+00],
       [ 2.25516826e-04,  5.04035402e+00,  3.10149674e-01,
         5.09245186e-01,  7.20077121e+00],
       ...,
       [ 2.55548868e-03,  1.56655712e+00,  1.62347039e-01,
         2.75710502e+00,  7.40889998e+00],
       [ 6.30006473e-03,  3.99165080e+00,  5.42294614e-02,
         5.40842785e-01, -1.01667542e+00],
       [ 2.42703557e-03,  7.72906582e+00,  3.51312246e-02,
         3.07589965e+00,  9.60297938e+00]])

In [9]:

# gradient from nv version

def cox_ph_gradient_nv(log_partial_hazard, time, event):
    # comparison gradient function like in sksurv without cython
    n_samples = event.shape[0]

    gradient = np.empty(n_samples)
    exp_tsj = np.zeros(n_samples)

    exp_pred = np.exp(log_partial_hazard)

    for i in range(n_samples):
        for j in range(n_samples):
            if time[j] >= time[i]:
                exp_tsj[i] += exp_pred[j]

    for i in range(n_samples):
        s = 0
        for j in range(n_samples):
            if event[j] and time[i] >= time[j]:
                s += exp_pred[i] / exp_tsj[j]
        gradient[i] = event[i] - s

    return -gradient

# gradient v version

def cox_ph_denominator(log_partial_hazard, risk_matrix):
    return np.sum(
        risk_matrix * np.array([np.exp(log_partial_hazard)] * log_partial_hazard.shape[0]), 
        axis=1)

def cox_ph_gradient(log_partial_hazard, time,event):
    n = event.shape[0]
    risk_matrix = get_risk_matrix(time)
    denominator = cox_ph_denominator(log_partial_hazard, risk_matrix)
    numerator = np.exp(log_partial_hazard)
    gradient = event - np.sum(
    (
        event.repeat(event.shape[0]).reshape((n, n)).T
        * get_risk_matrix(time).T
        * numerator.repeat(event.shape[0]).reshape((n, n))
    )
    / denominator.repeat(event.shape[0]).reshape((n, n)).T,
    axis=1,)
    return -gradient 




# hessian nv version

def cox_ph_denominator_hess_nv(log_partial_hazard, time):
    denominator = np.zeros(time.shape[0])
    for j in range(time.shape[0]): 
        for k in range(time.shape[0]):
            denominator[j] += (time[k] >= time[j]) * np.exp(log_partial_hazard[k])
    return np.square(denominator)

def cox_ph_numerator_hess_nv(log_partial_hazard, time):
    numerator = np.zeros(time.shape[0])
    for j in range(time.shape[0]):
        for k in range(time.shape[0]):
            numerator[j] += (time[k] >= time[j]) * np.exp(log_partial_hazard[k]) - (time[k] >= time[j]) * np.square(np.exp(log_partial_hazard[k]))
    return np.exp(log_partial_hazard)*numerator

def cox_ph_hessian_nv(log_partial_hazard, time, event):
    #time, event = transform_back(y)
    hess = np.zeros(time.shape[0])
    numerator = cox_ph_numerator_hess_nv(log_partial_hazard, time)
    denominator = cox_ph_denominator_hess_nv(log_partial_hazard, time)
    for i in range(time.shape[0]):
        for j in range(time.shape[0]):
            hess[i] -= event[j] * (time[i] >= time[j]) * numerator[i] / denominator[i]
    return hess




# hess v version

def get_risk_matrix(time):
    return (np.outer(time, time) >= np.square(time)).astype(int).T

def cox_ph_numerator_hess(log_partial_hazard, time):
    risk_matrix = get_risk_matrix(time)
    numerator = np.sum(risk_matrix * np.exp(log_partial_hazard),axis=1) - np.sum(risk_matrix * np.square(np.exp(log_partial_hazard)),axis=1)
    #print('risk_part',np.sum(risk_matrix * np.exp(log_partial_hazard),axis=1))
    #print(np.exp(log_partial_hazard)*numerator)
    return np.exp(log_partial_hazard)*numerator

def cox_ph_denominator_hess(log_partial_hazard, time):
    risk_matrix = get_risk_matrix(time)
    denominator = np.sum(risk_matrix * np.exp(log_partial_hazard),axis=1)
    return np.square(denominator)

def cox_ph_hessian(log_partial_hazard, time, event):
    n = event.shape[0]
    risk_matrix = get_risk_matrix(time)
    numerator = cox_ph_numerator_hess(log_partial_hazard, time)
    denominator = cox_ph_denominator_hess(log_partial_hazard, time)
    #print('num', numerator)
    #print('denominator', denominator)
    #print('(numerator / denominator)',(numerator / denominator))
    hess = -np.sum(event[:,None]*risk_matrix*(numerator / denominator), axis=0)
    return hess


time = np.array([1,2,3,4,4,4,4,4,5,5,5], dtype='int')
event = np.array([0,1,0,0,0,0,1,1,1,1,1], dtype='int')#np.zeros(9) 
log_hazard = np.array([0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5], dtype='float')

print('grad nv')
a = cox_ph_gradient_nv(log_hazard, time, event)
print('grad v')
b = cox_ph_gradient(log_hazard, time,event)

np.testing.assert_allclose(a,b)
print('hess nv')
c = cox_ph_hessian_nv(log_hazard, time, event)
print('hess v')
d = cox_ph_hessian(log_hazard, time, event)
np.testing.assert_allclose(c,d)

print(a,b)
print(c,d)


grad nv
grad v
hess nv
hess v
[-0.   -0.9   0.1   0.35  0.35  0.35 -0.65 -0.65  0.35  0.35  0.35] [-0.   -0.9   0.1   0.35  0.35  0.35 -0.65 -0.65  0.35  0.35  0.35]
[0.         0.06487213 0.07208014 0.24327048 0.24327048 0.24327048
 0.24327048 0.24327048 1.29744254 1.29744254 1.29744254] [-0.          0.06487213  0.07208014  0.24327048  0.24327048  0.24327048
  0.24327048  0.24327048  1.29744254  1.29744254  1.29744254]
