In [7]:
import numpy as np
import pandas as pd
import math
import numpy as np
from numba import jit
#import numba_special
#import numba_scipy
#from numba_scipy.special import erf as erfvec
from scipy.special import erf as erfvec
import time as t
from math import log
from math import exp, sqrt, pi
from math import erf as erfmath
from numba import njit
import os
current_path = os.getcwd() 
one_level_up = os.path.abspath(os.path.join(current_path,  ".."))
two_levels_up = os.path.abspath(os.path.join(current_path,  "..",".."))

In [4]:
## Load Simulation Data

In [5]:
hazard = log_hazard = np.random.normal(0, 1, 1000)
df = pd.read_csv(current_path+'/simulation_data/survival_simulation_1000.csv')
#df.event = 1
df.sort_values(by='time', inplace=True)
df.head(2)

Unnamed: 0,x_1,x_2,x_3,x_4,x_5,time,event
104,10.782665,6.321241,0.154621,19.20857,17.674405,0.00033,1.0
638,4.839507,9.959547,0.141726,19.185224,12.58512,0.000471,1.0


In [4]:
time = np.array([1.0,1.0,1.0,3.0,5.0,5.0,5.0,8.0,9.0, 9.0])
event = np.array([1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0, 1.0])

In [5]:
def risk_matrix_loop(time):
    n_samples = time.shape[0]
    risk_sum = np.copy(n_samples)
    risk_set = np.zeros_like(np.unique(time))
    idx=0
    previous_time = time[0]
    set_count = 0
    for k in range(n_samples):
        current_time = time[k]
        if current_time > previous_time:

            risk_set[idx] = risk_sum
            risk_sum -= set_count
            set_count = 0
            idx+=1
        set_count += 1
        previous_time = current_time
    risk_set[idx] = set_count
    return risk_set


In [6]:
risk_matrix_loop(time)

array([10.,  7.,  6.,  3.,  2.])

In [38]:
def risk_matrix_vectorized(time):
    return np.sum(np.unique((np.outer(time,time)>=np.square(time)).astype(int).T,axis=0),axis=1)[::-1]

In [43]:
# def risk_matrix_vectorized2(time):
#     unique_time, counts = np.unique(time,return_counts=True)
#     risk_set = time.shape[0]
#     risk_set_final = risk_set - np.cumsum(counts)
#     return risk_set_final[::-1]

In [44]:
#risk_matrix_vectorized2(time)

In [45]:
risk_matrix_vectorized(time)

array([10,  7,  6,  3,  2])

In [42]:
@jit(nopython=True, cache=True)
def risk_matrix_loop_numba(time):
    n_samples = time.shape[0]
    risk_sum = n_samples
    risk_set = np.zeros_like(np.unique(time))
    idx=0
    previous_time = time[0]
    set_count = 0
    for k in range(n_samples):
        current_time = time[k]
        if current_time > previous_time:

            risk_set[idx] = risk_sum
            risk_sum -= set_count
            set_count = 0
            idx+=1
        set_count += 1
        previous_time = current_time
    risk_set[idx] = set_count
    return risk_set

In [10]:
risk_matrix_loop_numba(time)

array([10.,  7.,  6.,  3.,  2.])

## Comparison

## Compare times

In [11]:
def function1(time):
    return risk_matrix_loop(time)
    

def function2(time):
    return risk_matrix_loop_numba(time)

def function3(time):
    return risk_matrix_vectorized(time)

path = current_path+'/simulation_data'
def comparison(num_runs = 10, size=10000):

    hazard = log_hazard = np.random.normal(0, 1, size)
    df = pd.read_csv(path+'/survival_simulation_'+str(size)+'.csv')
    n_samples = df.shape[0]

    df.sort_values(by='time', inplace=True)
    time = df.time.to_numpy()
    event = df.event.to_numpy().astype('bool')
    # Empty list to store the execution times
    function1_times = []
    function2_times = []
    function3_times = []

    # Loop to run each function and record the execution times
    for i in range(num_runs):
        start_time = t.time()
        function1(time)
        end_time = t.time()
        function1_times.append(end_time - start_time)

        start_time = t.time()
        function2(time)
        end_time = t.time()
        function2_times.append(end_time - start_time)

        start_time = t.time()
        function3(time)
        end_time = t.time()
        function3_times.append(end_time - start_time)

    # Calculate the mean and standard deviation of the execution times for each function
    function1_mean = sum(function1_times) / len(function1_times)
    function1_std = pd.Series(function1_times).std()
    function2_mean = sum(function2_times) / len(function2_times)
    function2_std = pd.Series(function2_times).std()
    function3_mean = sum(function3_times) / len(function3_times)
    function3_std = pd.Series(function2_times).std()

    # Create a Pandas dataframe to display the results
    df = pd.DataFrame({
        'Function': ['Difference Loop', 'Difference Loop Numba', 'Difference Vectorized'],
        'Mean': [function1_mean, function2_mean, function3_mean],
        'Standard Deviation': [function1_std, function2_std,function3_std],
        'Sample Size': [size, size, size],
        'Number Repetitions': [num_runs, num_runs, num_runs]
    })
    return df

df_1000 = comparison(num_runs = 50, size=1000)
print(df_1000.to_latex(index=False))
df_1000.to_csv(one_level_up+'/implementation_testing/results/risk_matrix_difference_comparison_1000.csv', index=False)
df_1000

\begin{tabular}{lrrrr}
\toprule
Function & Mean & Standard Deviation & Sample Size & Number Repetitions \\
\midrule
Difference Loop & 0.000749 & 0.000035 & 1000 & 50 \\
Difference Loop Numba & 0.000024 & 0.000005 & 1000 & 50 \\
Difference Vectorized & 0.095366 & 0.000005 & 1000 & 50 \\
\bottomrule
\end{tabular}



Unnamed: 0,Function,Mean,Standard Deviation,Sample Size,Number Repetitions
0,Difference Loop,0.000749,3.5e-05,1000,50
1,Difference Loop Numba,2.4e-05,5e-06,1000,50
2,Difference Vectorized,0.095366,5e-06,1000,50


In [12]:
df_10000 = comparison(num_runs = 50, size=10000)
print(df_10000.to_latex(index=False))
df_10000.to_csv(one_level_up+'/implementation_testing/results/risk_matrix_difference_comparison_10000.csv', index=False)
df_10000

\begin{tabular}{lrrrr}
\toprule
Function & Mean & Standard Deviation & Sample Size & Number Repetitions \\
\midrule
Difference Loop & 0.007300 & 0.000090 & 10000 & 50 \\
Difference Loop Numba & 0.000139 & 0.000027 & 10000 & 50 \\
Difference Vectorized & 17.158938 & 0.000027 & 10000 & 50 \\
\bottomrule
\end{tabular}



Unnamed: 0,Function,Mean,Standard Deviation,Sample Size,Number Repetitions
0,Difference Loop,0.0073,9e-05,10000,50
1,Difference Loop Numba,0.000139,2.7e-05,10000,50
2,Difference Vectorized,17.158938,2.7e-05,10000,50
