In [56]:
import pandas as pd

In [57]:
df_tied = pd.read_csv('../data/ovarian.csv')
df_tied.loc[:, 'Ascites'] = df_tied.Ascites.replace({'present':1, 'absent':0})
df_tied.replace({"unknown":pd.NA}, inplace=True)
df_tied.dropna(subset=['tyears','d','Karn','Ascites'], inplace=True)
df_tied.loc[:, 'Ascites'] = df_tied.loc[:, 'Ascites'].astype(int)
df_tied

Unnamed: 0,tyears,d,Karn,Broders,FIGO,Ascites,Diam,id
0,0.967124,1,9,,IV,1,1-2cm,1
1,2.665753,1,10,4,IV,1,<1cm,2
2,1.054794,1,10,3,III,1,1-2cm,3
3,0.682193,1,9,2,III,1,>5cm,4
5,0.419180,1,8,3,III,1,>5cm,6
...,...,...,...,...,...,...,...,...
352,4.224656,0,10,2,III,0,micr.,353
353,0.131509,1,9,3,III,1,>5cm,354
355,4.183561,0,9,2,IV,1,>5cm,356
356,4.164384,0,10,4,III,1,>5cm,357


In [73]:
from sklearn.base import BaseEstimator
import torch
from torch import nn
from torch import optim
import numpy as np


class TorchCox(BaseEstimator):
    """Fit a Cox model
    """

    def __init__(self, lr=1, random_state=None):
        self.random_state = random_state
        self.lr = lr
        
    def _padToMatch2d(self, inputtens, targetshape):
        target = torch.full(targetshape, fill_value=-1e3)#torch.zeros(*targetshape)
        target[:inputtens.shape[0], :inputtens.shape[1]] = inputtens
        return target
        
    def get_loss(self, tensor, event_tens, num_tied, beta):
        loss_event = torch.einsum('ik,k->i', event_tens, beta)
                        
        XB = torch.norm(torch.einsum('ijk,kk->ij', tensor, torch.diag(beta)), p='nuc', keepdim=True)
        loss_atrisk = -num_tied*torch.logsumexp(XB, dim=1)
        
        loss = torch.sum(loss_event + loss_atrisk)

        return -loss

    # the arguments are ignored anyway, so we make them optional
    def fit(self, df, Xnames=None, tname=None, dname=None, basehaz=True):
    
        self.Xnames = Xnames
        self.tname = tname
        self.dname = dname
        
        #self.random_state_ = check_random_state(self.random_state)
        beta = nn.Parameter(torch.zeros(len(self.Xnames))).float()
        
        optimizer = optim.LBFGS([beta], lr=self.lr)

        inputdf = df[[self.tname,self.dname,*self.Xnames]].sort_values([self.dname,self.tname], ascending=[False,True])

        tiecountdf = inputdf.loc[inputdf[self.dname]==1,:].groupby([self.tname]).size().reset_index(name='tiecount')
        num_tied = torch.from_numpy(tiecountdf.tiecount.values).int()

        tensin = torch.from_numpy(inputdf[[self.tname,self.dname,*self.Xnames]].values)

        #Get unique event times
        tensin_events = torch.unique(tensin[tensin[:,1]==1, 0])

        #For each unique event stack another matrix with event at the top, and all at risk entries below
        tensor = torch.stack([self._padToMatch2d(tensin[tensin[:,0] >= eventtime, :], tensin.shape) for eventtime in tensin_events])

        assert all(tensor[:,0,1] == 1)

        #One actually has to sum over the covariates which have a tied event time in the Breslow correction method!
        #See page 33 here: https://www.math.ucsd.edu/~rxu/math284/slect5.pdf
        event_tens = torch.stack([tensor[i, :num_tied[i], 2:].sum(dim=0) for i in range(tensor.shape[0])])

        #Drop time and status columns as no longer required
        tensor = tensor[:,:,2:]

        def closure():
            optimizer.zero_grad()
            loss = self.get_loss(tensor, event_tens, num_tied, beta)
            #print(loss)
            loss.backward()
            return loss

        optimizer.step(closure)

        self.beta = beta
        print(self.beta.detach().numpy())        
        
        
        #Compute baseline hazard during fit() to avoid having to save dataset to memory in TorchCox() objects, so it
        #  can then later be calculated if basehaz() is called.
        if basehaz:
            t, _ = torch.sort(torch.from_numpy(inputdf[self.tname].values))
            t_uniq = torch.unique(t)

            h0 = []
            for time in t_uniq:
                value = 1/torch.sum(torch.exp(torch.einsum('ij,j->i', torch.from_numpy(inputdf.loc[inputdf[self.tname] >= time.numpy(), self.Xnames].values).float(), self.beta)))
                h0.append({'time':time.numpy(), 'h0':value.detach().numpy()})

            h0df = pd.DataFrame(h0)
            h0df['H0'] = h0df.h0.cumsum()

            self.basehaz = h0df

        
    def predict_proba(self, testdf, Xnames=None, tname=None):
        
        betas = self.beta.detach().numpy()
        H0 = np.asarray([self.basehaz.loc[self.basehaz.time<=t, 'H0'].iloc[-1] for t in testdf[tname].values])

        S = np.exp(np.multiply(-np.exp(np.dot(testdf[Xnames].values, betas)), H0))
        
        assert all(S>=0)
        assert all(S<=1)
        
        #F = 1 - S
        #assert all(F>=0)
        #assert all(F<=1)
        
        return S

In [76]:
%%time

coxmod = TorchCox()

tname = 'tyears'
Xnames = ['Karn', 'Ascites']
dname = 'd'

coxmod.fit(df_tied, Xnames=Xnames, tname=tname, dname=dname)

[1.6259581e+21 2.2198941e+21]
CPU times: user 1.37 s, sys: 7 ms, total: 1.38 s
Wall time: 750 ms


In [62]:
coxmod.basehaz

Unnamed: 0,time,h0,H0
0,0.024657240259862743,0.0,0.0
1,0.027397616762659092,0.0,0
2,0.03561695837826295,0.0,0
3,0.04109387045962334,0.0,0
4,0.08219248033739741,0.0,0
...,...,...,...
301,7.0602742018236055,0.0,0
302,7.112329818483311,0.0,0
303,7.120547803885543,0.0,0
304,7.290409766965798,0.0,0


In [63]:
df_tied['pred'] = coxmod.predict_proba(df_tied, Xnames=Xnames, tname=tname)
df_tied

  S = np.exp(np.multiply(-np.exp(np.dot(testdf[Xnames].values, betas)), H0))
  S = np.exp(np.multiply(-np.exp(np.dot(testdf[Xnames].values, betas)), H0))


AssertionError: 

In [None]:
from sklearn import metrics
import matplotlib.pyplot as plt

fpr, tpr, thresholds = metrics.roc_curve(df_tied[dname].values, df_tied['pred'].values, pos_label=1)

plt.plot(fpr, tpr)

In [None]:
metrics.auc(fpr, tpr)

In [None]:
#See section 5.1 of this paper for possible choices of performance metric to use:
# https://arxiv.org/pdf/2012.04284.pdf