# Survival Analysis & Censored Data

### Loading Libraries

In [None]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd

# Data Visualization
import matplotlib.pyplot as plt
from matplotlib.pyplot import subplots

# ISLP
from ISLP import load_data
from ISLP.survival import sim_time
from ISLP.models import ModelSpec as MS

In [None]:
from lifelines import (KaplanMeierFitter, CoxPHFitter)

from lifelines.statistics import (logrank_test, multivariate_logrank_test)

### Brain Cancer Data

In [None]:
BrainCancer = load_data('BrainCancer')

BrainCancer.columns

In [None]:
BrainCancer['sex'].value_counts()

In [None]:
BrainCancer['diagnosis'].value_counts()

In [None]:
BrainCancer['status'].value_counts()

In [None]:
fig , ax = subplots (figsize=(8, 8))

km = KaplanMeierFitter()

km_brain = km.fit( BrainCancer ['time'], BrainCancer['status'])
km_brain .plot(label='Kaplan Meier estimate', ax=ax)
plt.grid(True)
plt.show()

In [None]:
fig , ax = subplots(figsize =(8, 8))
by_sex = {}

for sex, df in BrainCancer.groupby('sex'):
        km_sex = km.fit(df['time'], df['status'])
        by_sex[sex] = df
        km_sex.plot(label='Sex =%s' % sex, ax=ax)

plt.grid(True)
plt.show()

In [None]:
logrank_test (by_sex['Male']['time'],
              by_sex['Female']['time'],
              by_sex['Male']['status'],
              by_sex['Female']['status'])

In [None]:
coxph = CoxPHFitter 

sex_df = BrainCancer[['time', 'status', 'sex']]

model_df = MS(['time', 'status', 'sex'],
              intercept=False).fit_transform(sex_df)

cox_fit = coxph().fit(model_df,
                      'time',
                      'status')

cox_fit.summary[['coef', 'se(coef)', 'p']]

In [None]:
cox_fit.log_likelihood_ratio_test()

In [None]:
cleaned = BrainCancer.dropna()
all_MS = MS(cleaned.columns, intercept=False)
all_df = all_MS.fit_transform(cleaned)
fit_all = coxph().fit(all_df,
                      'time',
                      'status')

fit_all.summary [['coef', 'se(coef)', 'p']]

In [None]:
levels = cleaned['diagnosis'].unique()

def representative (series):
    if hasattr(series.dtype, 'categories'):
        return pd.Series.mode(series)
    else:
        return series.mean()

modal_data = cleaned.apply(representative, axis =0)

In [None]:
modal_df = pd.DataFrame(
    [modal_data.iloc[0] for _ in range(len(levels))])

modal_df['diagnosis'] = levels

modal_df

In [None]:
modal_X = all_MS.transform(modal_df)
modal_X.index = levels

modal_X

In [None]:
predicted_survival = fit_all.predict_survival_function(modal_X)

predicted_survival

In [None]:
fig, ax = subplots (figsize=(8, 8))

predicted_survival.plot(ax=ax);
plt.grid(True)
plt.show()

### Publication Data

In [None]:
fig, ax = subplots(figsize=(8 ,8))

Publication = load_data('Publication')
by_result = {}

for result, df in Publication.groupby('posres'):
    by_result[result] = df
    km_result = km.fit(df['time'], df['status'])
    km_result.plot(label='Result =%d' % result, ax=ax)

plt.grid(True)
plt.show()

In [None]:
posres_df = MS(['posres',
                'time',
                'status'],
               intercept =False).fit_transform(Publication)

posres_fit = coxph().fit(posres_df,
                         'time',
                         'status')

posres_fit.summary[['coef', 'se(coef)', 'p']]

In [None]:
model = MS(Publication.columns.drop('mech'),
           intercept=False)

coxph().fit(model.fit_transform(Publication),
            'time',
            'status').summary[['coef', 'se(coef)', 'p']]

### Call Center Data

In [None]:
rng = np.random.default_rng(10)

N = 2000

Operators = rng.choice(np.arange(5, 16),
                       N,
                       replace=True)

Center = rng.choice(['A', 'B', 'C'],
                    N,
                    replace=True)

Time = rng.choice(['Morn.', 'After.', 'Even.'],
                  N,
                  replace=True)

D = pd.DataFrame({'Operators': Operators,
                  'Center': pd.Categorical(Center),
                  'Time': pd.Categorical(Time)})

In [None]:
model = MS(['Operators',
            'Center',
            'Time'],
           intercept=False)

X = model.fit_transform(D)

In [None]:
X[:5]

In [None]:
true_beta = np.array([0.04, -0.3, 0, 0.2, -0.2])
true_linpred = X.dot(true_beta)

hazard = lambda t: 1e-5 * t

In [None]:
cum_hazard = lambda t: 1e-5 * t**2 / 2

In [None]:
W = np.array([sim_time(l, cum_hazard, rng)
              for l in true_linpred])

D['Wait time'] = np.clip(W, 0, 1000)

In [None]:
D['Failed'] = rng.choice([1, 0],
                         N,
                         p=[0.9, 0.1])

D[:5]

In [None]:
D['Failed'].mean()

In [None]:
fig, ax = subplots(figsize=(8 ,8))
by_center = {}

for center, df in D.groupby('Center'):
    by_center [center] = df
    km_center = km.fit(df['Wait time'], df['Failed'])
    km_center .plot(label='Center =%s' % center , ax=ax)
    ax. set_title (" Probability of Still Being on Hold")

plt.grid(True)
plt.show()

In [None]:
fig, ax = subplots(figsize =(8 ,8))
by_time = {}

for time, df in D.groupby('Time'):
    by_time[time] = df
    km_time = km.fit(df['Wait time'], df['Failed'])
    km_time.plot(label='Time =%s' % time, ax=ax)
    ax. set_title("Probability of Still Being on Hold")

plt.grid(True)
plt.show()

In [None]:
multivariate_logrank_test(D['Wait time'],
                          D['Center'],
                          D['Failed'])

In [None]:
multivariate_logrank_test(D['Wait time'],
                          D['Time'],
                          D['Failed'])

In [None]:
X = MS(['Wait time',
        'Failed',
        'Center'],
       intercept=False).fit_transform(D)

F = coxph().fit(X, 'Wait time', 'Failed')
F.log_likelihood_ratio_test()

In [None]:
X = MS(['Wait time',
        'Failed',
        'Time'],
       intercept=False).fit_transform(D)

F = coxph().fit(X, 'Wait time', 'Failed')
F.log_likelihood_ratio_test()

In [None]:
X = MS(D.columns,
       intercept=False).fit_transform(D)

fit_queuing = coxph().fit(
    X,
    'Wait time',
    'Failed')

fit_queuing.summary[['coef', 'se(coef)', 'p']]