In [1]:
# Import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import re
import pickle

import os
path_dir = os.path.dirname(os.getcwd())

import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
pio.templates.default = "plotly_white"

%load_ext autoreload
%autoreload 2

In [2]:
cd ../src/

/Users/linafaik/Documents/survival_analysis/src


In [3]:
from train import *
from train_survival_ml import *
from train_survival_deep import *

In [28]:
df = pd.read_csv(os.path.join(path_dir, "outputs/customer_subscription_clean.csv"))

In [29]:
# Parameters

scaler_name = "StandardScaler" #MinMaxScaler
random_state = 123

# 1. Train / test split

In [30]:
# covariate columns (used when possible)

cols_x = [
    'price', 'billing_cycle', 'age',
    'product=prd_1', 'gender=female', 'channel=email', 'reason=support',
    'nb_cases', 'time_since_signup', 
    'date_month_cos', 'date_month_sin',
    'date_weekday_cos', 'date_weekday_sin', 'date_hour_cos',
    'date_hour_sin'
]

col_target = "duration"

In [31]:
Xy_train, Xy_test, y_train, y_test = split_train_test(
    df, cols_x, col_target, test_size=0.15, col_stratify= "censored", random_state=random_state)

Xy_train, Xy_val, y_train, y_val = split_train_test(
    Xy_train, cols_x, col_target, test_size=0.2,  col_stratify= "censored", random_state=random_state)

n_train, n_test, n_val = Xy_train.shape[0], Xy_test.shape[0], Xy_val.shape[0]
n_tot =  n_train + n_test + n_val

print("Train: {}%, Test: {}%, Val: {}%".format(
    round(n_train/n_tot *100),
    round(n_test/n_tot *100),
    round(n_val/n_tot *100)
))

Train: 68%, Test: 15%, Val: 17%


In [32]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# rescale
scaler = eval(scaler_name)()

Xy_train[cols_x] = scaler.fit_transform(Xy_train[cols_x])
Xy_test[cols_x] = scaler.transform(Xy_test[cols_x])

# 2. Kaplan-Meier estimator

In [33]:
from sksurv.nonparametric import kaplan_meier_estimator

time, probas = kaplan_meier_estimator(Xy_train["censored"].astype(bool), Xy_train[col_target])

fig = px.line(x=time, y=probas, width=800, height = 400)
fig.update_layout(dict(xaxis={'title' : 'Time (# days)'}, yaxis={'title' : 'Survival probability'}))

In [39]:
from sksurv.nonparametric import kaplan_meier_estimator

for i, age_bin in enumerate(df.age_bin.unique()):
    
    Xy_train_filter = df[df.age_bin == age_bin]

    time, probas = kaplan_meier_estimator(Xy_train_filter["censored"].astype(bool), Xy_train_filter[col_target])
    probas = pd.DataFrame(
        {'time': time, 'age_bin' : age_bin, 
         'proba_readm': probas}
    )
    
    preds = probas if i ==0 else pd.concat([probas, preds], axis=0)

preds.head()

Unnamed: 0,time,age_bin,proba_readm
0,1.0,"[20,30[",0.998524
1,2.0,"[20,30[",0.997785
2,3.0,"[20,30[",0.997046
3,4.0,"[20,30[",0.996307
4,6.0,"[20,30[",0.996307


In [46]:
fig = px.line(preds, x="time", y="proba_readm", color="age_bin", width=800, height = 400)

fig = fig.add_traces([
    go.Scatter(x=[365*i, 365*i], y=[0, 1], name=f"year {i}", line_color = 'lightgrey') for i in range(1, 5)
])


fig.update_layout(dict(xaxis={'title' : 'nb days'}, yaxis={'title' : 'proba'}))

# 3. Cox PH estimator

## 3.1 Model training & analysis

### Training

In [47]:
from sksurv.linear_model import CoxPHSurvivalAnalysis

# train an estimator
estimator = CoxPHSurvivalAnalysis(alpha=0.5)
estimator = estimator.fit(Xy_train[cols_x], y_train)

### Cumulative hazard functions

In [48]:
# predict cumulative hazard function
chf_funcs = estimator.predict_cumulative_hazard_function(Xy_test[cols_x].iloc[:3])

data = [go.Scatter(x=fn.x,y= fn(fn.x), name=i) for i, fn in enumerate(chf_funcs)]
fig = go.Figure(data, layout=dict(width=800, height=400))
fig.update_layout({"yaxis":{"range": [0,1]}})

### Survival functions

In [49]:
# predict survival function
surv_funcs = estimator.predict_survival_function(Xy_test[cols_x].iloc[:3])

# plot results
data = [go.Scatter(x=fn.x,y= fn(fn.x), name=i) for i, fn in enumerate(surv_funcs)]
go.Figure(data, layout=dict(width=800, height=400))

### Feature importance

In [50]:
feat_importance, fig = plot_feat_imp(cols_x, estimator.coef_)
fig

## 3.2. Model evaluation

### C-index

In [52]:
from sksurv.metrics import concordance_index_censored

prediction = estimator.predict(Xy_test[cols_x])
result = concordance_index_censored(list(Xy_test.censored.astype(bool)), Xy_test[col_target], prediction)
result
# c-index, concordant,  discordant, tied_risk, tied_time

(0.6818926526547785, 603626993, 281595328, 2179, 352836)

### Time-dependant AUC

In [53]:
from sksurv.metrics import cumulative_dynamic_auc

times = np.percentile(df[col_target], np.linspace(5, 81, 15))
risk_score = estimator.predict(Xy_test[cols_x]) 

# Possible because the Cox PH is not time-dependant
auc, mean_auc = cumulative_dynamic_auc(y_train, y_test, risk_score, times)
mean_auc

0.7568436739464653

In [59]:
fig = px.line(x=times, y= auc, width=800, height=400)

fig = fig.add_traces([
    go.Scatter(x=[365*i, 365*i], y=[0, 1], name=f"year {i}", line_color = 'lightgrey') for i in range(1, int(max(times)/365)+1)
])


fig.update_layout({
    "xaxis": dict(title = "Time (#days)"),
    "yaxis": dict(title = "Time-dependent AUC")
})

### Bier score

In [60]:
from sksurv.metrics import brier_score, integrated_brier_score

In [61]:
survs = estimator.predict_survival_function(Xy_test[cols_x])

In [62]:
T = 364/2
preds = [fn(T) for fn in survs]
times, score = brier_score(y_train, y_test, preds, T)
score

array([0.13207416])

In [63]:
times = np.arange(364/2, 365)

preds = np.asarray([[fn(t) for t in times] for fn in survs])
score = integrated_brier_score(y_train, y_test, preds, times)
print(score)

0.1755871996391113


## 3.3. Model fine-tuning

In [64]:
from sklearn.model_selection import KFold

cv = KFold(n_splits=5, random_state=random_state, shuffle=True)

In [65]:
grid_params = {
    "alpha": [0.5, 1]
}

estimator_cox, results = grid_search(
    grid_params, Xy_train, cv, CoxPHSurvivalAnalysis, cols_x,  col_target, verbose = True)

4 total scenario to run
1/4: params: {'alpha': 0.3}
Fold 0: 0.683
Fold 1: 0.681
Fold 2: 0.684
Fold 3: 0.682
Fold 4: 0.682
2/4: params: {'alpha': 0.6}
Fold 0: 0.683
Fold 1: 0.681
Fold 2: 0.684
Fold 3: 0.682
Fold 4: 0.682
3/4: params: {'alpha': 1}
Fold 0: 0.683
Fold 1: 0.681
Fold 2: 0.684
Fold 3: 0.682
Fold 4: 0.682
4/4: params: {'alpha': 1.2}
Fold 0: 0.683
Fold 1: 0.681
Fold 2: 0.684
Fold 3: 0.682
Fold 4: 0.682


In [66]:
results

Unnamed: 0,alpha,fold_0,fold_1,fold_2,fold_3,fold_4,mean,std
0,0.3,0.68335,0.680502,0.683561,0.682121,0.682122,0.682331,0.001094
1,0.6,0.68335,0.680502,0.683561,0.682121,0.682122,0.682331,0.001094
2,1.0,0.68335,0.680502,0.683561,0.682121,0.682122,0.682331,0.001094
3,1.2,0.68335,0.680502,0.683561,0.682121,0.682122,0.682331,0.001094


In [69]:
estimator_cox.score(Xy_val[cols_x], y_val)

0.6830535761833626

In [67]:
with open(os.path.join(path_dir, "outputs/cox_ph.pkl"), "wb") as f:
    pickle.dump(estimator_cox, f)

# 4. Gradient Boosting Survival Analysis

In [None]:
from sksurv.ensemble import GradientBoostingSurvivalAnalysis

grid_params = {
    "n_estimators": [10],
    "max_depth": [3],
    "min_samples_leaf": [2, 3],
    "learning_rate": [0.5],
    "random_state": [random_state],
    "verbose":[1]}

estimator_gb, results = grid_search(
    grid_params, Xy_train, cv, GradientBoostingSurvivalAnalysis, cols_x, col_target, verbose = True)

results

In [None]:
estimator_gb.score(Xy_val[cols_x], y_val)

In [None]:
feat_importance_gb, fig = plot_feat_imp(cols_x, estimator_gb.feature_importances_)
fig

In [None]:
with open(os.path.join(path_dir, "outputs/gradient_boosting.pkl"), "wb") as f:
    pickle.dump(estimator_gb, f)

## 5. Survival Support Vector Machine

In [None]:
from sksurv.svm import FastSurvivalSVM 

In [None]:
from sksurv.svm import FastSurvivalSVM 

grid_params = {
    "alpha": [1,2, 5, 10],
    "rank_ratio": [0],
    "max_iter": [1000],
    "tol": [1e-5],
    "random_state": [random_state],
    "verbose":[0]}

estimator_svm, results = grid_search(grid_params, df, cv, FastSurvivalSVM, cols_x, col_target, verbose = True)

results

In [None]:
with open(os.path.join(path_dir, "outputs/svm.pkl"), "wb") as f:
    pickle.dump(estimator_svm, f)

In [None]:
from sksurv.svm import FastKernelSurvivalSVM 

grid_params = {
    "kernel": ["linear","poly","rbf","sigmoid","cosine"],
    "alpha": [2],
    "rank_ratio": [0],
    "max_iter": [1000],
    "tol": [1e-5],
    "random_state": [random_state]
}

estimator_ksvm, results = grid_search(grid_params, df, cv, FastKernelSurvivalSVM, cols_x, col_target, verbose = True)

results

In [None]:
with open(os.path.join(path_dir, "outputs/ksvm.pkl"), "wb") as f:
    pickle.dump(estimator_ksvm, f)