# Neural networks (DeepSurv)

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pysurvival.models.semi_parametric import NonLinearCoxPHModel
from pysurvival.utils.metrics import concordance_index
from sklearn.model_selection import train_test_split
from pysurvival.utils.display import display_loss_values
import torch

In [None]:
def model_data():
    train_df = pd.read_csv(r'C:\Users\grina\Desktop\VGTU\final_data.csv', index_col=0)
    train_df.reset_index(inplace=True)
    #replace empty values with 0
    train_df.fillna(value=0, inplace=True)
    train_df.drop(columns=['name', 'Parachute'], inplace=True)
    #change T, F with 1,0
    train_df['is_in_blue_zone'] = train_df['is_in_blue_zone'].replace({True:1, False:0})
    train_df['is_in_red_zone'] = train_df['is_in_red_zone'].replace({True:1, False:0})
    train_df['event'] = 1
    return train_df

### Optimal Parameters: {u'learning_rate': -3.12828125, u'num_nodes': 57.658203125, u'num_layers': 3.1380859375, u'dropout': 0.17845703125, u'lr_decay': 0.0006430859375, u'momentum': 0.857017578125, u'L2_reg': 2.620712890625}

### Optimal Parameters: {u'learning_rate': -3.3835156250000002, u'num_nodes': 126.7687109375, u'num_layers': 2.87705078125, u'dropout': 0.133017578125, u'lr_decay': 0.00025443359375, u'momentum': 0.8799306640625, u'L2_reg': 0.12260644531249987}

###  {u'learning_rate': -3.4492578125, u'num_nodes': 129.03681640625, u'num_layers': 2.131650390625, u'dropout': 0.4689794921875, u'lr_decay': 7.364257812500002e-05, u'momentum': 0.85455224609375, u'L2_reg': 4.69289697265625}

In [None]:
%%time
df_pysurvival_nn = model_data()
df_pysurvival_nn = pd.concat([df_pysurvival_nn, pd.get_dummies(df_pysurvival_nn['playing_type'], prefix='playing_type')], axis=1)
#del multikolinearumo pasalinam duo playing_type_2 atributa
df_pysurvival_nn.drop(columns=['playing_type_2'], inplace=True)
#df_pysurvival_nn.loc[(df_pysurvival_nn['playing_type_1'] == 0) & (df_pysurvival_nn['playing_type_3'] == 0), 'playing_type_3'] = 1
df_pysurvival_nn['death_time'] = df_pysurvival_nn['death_time'].div(60).round(0)

df_pysurvival_nn.drop(columns=['distance_sum', 'index', 'playing_type', 'assist', 'item_stack_count', 'damage', 'dist_on_freefall', 'rank'], inplace=True)
index_train, index_test = train_test_split(range(df_pysurvival_nn.shape[0]), test_size = 0.2, random_state=20)
data_train = df_pysurvival_nn.loc[index_train].reset_index(drop = True)
data_test  = df_pysurvival_nn.loc[index_test].reset_index(drop = True)
X_train, X_test = data_train.drop(columns=['death_time', 'event']), data_test.drop(columns=['death_time', 'event'])
T_train, T_test = data_train['death_time'].values, data_test['death_time'].values
E_train, E_test = data_train['event'].values, data_test['event'].values

structure = [{'activation': 'SELU', 'num_units': 129}, {'activation': 'SELU', 'num_units': 129}]
nn_model = NonLinearCoxPHModel(structure=structure)
nn_model.fit(X_train, T_train, E_train, lr=1e-3, init_method='glorot_uniform', num_epochs=2000, dropout=0.4689794921875, l2_reg=4.69289697265625)
display_loss_values(nn_model)

In [None]:
from pysurvival.utils.metrics import brier_score
def brier_score_plot(model, X, T, E, figure_size):
    
    times, brier_scores = brier_score(model, X, T, E)
    times.insert(0, 0)
    brier_scores.insert(0, 0)
    ibs_value = np.trapz(brier_scores, times)/max(T)

    fig, ax = plt.subplots(figsize=figure_size)
    title = 'Brier įvertinimų vidurkis = {:.2f}'
    title = title.format(ibs_value)
    ax.axhline(y=0.25, ls = 'dotted', color = 'red')
    ax.plot(times, brier_scores, color = 'blue', lw = 2)
    ax.set_xlim(0, max(T))
    ax.set_ylim(0)
    plt.xlabel('Laikas (min)', fontsize=13)
    plt.ylabel('Brier įvertinimas BS(t)', fontsize=13)
    ax.axhline(y=0.25, ls = 'dotted', color = 'red')
    plt.title(title, fontsize=18)
    plt.show()
    return ibs_value

In [None]:
ibs = brier_score_plot(nn_model, X_test, T_test, E_test, figure_size=(15, 6))

In [None]:
#### 5 - Cross Validation / Model Performances / C-INDEX
c_index = concordance_index(nn_model, X_test, T_test, E_test)
print('C-index: {:.2f}'.format(c_index))

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import median_absolute_error
from pysurvival import utils
from pysurvival.models.non_parametric import KaplanMeierModel
from pysurvival import utils
from pysurvival.utils import metrics

def act_to_est(model, X, T, E, figure_size, times = None,  metrics = ['rmse', 'mean', 'median']):
    kmf = KaplanMeierModel()
    kmf.fit(T, E)
    N = T.shape[0]
    if times is None:
        times = kmf.times
    actual = []
    predicted = []

    model_pred =  np.sum(model.predict_density(X), 0)
    for t in times:
        min_index = [abs(aj1-t) for (aj1, aj) in model.time_buckets]
        index = np.argmin(min_index)
        actual.append(N*kmf.predict_density(X,t))
        predicted.append(model_pred[index])    

    results = None
    title = 'Realus ir prognozuojamas ivykių skaičius'
    if metrics is not None:
        rmse = np.sqrt(mean_squared_error(actual, predicted))
        med_ae = median_absolute_error(actual, predicted)
        mae = mean_absolute_error(actual, predicted)

        #jei ivertinimo reikia tik vieno
        if isinstance(metrics, str) :
            if 'rmse' in metrics.lower() or 'root' in metrics.lower():
                results = rmse
                title += "\n Šaknis iš vidutinės kvadratinės paklaidos = {:.3f}".format(rmse)
            elif 'median' in metrics.lower() :
                results = med_ae
                title += "\n Absoliutinės paklaidos mediana = {:.3f}".format(med_ae)
            elif 'mean' in metrics.lower() :
                results = mae
                title += "\n Vidutinė absoliutinė paklaida = {:.3f}".format(mae)
            else:
                raise NotImplementedError('{} nėra tokio įvertinimo'.format(metrics))

        #jei reikalingu ivertinimu reikia saraso
        elif isinstance(metrics, list):
            results = {}
            is_rmse = False
            if any( [ ('rmse' in m.lower() or 'root' in m.lower()) \
                for m in metrics ]):
                is_rmse = True
                results['rmse'] = rmse
                title += "\n Šaknis iš vidutinės kvadratinės paklaidos = {:.3f}".format(rmse)
            is_med_ae = False
            if any( ['median' in m.lower() for m in metrics ]):
                is_med_ae = True
                results['median'] = med_ae
                title += "\n Absoliutinių paklaidų mediana = {:.3f}".format(med_ae)
            is_mae = False
            if any( ['mean' in m.lower() for m in metrics ]):
                is_mae = True
                results['mean'] = mae
                title += "\n Vidutinė absoliutinė paklaida = {:.3f}".format(mae)
            if all([not is_mae, not is_rmse, not is_med_ae]):
                error = 'Nurodyti vertinimai nerasti'
                raise NotImplementedError(error)

    fig, ax = plt.subplots(figsize=figure_size)
    ax.plot(times, actual, color='red', label='Realus', alpha=0.8, lw = 3)
    ax.plot(times, predicted, color='blue', label='Prognozuojamas', alpha=0.8, lw = 3)
    plt.xlim(0, max(T))
    ax.set_ylim(0)
    plt.xlabel('Laikas (min)', fontsize=13)
    plt.ylabel('Įvykių skaičius', fontsize=13)
    plt.title(title, fontsize = 15)
    plt.legend(fontsize = 15)
    plt.show()

    return results

In [None]:
results_end = act_to_est(nn_model, X_test, T_test, E_test, figure_size=(15, 6), metrics=['rmse', 'mean'])

In [None]:
y_tst = np.column_stack((E_test, T_test))
y_trn = np.column_stack((E_train, T_train))
ytst = pd.DataFrame()
ytrn = pd.DataFrame()
ytst['event'], ytst['death_time'] = y_tst.T
ytrn['event'], ytrn['death_time'] = y_trn.T
ytrn['event'] = ytrn['event'].astype('bool')
s = ytrn.dtypes
ytst['event'] = ytst['event'].astype('bool')
s1 = ytst.dtypes

y_train = np.array([tuple(x) for x in ytrn.values], dtype=list(zip(s.index, s)))
y_test = np.array([tuple(x) for x in ytst.values], dtype=list(zip(s1.index, s1)))

In [None]:
train_min, train_max = y_train['death_time'].min(), y_train['death_time'].max()
test_min, test_max = y_test['death_time'].min(), y_test["death_time"].max()

In [None]:
from sksurv.metrics import (concordance_index_censored,concordance_index_ipcw,cumulative_dynamic_auc)
times = np.arange(1, 31, 1)

prediction = nn_model.predict_risk(X_test)
va_auc, va_mean_auc = cumulative_dynamic_auc(y_train, y_test, prediction, times)

plt.plot(times, va_auc, marker="o")
plt.axhline(va_mean_auc, linestyle="--")
plt.xlabel("Time played (seconds)")
plt.ylabel("time-dependent AUC")
plt.grid(True)
va_mean_auc

In [None]:
ds_riskscores = pd.read_csv(r'C:\Users\grina\Desktop\VGTU\nncox_risk2.csv', header=None)
#just to slide results from negative to positive scale
#Not affecting end result as scale is arbitrary
risks = ds_riskscores+21488.166688

In [None]:
from scipy import stats
import seaborn as sns
counts, bins = np.histogram(risks, bins=30)
#Chose risk intervals visually. Picked slightly different starting points in those intervals
#because otherwise bins lay on each other 
low = risks[risks <= 12180]
medium = risks[(risks >= 12200) & (risks <= 17870)]
high = risks[risks >= 17910]
fig, ax = plt.subplots(figsize=(12, 6))
sns.distplot(low, bins=bins, color='blue', kde=False, label='Žemas rizikingumas')
sns.distplot(medium, bins=bins, color='green', kde=False, label='Vidutinis rizikingumas')
sns.distplot(high, bins=bins, color='red', kde=False, label='Aukštas rizikingumas')
ax.set(xlabel='Rizikingumas', ylabel='Kiekis', title='Žaidėjų rizikingumo histograma')
plt.legend()
sns.despine()

In [None]:
low_ind = np.where(risks <= 12180)[0]
surv_l = survivals[low_ind]

medium_ind = np.where((risks >= 12200) & (risks <= 17870))[0]
surv_m = survivals[medium_ind]

high_ind = np.where(risks >= 17910)[0]
surv_h = survivals[high_ind]

In [None]:
avg_h = []
avg_m = []
avg_l = []
for i, res in zip([surv_h, surv_m, surv_l], [avg_h, avg_m, avg_l]):
    res.append(np.median(i, axis=0))

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
colors = ['red', 'green', 'blue']
risk_types = ['aukštas', 'vidutinis', 'žemas']
c = 0
for name, i in zip(risk_types, [avg_h[0], avg_m[0], avg_l[0]]):
    _label = '{} rizikingumas'.format(name)
    plt.plot(nn_model.times, i, color=colors[c], label=_label, lw=2)
    c += 1
plt.legend(fontsize=12)
plt.title(title, fontsize=15)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
ymin, ymax = plt.ylim()
a = np.empty(X_test.shape[0], dtype=[("dist_on_foot", float), ("dist_on_vehicle", float)])

a["dist_on_foot"] = X_test['dist_on_foot'].to_numpy()
a["dist_on_vehicle"] = X_test['dist_on_vehicle'].to_numpy()
sort_idx = np.argsort(a, order=['dist_on_foot', 'dist_on_vehicle'])
list_ind = []
k = 0
arr_l = len(sort_idx)
for i in [int(arr_l*(1/3)),int(arr_l*(2/3)),int(arr_l)]:
    ind = np.random.choice(sort_idx[k:i], 1)[0]
    list_ind.append(ind)
    k = i
risk_types = ['aukštas', 'vidutinis', 'žemas']
colors = ['red', 'green', 'blue']
#calculating all survivals because when inserting separate indexes - calculation is incorrect(?)
survivals = nn_model.predict_survival(X_test)
c = 0
for name, (i, avg) in zip(risk_types, zip(list_ind, [avg_h[0], avg_m[0], avg_l[0]])):
    survival = survivals[i]
    _label = '{} rizikingumas'.format(name)
    plt.plot(nn_model.times, survival, color=colors[c], label=_label, lw=3, ls='dotted')
    temp = np.column_stack((avg, nn_model.times))
    test = min(range(len(avg)), key=lambda x: abs(avg[x]-0.5))
    plt.axvline(x=temp[test][1], color=colors[c], ls ='--', ymax=0.47)
    plt.text(temp[test][1], 0.25, str(temp[test][1]), ha='center', va='center',rotation='horizontal', backgroundcolor='white')
    plt.plot(nn_model.times, avg, color=colors[c], lw=2)
    c += 1
plt.axhline(y=0.5, color='black', ls ='--', xmax=(temp[test][1])/34.5)
title = 'Skirtingo rizikingumo žaidėjų išlikimo tikimybių kreivės'
plt.xlabel('Laikas (min)')
plt.ylabel('Išlikimo tikimybė S(t)')
ax.set_ylim(bottom=0)
ax.set_xlim(0)
plt.legend(fontsize=12)
plt.title(title, fontsize=15)
plt.show()

In [None]:
risks_nd = risks.to_numpy()

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
ymin, ymax = plt.ylim()
a = np.empty(X_test.shape[0], dtype=[("dist_on_foot", float), ("dist_on_vehicle", float)])

a["dist_on_foot"] = X_test['dist_on_foot'].to_numpy()
a["dist_on_vehicle"] = X_test['dist_on_vehicle'].to_numpy()
sort_idx = np.argsort(a, order=['dist_on_foot', 'dist_on_vehicle'])
list_ind = []
k = 0
arr_l = len(sort_idx)
for i in [int(arr_l*(1/3)),int(arr_l*(2/3)),int(arr_l)]:
    ind = np.random.choice(sort_idx[k:i], 1)[0]
    list_ind.append(ind)
    k = i
risk_types = ['aukštas', 'vidutinis', 'žemas']
colors = ['red', 'green', 'blue']
#calculating all survivals because when inserting separate indexes - calculation is incorrect(?)
survivals = nn_model.predict_survival(X_test)
c = 0
for i, risk in zip(list_ind, risk_types):
    survival = survivals[i]
    _label = '{} rizikingumas'.format(risk)
    plt.plot(nn_model.times, survival, color=colors[c], label=_label, lw=2)
    temp = np.column_stack((survival, nn_model.times))
    test = min(range(len(survival)), key=lambda x: abs(survival[x]-0.5))
    plt.axvline(x=temp[test][1], color=colors[c], ls ='--', ymax=0.47)
    ax.annotate(str(temp[test][1]), xy=(temp[test][1], 0.02), fontsize=14)
    c += 1
plt.axhline(y=0.5, color='black', ls ='--', xmax=(temp[test][1])/34.5)
title = 'Skirtingo rizikingumo žaidėjų išlikimo tikimybių kreivės'
plt.xlabel('Laikas (min)')
plt.ylabel('Išlikimo tikimybė S(t)')
ax.set_ylim(bottom=0)
ax.set_xlim(0)
plt.legend(fontsize=12)
plt.title(title, fontsize=15)
plt.show()


In [None]:
sns.distplot(risks, kde=False, bins=30)

In [None]:
%%time
c_list = {}
full_list = df_pysurvival_nn.columns.tolist()
for i in full_list:
    df_pysurvival_nn = model_data()
    df_pysurvival_nn = pd.concat([df_pysurvival_nn, pd.get_dummies(df_pysurvival_nn['playing_type'], prefix='playing_type')], axis=1)
    df_pysurvival_nn.rename(columns={"playing_type_1":"solo", "playing_type_2":"duo", "playing_type_3":"squad"}, inplace=True)
    #del multikolinearumo pasalinam duo playing_type_2 atributa
    df_pysurvival_nn.drop(columns=['duo'], inplace=True)
    #df_pysurvival_nn.loc[(df_pysurvival_nn['playing_type_1'] == 0) & (df_pysurvival_nn['playing_type_3'] == 0), 'playing_type_3'] = 1
    df_pysurvival_nn['death_time'] = df_pysurvival_nn['death_time'].div(60).round(0)

    df_pysurvival_nn.drop(columns=['distance_sum', 'index', 'playing_type', 'assist', 'item_stack_count', 'damage', 'dist_on_freefall', 'rank'], inplace=True)
    index_train, index_test = train_test_split(range(df_pysurvival_nn.shape[0]), test_size = 0.2, random_state=20)
    data_train = df_pysurvival_nn.loc[index_train].reset_index(drop = True)
    data_test  = df_pysurvival_nn.loc[index_test].reset_index(drop = True)
    X_train, X_test = data_train.drop(columns=['death_time', 'event']), data_test.drop(columns=['death_time', 'event'])
    T_train, T_test = data_train['death_time'].values, data_test['death_time'].values
    E_train, E_test = data_train['event'].values, data_test['event'].values

    structure = [{'activation': 'SELU', 'num_units': 129}, {'activation': 'SELU', 'num_units': 129}]
    nn_model = NonLinearCoxPHModel(structure=structure)
    X_train[i] = np.random.permutation(X_train[i])
    nn_model.fit(X_train, T_train, E_train, lr=1e-3, init_method='glorot_uniform', num_epochs=2000, dropout=0.4689794921875, l2_reg=4.69289697265625)
    c_index = concordance_index(nn_model, X_test, T_test, E_test)
    c_list[i] = c_index
print(c_list)

In [None]:
c_ind_final = pd.DataFrame()
c_ind_final['name'] = c_list.keys()
c_ind_final['diff'] = c_list.values()
c_ind_final['diff'] = c_index - c_ind_final['diff']
c_ind_final.sort_values(by='diff', ascending=False)