# Dynamic-DeepHit Tutorial

### by Changhee Lee

In [1]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google.colab'

In [2]:
cd '/content/drive/MyDrive/Dynamic_Deep_Hit/'

/content/drive/MyDrive/Dynamic_Deep_Hit


In [2]:
%tensorflow_version 1.x

UsageError: Line magic function `%tensorflow_version` not found.


In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'

In [None]:
!jupyter nbconvert --to python tutorial_mimic_5Day.ipynb

[NbConvertApp] Converting notebook tutorial_mimic_5Day.ipynb to python
[NbConvertApp] Writing 18596 bytes to tutorial_mimic_5Day.py


In [300]:
_EPSILON = 1e-08

import numpy as np
import pandas as pd
import tensorflow as tf
import random
import os

from sklearn.model_selection import train_test_split

import import_mimic as impt


from class_DeepLongitudinal import Model_Longitudinal_Attention

from utils_eval             import c_index, brier_score
from utils_log              import save_logging, load_logging
from utils_helper           import f_get_minibatch, f_get_boosted_trainset

In [301]:
#int(1.2*6.1) = 7 

In [302]:
def _f_get_pred(sess, model, data, data_mi, pred_horizon):
    '''
        predictions based on the prediction time.
        create new_data and new_mask2 that are available previous or equal to the prediction time (no future measurements are used)
    '''
    new_data    = np.zeros(np.shape(data))
    new_data_mi = np.zeros(np.shape(data_mi))

    meas_time = np.concatenate([np.zeros([np.shape(data)[0], 1]), np.cumsum(data[:, :, 0], axis=1)[:, :-1]], axis=1)

    for i in range(np.shape(data)[0]):
        last_meas = np.sum(meas_time[i, :] <= pred_horizon)

        new_data[i, :last_meas, :]    = data[i, :last_meas, :]
        new_data_mi[i, :last_meas, :] = data_mi[i, :last_meas, :]

    return model.predict(new_data, new_data_mi)


def f_get_risk_predictions(sess, model, data_, data_mi_, pred_time, eval_time):
    
    pred = _f_get_pred(sess, model, data_[[0]], data_mi_[[0]], 0)
    _, num_Event, num_Category = np.shape(pred)
       
    risk_all = {}
    for k in range(num_Event):
        risk_all[k] = np.zeros([np.shape(data_)[0], len(pred_time), len(eval_time)])
            
    for p, p_time in enumerate(pred_time):
        ### PREDICTION
        pred_horizon = int(p_time)
        pred = _f_get_pred(sess, model, data_, data_mi_, pred_horizon)
        #print(pred.shape)
        print("Pred ==============================================")
        print(pred)


        for t, t_time in enumerate(eval_time):
            eval_horizon = int(t_time) + pred_horizon #if eval_horizon >= num_Category, output the maximum...

            # calculate F(t | x, Y, t >= t_M) = \sum_{t_M <= \tau < t} P(\tau | x, Y, \tau > t_M)
            risk = np.sum(pred[:,:,pred_horizon:(eval_horizon+1)], axis=2) #risk score until eval_time
            risk = risk / (np.sum(np.sum(pred[:,:,pred_horizon:], axis=2), axis=1, keepdims=True) +_EPSILON) #conditioniong on t > t_pred
            
            for k in range(num_Event):
                risk_all[k][:, p, t] = risk[:, k]
                
    return pred, risk_all

### 1. Import Dataset
#####      - Users must prepare dataset in csv format and modify 'import_data.py' following our examplar 'PBC2'

In [303]:
data_mode                   = 'PBC2' 
seed                        = 1234

##### IMPORT DATASET
'''
    num_Category            = max event/censoring time * 1.2
    num_Event               = number of evetns i.e. len(np.unique(label))-1
    max_length              = maximum number of measurements
    x_dim                   = data dimension including delta (1 + num_features)
    x_dim_cont              = dim of continuous features
    x_dim_bin               = dim of binary features
    mask1, mask2, mask3     = used for cause-specific network (FCNet structure)
'''

if data_mode == 'PBC2':
    (x_dim, x_dim_cont, x_dim_bin), (data, time, label), (mask1, mask2, mask3), (data_mi) = impt.import_dataset(norm_mode = 'standard')
    
    # This must be changed depending on the datasets, prediction/evaliation times of interest
    #pred_time = [52, 3*52, 5*52] # prediction time (in months)
    #pred_time = [300]
    #eval_time = [12, 36, 60, 120] # months evaluation time (for C-index and Brier-Score)
    #eval_time = [6]
    
    # pred_time = [5 * 24]
    # eval_time = [1, 2, 3, 4, 5, 6]
    pred_time = [30, 60, 90]
    eval_time = [30, 60, 90]
else:
    print ('ERROR:  DATA_MODE NOT FOUND !!!')

_, num_Event, num_Category  = np.shape(mask1)  # dim of mask3: [subj, Num_Event, Num_Category]
max_length                  = np.shape(data)[1]


file_path = '{}'.format(datetime.now().strftime("%Y%m%d_%H%M%S"))

if not os.path.exists(file_path):
    os.makedirs(file_path)

### 2. Set Hyper-Parameters
##### - Play with your own hyper-parameters!

In [304]:
burn_in_mode                = 'ON' #{'ON', 'OFF'}
boost_mode                  = 'OFF' #'ON' #{'ON', 'OFF'}

##### HYPER-PARAMETERS
# TODO: Change to 32
new_parser = {'mb_size': 4,

             'iteration_burn_in': 3000,
             #'iteration': 25000,
              'iteration': 10000,

             'keep_prob': 0.6,
             'lr_train': 1e-4,

             'h_dim_RNN': 100,
             'h_dim_FC' : 100,
             'num_layers_RNN':2,
             'num_layers_ATT':2,
             'num_layers_CS' :2,

             'RNN_type':'LSTM', #{'LSTM', 'GRU'}

             'FC_active_fn' : tf.nn.relu,
             'RNN_active_fn': tf.nn.tanh,

            'reg_W'         : 1e-5,
            'reg_W_out'     : 0.,

             'alpha' :1.0,
             'beta'  :0.1,
             'gamma' :1.0
}


# INPUT DIMENSIONS
input_dims                  = { 'x_dim'         : x_dim,
                                'x_dim_cont'    : x_dim_cont,
                                'x_dim_bin'     : x_dim_bin,
                                'num_Event'     : num_Event,
                                'num_Category'  : num_Category,
                                'max_length'    : max_length }
print(input_dims)

# NETWORK HYPER-PARMETERS
network_settings            = { 'h_dim_RNN'         : new_parser['h_dim_RNN'],
                                'h_dim_FC'          : new_parser['h_dim_FC'],
                                'num_layers_RNN'    : new_parser['num_layers_RNN'],
                                'num_layers_ATT'    : new_parser['num_layers_ATT'],
                                'num_layers_CS'     : new_parser['num_layers_CS'],
                                'RNN_type'          : new_parser['RNN_type'],
                                'FC_active_fn'      : new_parser['FC_active_fn'],
                                'RNN_active_fn'     : new_parser['RNN_active_fn'],
                                'initial_W'         : tf.contrib.layers.xavier_initializer(),

                                'reg_W'             : new_parser['reg_W'],
                                'reg_W_out'         : new_parser['reg_W_out']
                                 }


mb_size           = new_parser['mb_size']
iteration         = new_parser['iteration']
iteration_burn_in = new_parser['iteration_burn_in']

keep_prob         = new_parser['keep_prob']
lr_train          = new_parser['lr_train']

alpha             = new_parser['alpha']
beta              = new_parser['beta']
gamma             = new_parser['gamma']

# SAVE HYPERPARAMETERS
log_name = file_path + '/hyperparameters_log.txt'
save_logging(new_parser, log_name)

{'x_dim': 18, 'x_dim_cont': 16, 'x_dim_bin': 1, 'num_Event': 1, 'num_Category': 120, 'max_length': 445}


### 3. Split Dataset into Train/Valid/Test Sets

In [308]:
### TRAINING-TESTING SPLIT
# TODO: could do stratified k-fold
(tr_data,te_data, tr_data_mi, te_data_mi, tr_time,te_time, tr_label,te_label, 
 tr_mask1,te_mask1, tr_mask2,te_mask2, tr_mask3,te_mask3) = train_test_split(data, data_mi, time, label, mask1, mask2, mask3, test_size=0.2, random_state=seed) 

(tr_data,va_data, tr_data_mi, va_data_mi, tr_time,va_time, tr_label,va_label, 
 tr_mask1,va_mask1, tr_mask2,va_mask2, tr_mask3,va_mask3) = train_test_split(tr_data, tr_data_mi, tr_time, tr_label, tr_mask1, tr_mask2, tr_mask3, test_size=0.125, random_state=seed) 

# (tr_data,te_data, tr_data_mi, te_data_mi, tr_time,te_time, tr_label,te_label, 
#  tr_mask1,te_mask1, tr_mask2,te_mask2, tr_mask3,te_mask3) = train_test_split(data, data_mi, time, label, mask1, mask2, mask3, test_size=0.2, random_state=seed) 

# (tr_data,va_data, tr_data_mi, va_data_mi, tr_time,va_time, tr_label,va_label, 
#  tr_mask1,va_mask1, tr_mask2,va_mask2, tr_mask3,va_mask3) = train_test_split(tr_data, tr_data_mi, tr_time, tr_label, tr_mask1, tr_mask2, tr_mask3, test_size=0.2, random_state=seed) 





# if boost_mode == 'ON':
#     tr_data, tr_data_mi, tr_time, tr_label, tr_mask1, tr_mask2, tr_mask3 = f_get_boosted_trainset(tr_data, tr_data_mi, tr_time, tr_label, tr_mask1, tr_mask2, tr_mask3)

In [317]:
tr_data.shape, va_data.shape, te_data.shape

((4416, 445, 18), (631, 445, 18), (1262, 445, 18))

In [320]:
tr_label.sum(), va_label.sum(), te_label.sum()

(283.0, 48.0, 74.0)

In [316]:
data.shape

(6309, 445, 18)

In [309]:
te_time.shape, va_time.shape

((1262, 1), (631, 1))

In [311]:
va_label.mean()

0.07606973058637084

In [314]:
te_label

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [1.]])

In [None]:
tr_data.shape, va_data.shape, te_data.shape

((4416, 445, 18), (631, 445, 18), (1262, 445, 18))

In [168]:
va_label.sum() / 631, te_label.sum()/te_data.shape[0],  tr_label.sum()/tr_data.shape[0]

(0.07606973058637084, 0.058637083993660855, 0.06408514492753623)

In [13]:
va_data.shape

(1010, 445, 18)

In [14]:
te_data.shape

(1262, 445, 18)

In [15]:
tr_data.shape

(4037, 445, 18)

In [17]:
import time

### 4. Train the Networ

In [19]:
##### CREATE DYNAMIC-DEEPFHT NETWORK
tf.reset_default_graph()

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)

model = Model_Longitudinal_Attention(sess, "Dyanmic-DeepHit", input_dims, network_settings)
saver = tf.train.Saver()

sess.run(tf.global_variables_initializer())

INFO:tensorflow:Scale of 0 disables regularizer.


In [151]:
tr_data.shape

(4037, 445, 18)

20220420_085927
20220427_091421
4_flagging.py
AUC_ROC_FINAL_Trey.ipynb
boxhed_MIMIC_Full_TREWScore_CV_Sliding_Window.ipynb
class_DeepLongitudinal.py
data
DD_pred_val_5day_Tmax120.npy
DD_risk_all_val_5day_Tmax120.npy
get_main_CF.py
import_data.py
import_mimic.py
main.py
notes.txt
old
OLD-ddh-packages.txt
old_requirements.txt
PBC2
PBC22022-04-17T19:34:03.619733
PBC22022-04-17T20:06:51.445516
PBC22022-04-17T20:46:42.447952
pt_class_DeepLongitudinal.py
__pycache__
README.md
sample.py
sandbox.ipynb
todo.txt
tutorial.ipynb
tutorial_mimic_5Day.ipynb
tutorial.py
utils_eval.py
utils_helper.py
utils_log.py
utils_network.py
_utils.py


In [None]:
PRINT_ITER = 1000
start = time.time()
### TRAINING - BURN-IN
if burn_in_mode == 'ON':
    print( "BURN-IN TRAINING ...")
    for itr in range(iteration_burn_in):
        x_mb, x_mi_mb, k_mb, t_mb, m1_mb, m2_mb, m3_mb = f_get_minibatch(mb_size, tr_data, tr_data_mi, tr_label, tr_time, tr_mask1, tr_mask2, tr_mask3)
        DATA = (x_mb, k_mb, t_mb)
        MISSING = (x_mi_mb)

        _, loss_curr = model.train_burn_in(DATA, MISSING, keep_prob, lr_train)

        if (itr+1)%PRINT_ITER == 0:
            print('itr: {:04d} | loss: {:.4f}'.format(itr+1, loss_curr))


### TRAINING - MAIN
print( "MAIN TRAINING ...")
min_valid = 0.5


for itr in range(iteration):
    x_mb, x_mi_mb, k_mb, t_mb, m1_mb, m2_mb, m3_mb = f_get_minibatch(mb_size, tr_data, tr_data_mi, tr_label, tr_time, tr_mask1, tr_mask2, tr_mask3)
    DATA = (x_mb, k_mb, t_mb)
    MASK = (m1_mb, m2_mb, m3_mb)
    MISSING = (x_mi_mb)
    PARAMETERS = (alpha, beta, gamma)

    _, loss_curr = model.train(DATA, MASK, MISSING, PARAMETERS, keep_prob, lr_train)

    if (itr+1)%PRINT_ITER == 0:
        print('itr: {:04d} | loss: {:.4f}'.format(itr+1, loss_curr))

    ### VALIDATION  (based on average C-index of our interest)
    if (itr+1)%PRINT_ITER == 0:        
        pred, risk_all = f_get_risk_predictions(sess, model, va_data, va_data_mi, pred_time, eval_time)
        
        for p, p_time in enumerate(pred_time):
            pred_horizon = int(p_time)
            val_result1 = np.zeros([num_Event, len(eval_time)])
            
            for t, t_time in enumerate(eval_time):                
                eval_horizon = int(t_time) + pred_horizon
                for k in range(num_Event):
                    val_result1[k, t] = c_index(risk_all[k][:, p, t], va_time, (va_label[:,0] == k+1).astype(int), eval_horizon) #-1 for no event (not comparable)
            
            if p == 0:
                val_final1 = val_result1
            else:
                val_final1 = np.append(val_final1, val_result1, axis=0)

        tmp_valid = np.mean(val_final1)
        print('val_loss: {:.4f}'.format(tmp_valid))

        if tmp_valid >  min_valid:
            # min_valid = tmp_valid
            saver.save(sess, file_path + '/model')
            print( 'updated.... average c-index = ' + str('%.4f' %(tmp_valid)))
        
end = time.time()
print('Elapsed Time= ', end - start)

print("================================")
print("         END OF TRAIN           ")
print("================================")

INFO:tensorflow:Scale of 0 disables regularizer.
BURN-IN TRAINING ...


OMP: Info #250: KMP_AFFINITY: pid 813035 tid 816612 thread 1 bound to OS proc set 1
OMP: Info #250: KMP_AFFINITY: pid 813035 tid 817159 thread 2 bound to OS proc set 2
OMP: Info #250: KMP_AFFINITY: pid 813035 tid 817160 thread 3 bound to OS proc set 3
OMP: Info #250: KMP_AFFINITY: pid 813035 tid 817161 thread 4 bound to OS proc set 4
OMP: Info #250: KMP_AFFINITY: pid 813035 tid 817162 thread 5 bound to OS proc set 5
OMP: Info #250: KMP_AFFINITY: pid 813035 tid 817163 thread 6 bound to OS proc set 6
OMP: Info #250: KMP_AFFINITY: pid 813035 tid 817164 thread 7 bound to OS proc set 7
OMP: Info #250: KMP_AFFINITY: pid 813035 tid 817165 thread 8 bound to OS proc set 8
OMP: Info #250: KMP_AFFINITY: pid 813035 tid 817166 thread 9 bound to OS proc set 9
OMP: Info #250: KMP_AFFINITY: pid 813035 tid 817167 thread 10 bound to OS proc set 10
OMP: Info #250: KMP_AFFINITY: pid 813035 tid 817168 thread 11 bound to OS proc set 11
OMP: Info #250: KMP_AFFINITY: pid 813035 tid 817169 thread 12 bound to O

In [173]:
end = 5.5
start = 0.12
"time: " + str(end-start)

'time: 5.38'

In [93]:
from datetime import date, datetime
datetime.now().strftime("%Y%m%d_%H%M%S")

'20220420_084940'

In [14]:
with open('DD_pred_val_5day_Tmax120.npy', 'wb') as f:
    np.save(f, pred)

In [15]:
with open('DD_risk_all_val_5day_Tmax120.npy', 'wb') as f:
    np.save(f, risk_all)

In [17]:
with open('DD_pred_val_5day_Tmax120.npy', 'rb') as f:
    temp1 = np.load(f)

In [None]:
# We have pred and risk_all
temp1.shape

### 5. Test the Trained Network

In [174]:
file_path

'20220427_091421'

In [175]:
file_path = 'PBC2'

In [176]:

pred_time = [5 * 24]
eval_time = [1, 2, 3, 4, 5, 6]
saver.restore(sess, file_path + '/model')


pred_test, risk_all = f_get_risk_predictions(sess, model, te_data, te_data_mi, pred_time, eval_time)

for p, p_time in enumerate(pred_time):
    pred_horizon = int(p_time)
    result1, result2 = np.zeros([num_Event, len(eval_time)]), np.zeros([num_Event, len(eval_time)])

    for t, t_time in enumerate(eval_time):                
        eval_horizon = int(t_time) + pred_horizon
        for k in range(num_Event):
            result1[k, t] = c_index(risk_all[k][:, p, t], te_time, (te_label[:,0] == k+1).astype(int), eval_horizon) #-1 for no event (not comparable)
            result2[k, t] = brier_score(risk_all[k][:, p, t], te_time, (te_label[:,0] == k+1).astype(int), eval_horizon) #-1 for no event (not comparable)
    
    if p == 0:
        final1, final2 = result1, result2
    else:
        final1, final2 = np.append(final1, result1, axis=0), np.append(final2, result2, axis=0)
        
        
row_header = []
for p_time in pred_time:
    for t in range(num_Event):
        row_header.append('pred_time {}: event_{}'.format(p_time,k+1))
            
col_header = []
for t_time in eval_time:
    col_header.append('eval_time {}'.format(t_time))

# c-index result
df1 = pd.DataFrame(final1, index = row_header, columns=col_header)

# brier-score result
df2 = pd.DataFrame(final2, index = row_header, columns=col_header)

### PRINT RESULTS
print('========================================================')
print('--------------------------------------------------------')
print('- C-INDEX: ')
print(df1)
print('--------------------------------------------------------')

print('- BRIER-SCORE: ')
print(df2)
print('========================================================')

INFO:tensorflow:Restoring parameters from PBC2/model
[[[0.01563262 0.01541429 0.01378932 ... 0.00513091 0.00712188 0.00735835]]

 [[0.02222462 0.0223603  0.01797179 ... 0.00337099 0.00593554 0.0058005 ]]

 [[0.03116917 0.03214696 0.02267849 ... 0.00186631 0.00436422 0.00399641]]

 ...

 [[0.01243486 0.0121342  0.01152251 ... 0.00629858 0.00765254 0.00815008]]

 [[0.0153534  0.01512579 0.01360126 ... 0.00522416 0.00717543 0.00742598]]

 [[0.04741859 0.05115076 0.02853565 ... 0.00053141 0.00207579 0.00169005]]]
--------------------------------------------------------
- C-INDEX: 
                        eval_time 1  eval_time 2  eval_time 3  eval_time 4  \
pred_time 120: event_1          0.0          0.0          0.0          0.0   

                        eval_time 5  eval_time 6  
pred_time 120: event_1          0.0          0.0  
--------------------------------------------------------
- BRIER-SCORE: 
                        eval_time 1  eval_time 2  eval_time 3  eval_time 4  \
pred_t

In [179]:
tr_data.shape, tr_data_mi.shape

((4037, 445, 18), (4037, 445, 18))

In [29]:
te_data.shape, te_data_mi.shape

((1262, 445, 18), (1262, 445, 18))

## Prediction

In [182]:
preds_train = model.predict(tr_data, tr_data_mi)

In [183]:
preds_train.shape

(4037, 1, 120)

In [184]:
# preds_val = 
preds_val = model.predict(va_data, va_data_mi)
preds_test = model.predict(te_data, te_data_mi)

preds_val.shape, preds_test.shape

((1010, 1, 120), (1262, 1, 120))

## Build flag dataset

* ep_ids: Length of 5000, but should only have 50 unique values for each patient.
* deltas: Length of 5000, but should have less than 50 values of 1 bc not every patient will have experienced the event.
* Y: 5000 all 1's
* hzrds: length of 5000

In [119]:
arr = []
for_flagging = pd.DataFrame()

In [115]:
preds_test = preds_test.reshape(1262, 120)
preds_test.shape

(1262, 120)

In [284]:
preds_test[-1]

array([[4.74185869e-02, 5.11507578e-02, 2.85356529e-02, 4.94656302e-02,
        1.18266642e-02, 5.62251285e-02, 7.60919154e-02, 4.70779873e-02,
        2.83700787e-02, 2.02239342e-02, 3.84919643e-02, 5.90230003e-02,
        3.49877514e-02, 1.63850021e-02, 9.40092839e-03, 1.89928189e-02,
        1.24453669e-02, 2.80708224e-02, 1.41971111e-02, 1.75902918e-02,
        5.74528845e-03, 1.64338909e-02, 1.72734875e-02, 8.49860720e-03,
        6.59849914e-03, 5.72214182e-03, 1.18650254e-02, 4.21600509e-03,
        7.65543664e-03, 5.63673628e-03, 3.07314238e-03, 1.13037638e-02,
        3.96570656e-03, 2.92197522e-03, 2.60574231e-03, 6.45308057e-04,
        9.24878102e-03, 1.97491003e-03, 1.11353013e-03, 1.45348930e-03,
        1.95527100e-03, 9.19480901e-03, 1.71024425e-04, 6.02528127e-03,
        1.22825950e-02, 4.65025101e-03, 8.28730501e-03, 1.19715706e-02,
        1.24806678e-03, 5.64308325e-03, 1.32221393e-02, 1.80227426e-03,
        1.97636167e-04, 9.06530360e-04, 9.90373373e-05, 1.269429

In [283]:
for_flagging

Unnamed: 0,id,hazard,label,Y
0,0.0,0.015633,0.0,1
1,0.0,0.015414,0.0,1
2,0.0,0.013789,0.0,1
3,0.0,0.015937,0.0,1
4,0.0,0.010537,0.0,1
...,...,...,...,...
151435,1261.0,0.002706,0.0,1
151436,1261.0,0.000769,0.0,1
151437,1261.0,0.000531,0.0,1
151438,1261.0,0.002076,0.0,1


In [124]:
np.repeat(2, 120)

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [125]:
arr = np.array([])
for i in range(1262):
    new = np.repeat(i, 120)
    arr = np.concatenate((arr, new), axis=None)

In [133]:
te_label.shape

(1262, 1)

In [131]:
for_flagging['id'] = arr
for_flagging['hazard'] = preds_test.reshape(1262*120)
for_flagging

Unnamed: 0,id,hazard
0,0.0,0.015633
1,0.0,0.015414
2,0.0,0.013789
3,0.0,0.015937
4,0.0,0.010537
...,...,...
151435,1261.0,0.002706
151436,1261.0,0.000769
151437,1261.0,0.000531
151438,1261.0,0.002076


In [136]:
# for i in te_label:
#     print(i)
#     return

arr = np.array([])
for i in range(1262):
    new = np.repeat(i, 120)
    arr = np.concatenate((arr, new), axis=None)

(1262, 1)

In [206]:
te_label.shape

(1262, 1)

In [205]:
te_label.reshape(1262).shape

(1262,)

In [137]:
l_arr = np.array([])
for i in te_label:
    new = np.repeat(i, 120)
    l_arr = np.concatenate((l_arr, new), axis=None)
l_arr.shape

(151440,)

In [143]:
for_flagging['label'] = l_arr
for_flagging

Unnamed: 0,id,hazard,label
0,0.0,0.015633,0.0
1,0.0,0.015414,0.0
2,0.0,0.013789,0.0
3,0.0,0.015937,0.0
4,0.0,0.010537,0.0
...,...,...,...
151435,1261.0,0.002706,1.0
151436,1261.0,0.000769,1.0
151437,1261.0,0.000531,1.0
151438,1261.0,0.002076,1.0


In [211]:
te_time

array([[120.      ],
       [ 47.436667],
       [ 47.033611],
       ...,
       [ 82.418611],
       [ 53.641667],
       [ 76.731944]])

In [214]:
te_label

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [1.]])

In [232]:
te_time_ceil = np.ceil(te_time.reshape(te_time.shape[0])).astype(int)

In [233]:
te_time_ceil

array([120,  48,  48, ...,  83,  54,  77])

In [235]:
curr = np.repeat(0, 120)
curr[te_time_ceil[0]] = 1
curr

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1])

In [258]:
l_array = np.array([]).astype(int)
l_array = np.concatenate([l_array, np.repeat(0, 120)])
l_array

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [260]:
te_label

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [1.]])

In [289]:
te_time_ceil

array([120,  48,  48, ...,  83,  54,  77])

In [296]:
tr_label.sum(), tr_label.shape

(255.0, (4037, 1))

In [299]:
tr_label.mean() * 100, va_label.mean() * 100, te_label.mean() * 100

(6.31657171166708, 7.524752475247524, 5.863708399366086)

In [294]:
te_label.sum(), te_label.shape

(74.0, (1262, 1))

In [295]:
va_label.sum(), va_label.shape

(76.0, (1010, 1))

In [261]:
l_array = np.array([]) 
for i, label in enumerate(te_label.reshape(te_label.shape[0])):
    print(i, label)
    if label == 0:
        l_array = np.concatenate([l_array, np.repeat(0, 120)])
    else:
        curr = np.repeat(0, 120)
        curr[te_time_ceil[i]] = 1
        l_array = np.concatenate([l_array, curr])

0 0.0
1 0.0
2 0.0
3 0.0
4 0.0
5 0.0
6 0.0
7 0.0
8 0.0
9 0.0
10 0.0
11 0.0
12 0.0
13 0.0
14 0.0
15 0.0
16 0.0
17 0.0
18 0.0
19 0.0
20 0.0
21 0.0
22 0.0
23 1.0
24 0.0
25 0.0
26 0.0
27 0.0
28 1.0
29 0.0
30 0.0
31 0.0
32 0.0
33 0.0
34 0.0
35 0.0
36 0.0
37 0.0
38 0.0
39 0.0
40 0.0
41 0.0
42 0.0
43 0.0
44 0.0
45 0.0
46 0.0
47 0.0
48 0.0
49 0.0
50 0.0
51 0.0
52 0.0
53 0.0
54 0.0
55 0.0
56 0.0
57 0.0
58 0.0
59 0.0
60 0.0
61 0.0
62 0.0
63 0.0
64 0.0
65 0.0
66 0.0
67 0.0
68 0.0
69 0.0
70 0.0
71 0.0
72 0.0
73 0.0
74 0.0
75 0.0
76 0.0
77 0.0
78 0.0
79 0.0
80 0.0
81 0.0
82 0.0
83 1.0
84 0.0
85 0.0
86 0.0
87 0.0
88 0.0
89 0.0
90 0.0
91 0.0
92 0.0
93 0.0
94 0.0
95 0.0
96 0.0
97 0.0
98 0.0
99 0.0
100 0.0
101 0.0
102 0.0
103 0.0
104 0.0
105 0.0
106 0.0
107 0.0
108 0.0
109 0.0
110 0.0
111 0.0
112 0.0
113 0.0
114 1.0
115 0.0
116 0.0
117 0.0
118 0.0
119 0.0
120 0.0
121 0.0
122 0.0
123 0.0
124 0.0
125 0.0
126 0.0
127 0.0
128 0.0
129 1.0
130 1.0
131 0.0
132 0.0
133 0.0
134 0.0
135 0.0
136 0.0
137 0.0
138 0.

In [270]:
for_flagging['label'] = l_array
for_flagging

Unnamed: 0,id,hazard,label
0,0.0,0.015633,0.0
1,0.0,0.015414,0.0
2,0.0,0.013789,0.0
3,0.0,0.015937,0.0
4,0.0,0.010537,0.0
...,...,...,...
151435,1261.0,0.002706,0.0
151436,1261.0,0.000769,0.0
151437,1261.0,0.000531,0.0
151438,1261.0,0.002076,0.0


In [274]:
for_flagging['Y'] = np.repeat(1, for_flagging.shape[0])
for_flagging

Unnamed: 0,id,hazard,label,Y
0,0.0,0.015633,0.0,1
1,0.0,0.015414,0.0,1
2,0.0,0.013789,0.0,1
3,0.0,0.015937,0.0,1
4,0.0,0.010537,0.0,1
...,...,...,...,...
151435,1261.0,0.002706,0.0,1
151436,1261.0,0.000769,0.0,1
151437,1261.0,0.000531,0.0,1
151438,1261.0,0.002076,0.0,1


In [281]:
for_flagging.to_csv('for_flagging.csv', index=False)

In [279]:
for_flagging.id.values

array([   0.,    0.,    0., ..., 1261., 1261., 1261.])

In [280]:
from flagging import flag

flag(for_flagging['id'].values, for_flagging['label'].values, for_flagging['Y'].values, for_flagging['hazard'].values, 2)

 84%|███████████████████████████████████████████████████████████████████▎            | 531/631 [00:00<00:00, 816.91it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 631/631 [00:00<00:00, 850.74it/s]


TypeError: unsupported operand type(s) for |: 'dict' and 'dict'

In [120]:
# for i, row in enumerate(preds_test):
#     for j, cell in enumerate(row):
#         for_flagging.iloc[i*j + j, 0] = i

IndexError: single positional indexer is out-of-bounds

In [199]:
for_flagging

Unnamed: 0,id,hazard,label
0,0.0,0.015633,0.0
1,0.0,0.015414,0.0
2,0.0,0.013789,0.0
3,0.0,0.015937,0.0
4,0.0,0.010537,0.0
...,...,...,...
151435,1261.0,0.002706,1.0
151436,1261.0,0.000769,1.0
151437,1261.0,0.000531,1.0
151438,1261.0,0.002076,1.0


## Prediction - Other

In [83]:
preds_total = model.predict(data, data_mi)

In [85]:
preds_total.shape

(6309, 1, 120)

In [84]:
preds_total

array([[[0.00959601, 0.00926501, 0.00934851, ..., 0.00758365,
         0.00802149, 0.0088137 ]],

       [[0.01679892, 0.01664792, 0.01456643, ..., 0.00475689,
         0.00689238, 0.00705252]],

       [[0.01589489, 0.01569101, 0.01396801, ..., 0.00504519,
         0.00707251, 0.00728838]],

       ...,

       [[0.02102844, 0.02108788, 0.01724927, ..., 0.00363602,
         0.00614285, 0.00606709]],

       [[0.01385891, 0.01358891, 0.01255394, ..., 0.00574734,
         0.0074249 , 0.00779846]],

       [[0.0115911 , 0.0112757 , 0.01089281, ..., 0.00665276,
         0.00777734, 0.00835884]]], dtype=float32)

In [46]:
# Zhale's older stuff below

In [23]:
pred_test.shape

(6309, 1, 120)

In [27]:
pred_test_reshaped = pred_test.reshape(6309, 120)

In [29]:
pred_test_df = pd.DataFrame(pred_test_reshaped)

In [30]:
pred_test_df.to_csv('DD_pred_test_5day_Tmax120.csv', index = False)

In [26]:
with open('DD_pred_test_5day.npy', 'wb') as f:
    np.save(f, pred_test)

In [31]:
with open('DD_risk_all_test_5day_Tmax120.npy', 'wb') as f:
    np.save(f, risk_all)

In [28]:
risk_all[0].shape

(6309, 1, 6)

In [29]:
pred_test.shape

(6309, 1, 144)

In [18]:
pred_test[:10]

array([[[7.2067390e-17, 1.4920398e-12, 1.0000000e+00, 1.1679825e-10,
         2.1693908e-19, 4.0052980e-32]],

       [[7.5126031e-06, 9.9999213e-01, 4.0980569e-07, 7.4903677e-12,
         1.4688242e-11, 1.8345603e-09]],

       [[7.5126031e-06, 9.9999213e-01, 4.0980569e-07, 7.4903677e-12,
         1.4688242e-11, 1.8345603e-09]],

       [[7.5126031e-06, 9.9999213e-01, 4.0980569e-07, 7.4903677e-12,
         1.4688242e-11, 1.8345603e-09]],

       [[7.5126031e-06, 9.9999213e-01, 4.0980569e-07, 7.4903677e-12,
         1.4688242e-11, 1.8345603e-09]],

       [[7.5126031e-06, 9.9999213e-01, 4.0980569e-07, 7.4903677e-12,
         1.4688242e-11, 1.8345603e-09]],

       [[7.2067390e-17, 1.4920398e-12, 1.0000000e+00, 1.1679825e-10,
         2.1693908e-19, 4.0052980e-32]],

       [[7.2067390e-17, 1.4920398e-12, 1.0000000e+00, 1.1679825e-10,
         2.1693908e-19, 4.0052980e-32]],

       [[7.2067390e-17, 1.4920398e-12, 1.0000000e+00, 1.1679825e-10,
         2.1693908e-19, 4.0052980e-32]],

 

In [23]:
data[21]

array([[1., 0., 1.],
       [1., 0., 1.],
       [1., 0., 1.],
       [1., 0., 1.],
       [0., 0., 1.]])

In [17]:
pred_test_modified = pred_test.reshape(1000,6)
first = []
second = []
third = []
forth = []
fifth = []
sixth = []
for i in range(1000):
  first.append(pred_test_modified[i][0])
  second.append(pred_test_modified[i][1])
  third.append(pred_test_modified[i][2])
  forth.append(pred_test_modified[i][3])
  fifth.append(pred_test_modified[i][4])
  sixth.append(pred_test_modified[i][5])

In [142]:
#pred_time = 5

In [143]:
predictions_test5 = pd.DataFrame()
predictions_test5['o1'] = first
predictions_test5['o2'] = second
predictions_test5['o3'] = third
predictions_test5['o4'] = forth
predictions_test5['o5'] = fifth
predictions_test5['o6'] = sixth

In [144]:
predictions_test5[:10]

Unnamed: 0,o1,o2,o3,o4,o5,o6
0,3.93638e-09,1.638231e-09,4.353853e-16,1.531789e-10,4.775619e-09,1.0
1,6.469569e-05,0.999935,5.842321e-09,2.826791e-08,1.687874e-08,2.8066e-07
2,6.469569e-05,0.999935,5.842321e-09,2.826791e-08,1.687874e-08,2.8066e-07
3,6.469569e-05,0.999935,5.842321e-09,2.826791e-08,1.687874e-08,2.8066e-07
4,6.469569e-05,0.999935,5.842321e-09,2.826791e-08,1.687874e-08,2.8066e-07
5,6.469569e-05,0.999935,5.842321e-09,2.826791e-08,1.687874e-08,2.8066e-07
6,2.908279e-09,1.170994e-09,2.541257e-16,1.089041e-10,3.576643e-09,1.0
7,1.398696e-07,9.107065e-08,2.343834e-13,8.22621e-09,1.389398e-07,0.9999996
8,1.398696e-07,9.107065e-08,2.343834e-13,8.22621e-09,1.389398e-07,0.9999996
9,6.469569e-05,0.999935,5.842321e-09,2.826791e-08,1.687874e-08,2.8066e-07


In [None]:
#pred_time = 1

In [18]:
predictions_test1 = pd.DataFrame()
predictions_test1['o1'] = first
predictions_test1['o2'] = second
predictions_test1['o3'] = third
predictions_test1['o4'] = forth
predictions_test1['o5'] = fifth
predictions_test1['o6'] = sixth

In [21]:
predictions_test1[:10]

Unnamed: 0,o1,o2,o3,o4,o5,o6
0,2.42021e-10,6.096923e-09,1.375406e-14,2.363927e-09,5.230796e-08,1.0
1,8.552134e-05,0.9999127,6.242942e-08,1.772607e-07,2.220673e-07,1e-06
2,8.552134e-05,0.9999127,6.242942e-08,1.772607e-07,2.220673e-07,1e-06
3,8.552134e-05,0.9999127,6.242942e-08,1.772607e-07,2.220673e-07,1e-06
4,8.552134e-05,0.9999127,6.242942e-08,1.772607e-07,2.220673e-07,1e-06
5,8.552134e-05,0.9999127,6.242942e-08,1.772607e-07,2.220673e-07,1e-06
6,2.42021e-10,6.096923e-09,1.375406e-14,2.363927e-09,5.230796e-08,1.0
7,2.42021e-10,6.096923e-09,1.375406e-14,2.363927e-09,5.230796e-08,1.0
8,2.42021e-10,6.096923e-09,1.375406e-14,2.363927e-09,5.230796e-08,1.0
9,8.552134e-05,0.9999127,6.242942e-08,1.772607e-07,2.220673e-07,1e-06


In [None]:
#pred_time = 4

In [105]:
predictions_test4 = pd.DataFrame()
predictions_test4['o1'] = first
predictions_test4['o2'] = second
predictions_test4['o3'] = third
predictions_test4['o4'] = forth
predictions_test4['o5'] = fifth
predictions_test4['o6'] = sixth

In [106]:
predictions_test4[:10]

Unnamed: 0,o1,o2,o3,o4,o5,o6
0,1.056608e-10,1.203512e-10,8.280706e-18,2.640959e-11,3.371833e-10,1.0
1,2.194388e-05,0.9999778,7.436449e-09,3.90444e-08,1.947373e-08,2.34094e-07
2,2.194388e-05,0.9999778,7.436449e-09,3.90444e-08,1.947373e-08,2.34094e-07
3,2.194388e-05,0.9999778,7.436449e-09,3.90444e-08,1.947373e-08,2.34094e-07
4,2.194388e-05,0.9999778,7.436449e-09,3.90444e-08,1.947373e-08,2.34094e-07
5,2.194388e-05,0.9999778,7.436449e-09,3.90444e-08,1.947373e-08,2.34094e-07
6,7.785443e-11,8.852336e-11,4.885612e-18,1.907497e-11,2.516654e-10,1.0
7,1.040355e-08,1.626743e-08,1.754808e-14,3.003139e-09,2.260344e-08,1.0
8,1.040355e-08,1.626743e-08,1.754808e-14,3.003139e-09,2.260344e-08,1.0
9,2.194388e-05,0.9999778,7.436449e-09,3.90444e-08,1.947373e-08,2.34094e-07


#pred_time = 3

In [79]:
pred_test_modified = pred_test.reshape(1000,6)
first = []
second = []
third = []
forth = []
fifth = []
sixth = []
for i in range(1000):
  first.append(pred_test_modified[i][0])
  second.append(pred_test_modified[i][1])
  third.append(pred_test_modified[i][2])
  forth.append(pred_test_modified[i][3])
  fifth.append(pred_test_modified[i][4])
  sixth.append(pred_test_modified[i][5])

In [80]:
predictions_test3 = pd.DataFrame()
predictions_test3['o1'] = first
predictions_test3['o2'] = second
predictions_test3['o3'] = third
predictions_test3['o4'] = forth
predictions_test3['o5'] = fifth
predictions_test3['o6'] = sixth

In [81]:
predictions_test3[:10]

Unnamed: 0,o1,o2,o3,o4,o5,o6
0,2.62269e-12,3.533205e-11,1.174773e-17,1.144177e-11,2.364255e-10,1.0
1,3.940062e-05,0.9999602,1.167588e-08,3.348477e-08,3.626331e-08,3.447938e-07
2,3.940062e-05,0.9999602,1.167588e-08,3.348477e-08,3.626331e-08,3.447938e-07
3,3.940062e-05,0.9999602,1.167588e-08,3.348477e-08,3.626331e-08,3.447938e-07
4,3.940062e-05,0.9999602,1.167588e-08,3.348477e-08,3.626331e-08,3.447938e-07
5,3.940062e-05,0.9999602,1.167588e-08,3.348477e-08,3.626331e-08,3.447938e-07
6,1.672833e-12,2.336268e-11,6.133169e-18,7.513793e-12,1.631514e-10,1.0
7,4.193434e-10,4.239696e-09,1.538802e-14,1.156924e-09,1.388105e-08,1.0
8,4.193434e-10,4.239696e-09,1.538802e-14,1.156924e-09,1.388105e-08,1.0
9,3.940062e-05,0.9999602,1.167588e-08,3.348477e-08,3.626331e-08,3.447938e-07


In [87]:
risk_all[0][0:10]

array([[[1.14417703e-11, 2.47867227e-10, 2.47867227e-10, 1.00000000e+00]],

       [[7.88727254e-02, 1.64290249e-01, 1.64290249e-01, 9.76445198e-01]],

       [[7.88727254e-02, 1.64290249e-01, 1.64290249e-01, 9.76445198e-01]],

       [[7.88727254e-02, 1.64290249e-01, 1.64290249e-01, 9.76445198e-01]],

       [[7.88727254e-02, 1.64290249e-01, 1.64290249e-01, 9.76445198e-01]],

       [[7.88727254e-02, 1.64290249e-01, 1.64290249e-01, 9.76445198e-01]],

       [[7.51379341e-12, 1.70665176e-10, 1.70665176e-10, 1.00000000e+00]],

       [[1.15692378e-09, 1.50379762e-08, 1.50379762e-08, 1.00000000e+00]],

       [[1.15692378e-09, 1.50379762e-08, 1.50379762e-08, 1.00000000e+00]],

       [[7.88727254e-02, 1.64290249e-01, 1.64290249e-01, 9.76445198e-01]]])

In [60]:
pred_test_modified = pred_test.reshape(1000,6)
first = []
second = []
third = []
forth = []
fifth = []
sixth = []
for i in range(1000):
  first.append(pred_test_modified[i][0])
  second.append(pred_test_modified[i][1])
  third.append(pred_test_modified[i][2])
  forth.append(pred_test_modified[i][3])
  fifth.append(pred_test_modified[i][4])
  sixth.append(pred_test_modified[i][5])

In [61]:
predictions_test = pd.DataFrame()
predictions_test['o1'] = first
predictions_test['o2'] = second
predictions_test['o3'] = third
predictions_test['o4'] = forth
predictions_test['o5'] = fifth
predictions_test['o6'] = sixth

In [63]:
predictions_test[:10]

Unnamed: 0,o1,o2,o3,o4,o5,o6
0,2.226777e-09,1.008492e-10,5.137436e-16,4.549167e-11,1.450606e-10,1.0
1,1.236637e-05,0.9999872,9.061327e-09,4.120514e-08,5.832531e-08,3.639243e-07
2,1.236637e-05,0.9999872,9.061327e-09,4.120514e-08,5.832531e-08,3.639243e-07
3,1.236637e-05,0.9999872,9.061327e-09,4.120514e-08,5.832531e-08,3.639243e-07
4,1.236637e-05,0.9999872,9.061327e-09,4.120514e-08,5.832531e-08,3.639243e-07
5,1.236637e-05,0.9999872,9.061327e-09,4.120514e-08,5.832531e-08,3.639243e-07
6,2.226777e-09,1.008492e-10,5.137436e-16,4.549167e-11,1.450606e-10,1.0
7,7.6586e-08,7.984121e-09,2.158505e-13,2.685965e-09,7.130636e-09,0.9999999
8,7.6586e-08,7.984121e-09,2.158505e-13,2.685965e-09,7.130636e-09,0.9999999
9,1.236637e-05,0.9999872,9.061327e-09,4.120514e-08,5.832531e-08,3.639243e-07
