# Library

In [2]:
import numpy as np, os
import pandas as pd
import tensorflow as tf
from tqdm import tqdm

# 결과 확인을 용이하게 하기 위한 코드
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# 700개로 pre-test (시간 체크용)

In [12]:
# 1) DATA
import random

# random seed 설정
seed_num = 42
random.seed(seed_num)

x = np.load('/project/LSH/x_(7727,10,4068).npy')
y = np.load('/project/LSH/y_(7727,1).npy')

# random seed에 따른 shuffle 
idx = list(range(len(x)))
random.shuffle(idx)
idx = idx[:700]

i = round(700*0.8)
X_train, y_train = x[idx[:i],:,:], y[idx[:i]]
X_test, y_test = x[idx[i:],:,:], y[idx[i:]]

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((560, 10, 4068), (560,), (140, 10, 4068), (140,))

In [87]:
# 2) MODEL
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import *
from tensorflow.keras.callbacks import EarlyStopping

# ---------------------
seed_num = 42
# ---------------------
tf.random.set_seed(seed_num)

lstm2 = Sequential()
lstm2.add(InputLayer(input_shape=(X_train.shape[1],X_train.shape[2])))
lstm2.add(LSTM(units=128, activation='hard_sigmoid', return_sequences=True))
lstm2.add(LSTM(units=64, activation='hard_sigmoid', return_sequences=True))
lstm2.add(Dropout(0.2))
lstm2.add(LSTM(units=64, activation='hard_sigmoid', return_sequences=True))
lstm2.add(LSTM(units=32, activation='hard_sigmoid', return_sequences=False))
lstm2.add(Dropout(0.2))
lstm2.add(Dense(units=1, activation='sigmoid'))

early_stop = EarlyStopping(monitor='val_acc', patience=30, verbose=1, restore_best_weights=True)
lstm2.compile(optimizer= "adam", loss=tf.keras.losses.BinaryCrossentropy(), metrics=['acc'])
lstm2.fit(X_train, y_train, validation_split=0.25, batch_size=128, epochs=100, callbacks=[early_stop], shuffle=False)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Restoring model weights from the end of the best epoch.
Epoch 00031: early stopping


<tensorflow.python.keras.callbacks.History at 0x7fc2c98cb340>

In [88]:
# 3)
# FEATURES
a = pd.read_csv('total_data_7727.csv')
features = list(a['ITEMID'].sort_values().unique()) # 4068개 ITEMID 

results = []
preds = lstm2.predict(X_test)

# COMPUTE BASELINE
from tensorflow.keras.losses import BinaryCrossentropy
bce = BinaryCrossentropy()
baseline_bce = bce(y_test, preds).numpy()
results.append({'feature':'BASELINE','baseline_bce':baseline_bce}) 

for k in tqdm(range(len(features))):
    
    # REVERSE ALL FEATURE K
    save_col = X_test[:,:,k].copy()
    X_test[:,:,k] = np.where(X_test[:,:,k]==1, 2, X_test[:,:,k])
    X_test[:,:,k] = np.where(X_test[:,:,k]==0, 1, X_test[:,:,k])
    X_test[:,:,k] = np.where(X_test[:,:,k]==2, 0, X_test[:,:,k])

    # COMPUTE BCE WITH FEATURE K REVERSED
    pred2 = lstm2.predict(X_test)
    loss_bce = bce(y_test, pred2).numpy()

    results.append({'feature':features[k],'bce':loss_bce})
    X_test[:,:,k] = save_col

100%|██████████| 4068/4068 [04:06<00:00, 16.51it/s]


In [89]:
# 4) DISPLAY LSTM FEATURE IMPORTANCE
df = pd.DataFrame(results)
df1 = df.copy()
df1['baseline_bce'] = float(df['baseline_bce'].dropna().unique())
df1['diff'] = df1['bce']-df1['baseline_bce']
df1 = df1.sort_values(by='diff')
df1

Unnamed: 0,feature,baseline_bce,bce,diff
1550,93519501,0.61192,0.611918,-0.000002
2511,703841104,0.61192,0.611918,-0.000002
536,6057143,0.61192,0.611918,-0.000002
3526,57665000202,0.61192,0.611918,-0.000002
3778,62584075001,0.61192,0.611918,-0.000002
...,...,...,...,...
4025,68084028401,0.61192,0.611922,0.000002
1367,78014923,0.61192,0.611922,0.000002
1139,71374066,0.61192,0.611922,0.000002
1063,65041435,0.61192,0.611922,0.000002


# All reversed (7727)

In [20]:
# 1) DATA 
import random    
# ---------------------
seed_num = 42
# ---------------------
random.seed(seed_num)

x = np.load('/project/LSH/x_(7727,10,4068).npy')
y = np.load('/project/LSH/y_(7727,1).npy')

idx = list(range(len(x)))
random.shuffle(idx)

i = round(x.shape[0]*0.8)
X_train, y_train = x[idx[:i],:,:], y[idx[:i]]
X_test, y_test = x[idx[i:],:,:], y[idx[i:]]

In [21]:
# 2) MODEL
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import *
from tensorflow.keras.callbacks import EarlyStopping

# ---------------------
seed_num = 42
# ---------------------
tf.random.set_seed(seed_num)

lstm2 = Sequential()
lstm2.add(InputLayer(input_shape=(X_train.shape[1],X_train.shape[2])))
lstm2.add(LSTM(units=128, activation='hard_sigmoid', return_sequences=True))
lstm2.add(LSTM(units=64, activation='hard_sigmoid', return_sequences=True))
lstm2.add(Dropout(0.2))
lstm2.add(LSTM(units=64, activation='hard_sigmoid', return_sequences=True))
lstm2.add(LSTM(units=32, activation='hard_sigmoid', return_sequences=False))
lstm2.add(Dropout(0.2))
lstm2.add(Dense(units=1, activation='sigmoid'))

early_stop = EarlyStopping(monitor='val_acc', patience=30, verbose=1, restore_best_weights=True)
lstm2.compile(optimizer= "adam", loss=tf.keras.losses.BinaryCrossentropy(), metrics=['acc'])
lstm2.fit(X_train, y_train, validation_split=0.25, batch_size=128, epochs=100, callbacks=[early_stop], shuffle=False)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Restoring model weights from the end of the best epoch.
Epoch 00035: early stopping


<tensorflow.python.keras.callbacks.History at 0x7faa206e5fa0>

In [15]:
# 3)
# FEATURES
a = pd.read_csv('total_data_7727.csv')
features = list(a['ITEMID'].sort_values().unique()) # 4068개 ITEMID 

results = []
preds = lstm2.predict(X_test)

# COMPUTE BASELINE
from tensorflow.keras.losses import BinaryCrossentropy
bce = BinaryCrossentropy()
baseline_bce = bce(y_test, preds).numpy()
results.append({'feature':'BASELINE','baseline_bce':baseline_bce}) 

for k in tqdm(range(len(features))):

    # REVERSE ALL FEATURE K
    save_col = X_test[:,:,k].copy()
    X_test[:,:,k] = np.where(X_test[:,:,k]==1, 2, X_test[:,:,k])
    X_test[:,:,k] = np.where(X_test[:,:,k]==0, 1, X_test[:,:,k])
    X_test[:,:,k] = np.where(X_test[:,:,k]==2, 0, X_test[:,:,k])

    # COMPUTE BCE WITH FEATURE K REVERSED
    preds = lstm2.predict(X_test)
    loss_bce = bce(y_test, preds).numpy()

    results.append({'feature':features[k],'bce':loss_bce})
    X_test[:,:,k] = save_col

100%|██████████| 4068/4068 [33:42<00:00,  2.01it/s]


In [46]:
# DISPLAY LSTM FEATURE IMPORTANCE
df = pd.DataFrame(results)
all_df = df.copy()
all_df['baseline_bce'] = float(df['baseline_bce'].dropna().unique())
all_df['diff'] = all_df['bce']-all_df['baseline_bce']
all_df = all_df.sort_values(by='diff')
all_df = all_df.drop(index=[0])
all_df

all_df.to_csv('all_df.csv',index = False)

Unnamed: 0,feature,baseline_bce,bce,diff
122,51006,1.014362,1.001019,-0.013343
3833,63323026201,1.014362,1.003800,-0.010562
207,51277,1.014362,1.005001,-0.009361
3457,55390000401,1.014362,1.010189,-0.004173
57,50912,1.014362,1.010785,-0.003577
...,...,...,...,...
2745,10019055302,1.014362,1.035443,0.021081
983,54817525,1.014362,1.036517,0.022155
1082,67434504,1.014362,1.039256,0.024894
1277,74606211,1.014362,1.042943,0.028581


# 1~5 reversed (7727)
- 1) DATA, 2) MODEL 생략

In [19]:
# 3)
# FEATURES
a = pd.read_csv('total_data_7727.csv')
features = list(a['ITEMID'].sort_values().unique()) # 4068개 ITEMID 

results = []
preds = lstm2.predict(X_test)

# COMPUTE BASELINE
from tensorflow.keras.losses import BinaryCrossentropy
bce = BinaryCrossentropy()
baseline_bce = bce(y_test, preds).numpy()
results.append({'feature':'BASELINE','baseline_bce':baseline_bce}) 

for k in tqdm(range(len(features))):

    # REVERSE ALL FEATURE K
    save_col = X_test[:,:,k].copy()
    X_test[:,5:,k] = np.where(X_test[:,5:,k]==1, 2, X_test[:,5:,k])
    X_test[:,5:,k] = np.where(X_test[:,5:,k]==0, 1, X_test[:,5:,k])
    X_test[:,5:,k] = np.where(X_test[:,5:,k]==2, 0, X_test[:,5:,k])

    # COMPUTE BCE WITH FEATURE K REVERSED
    preds = lstm2.predict(X_test)
    loss_bce = bce(y_test, preds).numpy()

    results.append({'feature':features[k],'bce':loss_bce})
    X_test[:,:,k] = save_col

100%|██████████| 4068/4068 [32:43<00:00,  2.07it/s]


In [49]:
# 4) DISPLAY LSTM FEATURE IMPORTANCE
df2 = pd.DataFrame(results)

df_1to5 = df2.copy()
base_value = df2.iloc[0,1]
df_1to5['baseline_bce'] = base_value
df_1to5['diff'] = df_1to5['bce']-df_1to5['baseline_bce']
df_1to5 = df_1to5.drop(index=[0])
df_1to5 = df_1to5.sort_values(by='diff')
df_1to5

df_1to5.to_csv('df_d1d5.csv',index = False)

Unnamed: 0,feature,baseline_bce,bce,diff
122,51006,1.014362,1.001019,-0.013343
3833,63323026201,1.014362,1.003800,-0.010562
207,51277,1.014362,1.005001,-0.009361
3457,55390000401,1.014362,1.010189,-0.004173
57,50912,1.014362,1.010785,-0.003577
...,...,...,...,...
2745,10019055302,1.014362,1.035443,0.021081
983,54817525,1.014362,1.036517,0.022155
1082,67434504,1.014362,1.039256,0.024894
1277,74606211,1.014362,1.042943,0.028581


# 6~10 reversed (7727)
- 1) DATA, 2) MODEL 생략

In [22]:
# 3)
# FEATURES
a = pd.read_csv('total_data_7727.csv')
features = list(a['ITEMID'].sort_values().unique()) # 4068개 ITEMID 

results = []
preds = lstm2.predict(X_test)

# COMPUTE BASELINE
from tensorflow.keras.losses import BinaryCrossentropy
bce = BinaryCrossentropy()
baseline_bce = bce(y_test, preds).numpy()
results.append({'feature':'BASELINE','baseline_bce':baseline_bce}) 

for k in tqdm(range(len(features))):

    # REVERSE ALL FEATURE K
    save_col = X_test[:,:,k].copy()
    X_test[:,0:5,k] = np.where(X_test[:,0:5,k]==1, 2, X_test[:,0:5,k])
    X_test[:,0:5,k] = np.where(X_test[:,0:5,k]==0, 1, X_test[:,0:5,k])
    X_test[:,0:5,k] = np.where(X_test[:,0:5,k]==2, 0, X_test[:,0:5,k])

    # COMPUTE BCE WITH FEATURE K REVERSED
    preds = lstm2.predict(X_test)
    loss_bce = bce(y_test, preds).numpy()

    results.append({'feature':features[k],'bce':loss_bce})
    X_test[:,:,k] = save_col

100%|██████████| 4068/4068 [32:41<00:00,  2.07it/s]


In [28]:
# 4) DISPLAY LSTM FEATURE IMPORTANCE
df1 = pd.DataFrame(results)
df_6to10 = df1.copy()
df_6to10['baseline_bce'] = float(df1['baseline_bce'].dropna().unique())
df_6to10['diff'] = df_6to10['bce']-df_6to10['baseline_bce']
df_6to10 = df_6to10.sort_values(by='diff')
df_6to10 = df_6to10.drop(index=[0])
df_6to10

# df_6to10.to_csv('df_d6d10_best.csv',index = False)

Unnamed: 0,feature,baseline_bce,bce
0,BASELINE,1.014362,
1,0,,1.011114
2,50803,,1.0151
3,50804,,1.017607
4,50805,,1.014638
5,50806,,1.019179
6,50808,,1.016358
7,50809,,1.01741
8,50811,,1.018671
9,50813,,1.024103


# with best model

## All reversed

In [30]:
# 1) DATA 
import random    
# ---------------------
seed_num = 42
# ---------------------
random.seed(seed_num)

x = np.load('/project/LSH/x_(7727,10,4068).npy')
y = np.load('/project/LSH/y_(7727,1).npy')

idx = list(range(len(x)))
random.shuffle(idx)

i = round(x.shape[0]*0.8)
X_train, y_train = x[idx[:i],:,:], y[idx[:i]]
X_test, y_test = x[idx[i:],:,:], y[idx[i:]]

In [31]:
# 2) MODEL
from keras.models import load_model
lstm2 = load_model('./model/(LSTM_best_4068)seed42-05-0.5029.hdf5')

In [32]:
# 3)
# FEATURES
a = pd.read_csv('total_data_7727.csv')
features = list(a['ITEMID'].sort_values().unique()) # 4068개 ITEMID 

results = []
preds = lstm2.predict(X_test)

# COMPUTE BASELINE
from tensorflow.keras.losses import BinaryCrossentropy
bce = BinaryCrossentropy()
baseline_bce = bce(y_test, preds).numpy()
results.append({'feature':'BASELINE','baseline_bce':baseline_bce}) 

for k in tqdm(range(len(features))):

    # REVERSE ALL FEATURE K
    save_col = X_test[:,:,k].copy()
    X_test[:,:,k] = np.where(X_test[:,:,k]==1, 2, X_test[:,:,k])
    X_test[:,:,k] = np.where(X_test[:,:,k]==0, 1, X_test[:,:,k])
    X_test[:,:,k] = np.where(X_test[:,:,k]==2, 0, X_test[:,:,k])

    # COMPUTE BCE WITH FEATURE K REVERSED
    preds = lstm2.predict(X_test)
    loss_bce = bce(y_test, preds).numpy()

    results.append({'feature':features[k],'bce':loss_bce})
    X_test[:,:,k] = save_col

100%|██████████| 4068/4068 [48:31<00:00,  1.40it/s] 


In [33]:
df1 = pd.DataFrame(results)

df_all_best = df1.copy()
base_value = df1.iloc[0,1]
df_all_best['baseline_bce'] = base_value
df_all_best['diff'] = df_all_best['bce']-df_all_best['baseline_bce']
df_all_best = df_all_best.drop(index=[0])
df_all_best = df_all_best.sort_values(by='diff')
df_all_best

df_all_best.to_csv('df_all_best.csv',index = False)

Unnamed: 0,feature,baseline_bce,bce,diff
122,51006,1.014362,1.001220,-0.013142
3833,63323026201,1.014362,1.006389,-0.007973
207,51277,1.014362,1.006995,-0.007367
1,0,1.014362,1.009931,-0.004431
3457,55390000401,1.014362,1.010688,-0.003673
...,...,...,...,...
2745,10019055302,1.014362,1.041859,0.027498
983,54817525,1.014362,1.043492,0.029130
1082,67434504,1.014362,1.046829,0.032467
1277,74606211,1.014362,1.051030,0.036668


## 1~5 reversed

In [24]:
# 1) DATA 
import random    
# ---------------------
seed_num = 42
# ---------------------
random.seed(seed_num)

x = np.load('/project/LSH/x_(7727,10,4068).npy')
y = np.load('/project/LSH/y_(7727,1).npy')

idx = list(range(len(x)))
random.shuffle(idx)

i = round(x.shape[0]*0.8)
X_train, y_train = x[idx[:i],:,:], y[idx[:i]]
X_test, y_test = x[idx[i:],:,:], y[idx[i:]]

In [25]:
# 2) MODEL
from keras.models import load_model
lstm2 = load_model('./model/(LSTM_best_4068)seed42-05-0.5029.hdf5')

In [26]:
# 3)
# FEATURES
a = pd.read_csv('total_data_7727.csv')
features = list(a['ITEMID'].sort_values().unique()) # 4068개 ITEMID 

results = []
preds = lstm2.predict(X_test)

# COMPUTE BASELINE
from tensorflow.keras.losses import BinaryCrossentropy
bce = BinaryCrossentropy()
baseline_bce = bce(y_test, preds).numpy()
results.append({'feature':'BASELINE','baseline_bce':baseline_bce}) 

for k in tqdm(range(len(features))):

    # REVERSE ALL FEATURE K
    save_col = X_test[:,:,k].copy()
    X_test[:,5:,k] = np.where(X_test[:,5:,k]==1, 2, X_test[:,5:,k])
    X_test[:,5:,k] = np.where(X_test[:,5:,k]==0, 1, X_test[:,5:,k])
    X_test[:,5:,k] = np.where(X_test[:,5:,k]==2, 0, X_test[:,5:,k])

    # COMPUTE BCE WITH FEATURE K REVERSED
    preds = lstm2.predict(X_test)
    loss_bce = bce(y_test, preds).numpy()

    results.append({'feature':features[k],'bce':loss_bce})
    X_test[:,:,k] = save_col

  0%|          | 14/4068 [00:07<35:02,  1.93it/s]


KeyboardInterrupt: 

In [None]:
# 4) DISPLAY LSTM FEATURE IMPORTANCE
df3 = pd.DataFrame(results)

df_d1d5_best = df3.copy()
base_value = df3.iloc[0,1]
df_d1d5_best['baseline_bce'] = base_value
df_d1d5_best['diff'] = df_d1d5_best['bce']-df_d1d5_best['baseline_bce']
df_d1d5_best = df_d1d5_best.drop(index=[0])
df_d1d5_best = df_d1d5_best.sort_values(by='diff')
df_d1d5_best

df_d1d5_best.to_csv('df_d1d5_best.csv',index = False)

## 6~10 reversed

In [3]:
# 1) DATA 
import random    
# ---------------------
seed_num = 42
# ---------------------
random.seed(seed_num)

x = np.load('/project/LSH/x_(7727,10,4068).npy')
y = np.load('/project/LSH/y_(7727,1).npy')

idx = list(range(len(x)))
random.shuffle(idx)

i = round(x.shape[0]*0.8)
X_train, y_train = x[idx[:i],:,:], y[idx[:i]]
X_test, y_test = x[idx[i:],:,:], y[idx[i:]]

In [52]:
# 2) MODEL
from keras.models import load_model
lstm2 = load_model('./model/(LSTM_best_4068)seed42-05-0.5029.hdf5')

In [53]:
# 3)
# FEATURES
a = pd.read_csv('total_data_7727.csv')
features = list(a['ITEMID'].sort_values().unique()) # 4068개 ITEMID 

results = []
preds = lstm2.predict(X_test)

# COMPUTE BASELINE
from tensorflow.keras.losses import BinaryCrossentropy
bce = BinaryCrossentropy()
baseline_bce = bce(y_test, preds).numpy()
results.append({'feature':'BASELINE','baseline_bce':baseline_bce}) 

for k in tqdm(range(len(features))):

    # REVERSE ALL FEATURE K
    save_col = X_test[:,:,k].copy()
    X_test[:,0:5,k] = np.where(X_test[:,0:5,k]==1, 2, X_test[:,0:5,k])
    X_test[:,0:5,k] = np.where(X_test[:,0:5,k]==0, 1, X_test[:,0:5,k])
    X_test[:,0:5,k] = np.where(X_test[:,0:5,k]==2, 0, X_test[:,0:5,k])

    # COMPUTE BCE WITH FEATURE K REVERSED
    preds = lstm2.predict(X_test)
    loss_bce = bce(y_test, preds).numpy()

    results.append({'feature':features[k],'bce':loss_bce})
    X_test[:,:,k] = save_col

100%|██████████| 4068/4068 [32:52<00:00,  2.06it/s]


In [57]:
# 4) DISPLAY LSTM FEATURE IMPORTANCE
df3 = pd.DataFrame(results)

df_d6d10_best = df3.copy()
base_value = df3.iloc[0,1]
df_d6d10_best['baseline_bce'] = base_value
df_d6d10_best['diff'] = df_d6d10_best['bce']-df_d6d10_best['baseline_bce']
df_d6d10_best = df_d6d10_best.drop(index=[0])
df_d6d10_best = df_d6d10_best.sort_values(by='diff')
df_d6d10_best

df_d6d10_best.to_csv('df_d6d10_best.csv',index = False)

Unnamed: 0,feature,baseline_bce,bce,diff
207,51277,1.014362,1.010716,-0.003645
1,0,1.014362,1.012197,-0.002165
122,51006,1.014362,1.012354,-0.002008
2196,409606211,1.014362,1.012364,-0.001998
1277,74606211,1.014362,1.012500,-0.001862
...,...,...,...,...
2313,517293025,1.014362,1.016243,0.001881
1816,182138167,1.014362,1.016294,0.001932
163,51200,1.014362,1.016299,0.001937
38,50889,1.014362,1.016344,0.001982


# LSTM - x all fit

## All reversed

In [6]:
# 1) DATA 
import random    
seed_num = 42
random.seed(seed_num)

x = np.load('/project/LSH/x_(7727,10,4068).npy')
y = np.load('/project/LSH/y_(7727,1).npy')

# --------------------------------------------------------------

# 2) MODEL
from keras.models import load_model
lstm2 = load_model('./model/(LSTM_best_4068)seed42-05-0.5029.hdf5')

# --------------------------------------------------------------

# 3)
# FEATURES
a = pd.read_csv('total_data_7727.csv')
features = list(a['ITEMID'].sort_values().unique()) # 4068개 ITEMID 

results = []
preds = lstm2.predict(x)

# COMPUTE BASELINE
from tensorflow.keras.losses import BinaryCrossentropy
bce = BinaryCrossentropy()
baseline_bce = bce(y, preds).numpy()
results.append({'feature':'BASELINE','baseline_bce':baseline_bce}) 

gpu_strategy = tf.distribute.get_strategy()
with gpu_strategy.scope():
    for k in tqdm(range(len(features))):

        # REVERSE ALL FEATURE K
        save_col = x[:,:,k].copy()
        x[:,:,k] = np.where(x[:,:,k]==1, 2, x[:,:,k])
        x[:,:,k] = np.where(x[:,:,k]==0, 1, x[:,:,k])
        x[:,:,k] = np.where(x[:,:,k]==2, 0, x[:,:,k])

        # COMPUTE BCE WITH FEATURE K REVERSED
        pred2 = lstm2.predict(x)
        loss_bce = bce(y, pred2).numpy()

        results.append({'feature':features[k],'bce':loss_bce})
        x[:,:,k] = save_col



100%|██████████| 4068/4068 [2:38:59<00:00,  2.34s/it]  


In [7]:
df1 = pd.DataFrame(results)

df_all_best = df1.copy()
base_value = df1.iloc[0,1]
df_all_best['baseline_bce'] = base_value
df_all_best['diff'] = df_all_best['bce']-df_all_best['baseline_bce']
df_all_best = df_all_best.drop(index=[0])
df_all_best = df_all_best.sort_values(by='diff')
df_all_best

df_all_best.to_csv('df_all_allfit.csv',index = False)

Unnamed: 0,feature,baseline_bce,bce,diff
122,51006,1.024412,1.009241,-0.015170
207,51277,1.024412,1.010720,-0.013692
3833,63323026201,1.024412,1.016048,-0.008364
1,0,1.024412,1.017825,-0.006587
57,50912,1.024412,1.018578,-0.005833
...,...,...,...,...
1816,182138167,1.024412,1.042575,0.018163
38,50889,1.024412,1.043742,0.019330
470,4003822,1.024412,1.045095,0.020683
384,227194,1.024412,1.049693,0.025281


## 1~5 reversed

In [3]:
# 1) DATA 
import random    
seed_num = 42
random.seed(seed_num)

x = np.load('/project/LSH/x_(7727,10,4068).npy')
y = np.load('/project/LSH/y_(7727,1).npy')

# --------------------------------------------------------------

# 2) MODEL
from keras.models import load_model
lstm2 = load_model('./model/(LSTM_best_4068)seed42-05-0.5029.hdf5')

# --------------------------------------------------------------

# 3)
# FEATURES
a = pd.read_csv('total_data_7727.csv')
features = list(a['ITEMID'].sort_values().unique()) # 4068개 ITEMID 

results = []
preds = lstm2.predict(x)

# COMPUTE BASELINE
from tensorflow.keras.losses import BinaryCrossentropy
bce = BinaryCrossentropy()
baseline_bce = bce(y, preds).numpy()
results.append({'feature':'BASELINE','baseline_bce':baseline_bce}) 

gpu_strategy = tf.distribute.get_strategy()
with gpu_strategy.scope():
    for k in tqdm(range(len(features))):

        # REVERSE ALL FEATURE K
        save_col = x[:,:,k].copy()
        x[:,5:,k] = np.where(x[:,5:,k]==1, 2, x[:,5:,k])
        x[:,5:,k] = np.where(x[:,5:,k]==0, 1, x[:,5:,k])
        x[:,5:,k] = np.where(x[:,5:,k]==2, 0, x[:,5:,k])

        # COMPUTE BCE WITH FEATURE K REVERSED
        pred2 = lstm2.predict(x)
        loss_bce = bce(y, pred2).numpy()

        results.append({'feature':features[k],'bce':loss_bce})
        x[:,:,k] = save_col

Using TensorFlow backend.




100%|██████████| 4068/4068 [2:37:04<00:00,  2.32s/it]  


In [4]:
# 4) DISPLAY LSTM FEATURE IMPORTANCE
df3 = pd.DataFrame(results)

df_d1d5_best = df3.copy()
base_value = df3.iloc[0,1]
df_d1d5_best['baseline_bce'] = base_value
df_d1d5_best['diff'] = df_d1d5_best['bce']-df_d1d5_best['baseline_bce']
df_d1d5_best = df_d1d5_best.drop(index=[0])
df_d1d5_best = df_d1d5_best.sort_values(by='diff')
df_d1d5_best

df_d1d5_best.to_csv('df_d1d5_allfit.csv',index = False)

Unnamed: 0,feature,baseline_bce,bce,diff
122,51006,1.024412,1.010175,-0.014237
207,51277,1.024412,1.010897,-0.013515
3833,63323026201,1.024412,1.013842,-0.010570
57,50912,1.024412,1.019033,-0.005379
1,0,1.024412,1.019348,-0.005064
...,...,...,...,...
470,4003822,1.024412,1.035309,0.010898
1082,67434504,1.024412,1.035496,0.011085
384,227194,1.024412,1.037392,0.012981
1277,74606211,1.024412,1.037929,0.013517


## 6~10 reversed

In [5]:
# 1) DATA 
import random    
seed_num = 42
random.seed(seed_num)

x = np.load('/project/LSH/x_(7727,10,4068).npy')
y = np.load('/project/LSH/y_(7727,1).npy')

# --------------------------------------------------------------

# 2) MODEL
from keras.models import load_model
lstm2 = load_model('./model/(LSTM_best_4068)seed42-05-0.5029.hdf5')

# --------------------------------------------------------------

# 3)
# FEATURES
a = pd.read_csv('total_data_7727.csv')
features = list(a['ITEMID'].sort_values().unique()) # 4068개 ITEMID 

results = []
preds = lstm2.predict(x)

# COMPUTE BASELINE
from tensorflow.keras.losses import BinaryCrossentropy
bce = BinaryCrossentropy()
baseline_bce = bce(y, preds).numpy()
results.append({'feature':'BASELINE','baseline_bce':baseline_bce}) 

gpu_strategy = tf.distribute.get_strategy()
with gpu_strategy.scope():
    for k in tqdm(range(len(features))):

        # REVERSE ALL FEATURE K
        save_col = x[:,:,k].copy()
        x[:,:5,k] = np.where(x[:,:5,k]==1, 2, x[:,:5,k])
        x[:,:5,k] = np.where(x[:,:5,k]==0, 1, x[:,:5,k])
        x[:,:5,k] = np.where(x[:,:5,k]==2, 0, x[:,:5,k])

        # COMPUTE BCE WITH FEATURE K REVERSED
        pred2 = lstm2.predict(x)
        loss_bce = bce(y, pred2).numpy()

        results.append({'feature':features[k],'bce':loss_bce})
        x[:,:,k] = save_col



100%|██████████| 4068/4068 [2:35:43<00:00,  2.30s/it]  


In [6]:
# 4) DISPLAY LSTM FEATURE IMPORTANCE
df3 = pd.DataFrame(results)

df_d6d10_best = df3.copy()
base_value = df3.iloc[0,1]
df_d6d10_best['baseline_bce'] = base_value
df_d6d10_best['diff'] = df_d6d10_best['bce']-df_d6d10_best['baseline_bce']
df_d6d10_best = df_d6d10_best.drop(index=[0])
df_d6d10_best = df_d6d10_best.sort_values(by='diff')
df_d6d10_best

df_d6d10_best.to_csv('df_d6d10_allfit.csv',index = False)

Unnamed: 0,feature,baseline_bce,bce,diff
2196,409606211,1.024412,1.018415,-0.005997
207,51277,1.024412,1.019376,-0.005036
1277,74606211,1.024412,1.019757,-0.004655
1082,67434504,1.024412,1.019939,-0.004473
983,54817525,1.024412,1.020280,-0.004132
...,...,...,...,...
163,51200,1.024412,1.027590,0.003179
1816,182138167,1.024412,1.027707,0.003295
470,4003822,1.024412,1.027831,0.003420
38,50889,1.024412,1.027921,0.003509
