# Library

In [2]:
import numpy as np, os
import pandas as pd
import tensorflow as tf
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

# 결과 확인을 용이하게 하기 위한 코드
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

#한글설정
import matplotlib.font_manager as fm

font_dirs = ['/usr/share/fonts/truetype/nanum', ]
font_files = fm.findSystemFonts(fontpaths=font_dirs)

for font_file in font_files:
    fm.fontManager.addfont(font_file)
    
# 한글 출력을 위해서 폰트 옵션을 설정합니다.
# "axes.unicode_minus" : 마이너스가 깨질 것을 방지

sns.set(font="NanumBarunGothic", 
        rc={"axes.unicode_minus":False},
        style='darkgrid')

# Data Load

In [3]:
m1 = pd.read_csv('method1_df.csv')
m2 = pd.read_csv('method2_df.csv')

# Method 1 : 상위 10개 feature
- 절대값 취하고 절대값 기준으로 내림차순 후 상위 10개 뽑기

In [4]:
m1['abs'] = abs(m1['mul'])
m1 = m1.sort_values(by='abs', ascending=False)
m1_top10 = m1['feature'].values[:10]
m1_top10

array([  409606211,    74606211,    67434504,    54817525, 10019055302,
            227194,    54858516,    74706811,    45006701,     4003822])

## (65.1) RF

In [54]:
# m1_top10만을 가진 x_(7727,10)_m1 만들기
item_list = list(total_data['ITEMID'].sort_values().unique())

m1_top10_index = []
for i in m1_top10:
    m1_top10_index.append(item_list.index(i))

print('m1_top10의 index : ',m1_top10_index)

x = np.load('x_(7727,4068).npy')
x_2d = x[:,m1_top10_index]
x_2d.shape

# np.save('x_(7727,10)_m1.npy',x_2d)

m1_top10의 index :  [2195, 1276, 1081, 982, 2744, 383, 1011, 1306, 845, 469]


(7727, 10)

In [53]:
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

model = RandomForestClassifier()

data={}
for seed in range(42, 52):
    random.seed(seed)
    
    x = np.load('./x_(7727,10)_m1.npy')
    y = np.load('./y_(7727,1).npy')

    idx = list(range(len(x)))
    random.shuffle(idx)

    i = round(x.shape[0]*0.8)
    X_train, y_train = x[idx[:i],:], y[idx[:i]]
    X_test, y_test = x[idx[i:],:], y[idx[i:]]
    
    _ = model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred_test)
    data[seed]=acc
    print(f'정확도 : {acc}, seed_num = {seed}')

df = pd.DataFrame.from_dict(data, orient='index')
print(f'정확도 df 만들고 평균 확인 : {df.mean().values}')

정확도 : 0.6355987055016181, seed_num = 42
정확도 : 0.6330097087378641, seed_num = 43
정확도 : 0.6349514563106796, seed_num = 44
정확도 : 0.6576051779935275, seed_num = 45
정확도 : 0.6466019417475728, seed_num = 46
정확도 : 0.6634304207119741, seed_num = 47
정확도 : 0.6750809061488673, seed_num = 48
정확도 : 0.654368932038835, seed_num = 49
정확도 : 0.6640776699029126, seed_num = 50
정확도 : 0.6511326860841424, seed_num = 51
정확도 df 만들고 평균 확인 : [0.65158576]


## (64.9) LSTM

In [58]:
# m1_top10만을 가진 x_(7727,10,10)_m1 만들기
item_list = list(total_data['ITEMID'].sort_values().unique())

m1_top10_index = []
for i in m1_top10:
    m1_top10_index.append(item_list.index(i))

print('m1_top10의 index : ',m1_top10_index)

x = np.load('x_(7727,10,4068).npy')
x_3d = x[:,:,m1_top10_index]
x_3d.shape
x_3d.sum()

# np.save('x_(7727,10,10)_m1.npy',x_3d)

m1_top10의 index :  [2195, 1276, 1081, 982, 2744, 383, 1011, 1306, 845, 469]


(7727, 10, 10)

6187.0

In [60]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')

import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, InputLayer
from tensorflow.keras.callbacks import EarlyStopping
from sklearn import metrics 
from tensorflow import keras
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam

import random
# ----------------------
seed_num = 42
# ----------------------
random.seed(seed_num)

x = np.load('x_(7727,10,10)_m1.npy')
y = np.load('y_(7727,1).npy')

idx = list(range(len(x)))
random.shuffle(idx)

i = round(x.shape[0]*0.8)
X_train, y_train = x[idx[:i],:,:], y[idx[:i]]
X_test, y_test = x[idx[i:],:,:], y[idx[i:]]

# ---------------------
seed_num = 42 
# ---------------------
tf.random.set_seed(seed_num)

lstm = Sequential()
lstm.add(InputLayer(input_shape=(X_train.shape[1],X_train.shape[2])))
lstm.add(LSTM(units=128, activation='hard_sigmoid', return_sequences=True))
lstm.add(LSTM(units=64, activation='hard_sigmoid', return_sequences=True))
lstm.add(Dropout(0.2))
lstm.add(LSTM(units=64, activation='hard_sigmoid', return_sequences=True))
lstm.add(LSTM(units=32, activation='hard_sigmoid', return_sequences=False))
lstm.add(Dropout(0.2))
lstm.add(Dense(units=1, activation='sigmoid'))

from tensorflow.keras.callbacks import ModelCheckpoint
import os

MODEL_SAVE_FOLDER_PATH = './model/'
if not os.path.exists(MODEL_SAVE_FOLDER_PATH):
    os.mkdir(MODEL_SAVE_FOLDER_PATH)

model_path = MODEL_SAVE_FOLDER_PATH + 'm1_top10_seed42-{epoch:02d}-{val_loss:.4f}.hdf5'

cb_checkpoint = ModelCheckpoint(filepath=model_path, monitor='val_loss',
                                verbose=1, save_best_only=True)

early_stop = EarlyStopping(monitor='val_acc', patience=50, verbose=1, restore_best_weights=True)
lstm.compile(optimizer= keras.optimizers.Adam(learning_rate = 0.001), loss = "binary_crossentropy", metrics=['acc'])
lstm.fit(X_train, y_train, validation_split=0.25, batch_size=128, epochs=500,  callbacks=[early_stop,cb_checkpoint], shuffle=False)

Epoch 1/500

Epoch 00001: val_loss improved from inf to 0.66603, saving model to ./model/10개 feature/seed42-01-0.6660.hdf5
Epoch 2/500

Epoch 00002: val_loss did not improve from 0.66603
Epoch 3/500

Epoch 00003: val_loss improved from 0.66603 to 0.66551, saving model to ./model/10개 feature/seed42-03-0.6655.hdf5
Epoch 4/500

Epoch 00004: val_loss improved from 0.66551 to 0.66526, saving model to ./model/10개 feature/seed42-04-0.6653.hdf5
Epoch 5/500

Epoch 00005: val_loss improved from 0.66526 to 0.66517, saving model to ./model/10개 feature/seed42-05-0.6652.hdf5
Epoch 6/500

Epoch 00006: val_loss improved from 0.66517 to 0.66515, saving model to ./model/10개 feature/seed42-06-0.6652.hdf5
Epoch 7/500

Epoch 00007: val_loss did not improve from 0.66515
Epoch 8/500

Epoch 00008: val_loss did not improve from 0.66515
Epoch 9/500

Epoch 00009: val_loss did not improve from 0.66515
Epoch 10/500

Epoch 00010: val_loss did not improve from 0.66515
Epoch 11/500

Epoch 00011: val_loss improved fro


Epoch 00039: val_loss improved from 0.61935 to 0.61702, saving model to ./model/10개 feature/seed42-39-0.6170.hdf5
Epoch 40/500

Epoch 00040: val_loss improved from 0.61702 to 0.61572, saving model to ./model/10개 feature/seed42-40-0.6157.hdf5
Epoch 41/500

Epoch 00041: val_loss improved from 0.61572 to 0.61446, saving model to ./model/10개 feature/seed42-41-0.6145.hdf5
Epoch 42/500

Epoch 00042: val_loss improved from 0.61446 to 0.61336, saving model to ./model/10개 feature/seed42-42-0.6134.hdf5
Epoch 43/500

Epoch 00043: val_loss improved from 0.61336 to 0.61330, saving model to ./model/10개 feature/seed42-43-0.6133.hdf5
Epoch 44/500

Epoch 00044: val_loss did not improve from 0.61330
Epoch 45/500

Epoch 00045: val_loss did not improve from 0.61330
Epoch 46/500

Epoch 00046: val_loss did not improve from 0.61330
Epoch 47/500

Epoch 00047: val_loss did not improve from 0.61330
Epoch 48/500

Epoch 00048: val_loss did not improve from 0.61330
Epoch 49/500

Epoch 00049: val_loss did not impr


Epoch 00081: val_loss did not improve from 0.61251
Epoch 82/500

Epoch 00082: val_loss did not improve from 0.61251
Epoch 83/500

Epoch 00083: val_loss did not improve from 0.61251
Epoch 84/500

Epoch 00084: val_loss improved from 0.61251 to 0.61229, saving model to ./model/10개 feature/seed42-84-0.6123.hdf5
Epoch 85/500

Epoch 00085: val_loss did not improve from 0.61229
Epoch 86/500

Epoch 00086: val_loss did not improve from 0.61229
Epoch 87/500

Epoch 00087: val_loss did not improve from 0.61229
Epoch 88/500

Epoch 00088: val_loss did not improve from 0.61229
Epoch 89/500

Epoch 00089: val_loss did not improve from 0.61229
Epoch 90/500

Epoch 00090: val_loss did not improve from 0.61229
Epoch 91/500

Epoch 00091: val_loss did not improve from 0.61229
Epoch 92/500

Epoch 00092: val_loss did not improve from 0.61229
Epoch 93/500

Epoch 00093: val_loss did not improve from 0.61229
Epoch 94/500

Epoch 00094: val_loss did not improve from 0.61229
Epoch 95/500

Epoch 00095: val_loss did 

<tensorflow.python.keras.callbacks.History at 0x7f293bc47970>

In [63]:
from keras.models import load_model
best_model = load_model('./model/m1_top10_seed42-84-0.6123.hdf5') 

dic_42={}
for seed in range(0, 50):
    random.seed(seed)

    x = np.load('x_(7727,10,10)_m1.npy')
    y = np.load('y_(7727,1).npy')

    idx = list(range(len(x)))
    random.shuffle(idx)

    i = round(x.shape[0]*0.8)
    X_train, y_train = x[idx[:i],:,:], y[idx[:i]]
    X_test, y_test = x[idx[i:],:,:], y[idx[i:]]
    
    pred = best_model.predict(X_test)
    pred[pred>0.5]=1
    pred[pred<=0.5]=0
    acc = metrics.accuracy_score(y_test, pred)
    dic_42[seed]=acc
    print(f'정확도 :{metrics.accuracy_score(y_test, pred)}, seed_num = {seed}')

정확도 :0.6466019417475728, seed_num = 0
정확도 :0.6427184466019418, seed_num = 1
정확도 :0.6498381877022654, seed_num = 2
정확도 :0.6375404530744336, seed_num = 3
정확도 :0.654368932038835, seed_num = 4
정확도 :0.655663430420712, seed_num = 5
정확도 :0.6537216828478964, seed_num = 6
정확도 :0.629126213592233, seed_num = 7
정확도 :0.6614886731391586, seed_num = 8
정확도 :0.6498381877022654, seed_num = 9
정확도 :0.627831715210356, seed_num = 10
정확도 :0.6711974110032363, seed_num = 11
정확도 :0.6472491909385113, seed_num = 12
정확도 :0.6414239482200648, seed_num = 13
정확도 :0.6466019417475728, seed_num = 14
정확도 :0.656957928802589, seed_num = 15
정확도 :0.6692556634304208, seed_num = 16
정확도 :0.6453074433656958, seed_num = 17
정확도 :0.6459546925566343, seed_num = 18
정확도 :0.6414239482200648, seed_num = 19
정확도 :0.6601941747572816, seed_num = 20
정확도 :0.6595469255663431, seed_num = 21
정확도 :0.6375404530744336, seed_num = 22
정확도 :0.6550161812297735, seed_num = 23
정확도 :0.6530744336569579, seed_num = 24
정확도 :0.6491909385113268, seed_num = 25
정

In [64]:
# seed = 42의 정확도 df 만들고 평균 확인 => 64.9
df_42 = pd.DataFrame.from_dict(dic_42, orient='index')
df_42.mean()

0    0.649204
dtype: float64

## (64.9) not saved LSTM

In [82]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')

import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, InputLayer
from tensorflow.keras.callbacks import EarlyStopping
from sklearn import metrics 
from tensorflow import keras
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam

import random
# ----------------------
seed_num = 42
# ----------------------
random.seed(seed_num)

x = np.load('x_(7727,10,10)_m1.npy')
y = np.load('y_(7727,1).npy')

idx = list(range(len(x)))
random.shuffle(idx)

i = round(x.shape[0]*0.8)
X_train, y_train = x[idx[:i],:,:], y[idx[:i]]
X_test, y_test = x[idx[i:],:,:], y[idx[i:]]

# ---------------------
seed_num = 42 
# ---------------------
tf.random.set_seed(seed_num)

lstm = Sequential()
lstm.add(InputLayer(input_shape=(X_train.shape[1],X_train.shape[2])))
lstm.add(LSTM(units=128, activation='hard_sigmoid', return_sequences=True))
lstm.add(LSTM(units=64, activation='hard_sigmoid', return_sequences=True))
lstm.add(Dropout(0.2))
lstm.add(LSTM(units=64, activation='hard_sigmoid', return_sequences=True))
lstm.add(LSTM(units=32, activation='hard_sigmoid', return_sequences=False))
lstm.add(Dropout(0.2))
lstm.add(Dense(units=1, activation='sigmoid'))

early_stop = EarlyStopping(monitor='val_acc', patience=50, verbose=1, restore_best_weights=False)
lstm.compile(optimizer= keras.optimizers.Adam(learning_rate = 0.001), loss = "binary_crossentropy", metrics=['acc'])
lstm.fit(X_train, y_train, validation_split=0.25, batch_size=128, epochs=500,  callbacks=[early_stop], shuffle=False)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500


Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78/500
Epoch 79/500
Epoch 80/500
Epoch 81/500
Epoch 82/500
Epoch 83/500
Epoch 84/500
Epoch 85/500
Epoch 86/500
Epoch 87/500
Epoch 88/500
Epoch 89/500
Epoch 90/500
Epoch 91/500
Epoch 92/500
Epoch 93/500
Epoch 94/500
Epoch 95/500
Epoch 96/500
Epoch 97/500
Epoch 98/500
Epoch 99/500
Epoch 100/500
Epoch 101/500
Epoch 102/500
Epoch 103/500
Epoch 00103: early stopping


<tensorflow.python.keras.callbacks.History at 0x7f273c1051f0>

In [83]:
dic_42={}
for seed in range(0, 50):
    random.seed(seed)

    x = np.load('x_(7727,10,10)_m1.npy')
    y = np.load('y_(7727,1).npy')

    idx = list(range(len(x)))
    random.shuffle(idx)

    i = round(x.shape[0]*0.8)
    X_train, y_train = x[idx[:i],:,:], y[idx[:i]]
    X_test, y_test = x[idx[i:],:,:], y[idx[i:]]
    
    pred = lstm.predict(X_test)
    pred[pred>0.5]=1
    pred[pred<=0.5]=0
    acc = metrics.accuracy_score(y_test, pred)
    dic_42[seed]=acc
    print(f'정확도 :{metrics.accuracy_score(y_test, pred)}, seed_num = {seed}')

정확도 :0.6459546925566343, seed_num = 0
정확도 :0.6427184466019418, seed_num = 1
정확도 :0.6498381877022654, seed_num = 2
정확도 :0.6375404530744336, seed_num = 3
정확도 :0.654368932038835, seed_num = 4
정확도 :0.655663430420712, seed_num = 5
정확도 :0.6537216828478964, seed_num = 6
정확도 :0.629126213592233, seed_num = 7
정확도 :0.6614886731391586, seed_num = 8
정확도 :0.6491909385113268, seed_num = 9
정확도 :0.627831715210356, seed_num = 10
정확도 :0.6711974110032363, seed_num = 11
정확도 :0.6472491909385113, seed_num = 12
정확도 :0.6414239482200648, seed_num = 13
정확도 :0.6466019417475728, seed_num = 14
정확도 :0.6563106796116505, seed_num = 15
정확도 :0.6692556634304208, seed_num = 16
정확도 :0.6446601941747573, seed_num = 17
정확도 :0.6453074433656958, seed_num = 18
정확도 :0.6414239482200648, seed_num = 19
정확도 :0.6601941747572816, seed_num = 20
정확도 :0.6588996763754046, seed_num = 21
정확도 :0.6375404530744336, seed_num = 22
정확도 :0.6550161812297735, seed_num = 23
정확도 :0.6530744336569579, seed_num = 24
정확도 :0.6491909385113268, seed_num = 25


In [84]:
# seed = 42의 정확도 df 만들고 평균 확인 => 64.9
df_42 = pd.DataFrame.from_dict(dic_42, orient='index')
df_42.mean()

0    0.64901
dtype: float64

# RF : 상위 10개 feature 
- x_(7727,10)_rf.npy 저장

In [47]:
# x_(7727,4068).npy 저장
x1 = np.load('x_(7727,10,4068).npy')
x1 = x1.sum(axis=1)
x1.sum(), x1.shape

x1_ = x1.copy()
x1_[x1_>1] = 1
x1_.sum()

# np.save('x_(7727,4068).npy', x1_)

(2042722.0, (7727, 4068))

510420.0

In [48]:
# RF 기준으로 상위 10개 feature 선별
total_data = pd.read_csv('total_data_7727.csv')
features = list(total_data['ITEMID'].sort_values().unique())

import random
random.seed(42)
x = np.load('x_(7727,4068).npy')
y = np.load('y_(7727,1).npy')

In [49]:
# RF에 내장된 feature importance 
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics 

model = RandomForestClassifier()
model.fit(x, y)

importances = model.feature_importances_

result = []
for f in range(x.shape[1]):
    result.append({'feature' : features[f], 'importance' : importances[f]})

RandomForestClassifier()

In [50]:
# rf_top10 itemid 골라내기 
df = pd.DataFrame(result)
df = df.sort_values(by='importance',ascending=False)
rf_top10 = df['feature'].values[:10]
rf_top10

array([      51277,       51006,       50912,       50862,       50983,
       63323026201,       50882,   904224461,   409606211,       51003])

In [51]:
# rf_top10만을 가진 x_(7727,10)_rf 만들기
item_list = list(total_data['ITEMID'].sort_values().unique())

rf_top10_index = []
for i in rf_top10:
    rf_top10_index.append(item_list.index(i))

print('rf_top10의 index : ',rf_top10_index)

x_2d = x[:,rf_top10_index]
x_2d.shape

# np.save('x_(7727,10)_rf.npy',x_2d)

rf_top10의 index :  [206, 121, 56, 20, 103, 3832, 34, 2611, 2195, 119]


(7727, 10)

## (67.3) RF

In [45]:
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

model = RandomForestClassifier()

data={}
for seed in range(42, 52):
    random.seed(seed)
    
    x = np.load('./x_(7727,10)_rf.npy')
    y = np.load('./y_(7727,1).npy')

    idx = list(range(len(x)))
    random.shuffle(idx)

    i = round(x.shape[0]*0.8)
    X_train, y_train = x[idx[:i],:], y[idx[:i]]
    X_test, y_test = x[idx[i:],:], y[idx[i:]]
    
    _ = model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred_test)
    data[seed]=acc
    print(f'정확도 : {acc}, seed_num = {seed}')

df = pd.DataFrame.from_dict(data, orient='index')
print(f'정확도 df 만들고 평균 확인 : {df.mean().values}')

정확도 : 0.6763754045307443, seed_num = 42
정확도 : 0.6692556634304208, seed_num = 43
정확도 : 0.6550161812297735, seed_num = 44
정확도 : 0.6653721682847896, seed_num = 45
정확도 : 0.6899676375404531, seed_num = 46
정확도 : 0.6673139158576051, seed_num = 47
정확도 : 0.6957928802588996, seed_num = 48
정확도 : 0.6802588996763754, seed_num = 49
정확도 : 0.658252427184466, seed_num = 50
정확도 : 0.6724919093851133, seed_num = 51
정확도 df 만들고 평균 확인 : [0.67300971]


## (72.0) LSTM

In [65]:
# rf_top10만을 가진 x_(7727,10,10)_rf 만들기
item_list = list(total_data['ITEMID'].sort_values().unique())

rf_top10_index = []
for i in rf_top10:
    rf_top10_index.append(item_list.index(i))

print('rf_top10의 index : ',rf_top10_index)

x = np.load('x_(7727,10,4068).npy')
x_3d = x[:,:,rf_top10_index]
x_3d.shape
x_3d.sum()

# np.save('x_(7727,10,10)_rf.npy',x_3d)

rf_top10의 index :  [206, 121, 56, 20, 103, 3832, 34, 2611, 2195, 119]


(7727, 10, 10)

148506.0

In [67]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')

import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, InputLayer
from tensorflow.keras.callbacks import EarlyStopping
from sklearn import metrics 
from tensorflow import keras
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam

import random
# ----------------------
seed_num = 42
# ----------------------
random.seed(seed_num)

x = np.load('x_(7727,10,10)_rf.npy')
y = np.load('y_(7727,1).npy')

idx = list(range(len(x)))
random.shuffle(idx)

i = round(x.shape[0]*0.8)
X_train, y_train = x[idx[:i],:,:], y[idx[:i]]
X_test, y_test = x[idx[i:],:,:], y[idx[i:]]

# ---------------------
seed_num = 42 
# ---------------------
tf.random.set_seed(seed_num)

lstm = Sequential()
lstm.add(InputLayer(input_shape=(X_train.shape[1],X_train.shape[2])))
lstm.add(LSTM(units=128, activation='hard_sigmoid', return_sequences=True))
lstm.add(LSTM(units=64, activation='hard_sigmoid', return_sequences=True))
lstm.add(Dropout(0.2))
lstm.add(LSTM(units=64, activation='hard_sigmoid', return_sequences=True))
lstm.add(LSTM(units=32, activation='hard_sigmoid', return_sequences=False))
lstm.add(Dropout(0.2))
lstm.add(Dense(units=1, activation='sigmoid'))

from tensorflow.keras.callbacks import ModelCheckpoint
import os

MODEL_SAVE_FOLDER_PATH = './model/'
if not os.path.exists(MODEL_SAVE_FOLDER_PATH):
    os.mkdir(MODEL_SAVE_FOLDER_PATH)

model_path = MODEL_SAVE_FOLDER_PATH + 'rf_top10_seed42-{epoch:02d}-{val_loss:.4f}.hdf5'

cb_checkpoint = ModelCheckpoint(filepath=model_path, monitor='val_loss',
                                verbose=1, save_best_only=True)

early_stop = EarlyStopping(monitor='val_acc', patience=50, verbose=1, restore_best_weights=True)
lstm.compile(optimizer= keras.optimizers.Adam(learning_rate = 0.001), loss = "binary_crossentropy", metrics=['acc'])
lstm.fit(X_train, y_train, validation_split=0.25, batch_size=128, epochs=500,  callbacks=[early_stop,cb_checkpoint], shuffle=False)

Epoch 1/500

Epoch 00001: val_loss improved from inf to 0.66596, saving model to ./model/rf_top10_seed42-01-0.6660.hdf5
Epoch 2/500

Epoch 00002: val_loss did not improve from 0.66596
Epoch 3/500

Epoch 00003: val_loss improved from 0.66596 to 0.66323, saving model to ./model/rf_top10_seed42-03-0.6632.hdf5
Epoch 4/500

Epoch 00004: val_loss improved from 0.66323 to 0.62068, saving model to ./model/rf_top10_seed42-04-0.6207.hdf5
Epoch 5/500

Epoch 00005: val_loss improved from 0.62068 to 0.58725, saving model to ./model/rf_top10_seed42-05-0.5873.hdf5
Epoch 6/500

Epoch 00006: val_loss improved from 0.58725 to 0.58069, saving model to ./model/rf_top10_seed42-06-0.5807.hdf5
Epoch 7/500

Epoch 00007: val_loss improved from 0.58069 to 0.57124, saving model to ./model/rf_top10_seed42-07-0.5712.hdf5
Epoch 8/500

Epoch 00008: val_loss improved from 0.57124 to 0.56950, saving model to ./model/rf_top10_seed42-08-0.5695.hdf5
Epoch 9/500

Epoch 00009: val_loss improved from 0.56950 to 0.56909, sav

Epoch 39/500

Epoch 00039: val_loss improved from 0.55857 to 0.55695, saving model to ./model/rf_top10_seed42-39-0.5570.hdf5
Epoch 40/500

Epoch 00040: val_loss improved from 0.55695 to 0.55672, saving model to ./model/rf_top10_seed42-40-0.5567.hdf5
Epoch 41/500

Epoch 00041: val_loss did not improve from 0.55672
Epoch 42/500

Epoch 00042: val_loss did not improve from 0.55672
Epoch 43/500

Epoch 00043: val_loss did not improve from 0.55672
Epoch 44/500

Epoch 00044: val_loss improved from 0.55672 to 0.55472, saving model to ./model/rf_top10_seed42-44-0.5547.hdf5
Epoch 45/500

Epoch 00045: val_loss did not improve from 0.55472
Epoch 46/500

Epoch 00046: val_loss did not improve from 0.55472
Epoch 47/500

Epoch 00047: val_loss did not improve from 0.55472
Epoch 48/500

Epoch 00048: val_loss did not improve from 0.55472
Epoch 49/500

Epoch 00049: val_loss improved from 0.55472 to 0.55466, saving model to ./model/rf_top10_seed42-49-0.5547.hdf5
Epoch 50/500

Epoch 00050: val_loss did not i

<tensorflow.python.keras.callbacks.History at 0x7f27dc6188b0>

In [68]:
from keras.models import load_model
best_model = load_model('./model/rf_top10_seed42-64-0.5546.hdf5') 

dic_42={}
for seed in range(0, 50):
    random.seed(seed)

    x = np.load('x_(7727,10,10)_rf.npy')
    y = np.load('y_(7727,1).npy')

    idx = list(range(len(x)))
    random.shuffle(idx)

    i = round(x.shape[0]*0.8)
    X_train, y_train = x[idx[:i],:,:], y[idx[:i]]
    X_test, y_test = x[idx[i:],:,:], y[idx[i:]]
    
    pred = best_model.predict(X_test)
    pred[pred>0.5]=1
    pred[pred<=0.5]=0
    acc = metrics.accuracy_score(y_test, pred)
    dic_42[seed]=acc
    print(f'정확도 :{metrics.accuracy_score(y_test, pred)}, seed_num = {seed}')

정확도 :0.7165048543689321, seed_num = 0
정확도 :0.7249190938511327, seed_num = 1
정확도 :0.7171521035598706, seed_num = 2
정확도 :0.7242718446601941, seed_num = 3
정확도 :0.7242718446601941, seed_num = 4
정확도 :0.7165048543689321, seed_num = 5
정확도 :0.7190938511326861, seed_num = 6
정확도 :0.7216828478964401, seed_num = 7
정확도 :0.7313915857605178, seed_num = 8
정확도 :0.7216828478964401, seed_num = 9
정확도 :0.7249190938511327, seed_num = 10
정확도 :0.7385113268608414, seed_num = 11
정확도 :0.7365695792880259, seed_num = 12
정확도 :0.7190938511326861, seed_num = 13
정확도 :0.7288025889967638, seed_num = 14
정확도 :0.7190938511326861, seed_num = 15
정확도 :0.7190938511326861, seed_num = 16
정확도 :0.7158576051779936, seed_num = 17
정확도 :0.7268608414239482, seed_num = 18
정확도 :0.7216828478964401, seed_num = 19
정확도 :0.7126213592233009, seed_num = 20
정확도 :0.6996763754045308, seed_num = 21
정확도 :0.7223300970873786, seed_num = 22
정확도 :0.713915857605178, seed_num = 23
정확도 :0.7003236245954693, seed_num = 24
정확도 :0.7307443365695793, seed_num = 

In [69]:
# seed = 42의 정확도 df 만들고 평균 확인 => 72.0
df_42 = pd.DataFrame.from_dict(dic_42, orient='index')
df_42.mean()

0    0.720673
dtype: float64

# Method 2 : 상위 10개 feature
- 절대값 취하고 절대값 기준으로 내림차순 후 상위 10개 뽑기

In [71]:
m2['abs'] = abs(m2['diff2'])
m2 = m2.sort_values(by='abs', ascending=False)
m2_top10 = m2['feature'].values[:10]
m2_top10

array([  409606211,    74606211,    67434504,    54817525, 10019055302,
          54858516,    74706811,    45006701,      228125, 63323016501])

## (60.9) RF

In [72]:
# m2_top10만을 가진 x_(7727,10)_m2 만들기
item_list = list(total_data['ITEMID'].sort_values().unique())

m2_top10_index = []
for i in m2_top10:
    m2_top10_index.append(item_list.index(i))

print('m2_top10의 index : ',m2_top10_index)

x = np.load('x_(7727,4068).npy')
x_2d = x[:,m2_top10_index]
x_2d.shape

# np.save('x_(7727,10)_m2.npy',x_2d)

m2_top10의 index :  [2195, 1276, 1081, 982, 2744, 1011, 1306, 845, 392, 3820]


(7727, 10)

In [73]:
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

model = RandomForestClassifier()

data={}
for seed in range(42, 52):
    random.seed(seed)
    
    x = np.load('./x_(7727,10)_m2.npy')
    y = np.load('./y_(7727,1).npy')

    idx = list(range(len(x)))
    random.shuffle(idx)

    i = round(x.shape[0]*0.8)
    X_train, y_train = x[idx[:i],:], y[idx[:i]]
    X_test, y_test = x[idx[i:],:], y[idx[i:]]
    
    _ = model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred_test)
    data[seed]=acc
    print(f'정확도 : {acc}, seed_num = {seed}')

df = pd.DataFrame.from_dict(data, orient='index')
print(f'정확도 df 만들고 평균 확인 : {df.mean().values}')

정확도 : 0.6006472491909385, seed_num = 42
정확도 : 0.5844660194174758, seed_num = 43
정확도 : 0.594822006472492, seed_num = 44
정확도 : 0.6148867313915858, seed_num = 45
정확도 : 0.6116504854368932, seed_num = 46
정확도 : 0.626537216828479, seed_num = 47
정확도 : 0.627831715210356, seed_num = 48
정확도 : 0.5980582524271845, seed_num = 49
정확도 : 0.625242718446602, seed_num = 50
정확도 : 0.6122977346278318, seed_num = 51
정확도 df 만들고 평균 확인 : [0.60964401]


## (60.8) LSTM

In [74]:
# m2_top10만을 가진 x_(7727,10,10)_m2 만들기
item_list = list(total_data['ITEMID'].sort_values().unique())

m2_top10_index = []
for i in m2_top10:
    m2_top10_index.append(item_list.index(i))

print('m2_top10의 index : ',m2_top10_index)

x = np.load('x_(7727,10,4068).npy')
x_3d = x[:,:,m2_top10_index]
x_3d.shape
x_3d.sum()

# np.save('x_(7727,10,10)_m2.npy',x_3d)

m2_top10의 index :  [2195, 1276, 1081, 982, 2744, 1011, 1306, 845, 392, 3820]


(7727, 10, 10)

5833.0

In [75]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')

import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, InputLayer
from tensorflow.keras.callbacks import EarlyStopping
from sklearn import metrics 
from tensorflow import keras
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam

import random
# ----------------------
seed_num = 42
# ----------------------
random.seed(seed_num)

x = np.load('x_(7727,10,10)_m2.npy')
y = np.load('y_(7727,1).npy')

idx = list(range(len(x)))
random.shuffle(idx)

i = round(x.shape[0]*0.8)
X_train, y_train = x[idx[:i],:,:], y[idx[:i]]
X_test, y_test = x[idx[i:],:,:], y[idx[i:]]

# ---------------------
seed_num = 42 
# ---------------------
tf.random.set_seed(seed_num)

lstm = Sequential()
lstm.add(InputLayer(input_shape=(X_train.shape[1],X_train.shape[2])))
lstm.add(LSTM(units=128, activation='hard_sigmoid', return_sequences=True))
lstm.add(LSTM(units=64, activation='hard_sigmoid', return_sequences=True))
lstm.add(Dropout(0.2))
lstm.add(LSTM(units=64, activation='hard_sigmoid', return_sequences=True))
lstm.add(LSTM(units=32, activation='hard_sigmoid', return_sequences=False))
lstm.add(Dropout(0.2))
lstm.add(Dense(units=1, activation='sigmoid'))

from tensorflow.keras.callbacks import ModelCheckpoint
import os

MODEL_SAVE_FOLDER_PATH = './model/'
if not os.path.exists(MODEL_SAVE_FOLDER_PATH):
    os.mkdir(MODEL_SAVE_FOLDER_PATH)

model_path = MODEL_SAVE_FOLDER_PATH + 'm2_top10_seed42-{epoch:02d}-{val_loss:.4f}.hdf5'

cb_checkpoint = ModelCheckpoint(filepath=model_path, monitor='val_loss',
                                verbose=1, save_best_only=True)

early_stop = EarlyStopping(monitor='val_acc', patience=50, verbose=1, restore_best_weights=True)
lstm.compile(optimizer= keras.optimizers.Adam(learning_rate = 0.001), loss = "binary_crossentropy", metrics=['acc'])
lstm.fit(X_train, y_train, validation_split=0.25, batch_size=128, epochs=500,  callbacks=[early_stop,cb_checkpoint], shuffle=False)

Epoch 1/500

Epoch 00001: val_loss improved from inf to 0.66603, saving model to ./model/m2_top10_seed42-01-0.6660.hdf5
Epoch 2/500

Epoch 00002: val_loss did not improve from 0.66603
Epoch 3/500

Epoch 00003: val_loss improved from 0.66603 to 0.66551, saving model to ./model/m2_top10_seed42-03-0.6655.hdf5
Epoch 4/500

Epoch 00004: val_loss improved from 0.66551 to 0.66526, saving model to ./model/m2_top10_seed42-04-0.6653.hdf5
Epoch 5/500

Epoch 00005: val_loss improved from 0.66526 to 0.66517, saving model to ./model/m2_top10_seed42-05-0.6652.hdf5
Epoch 6/500

Epoch 00006: val_loss improved from 0.66517 to 0.66515, saving model to ./model/m2_top10_seed42-06-0.6652.hdf5
Epoch 7/500

Epoch 00007: val_loss did not improve from 0.66515
Epoch 8/500

Epoch 00008: val_loss did not improve from 0.66515
Epoch 9/500

Epoch 00009: val_loss did not improve from 0.66515
Epoch 10/500

Epoch 00010: val_loss did not improve from 0.66515
Epoch 11/500

Epoch 00011: val_loss improved from 0.66515 to 0.


Epoch 00037: val_loss did not improve from 0.62398
Epoch 38/500

Epoch 00038: val_loss improved from 0.62398 to 0.62382, saving model to ./model/m2_top10_seed42-38-0.6238.hdf5
Epoch 39/500

Epoch 00039: val_loss improved from 0.62382 to 0.62352, saving model to ./model/m2_top10_seed42-39-0.6235.hdf5
Epoch 40/500

Epoch 00040: val_loss did not improve from 0.62352
Epoch 41/500

Epoch 00041: val_loss did not improve from 0.62352
Epoch 42/500

Epoch 00042: val_loss did not improve from 0.62352
Epoch 43/500

Epoch 00043: val_loss did not improve from 0.62352
Epoch 44/500

Epoch 00044: val_loss improved from 0.62352 to 0.62337, saving model to ./model/m2_top10_seed42-44-0.6234.hdf5
Epoch 45/500

Epoch 00045: val_loss improved from 0.62337 to 0.62285, saving model to ./model/m2_top10_seed42-45-0.6228.hdf5
Epoch 46/500

Epoch 00046: val_loss improved from 0.62285 to 0.62259, saving model to ./model/m2_top10_seed42-46-0.6226.hdf5
Epoch 47/500

Epoch 00047: val_loss did not improve from 0.6225

<tensorflow.python.keras.callbacks.History at 0x7f279c6853a0>

In [76]:
from keras.models import load_model
best_model = load_model('./model/m2_top10_seed42-46-0.6226.hdf5') 

dic_42={}
for seed in range(0, 50):
    random.seed(seed)

    x = np.load('x_(7727,10,10)_m2.npy')
    y = np.load('y_(7727,1).npy')

    idx = list(range(len(x)))
    random.shuffle(idx)

    i = round(x.shape[0]*0.8)
    X_train, y_train = x[idx[:i],:,:], y[idx[:i]]
    X_test, y_test = x[idx[i:],:,:], y[idx[i:]]
    
    pred = best_model.predict(X_test)
    pred[pred>0.5]=1
    pred[pred<=0.5]=0
    acc = metrics.accuracy_score(y_test, pred)
    dic_42[seed]=acc
    print(f'정확도 :{metrics.accuracy_score(y_test, pred)}, seed_num = {seed}')

정확도 :0.6045307443365696, seed_num = 0
정확도 :0.6051779935275081, seed_num = 1
정확도 :0.6110032362459547, seed_num = 2
정확도 :0.602588996763754, seed_num = 3
정확도 :0.6097087378640776, seed_num = 4
정확도 :0.6110032362459547, seed_num = 5
정확도 :0.6110032362459547, seed_num = 6
정확도 :0.5928802588996763, seed_num = 7
정확도 :0.6226537216828479, seed_num = 8
정확도 :0.6129449838187703, seed_num = 9
정확도 :0.5909385113268608, seed_num = 10
정확도 :0.6297734627831715, seed_num = 11
정확도 :0.601294498381877, seed_num = 12
정확도 :0.598705501618123, seed_num = 13
정확도 :0.6071197411003236, seed_num = 14
정확도 :0.6135922330097088, seed_num = 15
정확도 :0.6330097087378641, seed_num = 16
정확도 :0.598705501618123, seed_num = 17
정확도 :0.6045307443365696, seed_num = 18
정확도 :0.596116504854369, seed_num = 19
정확도 :0.6187702265372168, seed_num = 20
정확도 :0.6135922330097088, seed_num = 21
정확도 :0.6, seed_num = 22
정확도 :0.6032362459546926, seed_num = 23
정확도 :0.6032362459546926, seed_num = 24
정확도 :0.6071197411003236, seed_num = 25
정확도 :0.595469255

In [77]:
# seed = 42의 정확도 df 만들고 평균 확인 => 60.8
df_42 = pd.DataFrame.from_dict(dic_42, orient='index')
df_42.mean()

0    0.608298
dtype: float64