# 모듈

In [6]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import sklearn
from sklearn import metrics
from sklearn.model_selection import train_test_split
from platform import python_version

# random 고정시 필요한 모듈
import os
import random

# 모델 형성시 필요한 모듈
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation,BatchNormalization
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras import metrics

# local적인 모델 해석 
import lime
import lime.lime_tabular

# random 고정
os.environ['PYTHONHASHSEED'] = str(42)
np.random.seed(42)
random.seed(42)

In [7]:
print(f'python version: {python_version()}')
print(f'numpy version : {np.__version__}')
print(f'pandas version : {pd.__version__}')
print(f'matplotlib version : {matplotlib.__version__}')
print(f'sklearn version : {sklearn.__version__}')
print(f'tensorflow version : {tf.__version__}')

python version: 3.7.4
numpy version : 1.18.5
pandas version : 1.1.1
matplotlib version : 3.3.1
sklearn version : 0.23.2
tensorflow version : 2.3.0


# 데이터 가공

In [8]:
# 파일을 저장한 위치를 써 주세요.
directory = os.getcwd()

In [9]:
# 앞에서의 eda에 의해 형성된 xlsx 파일을 불러옵니다.
X_df=pd.read_excel(directory+'/X_for_train.xlsx') 

In [10]:
X = X_df.copy() ; X

Unnamed: 0.1,Unnamed: 0,방송일시,노출(분),마더코드,상품코드,상품명,상품군,판매단가,취급액,평균방송분,...,PrimeTime,남성상품,여성상품,무이자,일시불,유명기업/브랜드,타 채널 시청자 수 평균,가전제품,농수축소분류,어류손질여부
0,0,2019-01-01 06:00:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900,2099000,10.0,...,프라임아님,1,0,0,0,0,1520.0,가전제품 아님,분류에없음,해당없음
1,1,2019-01-01 06:00:00,20.0,100346,201079,테이트 여성 셀린니트3종,의류,39900,4371000,10.0,...,프라임아님,0,1,0,0,0,1520.0,가전제품 아님,분류에없음,해당없음
2,2,2019-01-01 06:20:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900,3262000,10.0,...,프라임아님,1,0,0,0,0,1520.0,가전제품 아님,분류에없음,해당없음
3,3,2019-01-01 06:20:00,20.0,100346,201079,테이트 여성 셀린니트3종,의류,39900,6955000,10.0,...,프라임아님,0,1,0,0,0,1520.0,가전제품 아님,분류에없음,해당없음
4,4,2019-01-01 06:40:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900,6672000,10.0,...,프라임아님,1,0,0,0,0,1520.0,가전제품 아님,분류에없음,해당없음
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35374,35374,2019-12-31 23:40:00,20.0,100448,201391,일시불쿠첸압력밥솥 6인용,주방,148000,10157000,5.0,...,프라임아님,0,0,0,1,1,1335.9,가전제품 아님,분류에없음,해당없음
35375,35375,2020-01-01 00:00:00,20.0,100448,201383,무이자쿠첸압력밥솥 10인용,주방,178000,50929000,5.0,...,프라임아님,0,0,1,0,1,1520.0,가전제품 아님,분류에없음,해당없음
35376,35376,2020-01-01 00:00:00,20.0,100448,201390,일시불쿠첸압력밥솥 10인용,주방,168000,104392000,5.0,...,프라임아님,0,0,0,1,1,1520.0,가전제품 아님,분류에없음,해당없음
35377,35377,2020-01-01 00:00:00,20.0,100448,201384,무이자쿠첸압력밥솥 6인용,주방,158000,13765000,5.0,...,프라임아님,0,0,1,0,1,1520.0,가전제품 아님,분류에없음,해당없음


In [11]:
# target data 를 빼냅니다.
y=X['취급액']

In [12]:
# 예측에 쓰지 않는 열 제거
X.drop(columns = ['Unnamed: 0','취급액','방송일시','마더코드','상품코드','상품명','년','일','시분','월일','시간열','정수노출(분)','요일/시간','날짜','시각'],inplace = True)

In [13]:
# 더미화
X = pd.get_dummies(X,columns=['월','요일','분기','상품군','PrimeTime','어류손질여부','가전제품','농수축소분류'])

In [14]:
# X_col 의 이름 저장
X_features=X.columns

In [15]:
# scaling 과정
# 큰 skewness 를 가지는 값들은 대부분 애초에 분포가 너무 틀어져있어 (노출 분거의 다 20,10분의 값을 가진다.) log/squre 등의 변환을 해도 똑같을거라 판단하고, scaling 만 하기로 하였다.
# 그리고, categorical 데이터의 경우에도 scaling 을 해 주어서 평균 0 / 분산 1 을 맞추어주는게 나중에 DNN 이 학습을 더 잘할 거라 판단하였다.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
X

array([[-0.10967928, -0.55549003, -0.20858117, ..., -0.06437268,
        -0.05297285, -0.08298992],
       [-0.10967928, -0.55549003, -0.20858117, ..., -0.06437268,
        -0.05297285, -0.08298992],
       [-0.10967928, -0.55549003, -0.20858117, ..., -0.06437268,
        -0.05297285, -0.08298992],
       ...,
       [-0.10967928, -0.35098812, -0.92228108, ..., -0.06437268,
        -0.05297285, -0.08298992],
       [-0.10967928, -0.36695236, -0.92228108, ..., -0.06437268,
        -0.05297285, -0.08298992],
       [-0.10967928, -0.3829166 , -0.92228108, ..., -0.06437268,
        -0.05297285, -0.08298992]])

In [16]:
# dataset train/test set 으로 나누기
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [17]:
# dataset 의 train 을 valid / train 으로 다시 나누기
X_train_, X_valid_, y_train_, y_valid_ = train_test_split(X_train, y_train, test_size=0.20, random_state=0)

In [18]:
# MAPE 정의
def MAPE(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# DNN model

In [19]:
# random 고정
tf.random.set_seed(42)
os.environ['PYTHONHASHSEED'] = str(42)
np.random.seed(42)
random.seed(42)

session_conf = tf.compat.v1.ConfigProto(
    intra_op_parallelism_threads=1, 
    inter_op_parallelism_threads=1)

sess = tf.compat.v1.Session(
    graph=tf.compat.v1.get_default_graph(), 
    config=session_conf)

tf.compat.v1.keras.backend.set_session(sess)

# 모델 훈련
model1 = keras.models.Sequential()
model1 = Sequential([
    Dense(128, kernel_initializer='normal', activation = "relu", input_shape=X_train.shape[1:]), 
    Dropout(0.2),
    Dense(256, kernel_initializer='normal', activation = "relu"),
    Dropout(0.2),
    Dense(512, kernel_initializer='normal', activation = "relu"),
    Dropout(0.2),
    Dense(1024, kernel_initializer='normal', activation = "relu"),
    Dropout(0.2),
    Dense(512, kernel_initializer='normal', activation = "relu"),
    Dropout(0.2),
    Dense(256, kernel_initializer='normal', activation = "relu"),
    Dropout(0.2),
    Dense(32, kernel_initializer='normal', activation = "relu"),
    Dense(1, kernel_initializer='normal'), ])

model1.compile(loss="mape",  # 평가기준이 mape 이니까 이걸로 하자.
              optimizer=keras.optimizers.Adam())
checkpoint = keras.callbacks.ModelCheckpoint(filepath="model1.h5", #저장할 모델 이름
                                             monitor = 'val_loss', #monitoring 할 기준
                                             save_best_only=True ) # 
early_stopping = keras.callbacks.EarlyStopping(patience=16, #2 만 줘보자.
                                             restore_best_weights=True)
history1 = model1.fit(X_train_, y_train_, 
                        epochs=160,
                        validation_data=(X_valid_, y_valid_),
                        callbacks=[checkpoint,early_stopping])
model1 = keras.models.load_model(filepath = "model1.h5")
evaluation1 = model1.evaluate(X_test, y_test) 
# validation 에 대해 41.0262 의 값이 나왔다. 

Epoch 1/160
Epoch 2/160
Epoch 3/160
Epoch 4/160
Epoch 5/160
Epoch 6/160
Epoch 7/160
Epoch 8/160
Epoch 9/160
Epoch 10/160
Epoch 11/160
Epoch 12/160
Epoch 13/160
Epoch 14/160
Epoch 15/160
Epoch 16/160
Epoch 17/160
Epoch 18/160
Epoch 19/160
Epoch 20/160
Epoch 21/160
Epoch 22/160
Epoch 23/160
Epoch 24/160
Epoch 25/160
Epoch 26/160
Epoch 27/160
Epoch 28/160
Epoch 29/160
Epoch 30/160
Epoch 31/160
Epoch 32/160
Epoch 33/160
Epoch 34/160
Epoch 35/160
Epoch 36/160
Epoch 37/160
Epoch 38/160
Epoch 39/160
Epoch 40/160
Epoch 41/160
Epoch 42/160
Epoch 43/160
Epoch 44/160
Epoch 45/160
Epoch 46/160
Epoch 47/160
Epoch 48/160
Epoch 49/160
Epoch 50/160
Epoch 51/160
Epoch 52/160
Epoch 53/160
Epoch 54/160
Epoch 55/160
Epoch 56/160
Epoch 57/160
Epoch 58/160
Epoch 59/160
Epoch 60/160
Epoch 61/160
Epoch 62/160
Epoch 63/160
Epoch 64/160
Epoch 65/160
Epoch 66/160
Epoch 67/160
Epoch 68/160
Epoch 69/160
Epoch 70/160
Epoch 71/160
Epoch 72/160
Epoch 73/160
Epoch 74/160
Epoch 75/160


In [20]:
# random 고정
tf.random.set_seed(42)
os.environ['PYTHONHASHSEED'] = str(42)
np.random.seed(42)
random.seed(42)

session_conf = tf.compat.v1.ConfigProto(
    intra_op_parallelism_threads=1, 
    inter_op_parallelism_threads=1)

sess = tf.compat.v1.Session(
    graph=tf.compat.v1.get_default_graph(), 
    config=session_conf)

tf.compat.v1.keras.backend.set_session(sess)

# 모델 훈련
model2 = keras.models.Sequential()
model2.add(keras.layers.Dense(200, activation="selu",kernel_initializer="lecun_normal",input_shape=X_train.shape[1:]))
model2.add(Dropout(0.2))
for layer in range(5):
    model2.add(keras.layers.Dense(200, activation="selu",kernel_initializer="lecun_normal"))
    model2.add(Dropout(0.2))
model2.add(keras.layers.Dense(100, activation="selu",kernel_initializer="lecun_normal"))
model2.add(Dropout(0.2))
model2.add(keras.layers.Dense(10))
model2.add(keras.layers.Dense(1))
model2.compile(loss="mape",  # 평가기준이 mape 이니까 이걸로 하자.
              optimizer=keras.optimizers.Adam())
checkpoint = keras.callbacks.ModelCheckpoint(filepath="model2.h5", #저장할 모델 이름
                                             monitor = 'val_loss', #monitoring 할 기준
                                             save_best_only=True ) # 
early_stopping = keras.callbacks.EarlyStopping(patience=24, #2 만 줘보자.
                                             restore_best_weights=True)

history2 = model2.fit(X_train_, y_train_, 
                        epochs=160,
                        validation_data=(X_valid_, y_valid_),
                        callbacks=[checkpoint,early_stopping])
model2 = keras.models.load_model(filepath = "model2.h5")
evaluation2 = model2.evaluate(X_test, y_test) 
# validation set 에 대해 mape 가 41.4471 이 나왔다.

Epoch 1/160
Epoch 2/160
Epoch 3/160
Epoch 4/160
Epoch 5/160
Epoch 6/160
Epoch 7/160
Epoch 8/160
Epoch 9/160
Epoch 10/160
Epoch 11/160
Epoch 12/160
Epoch 13/160
Epoch 14/160
Epoch 15/160
Epoch 16/160
Epoch 17/160
Epoch 18/160
Epoch 19/160
Epoch 20/160
Epoch 21/160
Epoch 22/160
Epoch 23/160
Epoch 24/160
Epoch 25/160
Epoch 26/160
Epoch 27/160
Epoch 28/160
Epoch 29/160
Epoch 30/160
Epoch 31/160
Epoch 32/160
Epoch 33/160
Epoch 34/160
Epoch 35/160
Epoch 36/160
Epoch 37/160
Epoch 38/160
Epoch 39/160
Epoch 40/160
Epoch 41/160
Epoch 42/160
Epoch 43/160
Epoch 44/160
Epoch 45/160
Epoch 46/160
Epoch 47/160
Epoch 48/160
Epoch 49/160
Epoch 50/160
Epoch 51/160
Epoch 52/160
Epoch 53/160
Epoch 54/160
Epoch 55/160
Epoch 56/160
Epoch 57/160
Epoch 58/160
Epoch 59/160
Epoch 60/160
Epoch 61/160
Epoch 62/160


In [21]:
# random 고정
tf.random.set_seed(42)
os.environ['PYTHONHASHSEED'] = str(42)
np.random.seed(42)
random.seed(42)

session_conf = tf.compat.v1.ConfigProto(
    intra_op_parallelism_threads=1, 
    inter_op_parallelism_threads=1)

sess = tf.compat.v1.Session(
    graph=tf.compat.v1.get_default_graph(), 
    config=session_conf)

tf.compat.v1.keras.backend.set_session(sess)

model3 = keras.models.Sequential()
model3 = Sequential([
    Dense(128, kernel_initializer='he_normal', activation = "elu", input_shape=X_train.shape[1:]),
    Dropout(0.2),
    Dense(256, kernel_initializer='he_normal', activation = "elu"),
    Dropout(0.2),
    Dense(512, kernel_initializer='he_normal', activation = "elu"),
    Dropout(0.2),
    Dense(512, kernel_initializer='he_normal', activation = "elu"),
    Dropout(0.2),
    Dense(512, kernel_initializer='he_normal', activation = "elu"),
    Dropout(0.2),
    Dense(512, kernel_initializer='he_normal', activation = "elu"),
    Dropout(0.2),
    Dense(512, kernel_initializer='he_normal', activation = "elu"),
    Dropout(0.2),
    Dense(256, kernel_initializer='he_normal', activation = "elu"),
    Dropout(0.2),
    Dense(32, kernel_initializer='he_normal', activation = "elu"),
    Dropout(0.2),
    Dense(1, kernel_initializer='he_normal'), 
])

model3.compile(loss="mape",  # 평가기준이 mape 이니까 이걸로 하자.
              optimizer=keras.optimizers.Adam())
checkpoint = keras.callbacks.ModelCheckpoint(filepath="model3.h5", #저장할 모델 이름
                                             monitor = 'val_loss', #monitoring 할 기준
                                             save_best_only=True ) # 
early_stopping = keras.callbacks.EarlyStopping(patience=16, #2 만 줘보자.
                                             restore_best_weights=True)

history3 = model3.fit(X_train_, y_train_, 
                        epochs=160,
                        validation_data=(X_valid_, y_valid_),
                        callbacks=[checkpoint,early_stopping])
model3 = keras.models.load_model(filepath = "model3.h5")
evaluation3 = model3.evaluate(X_test, y_test)
# validation set 에 대해 mape 가 40.6775 가 나왔다.

Epoch 1/160
Epoch 2/160
Epoch 3/160
Epoch 4/160
Epoch 5/160
Epoch 6/160
Epoch 7/160
Epoch 8/160
Epoch 9/160
Epoch 10/160
Epoch 11/160
Epoch 12/160
Epoch 13/160
Epoch 14/160
Epoch 15/160
Epoch 16/160
Epoch 17/160
Epoch 18/160
Epoch 19/160
Epoch 20/160
Epoch 21/160
Epoch 22/160
Epoch 23/160
Epoch 24/160
Epoch 25/160
Epoch 26/160
Epoch 27/160
Epoch 28/160
Epoch 29/160
Epoch 30/160
Epoch 31/160
Epoch 32/160
Epoch 33/160
Epoch 34/160
Epoch 35/160
Epoch 36/160
Epoch 37/160
Epoch 38/160
Epoch 39/160
Epoch 40/160
Epoch 41/160
Epoch 42/160
Epoch 43/160
Epoch 44/160
Epoch 45/160
Epoch 46/160
Epoch 47/160
Epoch 48/160
Epoch 49/160
Epoch 50/160
Epoch 51/160
Epoch 52/160
Epoch 53/160
Epoch 54/160
Epoch 55/160
Epoch 56/160
Epoch 57/160
Epoch 58/160
Epoch 59/160
Epoch 60/160
Epoch 61/160
Epoch 62/160
Epoch 63/160
Epoch 64/160
Epoch 65/160
Epoch 66/160
Epoch 67/160
Epoch 68/160
Epoch 69/160
Epoch 70/160
Epoch 71/160


In [47]:
# 3개의 모델을 각 가중치를 주어서, 합쳤습니다.
# 40.1136 의 mape 가 나왔음.
y_pred = 0.3*model1.predict(X_test)+ 0.35*model2.predict(X_test)+ 0.35*model3.predict(X_test)
MAPE(y_test,y_pred.reshape(-1))

40.11364716950374