In [1]:
# 필수 라이브러리
import pandas as pd
import numpy as np
import random
import tensorflow as tf

# 랜덤 시드 고정
SEED=12
random.seed(SEED)
tf.random.set_seed(SEED)
np.random.seed(SEED)

print("시드 고정 :",SEED)


시드 고정 : 12


# 데이터 전처리


In [2]:
train=pd.read_csv('data/train.csv')
test=pd.read_csv('data/test.csv')

submission=pd.read_csv('data/sample_submission.csv')

print(train.shape,test.shape,submission.shape)

(5497, 14) (1000, 13) (1000, 2)


In [3]:
train.head()

Unnamed: 0,index,quality,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type
0,0,5,5.6,0.695,0.06,6.8,0.042,9.0,84.0,0.99432,3.44,0.44,10.2,white
1,1,5,8.8,0.61,0.14,2.4,0.067,10.0,42.0,0.9969,3.19,0.59,9.5,red
2,2,5,7.9,0.21,0.39,2.0,0.057,21.0,138.0,0.99176,3.05,0.52,10.9,white
3,3,6,7.0,0.21,0.31,6.0,0.046,29.0,108.0,0.9939,3.26,0.5,10.8,white
4,4,6,7.8,0.4,0.26,9.5,0.059,32.0,178.0,0.9955,3.04,0.43,10.9,white


* target 변수 = 와인 품질을 나타내는 quality. 따라서, 딥러닝 다중 분류 문제.

In [4]:
submission.head()

Unnamed: 0,index,quality
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [5]:
# 범주형 데이터의 숫자형 데이터로 변환

train['type'].value_counts()

white    4159
red      1338
Name: type, dtype: int64

In [6]:
train['type']=np.where(train['type']=='white',1,0).astype(int)
test['type']=np.where(test['type']=='white',1,0).astype(int)
train['type'].value_counts()

1    4159
0    1338
Name: type, dtype: int64

* 모델 학습에 입력하려면 숫자형 데이터로 변환해야한다.
* 머신러닝 파트에서 다룬 LabelEncoder을 사용하는 레이블 인코딩 방법도 있다.

In [8]:
# 목표 변수 quality 열의 데이터 개수 확인.

train['quality'].value_counts()

6    2416
5    1788
7     924
4     186
8     152
3      26
9       5
Name: quality, dtype: int64

* 목표 변수는 연속형 숫자 데이터가 아니라, 와인 등급을 나타내는 범주형 데이터이다.
* keras to_categorical 함수를 이용하여 목표 변수를 원 핫 인코딩 변환한다.

In [10]:
# 3~9 범위 값으로 원핫 인코딩을 하면 숫자 0부터 최대값인 9까지 10개의 클래스로 인식한다.
# 따라서, 와인 등급을 0~6 범위로 바꾼다.

from tensorflow.keras.utils import to_categorical

y_train=to_categorical(train.loc[:,'quality']-3)
y_train

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [11]:
# 모델 학습에 사용할 피처를 선택하고, MinMax 스케일링으로 모든 피처 변수의 데이터를 0 ~ 1범위로 정규화 변환한다.

# 피처 선택(target 제외 피처변수들)

x_train=train.loc[:,'fixed acidity':]
x_test=test.loc[:,'fixed acidity':]

# 피처 스케일링
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(x_train)
x_train_scaled=scaler.fit_transform(x_train)
x_test_scaled=scaler.fit_transform(x_test)

print(x_train_scaled.shape,y_train.shape)
print(x_test_scaled.shape)

(5497, 12) (5497, 7)
(1000, 12)


In [12]:
x_train_scaled.shape[1]

12

In [14]:
y_train.shape[1]

7

# 모델 설계 : 드랍아웃 활용

In [15]:
# 심층 신경망 모델
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout

def build_model(train_data,train_target):
    model=Sequential()
    model.add(Dense(128,activation='tanh',input_dim=train_data.shape[1]))
    model.add(Dropout(0.2))
    model.add(Dense(64,activation='tanh'))
    model.add(Dropout(0.2))
    model.add(Dense(32,activation='tanh'))
    model.add(Dense(train_target.shape[1],activation='softmax'))
    
    model.compile(optimizer='RMSProp',loss='categorical_crossentropy',metrics=['acc','mae'])
    
    return model

In [17]:
model=build_model(x_train_scaled,y_train)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 128)               1664      
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_3 (Dense)              (None, 7)                 231       
Total params: 12,231
Trainable params: 12,231
Non-trainable params: 0
____________________________________________________

# 4-3. 콜백 함수 : Early Stopping 기법

In [19]:
# Early stopping 기법 : 과대 적합 방지, 검증 데이터의 손실함수가 감소하지 않기 시작할때, 모델 학습을 중단.

from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping

x_tr,x_val,y_tr,y_val=train_test_split(x_train_scaled,y_train,test_size=0.15,
                                      shuffle=True,random_state=SEED)

early_stopping=EarlyStopping(monitor='val_loss', patience=10) # patience가 epoch 10회 반복되는 동안 손실함수가 감소하지 않으면 학습을 멈춘다.
history=model.fit(x_tr,y_tr,batch_size=64,epochs=200,
                 validation_data=(x_val,y_val),
                 callbacks=[early_stopping],
                 verbose=2)

Epoch 1/200
73/73 - 1s - loss: 1.3005 - acc: 0.4574 - mae: 0.1930 - val_loss: 1.1688 - val_acc: 0.5055 - val_mae: 0.1808
Epoch 2/200
73/73 - 0s - loss: 1.1752 - acc: 0.5015 - mae: 0.1779 - val_loss: 1.1034 - val_acc: 0.5430 - val_mae: 0.1732
Epoch 3/200
73/73 - 0s - loss: 1.1449 - acc: 0.5169 - mae: 0.1742 - val_loss: 1.0814 - val_acc: 0.5442 - val_mae: 0.1681
Epoch 4/200
73/73 - 0s - loss: 1.1285 - acc: 0.5210 - mae: 0.1719 - val_loss: 1.1267 - val_acc: 0.5042 - val_mae: 0.1698
Epoch 5/200
73/73 - 0s - loss: 1.1177 - acc: 0.5330 - mae: 0.1709 - val_loss: 1.0708 - val_acc: 0.5564 - val_mae: 0.1680
Epoch 6/200
73/73 - 0s - loss: 1.1098 - acc: 0.5240 - mae: 0.1712 - val_loss: 1.0613 - val_acc: 0.5539 - val_mae: 0.1667
Epoch 7/200
73/73 - 0s - loss: 1.1028 - acc: 0.5291 - mae: 0.1704 - val_loss: 1.0571 - val_acc: 0.5503 - val_mae: 0.1658
Epoch 8/200
73/73 - 0s - loss: 1.0996 - acc: 0.5216 - mae: 0.1700 - val_loss: 1.0612 - val_acc: 0.5576 - val_mae: 0.1655
Epoch 9/200
73/73 - 0s - loss: 1

In [20]:
# val_loss, val_acc, val_mae 값 반환
model.evaluate(x_val,y_val)



[1.026451826095581, 0.5575757622718811, 0.16143734753131866]

In [21]:
# test 데이터에 대한 예측값 정리
y_pred_proba=model.predict(x_test)
y_pred_proba[:5]

array([[0.14640701, 0.02422137, 0.10052606, 0.06059574, 0.12745844,
        0.49287596, 0.04791538],
       [0.07843959, 0.00263109, 0.01299645, 0.02802715, 0.12971562,
        0.51961535, 0.22857478],
       [0.1974192 , 0.07493019, 0.26500374, 0.14723289, 0.06576081,
        0.21707143, 0.03258163],
       [0.17468645, 0.08192883, 0.27029353, 0.1755149 , 0.07622779,
        0.19794759, 0.02340088],
       [0.11724108, 0.01547755, 0.06363828, 0.06544021, 0.08472515,
        0.5647171 , 0.08876064]], dtype=float32)

In [22]:
y_pred_proba.shape

(1000, 7)

In [23]:
y_pred_label=np.argmax(y_pred_proba,axis=-1)+3
y_pred_label[:5]

array([8, 8, 5, 5, 8], dtype=int64)

In [24]:
# 제출 양식에 맞게 정리
submission['quality']=y_pred_label.astype(int)
submission.head()

Unnamed: 0,index,quality
0,0,8
1,1,8
2,2,5
3,3,5
4,4,8


In [25]:
submission.to_csv('wine_dnn_001.csv',index=False)