In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 라이브러리

In [35]:
# 라이브러리리
import pandas as pd
import numpy as np
import random
import tensorflow as tf
# 랜덤시드 고정
SEED =12
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
print("시드 고정:", SEED)

시드 고정: 12


## 데이터 로드

In [133]:
drive_path = "/content/drive/MyDrive/SKT_FLY_AI/Day6/wine_data/"
train = pd.read_csv(drive_path + "train.csv")
test = pd.read_csv(drive_path + "test.csv")
sample_submission = pd.read_csv(drive_path + 'sample_submission.csv')

In [134]:
train.head()

Unnamed: 0,index,quality,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type
0,0,5,5.6,0.695,0.06,6.8,0.042,9.0,84.0,0.99432,3.44,0.44,10.2,white
1,1,5,8.8,0.61,0.14,2.4,0.067,10.0,42.0,0.9969,3.19,0.59,9.5,red
2,2,5,7.9,0.21,0.39,2.0,0.057,21.0,138.0,0.99176,3.05,0.52,10.9,white
3,3,6,7.0,0.21,0.31,6.0,0.046,29.0,108.0,0.9939,3.26,0.5,10.8,white
4,4,6,7.8,0.4,0.26,9.5,0.059,32.0,178.0,0.9955,3.04,0.43,10.9,white


In [135]:
test.head()

Unnamed: 0,index,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type
0,0,9.0,0.31,0.48,6.6,0.043,11.0,73.0,0.9938,2.9,0.38,11.6,white
1,1,13.3,0.43,0.58,1.9,0.07,15.0,40.0,1.0004,3.06,0.49,9.0,red
2,2,6.5,0.28,0.27,5.2,0.04,44.0,179.0,0.9948,3.19,0.69,9.4,white
3,3,7.2,0.15,0.39,1.8,0.043,21.0,159.0,0.9948,3.52,0.47,10.0,white
4,4,6.8,0.26,0.26,2.0,0.019,23.5,72.0,0.99041,3.16,0.47,11.8,white


In [136]:
sample_submission.head()

Unnamed: 0,index,quality
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


### type column 인코딩

In [137]:
train['type'].value_counts()

white    4159
red      1338
Name: type, dtype: int64

In [138]:
train['type'] = np.where(train['type'] == 'white', 1, 0).astype(int)
test['type'] = np.where(test['type'] == 'white', 1, 0).astype(int)
train['type'].value_counts()

1    4159
0    1338
Name: type, dtype: int64

## quality column OHE

In [139]:
train['quality'].value_counts()

6    2416
5    1788
7     924
4     186
8     152
3      26
9       5
Name: quality, dtype: int64

In [140]:
from tensorflow.keras.utils import to_categorical

y_train = to_categorical(train.loc[:, 'quality']-3)
y_train

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [141]:
X_train = train.loc[:, 'fixed acidity':]
X_test = test.loc[:, 'fixed acidity':]

In [142]:
X_train = train.loc[:, ['alcohol', 'density', 'volatile acidity', 'chlorides', 'type']]
X_test = test.loc[:, ['alcohol', 'density', 'volatile acidity', 'chlorides', 'type']]

In [143]:
# 피처 스케일링
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

print(X_train_scaled.shape, y_train.shape)
print(X_test_scaled.shape)

(5497, 5) (5497, 7)
(1000, 5)


In [144]:
train.corr()['quality'].abs().sort_values(ascending=False)

quality                 1.000000
alcohol                 0.439615
density                 0.299831
volatile acidity        0.261557
chlorides               0.198148
type                    0.118280
citric acid             0.079157
fixed acidity           0.076506
free sulfur dioxide     0.055574
sulphates               0.042068
total sulfur dioxide    0.039732
residual sugar          0.032848
pH                      0.017931
index                   0.008045
Name: quality, dtype: float64

In [145]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout

def build_model(train_data, train_target):
  model = Sequential()

  model.add(Dense(128, activation='relu', input_dim=train_data.shape[1]))
  model.add(Dropout(0.2))

  model.add(Dense(64, activation='relu'))
  model.add(Dropout(0.2))

  model.add(Dense(32, activation='relu'))
  model.add(Dropout(0.2))

  model.add(Dense(16, activation='relu'))

  model.add(Dense(train_target.shape[1], activation='softmax'))

  model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc', 'mae'])
  return model

model = build_model(X_train_scaled, y_train)
model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_24 (Dense)            (None, 128)               768       
                                                                 
 dropout_14 (Dropout)        (None, 128)               0         
                                                                 
 dense_25 (Dense)            (None, 64)                8256      
                                                                 
 dropout_15 (Dropout)        (None, 64)                0         
                                                                 
 dense_26 (Dense)            (None, 32)                2080      
                                                                 
 dropout_16 (Dropout)        (None, 32)                0         
                                                                 
 dense_27 (Dense)            (None, 16)               

In [146]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping

x_tr, x_val, y_tr, y_val = train_test_split(X_train_scaled, y_train, test_size=0.15, shuffle=True, random_state=SEED)
early_stopping = EarlyStopping(monitor='val_loss', patience=10)
history = model.fit(x_tr, y_tr, batch_size=64, epochs=200, validation_data=(x_val, y_val), callbacks=[early_stopping], verbose=2)

Epoch 1/200
73/73 - 1s - loss: 1.4417 - acc: 0.4133 - mae: 0.2060 - val_loss: 1.2064 - val_acc: 0.4958 - val_mae: 0.1849 - 970ms/epoch - 13ms/step
Epoch 2/200
73/73 - 0s - loss: 1.2322 - acc: 0.4690 - mae: 0.1853 - val_loss: 1.1335 - val_acc: 0.5139 - val_mae: 0.1800 - 211ms/epoch - 3ms/step
Epoch 3/200
73/73 - 0s - loss: 1.1821 - acc: 0.4953 - mae: 0.1793 - val_loss: 1.0969 - val_acc: 0.5479 - val_mae: 0.1717 - 206ms/epoch - 3ms/step
Epoch 4/200
73/73 - 0s - loss: 1.1599 - acc: 0.4961 - mae: 0.1764 - val_loss: 1.1032 - val_acc: 0.5430 - val_mae: 0.1763 - 212ms/epoch - 3ms/step
Epoch 5/200
73/73 - 0s - loss: 1.1549 - acc: 0.5092 - mae: 0.1755 - val_loss: 1.1078 - val_acc: 0.5285 - val_mae: 0.1752 - 204ms/epoch - 3ms/step
Epoch 6/200
73/73 - 0s - loss: 1.1487 - acc: 0.5131 - mae: 0.1757 - val_loss: 1.0863 - val_acc: 0.5442 - val_mae: 0.1721 - 240ms/epoch - 3ms/step
Epoch 7/200
73/73 - 0s - loss: 1.1332 - acc: 0.5193 - mae: 0.1740 - val_loss: 1.0760 - val_acc: 0.5467 - val_mae: 0.1670 - 

In [147]:
model.evaluate(x_val, y_val)



[1.0623955726623535, 0.5478788018226624, 0.1664457768201828]

In [148]:
y_pred_proba = model.predict(X_test)
y_pred_proba[:5]



array([[0.0000000e+00, 0.0000000e+00, 5.9265021e-30, 4.1559982e-08,
        9.9999994e-01, 4.3356700e-09, 2.6007895e-36],
       [0.0000000e+00, 3.3223110e-31, 8.0640038e-23, 2.8046904e-06,
        9.9999660e-01, 4.1844456e-07, 9.2505160e-28],
       [0.0000000e+00, 2.9140913e-33, 2.3131043e-24, 1.1396329e-06,
        9.9999863e-01, 1.5653889e-07, 1.3571470e-29],
       [0.0000000e+00, 5.5271660e-36, 3.2397522e-26, 3.8550505e-07,
        9.9999958e-01, 4.4479460e-08, 5.7957914e-32],
       [0.0000000e+00, 0.0000000e+00, 1.2089389e-30, 2.7799844e-08,
        9.9999994e-01, 2.7502167e-09, 3.5321064e-37]], dtype=float32)

In [149]:
y_pred_label = np.argmax(y_pred_proba, axis=-1) + 3
y_pred_label[:5]

array([7, 7, 7, 7, 7])

In [150]:
sample_submission['quality'] = y_pred_label.astype(int)
sample_submission.head()

Unnamed: 0,index,quality
0,0,7
1,1,7
2,2,7
3,3,7
4,4,7


In [151]:
# 제출 파일
sample_submission.to_csv('win_dnn.csv', index=False)