In [1]:
import pandas as pd
import numpy as np
import random
import tensorflow as tf

In [2]:
data = pd.read_csv('wine_train.csv')

In [3]:
data.shape

(5497, 14)

In [4]:
data[:3]

Unnamed: 0,index,quality,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type
0,0,5,5.6,0.695,0.06,6.8,0.042,9.0,84.0,0.99432,3.44,0.44,10.2,white
1,1,5,8.8,0.61,0.14,2.4,0.067,10.0,42.0,0.9969,3.19,0.59,9.5,red
2,2,5,7.9,0.21,0.39,2.0,0.057,21.0,138.0,0.99176,3.05,0.52,10.9,white


In [5]:
data['quality'].value_counts()

6    2416
5    1788
7     924
4     186
8     152
3      26
9       5
Name: quality, dtype: int64

In [6]:
data.type

0       white
1         red
2       white
3       white
4       white
        ...  
5492    white
5493    white
5494    white
5495    white
5496    white
Name: type, Length: 5497, dtype: object

In [7]:
data.type.value_counts()

white    4159
red      1338
Name: type, dtype: int64

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5497 entries, 0 to 5496
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   index                 5497 non-null   int64  
 1   quality               5497 non-null   int64  
 2   fixed acidity         5497 non-null   float64
 3   volatile acidity      5497 non-null   float64
 4   citric acid           5497 non-null   float64
 5   residual sugar        5497 non-null   float64
 6   chlorides             5497 non-null   float64
 7   free sulfur dioxide   5497 non-null   float64
 8   total sulfur dioxide  5497 non-null   float64
 9   density               5497 non-null   float64
 10  pH                    5497 non-null   float64
 11  sulphates             5497 non-null   float64
 12  alcohol               5497 non-null   float64
 13  type                  5497 non-null   object 
dtypes: float64(11), int64(2), object(1)
memory usage: 601.4+ KB


In [9]:
data['type']=np.where(data['type']=='white', 1, 0).astype('int')

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5497 entries, 0 to 5496
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   index                 5497 non-null   int64  
 1   quality               5497 non-null   int64  
 2   fixed acidity         5497 non-null   float64
 3   volatile acidity      5497 non-null   float64
 4   citric acid           5497 non-null   float64
 5   residual sugar        5497 non-null   float64
 6   chlorides             5497 non-null   float64
 7   free sulfur dioxide   5497 non-null   float64
 8   total sulfur dioxide  5497 non-null   float64
 9   density               5497 non-null   float64
 10  pH                    5497 non-null   float64
 11  sulphates             5497 non-null   float64
 12  alcohol               5497 non-null   float64
 13  type                  5497 non-null   int32  
dtypes: float64(11), int32(1), int64(2)
memory usage: 579.9 KB


In [11]:
from tensorflow.keras.utils import to_categorical

In [12]:
# 딥러닝 기본에서는 target이 명목형인 경우, dummy로 만들어 주어아 한다. (one-hot 인코딩)

In [13]:
y_data=to_categorical(data['quality']-3) 
# 그냥 둬도 작동은 하지만, -3을 함으로써 0부터 시작. 메모리 낭비를 최소화(그대로 둬도 0부터 시작하므로)

In [14]:
y_data[:3]

array([[0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0.]], dtype=float32)

In [15]:
X_data=data.loc[:,'fixed acidity':]

In [16]:
X_data[:3]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type
0,5.6,0.695,0.06,6.8,0.042,9.0,84.0,0.99432,3.44,0.44,10.2,1
1,8.8,0.61,0.14,2.4,0.067,10.0,42.0,0.9969,3.19,0.59,9.5,0
2,7.9,0.21,0.39,2.0,0.057,21.0,138.0,0.99176,3.05,0.52,10.9,1


In [17]:
X_data.shape, y_data.shape

((5497, 12), (5497, 7))

In [18]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [19]:
scaler = MinMaxScaler()
X_data_scaled = scaler.fit_transform(X_data)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X_data_scaled, y_data, 
                                                    test_size=0.2, 
                                                    shuffle=True, 
                                                    random_state=11)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(4397, 12) (4397, 7)
(1100, 12) (1100, 7)


In [22]:
# 모델생성
# 심층 신경망 모델
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense

In [23]:
model = Sequential()
model.add(Dense(128, activation='relu', input_dim=12))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(7, activation='softmax'))

In [24]:
# 이 모델로 설정을 하겠음(compile)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc','mae'])

In [25]:
# 설정한 레이어들을 요약
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               1664      
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 32)                2080      
                                                                 
 dense_3 (Dense)             (None, 10)                330       
                                                                 
 dense_4 (Dense)             (None, 7)                 77        
                                                                 
Total params: 12,407
Trainable params: 12,407
Non-trainable params: 0
_________________________________________________________________


In [29]:
history=model.fit(X_train, y_train, batch_size=50, epochs=200, verbose=2)

Epoch 1/200
88/88 - 0s - loss: 0.9303 - acc: 0.6070 - mae: 0.1501 - 73ms/epoch - 830us/step
Epoch 2/200
88/88 - 0s - loss: 0.9307 - acc: 0.6047 - mae: 0.1498 - 85ms/epoch - 968us/step
Epoch 3/200
88/88 - 0s - loss: 0.9267 - acc: 0.6086 - mae: 0.1498 - 102ms/epoch - 1ms/step
Epoch 4/200
88/88 - 0s - loss: 0.9230 - acc: 0.6056 - mae: 0.1486 - 65ms/epoch - 743us/step
Epoch 5/200
88/88 - 0s - loss: 0.9243 - acc: 0.6079 - mae: 0.1492 - 85ms/epoch - 962us/step
Epoch 6/200
88/88 - 0s - loss: 0.9190 - acc: 0.6111 - mae: 0.1487 - 90ms/epoch - 1ms/step
Epoch 7/200
88/88 - 0s - loss: 0.9150 - acc: 0.6179 - mae: 0.1478 - 78ms/epoch - 883us/step
Epoch 8/200
88/88 - 0s - loss: 0.9168 - acc: 0.6086 - mae: 0.1480 - 71ms/epoch - 805us/step
Epoch 9/200
88/88 - 0s - loss: 0.9247 - acc: 0.6143 - mae: 0.1485 - 73ms/epoch - 827us/step
Epoch 10/200
88/88 - 0s - loss: 0.9187 - acc: 0.6125 - mae: 0.1482 - 70ms/epoch - 790us/step
Epoch 11/200
88/88 - 0s - loss: 0.9172 - acc: 0.6147 - mae: 0.1483 - 72ms/epoch - 

Epoch 90/200
88/88 - 0s - loss: 0.8209 - acc: 0.6641 - mae: 0.1337 - 68ms/epoch - 774us/step
Epoch 91/200
88/88 - 0s - loss: 0.8139 - acc: 0.6636 - mae: 0.1324 - 60ms/epoch - 680us/step
Epoch 92/200
88/88 - 0s - loss: 0.8119 - acc: 0.6664 - mae: 0.1321 - 58ms/epoch - 654us/step
Epoch 93/200
88/88 - 0s - loss: 0.8231 - acc: 0.6550 - mae: 0.1330 - 59ms/epoch - 665us/step
Epoch 94/200
88/88 - 0s - loss: 0.8141 - acc: 0.6650 - mae: 0.1321 - 64ms/epoch - 725us/step
Epoch 95/200
88/88 - 0s - loss: 0.8094 - acc: 0.6641 - mae: 0.1319 - 62ms/epoch - 703us/step
Epoch 96/200
88/88 - 0s - loss: 0.8085 - acc: 0.6636 - mae: 0.1315 - 62ms/epoch - 699us/step
Epoch 97/200
88/88 - 0s - loss: 0.8097 - acc: 0.6707 - mae: 0.1312 - 48ms/epoch - 544us/step
Epoch 98/200
88/88 - 0s - loss: 0.8138 - acc: 0.6659 - mae: 0.1317 - 66ms/epoch - 748us/step
Epoch 99/200
88/88 - 0s - loss: 0.8104 - acc: 0.6602 - mae: 0.1315 - 73ms/epoch - 827us/step
Epoch 100/200
88/88 - 0s - loss: 0.8125 - acc: 0.6559 - mae: 0.1316 - 

Epoch 178/200
88/88 - 0s - loss: 0.7195 - acc: 0.7046 - mae: 0.1174 - 69ms/epoch - 788us/step
Epoch 179/200
88/88 - 0s - loss: 0.7146 - acc: 0.7082 - mae: 0.1176 - 68ms/epoch - 771us/step
Epoch 180/200
88/88 - 0s - loss: 0.7086 - acc: 0.7084 - mae: 0.1164 - 56ms/epoch - 635us/step
Epoch 181/200
88/88 - 0s - loss: 0.7024 - acc: 0.7132 - mae: 0.1160 - 53ms/epoch - 601us/step
Epoch 182/200
88/88 - 0s - loss: 0.7131 - acc: 0.7028 - mae: 0.1169 - 72ms/epoch - 816us/step
Epoch 183/200
88/88 - 0s - loss: 0.7081 - acc: 0.7062 - mae: 0.1157 - 73ms/epoch - 827us/step
Epoch 184/200
88/88 - 0s - loss: 0.7034 - acc: 0.7066 - mae: 0.1156 - 72ms/epoch - 816us/step
Epoch 185/200
88/88 - 0s - loss: 0.7112 - acc: 0.7075 - mae: 0.1161 - 72ms/epoch - 816us/step
Epoch 186/200
88/88 - 0s - loss: 0.7100 - acc: 0.7009 - mae: 0.1166 - 70ms/epoch - 793us/step
Epoch 187/200
88/88 - 0s - loss: 0.7093 - acc: 0.7021 - mae: 0.1162 - 77ms/epoch - 873us/step
Epoch 188/200
88/88 - 0s - loss: 0.7071 - acc: 0.7050 - mae:

In [30]:
history

<keras.callbacks.History at 0x2033d358748>

In [32]:
# epoch를 학습시킨 후(train data의 학습 결과)
# loss: 0.6898 - acc: 0.7159 - mae: 0.1128

In [34]:
# 테스트 데이터의 학습 결과를 비교
# acc: train < test
test_result=model.evaluate(X_test, y_test)



In [35]:
test_result

[1.1199395656585693, 0.5745454430580139, 0.14004941284656525]

In [36]:
# overfitting 되었음
# train - loss: 0.6898 - acc: 0.7159 - mae: 0.1128
# test  - loss: 1.1199 - acc: 0.5745 - mae: 0.1400

# (테스트 데이터가)
# loss(적을수록 좋음):train < test
# acc(클수록 좋음): train > test
# mae(적을수록 좋음): train < test

In [37]:
# 다중 분류에서는 predict를 할 때 각 항목이 될 가능성을 계산해준다.
y_pred=model.predict(X_test)

In [38]:
y_pred[0]

array([6.8777872e-06, 2.1097001e-03, 2.1587351e-02, 3.8487586e-01,
       5.7676655e-01, 1.4628595e-02, 2.5024761e-05], dtype=float32)

In [40]:
y_pred0_index=np.argmax(y_pred[0])

In [41]:
y_pred0_index # predict하여 예측한 품질의 각 확률 중 최대값

4

In [42]:
y_pred0_index+3 #최종 예측 품질

7

In [46]:
y_test[0] # 실제 품질은 6

array([0., 0., 0., 1., 0., 0., 0.], dtype=float32)

In [44]:
len(y_pred)

1100

In [47]:
y_pred.shape

(1100, 7)

In [None]:
# 예측한 모든 품질을 한번에 계산하기

In [48]:
y_pred_test=np.argmax(y_pred, axis=-1) + 3

In [49]:
y_pred_test

array([7, 6, 6, ..., 6, 6, 6], dtype=int64)

In [54]:
# 실제 품질

In [55]:
y_test2=np.argmax(y_test, axis=-1) + 3

In [56]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

In [57]:
print(accuracy_score(y_test2, y_pred_test))

0.5745454545454546


In [58]:
print(confusion_matrix(y_test2, y_pred_test))

[[  0   0   2   1   0   0   0]
 [  0   6  22   4   1   0   0]
 [  5   8 237  82  17   0   0]
 [  0   5 122 301  64   5   0]
 [  0   0   7  87  86   9   0]
 [  0   0   0  17   9   2   0]
 [  0   0   0   0   0   1   0]]


In [59]:
# 시각화

In [63]:
import matplotlib.pyplot as plt

ImportError: cannot import name 'plot_confusion_matrix' from 'sklearn.metrics' (C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\__init__.py)

In [61]:
label=['3', '4', '5', '6', '7', '8', '9'] # 라벨 설정
plot = plot_confusion_matrix(clf, # 분류 모델
                             y_pred_test, y_test2, # 예측 데이터와 예측값의 정답(y_true)
                             display_labels=label, # 표에 표시할 labels
                             cmap=plt.cm.Blue, # 컬러맵(plt.cm.Reds, plt.cm.rainbow 등이 있음)
                             normalize=None) # 'true', 'pred', 'all' 중에서 지정 가능. default=None

NameError: name 'plot_confusion_matrix' is not defined

In [None]:
p