In [1]:
import tensorflow as tf
import numpy as np
from tensorflow import keras
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# data load
df = pd.read_csv('train.csv')

In [3]:
df['label'].value_counts()

1    6742
7    6265
3    6131
2    5958
9    5949
0    5923
6    5918
8    5851
4    5842
5    5421
Name: label, dtype: int64

In [4]:
# data preprocessing
df.drop(['index'], axis=1, inplace=True)

In [5]:
# train, test split
X = df.drop(['label'], axis = 1)
y = df['label']

X_train , X_test , y_train , y_test = train_test_split(X, y,
                                                       test_size=0.2,  random_state=156)

In [6]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(48000, 784)
(12000, 784)
(48000,)
(12000,)


In [7]:
y_train = np.asarray(y_train[:48000])
y_test = np.asarray(y_test[:12000])
X_train = np.asarray(X_train[:48000] / 255.0)
X_test = np.asarray(X_test[:12000] / 255.0)

In [8]:
# 모델 구축
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256,input_shape = (784,),activation = 'relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(256,activation = 'relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(256,activation = 'relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(10,activation = 'softmax')])

In [9]:
# 모델 컴파일
model.compile(tf.keras.optimizers.Adam(lr=0.0001),
             loss='sparse_categorical_crossentropy',
             metrics=['accuracy'])

In [11]:
# 모델 훈련
model.fit(X_train, y_train,
          batch_size=128,
          epochs=30,
          validation_data=(X_test, y_test))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7f9c1841c2b0>

In [12]:
# 모델 평가
loc, acc = model.evaluate(X_test, y_test, verbose=2)
print('정확도:{:5.2f}%'.format(acc*100))

375/375 - 0s - loss: 0.0715 - accuracy: 0.9789
정확도:97.89%


In [13]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 256)               200960    
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 10)                2

In [65]:
# test, submission file load
test = pd.read_csv('test.csv')
sub = pd.read_csv('sample_submission.csv')

In [66]:
test.head()

Unnamed: 0,index,px1,px2,px3,px4,px5,px6,px7,px8,px9,...,px775,px776,px777,px778,px779,px780,px781,px782,px783,px784
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [67]:
test.drop(['index'], axis=1, inplace=True)

In [68]:
# 실 데이터 예측
actual_pred = model.predict_classes(test)
actual_pred



array([7, 2, 1, ..., 4, 5, 6])

In [69]:
sub['label'] = actual_pred
sub['label'].value_counts()

3    1158
1    1134
2    1086
7    1026
9     990
0     977
6     928
4     927
5     892
8     882
Name: label, dtype: int64

In [70]:
sub.to_csv("sub.csv", index=False)