# Kaggle: Digit Recognizer

링크: https://www.kaggle.com/c/digit-recognizer

## IMPORT

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

## PREP (데이터 전처리), CDNN (모델) 클래스 생성

In [2]:
class PREP:
    def __init__(self, train, batch_size, ratio=None):
        self.train = train
        self.ratio = ratio
        self.batch_size = batch_size
        
    def butcher(self):
        for i in range(0, self.train.shape[0], self.batch_size):
            yield self.train.iloc[i:i+self.batch_size]
            
    def batches(self):
        if self.ratio is None:
            print("You need ratio to use this function")
            return
        label = pd.DataFrame(self.train["label"])
        label = pd.get_dummies(label["label"])
        limit = int(self.train.shape[0]*self.ratio)+1
        inv_limit = int(self.train.shape[0]*(1-self.ratio))+1
        train_data = self.train.drop("label", axis=1).iloc[0:limit]
        train_label = label.iloc[0:limit]
        test_data = self.train.drop("label", axis=1).iloc[limit:]
        test_label = label.iloc[limit:]
        x = (train_data.iloc[i:i+self.batch_size] for i in range(0, train_data.shape[0], self.batch_size))
        y = (train_label.iloc[i:i+self.batch_size] for i in range(0, train_label.shape[0], self.batch_size))
        j = (test_data.iloc[i:i+self.batch_size] for i in range(0, test_data.shape[0], self.batch_size))
        k = (test_label.iloc[i:i+self.batch_size] for i in range(0, test_label.shape[0], self.batch_size))
        nitro = int(limit/self.batch_size)
        nidran = int(inv_limit/self.batch_size)
        return x,y,j,k,nitro,nidran
    
    def for_predict(self):
        try:
            train = self.train.drop("label", axis=1)
        except KeyError:
            train = self.train
        
        neet = int(train.shape[0]/self.batch_size)
        z = (train.iloc[i:i+self.batch_size] for i in range(0, train.shape[0], self.batch_size))
        return z, neet

In [3]:
class CDNN:
    def __init__(self, train_df, batch_size, ratio, epoch, learn_rate, keep, name, test):
        self.tdf = train_df
        self.interval = batch_size
        self.ratio = ratio
        self.epoch = epoch
        self.rate = learn_rate
        self.keep = keep
        self.name = name
        self.test = test
        print("\n{} training has begun..".format(self.name))
        print("batch_size: {}, ratio: {}, epoch: {}".format(self.interval, self.ratio, self.epoch))
        print("learn_rate: {}, keep: {}".format(self.rate, self.keep))
        
    def flow(self):
        
        tf.reset_default_graph()
        print("Tensor graph has been reset")

        X = tf.placeholder(shape=[None,784], dtype=tf.float32)
        Y = tf.placeholder(shape=[None,10], dtype=tf.float32)
        keep = tf.placeholder(dtype=tf.float32)
        X_img = tf.reshape(X, shape=[-1,28,28,1])
        
        #########################################
        L1 = tf.layers.conv2d(inputs=X_img, filters=32, kernel_size=[3,3], 
                              padding="SAME", strides=1, activation=tf.nn.relu)
        L1 = tf.layers.max_pooling2d(inputs=L1, pool_size=[2,2], 
                                     padding="SAME", strides=2)
        L1 = tf.layers.dropout(inputs=L1, rate=0.3)
        
        
        L2 = tf.layers.conv2d(inputs=L1, filters=64, kernel_size=[3,3], 
                              padding="SAME", strides=1, activation=tf.nn.relu)
        L2 = tf.layers.max_pooling2d(inputs=L2, pool_size=[2,2], 
                                     padding="SAME", strides=2)
        L2 = tf.layers.dropout(inputs=L2, rate=0.3)
        
        
        L3 = tf.layers.conv2d(inputs=L2, filters=128, kernel_size=[3,3], 
                              padding="SAME", strides=1, activation=tf.nn.relu)
        L3 = tf.layers.max_pooling2d(inputs=L3, pool_size=[2,2], 
                                     padding="SAME", strides=2)
        L3 = tf.layers.dropout(inputs=L3, rate=0.3)
                                
        L3 = tf.reshape(L3, shape=[-1,4*4*128])
        
        #########################################
        
        dense1 = tf.layers.dense(inputs=L3, 
                                 units=128, 
                                 activation=tf.nn.relu)
        dense1 = tf.layers.dropout(inputs=dense1, 
                                   rate=keep)

        dense2 = tf.layers.dense(inputs=dense1, units=256, activation=tf.nn.relu)
        dense2 = tf.layers.dropout(inputs=dense2, rate=keep)

        dense3 = tf.layers.dense(inputs=dense2, units=128, activation=tf.nn.relu)
        dense3 = tf.layers.dropout(inputs=dense3, rate=keep)

        dense4 = tf.layers.dense(inputs=dense3, units=512, activation=tf.nn.relu)
        dense4 = tf.layers.dropout(inputs=dense4, rate=keep)

        dense5 = tf.layers.dense(inputs=dense4, units=1024, activation=tf.nn.relu)
        dense5 = tf.layers.dropout(inputs=dense5, rate=keep)
        
        #########################################
        
        H = tf.layers.dense(inputs=dense5, units=10)
        cost = tf.losses.softmax_cross_entropy(Y, H)
        trainer = tf.train.AdamOptimizer(learning_rate=self.rate).minimize(cost)
        
        sess = tf.Session()
        sess.run(tf.global_variables_initializer())
        
        tensors = sess, trainer, cost, X, Y, keep, H
        
        return tensors
        
    def learn(self):
        sess, trainer, cost, X, Y, keep, H = self.flow()
        print("Engaging the flow")
        
        predict = tf.argmax(H,1)
        correct = tf.equal(predict, tf.argmax(Y,1))
        accuracy = tf.reduce_sum(tf.cast(correct,dtype=tf.float32))
        
        for step in range(self.epoch):
            x,y,j,k,nitro,nidran = PREP(self.tdf, self.interval, self.ratio).batches()
            for n in range(nitro):
                _, cost_val = sess.run([trainer, cost], feed_dict={X:next(x), Y:next(y), keep:self.keep})
            result_sum = 0;
            for n in range(nidran):
                correct_num = sess.run(accuracy, feed_dict={X:next(j), Y:next(k), keep:1})
                result_sum += correct_num
            percy = result_sum/(self.tdf.shape[0]*(1-self.ratio))
            if step%2==0:
                print("cost: {}".format(cost_val))
                print("정확도: {}".format(percy))
        
        print("Training Complete!")
        
        enter = []
        z,neet = PREP(self.tdf, self.interval).for_predict()
        for n in range(neet):
            result = sess.run(H, feed_dict={X:next(z), keep:1})
            for r in result:
                enter.append(r)
        pd.DataFrame(enter).to_csv("./models/{}_test.csv".format(self.name), sep=",", index=False)
        print("{}_test.csv has been saved.".format(self.name))
        
        enter = []
        z,neet = PREP(self.test, self.interval).for_predict()
        for n in range(neet):
            result = sess.run(H, feed_dict={X:next(z), keep:1})
            for r in result:
                enter.append(r)
        pd.DataFrame(enter).to_csv("./models/{}_prediction.csv".format(self.name), sep=",", index=False)
        print("{}_prediction.csv has been saved.".format(self.name))

        return sess, H, X, Y, keep, self.name
        


## 클래스(PREP, CDNN) 구현

In [4]:
train_file = pd.read_csv("./data/digits/train.csv")
train = pd.DataFrame(train_file)

test_file = pd.read_csv("./data/digits/test.csv")
test_df = pd.DataFrame(test_file)

test_label = pd.DataFrame(train["label"])
test_label = test_label.values.reshape(-1)

In [5]:
train.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
%%time

test_summed = pd.DataFrame()
for n in range(25):
    model = CDNN(train, 100, 7/10, 25, 0.0001, 0.5,"model_{}".format(n), test_df).learn()
    if n == 0:
        test_summed = test_summed.append(pd.read_csv("./models/model_{}_test.csv".format(n)))
    else:
        test_summed = test_summed + pd.read_csv("./models/model_{}_test.csv".format(n))
    
print("\nTrained all models successfully.")
print("now commencing ensemble:")
with tf.Session() as test_sess:
    test_maxed = tf.argmax(test_summed, axis=1)
    correct = tf.equal(test_label, test_maxed)
    predict = tf.reduce_sum(tf.cast(correct, dtype=tf.float32))
    accuracy = (test_sess.run(predict)/test_label.shape[0])
    print("ensemble 정확도: {}".format(accuracy))


model_0 training has begun..
batch_size: 100, ratio: 0.7, epoch: 25
learn_rate: 0.0001, keep: 0.5
Tensor graph has been reset
Engaging the flow
cost: 0.11166945099830627
정확도: 0.9578571428571427
cost: 0.026284752413630486
정확도: 0.9757936507936507
cost: 0.0015420368872582912
정확도: 0.9793650793650792
cost: 0.0013552221935242414
정확도: 0.9787301587301586
cost: 0.00010014755389420316
정확도: 0.9846031746031745
cost: 5.7839861256070435e-05
정확도: 0.9857936507936507
cost: 0.00020432980090845376
정확도: 0.9857142857142855
cost: 0.00046699921949766576
정확도: 0.9798412698412697
cost: 7.861454832891468e-06
정확도: 0.9842857142857141
cost: 3.38785866915714e-06
정확도: 0.9835714285714284
cost: 1.1460485438874457e-05
정확도: 0.9877777777777776
cost: 2.217196197307203e-06
정확도: 0.9849999999999999
cost: 3.2647469197399914e-05
정확도: 0.9869841269841269
Training Complete!
model_0_test.csv has been saved.
model_0_prediction.csv has been saved.

model_1 training has begun..
batch_size: 100, ratio: 0.7, epoch: 25
learn_rate: 0.000

Engaging the flow
cost: 0.08907097578048706
정확도: 0.9642857142857142
cost: 0.008750050328671932
정확도: 0.977063492063492
cost: 0.03176242113113403
정확도: 0.9795238095238094
cost: 0.004876079969108105
정확도: 0.9811904761904761
cost: 0.0021812228951603174
정확도: 0.9828571428571428
cost: 0.00012935916311107576
정확도: 0.9828571428571428
cost: 0.0007257473189383745
정확도: 0.9715079365079363
cost: 0.00048104007146321237
정확도: 0.982142857142857
cost: 0.00010993843898177147
정확도: 0.9867460317460316
cost: 6.798483082093298e-05
정확도: 0.984126984126984
cost: 0.010644487105309963
정확도: 0.9797619047619046
cost: 0.000631948874797672
정확도: 0.9832539682539682
cost: 0.0001453511940781027
정확도: 0.9873809523809522
Training Complete!
model_9_test.csv has been saved.
model_9_prediction.csv has been saved.

model_10 training has begun..
batch_size: 100, ratio: 0.7, epoch: 25
learn_rate: 0.0001, keep: 0.5
Tensor graph has been reset
Engaging the flow
cost: 0.09359478950500488
정확도: 0.9512698412698412
cost: 0.007195981219410896


cost: 0.08083142340183258
정확도: 0.9557936507936506
cost: 0.02150394208729267
정확도: 0.9722222222222221
cost: 0.01217598281800747
정확도: 0.9714285714285713
cost: 0.005966701079159975
정확도: 0.9813492063492062
cost: 0.0006060560117475688
정확도: 0.9797619047619046
cost: 0.0014029279118403792
정확도: 0.9826190476190475
cost: 0.00039031653432175517
정확도: 0.9826984126984125
cost: 0.0003803115396294743
정확도: 0.9874603174603174
cost: 0.026122868061065674
정확도: 0.9842857142857141
cost: 0.0018396642990410328
정확도: 0.9852380952380951
cost: 0.00010863409261219203
정확도: 0.9853174603174601
cost: 3.465189001872204e-05
정확도: 0.9864285714285713
cost: 1.1492473277030513e-05
정확도: 0.9876190476190475
Training Complete!
model_18_test.csv has been saved.
model_18_prediction.csv has been saved.

model_19 training has begun..
batch_size: 100, ratio: 0.7, epoch: 25
learn_rate: 0.0001, keep: 0.5
Tensor graph has been reset
Engaging the flow
cost: 0.06644792854785919
정확도: 0.9576984126984126
cost: 0.0044594695791602135
정확도: 0.97809

## 예측 및 예측값 저장

In [10]:
extract_0 = pd.read_csv("./models/model_0_prediction.csv")

for n in range(1,25):
    amalg = extract_0.add(pd.read_csv("./models/model_{}_prediction.csv".format(n)))

final = amalg.idxmax(axis=1)
display(amalg.head(), final.head())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-7.645569,-3.583387,55.473415,-0.341673,-2.585356,-14.679031,-10.980712,-5.670987,-9.255359,-13.532999
1,46.852884,-10.677723,-9.045976,-14.972256,-9.082279,-0.427008,-3.804859,2.46217,-9.69608,1.312167
2,-11.987805,-7.457409,-1.028674,-9.333082,6.820988,-7.527546,-17.844129,-4.645116,9.079844,40.325674
3,17.480016,-11.841895,-2.144212,-3.045258,-3.328139,-5.07356,-7.973568,-2.113376,-2.552117,17.727955
4,-15.433798,-3.114662,8.223172,35.171421,-9.490192,-2.747416,-13.545908,-7.985195,10.109681,-1.298504


0    2
1    0
2    9
3    9
4    3
dtype: object

In [11]:
submission = pd.DataFrame([], columns=["ImageId","Label"])
submission["Label"] = final
imgId = pd.DataFrame(list(range(1,28001)))
submission["ImageId"] = imgId
submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28000 entries, 0 to 27999
Data columns (total 2 columns):
ImageId    28000 non-null int64
Label      28000 non-null object
dtypes: int64(1), object(1)
memory usage: 437.6+ KB


In [12]:
submission.head()

Unnamed: 0,ImageId,Label
0,1,2
1,2,0
2,3,9
3,4,9
4,5,3


In [13]:
submission.to_csv("./data/digits_pred_ens_10_20mdl_25epc.csv", sep=',', index=False)