## Data Load

In [90]:
import pandas as pd
import numpy as np

In [91]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [92]:
train.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [93]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42000 entries, 0 to 41999
Columns: 785 entries, label to pixel783
dtypes: int64(785)
memory usage: 251.5 MB


In [94]:
x = train.drop("label", axis = 1)
y = train["label"]

In [95]:
def preprocess(x, y):
    digits = []
    labels = []
    for i in range(0,len(x)):
        # TAKING ONE DIGIT OUT OF THE DATAFRAME
        digit = np.array(x.iloc[i:i+1,:])
        # RESHAPING TO MAKE AN IMAGE OF 28 X 28 PIXELS
        digit = digit.reshape(28,28)
        # EXTRACTING THE LABELS 
        label = y[i]
        
        digits.append(digit)
        labels.append(label)

    digits = np.array(digits)
    labels = np.array(labels)
    return digits,labels

In [96]:
digits,labels = preprocess(x, y)

In [97]:
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense

In [98]:
# CNN 모델 정의
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(28, 28, 1)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(10, activation='softmax'))

In [100]:
# 모델 컴파일 및 학습
model.compile(loss="sparse_categorical_crossentropy",
        optimizer="nadam",
        metrics=["accuracy"])
model.fit(digits, labels, batch_size=32, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x30b5360d0>

In [101]:
ypreds = model.predict(digits[:20000])
# TAKING MAXIMUM PROBABILITIES TO MAKE OUR PREDICTIONS
preds=[]
for i in range(0,len(ypreds)):
    preds.append(np.argmax(ypreds[i]))

# MAKING A DATAFRAME TO ACCES TRUE LABELS AND PREDICTED LABELS WITH EASE
df = pd.DataFrame({"TRUE":labels[:20000],"PREDS":preds})
df



Unnamed: 0,TRUE,PREDS
0,1,1
1,0,0
2,1,1
3,4,4
4,0,0
...,...,...
19995,9,9
19996,9,9
19997,6,6
19998,8,8


In [102]:
from sklearn.metrics import classification_report
rep = pd.DataFrame(classification_report(df["TRUE"],df["PREDS"],output_dict=True))
rep

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,accuracy,macro avg,weighted avg
precision,0.998964,0.994278,0.995035,0.988124,0.995937,0.999441,0.995468,0.998045,0.993243,0.998473,0.9956,0.995701,0.995617
recall,0.99793,0.996911,0.997015,1.0,0.994926,0.996656,0.997477,0.986473,0.997911,0.990909,0.9956,0.995621,0.9956
f1-score,0.998446,0.995593,0.996024,0.994026,0.995431,0.998046,0.996472,0.992225,0.995572,0.994677,0.9956,0.995651,0.995599
support,1932.0,2266.0,2010.0,2080.0,1971.0,1794.0,1982.0,2070.0,1915.0,1980.0,0.9956,20000.0,20000.0


In [103]:
def preprocess_test(df):
    digits = []
    for i in range(0,len(df)):
        digit = np.array(df.iloc[i:i+1,:])
        digit = digit.reshape(28,28)
        digits.append(digit)

    digits = np.array(digits)
    return digits

test_digits = preprocess_test(test)

In [104]:
pred = model.predict(test_digits)
test_preds=[]
for i in range(0,len(pred)):
    test_preds.append(np.argmax(pred[i]))



In [105]:
submission = pd.read_csv('sample_submission.csv')
submission['Label'] =test_preds
submission.head()

Unnamed: 0,ImageId,Label
0,1,2
1,2,0
2,3,9
3,4,9
4,5,3


In [106]:
submission.to_csv("submission.csv", index = False)