In [1]:
import tensorflow as tf
import datetime
import keras
from keras import layers, Input
from keras.models import Model
import numpy as np
import pandas as pd
from keras import optimizers
#from tensorflow.keras import datasets, layers, models, optimizers
%load_ext tensorboard



Using TensorFlow backend.


In [2]:
from keras_radam import RAdam
from tensorflow.python.keras.optimizer_v2.optimizer_v2 import OptimizerV2
from tensorflow.python import ops, math_ops, state_ops, control_flow_ops
from tensorflow.python.keras import backend as K

__all__ = ['RAdam']


class RAdam(OptimizerV2):
    """RAdam optimizer.
    According to the paper
    [On The Variance Of The Adaptive Learning Rate And Beyond](https://arxiv.org/pdf/1908.03265v1.pdf).
    """

    def __init__(self,
                 learning_rate=0.001,
                 beta_1=0.9,
                 beta_2=0.999,
                 epsilon=1e-7,
                 weight_decay=0.,
                 amsgrad=False,
                 total_steps=0,
                 warmup_proportion=0.1,
                 min_lr=0.,
                 name='RAdam',
                 **kwargs):
        r"""Construct a new Adam optimizer.
        Args:
            learning_rate: A Tensor or a floating point value.    The learning rate.
            beta_1: A float value or a constant float tensor. The exponential decay
                rate for the 1st moment estimates.
            beta_2: A float value or a constant float tensor. The exponential decay
                rate for the 2nd moment estimates.
            epsilon: A small constant for numerical stability. This epsilon is
                "epsilon hat" in the Kingma and Ba paper (in the formula just before
                Section 2.1), not the epsilon in Algorithm 1 of the paper.
            weight_decay: A floating point value. Weight decay for each param.
            amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm from
                the paper "On the Convergence of Adam and beyond".
            total_steps: An integer. Total number of training steps.
                Enable warmup by setting a positive value.
            warmup_proportion: A floating point value. The proportion of increasing steps.
            min_lr: A floating point value. Minimum learning rate after warmup.
            name: Optional name for the operations created when applying gradients.
                Defaults to "Adam".    @compatibility(eager) When eager execution is
                enabled, `learning_rate`, `beta_1`, `beta_2`, and `epsilon` can each be
                a callable that takes no arguments and returns the actual value to use.
                This can be useful for changing these values across different
                invocations of optimizer functions. @end_compatibility
            **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
                `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
                gradients by value, `decay` is included for backward compatibility to
                allow time inverse decay of learning rate. `lr` is included for backward
                compatibility, recommended to use `learning_rate` instead.
        """

        super(RAdam, self).__init__(name, **kwargs)
        self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
        self._set_hyper('beta_1', beta_1)
        self._set_hyper('beta_2', beta_2)
        self._set_hyper('decay', self._initial_decay)
        self._set_hyper('weight_decay', weight_decay)
        self._set_hyper('total_steps', float(total_steps))
        self._set_hyper('warmup_proportion', warmup_proportion)
        self._set_hyper('min_lr', min_lr)
        self.epsilon = epsilon or K.epsilon()
        self.amsgrad = amsgrad
        self._initial_weight_decay = weight_decay
        self._initial_total_steps = total_steps

    def _create_slots(self, var_list):
        for var in var_list:
            self.add_slot(var, 'm')
        for var in var_list:
            self.add_slot(var, 'v')
        if self.amsgrad:
            for var in var_list:
                self.add_slot(var, 'vhat')

    def set_weights(self, weights):
        params = self.weights
        num_vars = int((len(params) - 1) / 2)
        if len(weights) == 3 * num_vars + 1:
            weights = weights[:len(params)]
        super(RAdam, self).set_weights(weights)

    def _resource_apply_dense(self, grad, var):
        var_dtype = var.dtype.base_dtype
        lr_t = self._decayed_lr(var_dtype)
        m = self.get_slot(var, 'm')
        v = self.get_slot(var, 'v')
        beta_1_t = self._get_hyper('beta_1', var_dtype)
        beta_2_t = self._get_hyper('beta_2', var_dtype)
        epsilon_t = ops.convert_to_tensor(self.epsilon, var_dtype)
        local_step = math_ops.cast(self.iterations + 1, var_dtype)
        beta_1_power = math_ops.pow(beta_1_t, local_step)
        beta_2_power = math_ops.pow(beta_2_t, local_step)

        if self._initial_total_steps > 0:
            total_steps = self._get_hyper('total_steps', var_dtype)
            warmup_steps = total_steps * self._get_hyper('warmup_proportion', var_dtype)
            min_lr = self._get_hyper('min_lr', var_dtype)
            decay_steps = K.maximum(total_steps - warmup_steps, 1)
            decay_rate = (min_lr - lr_t) / decay_steps
            lr_t = tf.where(
                local_step <= warmup_steps,
                lr_t * (local_step / warmup_steps),
                lr_t + decay_rate * K.minimum(local_step - warmup_steps, decay_steps),
            )

        sma_inf = 2.0 / (1.0 - beta_2_t) - 1.0
        sma_t = sma_inf - 2.0 * local_step * beta_2_power / (1.0 - beta_2_power)

        m_t = state_ops.assign(m,
                               beta_1_t * m + (1.0 - beta_1_t) * grad,
                               use_locking=self._use_locking)
        m_corr_t = m_t / (1.0 - beta_1_power)

        v_t = state_ops.assign(v,
                               beta_2_t * v + (1.0 - beta_2_t) * math_ops.square(grad),
                               use_locking=self._use_locking)
        if self.amsgrad:
            vhat = self.get_slot(var, 'vhat')
            vhat_t = state_ops.assign(vhat,
                                      math_ops.maximum(vhat, v_t),
                                      use_locking=self._use_locking)
            v_corr_t = math_ops.sqrt(vhat_t / (1.0 - beta_2_power))
        else:
            vhat_t = None
            v_corr_t = math_ops.sqrt(v_t / (1.0 - beta_2_power))

        r_t = math_ops.sqrt((sma_t - 4.0) / (sma_inf - 4.0) *
                            (sma_t - 2.0) / (sma_inf - 2.0) *
                            sma_inf / sma_t)

        var_t = tf.where(sma_t >= 5.0, r_t * m_corr_t / (v_corr_t + epsilon_t), m_corr_t)

        if self._initial_weight_decay > 0.0:
            var_t += self._get_hyper('weight_decay', var_dtype) * var

        var_update = state_ops.assign_sub(var,
                                          lr_t * var_t,
                                          use_locking=self._use_locking)

        updates = [var_update, m_t, v_t]
        if self.amsgrad:
            updates.append(vhat_t)
        return control_flow_ops.group(*updates)

    def _resource_apply_sparse(self, grad, var, indices):
        var_dtype = var.dtype.base_dtype
        lr_t = self._decayed_lr(var_dtype)
        beta_1_t = self._get_hyper('beta_1', var_dtype)
        beta_2_t = self._get_hyper('beta_2', var_dtype)
        epsilon_t = ops.convert_to_tensor(self.epsilon, var_dtype)
        local_step = math_ops.cast(self.iterations + 1, var_dtype)
        beta_1_power = math_ops.pow(beta_1_t, local_step)
        beta_2_power = math_ops.pow(beta_2_t, local_step)

        if self._initial_total_steps > 0:
            total_steps = self._get_hyper('total_steps', var_dtype)
            warmup_steps = total_steps * self._get_hyper('warmup_proportion', var_dtype)
            min_lr = self._get_hyper('min_lr', var_dtype)
            decay_steps = K.maximum(total_steps - warmup_steps, 1)
            decay_rate = (min_lr - lr_t) / decay_steps
            lr_t = tf.where(
                local_step <= warmup_steps,
                lr_t * (local_step / warmup_steps),
                lr_t + decay_rate * K.minimum(local_step - warmup_steps, decay_steps),
            )

        sma_inf = 2.0 / (1.0 - beta_2_t) - 1.0
        sma_t = sma_inf - 2.0 * local_step * beta_2_power / (1.0 - beta_2_power)

        m = self.get_slot(var, 'm')
        m_scaled_g_values = grad * (1 - beta_1_t)
        m_t = state_ops.assign(m, m * beta_1_t, use_locking=self._use_locking)
        with ops.control_dependencies([m_t]):
            m_t = self._resource_scatter_add(m, indices, m_scaled_g_values)
        m_corr_t = m_t / (1.0 - beta_1_power)

        v = self.get_slot(var, 'v')
        v_scaled_g_values = (grad * grad) * (1 - beta_2_t)
        v_t = state_ops.assign(v, v * beta_2_t, use_locking=self._use_locking)
        with ops.control_dependencies([v_t]):
            v_t = self._resource_scatter_add(v, indices, v_scaled_g_values)

        if self.amsgrad:
            vhat = self.get_slot(var, 'vhat')
            vhat_t = state_ops.assign(vhat,
                                      math_ops.maximum(vhat, v_t),
                                      use_locking=self._use_locking)
            v_corr_t = math_ops.sqrt(vhat_t / (1.0 - beta_2_power))
        else:
            vhat_t = None
            v_corr_t = math_ops.sqrt(v_t / (1.0 - beta_2_power))

        r_t = math_ops.sqrt((sma_t - 4.0) / (sma_inf - 4.0) *
                            (sma_t - 2.0) / (sma_inf - 2.0) *
                            sma_inf / sma_t)

        var_t = tf.where(sma_t >= 5.0, r_t * m_corr_t / (v_corr_t + epsilon_t), m_corr_t)

        if self._initial_weight_decay > 0.0:
            var_t += self._get_hyper('weight_decay', var_dtype) * var

        var_update = self._resource_scatter_add(var, indices, tf.gather(-lr_t * var_t, indices))

        updates = [var_update, m_t, v_t]
        if self.amsgrad:
            updates.append(vhat_t)
        return control_flow_ops.group(*updates)

    def get_config(self):
        config = super(RAdam, self).get_config()
        config.update({
            'learning_rate': self._serialize_hyperparameter('learning_rate'),
            'beta_1': self._serialize_hyperparameter('beta_1'),
            'beta_2': self._serialize_hyperparameter('beta_2'),
            'decay': self._serialize_hyperparameter('decay'),
            'weight_decay': self._serialize_hyperparameter('weight_decay'),
            'epsilon': self.epsilon,
            'amsgrad': self.amsgrad,
            'total_steps': self._serialize_hyperparameter('total_steps'),
            'warmup_proportion': self._serialize_hyperparameter('warmup_proportion'),
            'min_lr': self._serialize_hyperparameter('min_lr'),
        })
        return config

In [3]:
train = pd.read_csv('data/train.csv').set_index('id')
test = pd.read_csv('data/test.csv').set_index('id')

In [4]:
train

Unnamed: 0_level_0,digit,letter,0,1,2,3,4,5,6,7,...,774,775,776,777,778,779,780,781,782,783
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5,L,1,1,1,4,3,0,0,4,...,2,1,0,1,2,4,4,4,3,4
2,0,B,0,4,0,0,4,1,1,1,...,0,3,0,1,4,1,4,2,1,2
3,4,L,1,1,2,2,1,1,1,0,...,3,3,3,0,2,0,3,0,2,2
4,9,D,1,2,0,2,0,4,0,3,...,3,3,2,0,1,4,0,0,1,1
5,6,A,3,0,2,4,0,3,0,4,...,4,4,3,2,1,3,4,3,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2044,6,V,2,4,3,4,2,4,4,1,...,0,2,2,0,0,1,3,1,4,0
2045,1,L,3,2,2,1,1,4,0,1,...,2,3,4,2,1,2,3,4,1,1
2046,9,A,4,0,4,0,2,4,4,4,...,2,3,1,1,3,4,2,2,0,0
2047,0,Z,2,3,3,0,3,0,4,3,...,2,3,1,1,0,4,1,4,3,1


In [5]:
aa = train.copy()
aa

Unnamed: 0_level_0,digit,letter,0,1,2,3,4,5,6,7,...,774,775,776,777,778,779,780,781,782,783
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5,L,1,1,1,4,3,0,0,4,...,2,1,0,1,2,4,4,4,3,4
2,0,B,0,4,0,0,4,1,1,1,...,0,3,0,1,4,1,4,2,1,2
3,4,L,1,1,2,2,1,1,1,0,...,3,3,3,0,2,0,3,0,2,2
4,9,D,1,2,0,2,0,4,0,3,...,3,3,2,0,1,4,0,0,1,1
5,6,A,3,0,2,4,0,3,0,4,...,4,4,3,2,1,3,4,3,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2044,6,V,2,4,3,4,2,4,4,1,...,0,2,2,0,0,1,3,1,4,0
2045,1,L,3,2,2,1,1,4,0,1,...,2,3,4,2,1,2,3,4,1,1
2046,9,A,4,0,4,0,2,4,4,4,...,2,3,1,1,3,4,2,2,0,0
2047,0,Z,2,3,3,0,3,0,4,3,...,2,3,1,1,0,4,1,4,3,1


In [6]:
letter = aa['letter']
test_letter = test['letter']
letter = letter.to_frame()
test_letter = test_letter.to_frame()

In [7]:
ytrain = aa['digit']
ytrain = ytrain.to_frame()
ytrain

Unnamed: 0_level_0,digit
id,Unnamed: 1_level_1
1,5
2,0
3,4
4,9
5,6
...,...
2044,6
2045,1
2046,9
2047,0


In [8]:
xtrain = aa.drop(['digit', 'letter'], axis = 1)
test = test.drop(['letter'], axis = 1)

In [9]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()

onehot_ytrain = encoder.fit_transform(ytrain)
onehot_letter = encoder.fit_transform(letter)
onehot_test_letter = encoder.fit_transform(test_letter)

onehot_ytrain = onehot_ytrain.toarray()
onehot_letter = onehot_letter.toarray()
onehot_test_letter = onehot_test_letter.toarray()

print(onehot_ytrain)
print(onehot_letter)
print(onehot_test_letter)

[[0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 1.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [10]:
# digit  디코딩
decoding = np.argmax(onehot_ytrain, axis=1).reshape(-1,1)
decoding = pd.DataFrame(decoding, columns = ['digit'])
decoding

Unnamed: 0,digit
0,5
1,0
2,4
3,9
4,6
...,...
2043,6
2044,1
2045,9
2046,0


In [11]:
# xtrain /= 255.0
# test /= 255.0

xtrain = np.array(xtrain)
test = np.array(test)

print(xtrain.shape)
print(test.shape)

(2048, 784)
(20480, 784)


In [12]:
print(xtrain.shape)
print(onehot_ytrain.shape)
print(onehot_letter.shape)

(2048, 784)
(2048, 10)
(2048, 26)


In [13]:
xtrain = xtrain.reshape((2048, 28, 28, 1))
test = test.reshape((20480, 28, 28, 1))
onehot_letter = onehot_letter.reshape((2048, 1, 26))
onehot_test_letter = onehot_test_letter.reshape((20480, 1, 26))

xtrain = xtrain.astype('float32')
test = test.astype('float32')
onehot_letter = onehot_letter.astype('float32')
onehot_test_letter = onehot_test_letter.astype('float32')

In [14]:
print(xtrain.shape)
print(test.shape)
print(onehot_letter.shape)
print(onehot_test_letter.shape)

(2048, 28, 28, 1)
(20480, 28, 28, 1)
(2048, 1, 26)
(20480, 1, 26)


In [15]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

datagen = ImageDataGenerator(
    rescale = 1./255.0,
    width_shift_range = 5,
    height_shift_range = 5,
    rotation_range = 10,
    zoom_range = 0.05
)
datagen.fit(xtrain)

In [19]:
image_input = Input(shape = (28, 28, 1), dtype = 'float32', name = 'image')
letter_input = Input(shape = (1, 26), dtype = 'float32', name = 'letter')

In [20]:

# cnn = layers.BatchNormalization()(image_input)
cnn = layers.ZeroPadding2D((1, 1))(image_input)
cnn = layers.Conv2D(128, (3, 3), activation = 'relu', padding = 'same')(cnn)
cnn = layers.ZeroPadding2D((1, 1))(cnn)
# cnn = layers.BatchNormalization()(cnn)
cnn = layers.Conv2D(128, (3, 3), activation = 'relu', padding = 'same')(cnn)
cnn = layers.MaxPooling2D(2, 2)(cnn)
# cnn = layers.BatchNormalization()(cnn)
cnn = layers.ZeroPadding2D((1, 1))(cnn)
cnn = layers.Conv2D(256, (3, 3), activation = 'relu', padding = 'same')(cnn)
cnn = layers.ZeroPadding2D((1, 1))(cnn)
# cnn = layers.BatchNormalization()(cnn)
cnn = layers.Conv2D(256, (3, 3), activation = 'relu', padding = 'same')(cnn)
cnn = layers.MaxPooling2D(2, 2)(cnn)
cnn = layers.Dropout(0.2)(cnn)
cnn = layers.Flatten()(cnn)


dnn = layers.LSTM(128, activation = 'relu', return_sequences = True)(letter_input)
dnn = layers.LSTM(256, activation = 'relu', return_sequences = True)(dnn)
dnn = layers.LSTM(256, activation = 'relu')(dnn)

concatenated = layers.concatenate([cnn, dnn])

#result = layers.BatchNormalization()(concatenated)
result = layers.Dense(256, activation = 'relu')(concatenated)
result = layers.Dropout(0.5)(result)
result = layers.Dense(10, activation = 'softmax')(result)

model = Model([image_input, letter_input], result)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
image (InputLayer)              [(None, 28, 28, 1)]  0                                            
__________________________________________________________________________________________________
zero_padding2d_1 (ZeroPadding2D (None, 30, 30, 1)    0           image[0][0]                      
__________________________________________________________________________________________________
conv2d (Conv2D)                 (None, 30, 30, 128)  1280        zero_padding2d_1[0][0]           
__________________________________________________________________________________________________
zero_padding2d_2 (ZeroPadding2D (None, 32, 32, 128)  0           conv2d[0][0]                     
______________________________________________________________________________________________

In [21]:
# OPTIMIZER = tf.keras.optimizers.Adamax(0.1)
# OPTIMIZER = tf.keras.optimizers.RMSprop(0.01)
# OPTIMIZER = tf.keras.optimizers.SGD(0.1)
# OPTIMIZER = tf.keras.optimizers.Adagrad(0.01)
# OPTIMIZER = tf.keras.optimizers.Adam(0.001)
# OPTIMIZER = RAdam(learning_rate = 0.001, warmup_proportion = 0.1)

In [22]:
EarlyStopping = tf.keras.callbacks.EarlyStopping(monitor = 'val_accuracy', patience = 10)
ModelCheckpoint = tf.keras.callbacks.ModelCheckpoint(filepath = 'best_model.h5', monitor = 'val_loss', save_weights_only = True, save_best_only = True)
ReduceLROnPlateau = tf.keras.callbacks.ReduceLROnPlateau(monitor = 'val_loss', patience = 5, factor = 0.1)

In [23]:
model.compile(RAdam(),
                loss = 'categorical_crossentropy',
                metrics = ["accuracy"])

In [28]:
model.fit_generator(
    datagen.flow({'image' : xtrain, 'letter' : onehot_letter}, onehot_ytrain, batch_size = 32),
    epochs = 150,
    steps_per_epoch = 100,
    verbose = 1,
)

ValueError: `x` (images tensor) and `y` (labels) should have the same length. Found: x.shape = (), y.shape = (2048, 10)

In [22]:
model.fit(
    {'image' : xtrain, 'letter' : onehot_letter}, onehot_ytrain,
    batch_size = 16,
    epochs = 150,
    verbose = 1,
    validation_split = 0.10,
    callbacks = [EarlyStopping, ModelCheckpoint, ReduceLROnPlateau]
)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150


<tensorflow.python.keras.callbacks.History at 0x205884d5a08>

[Tensorboard Link](http://localhost:6006)  
71/71 [==============================] - 13s 181ms/step - loss: 0.0737 - accuracy: 0.9756 - val_loss: 0.5013 - val_accuracy: 0.8244  85%

Epoch 11/150
58/58 [==============================] - 18s 316ms/step - loss: 0.2306 - accuracy: 0.9143 - val_loss: 0.4259 - val_accuracy: 0.8585 - lr: 0.0010 87%

In [23]:
y_pred = model.predict({'image' : test, 'letter' : onehot_test_letter}, batch_size=32)
# digit  디코딩
decoding = np.argmax(y_pred, axis=1).reshape(-1,1)
decoding = pd.DataFrame(decoding, columns = ['digit'])
decoding

Unnamed: 0,digit
0,6
1,9
2,8
3,0
4,3
...,...
20475,4
20476,1
20477,6
20478,8


In [24]:
sorted(decoding['digit'].unique())

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [25]:
model.load_weights('best_model.h5')
y_pred = model.predict({'image' : test, 'letter' : onehot_test_letter}, batch_size=32)
# digit  디코딩
decoding = np.argmax(y_pred, axis=1).reshape(-1,1)
decoding = pd.DataFrame(decoding, columns = ['digit'])
decoding

Unnamed: 0,digit
0,6
1,9
2,8
3,0
4,3
...,...
20475,4
20476,1
20477,6
20478,1


In [26]:
sub = pd.read_csv('data/submission.csv')
sub = sub.drop(['digit'], axis = 1)
sub = pd.concat([sub, decoding], axis = 1)
sub.to_csv('sub.csv', index = False, encoding = 'utf-8')
sub

Unnamed: 0,id,digit
0,2049,6
1,2050,9
2,2051,8
3,2052,0
4,2053,3
...,...,...
20475,22524,4
20476,22525,1
20477,22526,6
20478,22527,1
