In [1]:
import tensorflow as tf
import datetime
import keras
import numpy as np
import pandas as pd
from keras import layers, Input
from keras.models import Model, Input
from tensorflow.keras import datasets, layers, models, optimizers
from keras.layers import Conv2D, SeparableConv2D, Dense, MaxPooling2D, GlobalAveragePooling2D
from keras.layers import Activation, BatchNormalization, Dropout
from keras.layers import Add
from keras.regularizers import l2
from keras.optimizers import SGD

import keras.backend as K
%load_ext tensorboard



Using TensorFlow backend.


In [29]:
keras.__version__

'2.4.2'

In [2]:
from keras_radam import RAdam
from tensorflow.python.keras.optimizer_v2.optimizer_v2 import OptimizerV2
from tensorflow.python import ops, math_ops, state_ops, control_flow_ops
from tensorflow.python.keras import backend as K

__all__ = ['RAdam']


class RAdam(OptimizerV2):
    """RAdam optimizer.
    According to the paper
    [On The Variance Of The Adaptive Learning Rate And Beyond](https://arxiv.org/pdf/1908.03265v1.pdf).
    """

    def __init__(self,
                 learning_rate=0.001,
                 beta_1=0.9,
                 beta_2=0.999,
                 epsilon=1e-7,
                 weight_decay=0.,
                 amsgrad=False,
                 total_steps=0,
                 warmup_proportion=0.1,
                 min_lr=0.,
                 name='RAdam',
                 **kwargs):
        r"""Construct a new Adam optimizer.
        Args:
            learning_rate: A Tensor or a floating point value.    The learning rate.
            beta_1: A float value or a constant float tensor. The exponential decay
                rate for the 1st moment estimates.
            beta_2: A float value or a constant float tensor. The exponential decay
                rate for the 2nd moment estimates.
            epsilon: A small constant for numerical stability. This epsilon is
                "epsilon hat" in the Kingma and Ba paper (in the formula just before
                Section 2.1), not the epsilon in Algorithm 1 of the paper.
            weight_decay: A floating point value. Weight decay for each param.
            amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm from
                the paper "On the Convergence of Adam and beyond".
            total_steps: An integer. Total number of training steps.
                Enable warmup by setting a positive value.
            warmup_proportion: A floating point value. The proportion of increasing steps.
            min_lr: A floating point value. Minimum learning rate after warmup.
            name: Optional name for the operations created when applying gradients.
                Defaults to "Adam".    @compatibility(eager) When eager execution is
                enabled, `learning_rate`, `beta_1`, `beta_2`, and `epsilon` can each be
                a callable that takes no arguments and returns the actual value to use.
                This can be useful for changing these values across different
                invocations of optimizer functions. @end_compatibility
            **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
                `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
                gradients by value, `decay` is included for backward compatibility to
                allow time inverse decay of learning rate. `lr` is included for backward
                compatibility, recommended to use `learning_rate` instead.
        """

        super(RAdam, self).__init__(name, **kwargs)
        self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
        self._set_hyper('beta_1', beta_1)
        self._set_hyper('beta_2', beta_2)
        self._set_hyper('decay', self._initial_decay)
        self._set_hyper('weight_decay', weight_decay)
        self._set_hyper('total_steps', float(total_steps))
        self._set_hyper('warmup_proportion', warmup_proportion)
        self._set_hyper('min_lr', min_lr)
        self.epsilon = epsilon or K.epsilon()
        self.amsgrad = amsgrad
        self._initial_weight_decay = weight_decay
        self._initial_total_steps = total_steps

    def _create_slots(self, var_list):
        for var in var_list:
            self.add_slot(var, 'm')
        for var in var_list:
            self.add_slot(var, 'v')
        if self.amsgrad:
            for var in var_list:
                self.add_slot(var, 'vhat')

    def set_weights(self, weights):
        params = self.weights
        num_vars = int((len(params) - 1) / 2)
        if len(weights) == 3 * num_vars + 1:
            weights = weights[:len(params)]
        super(RAdam, self).set_weights(weights)

    def _resource_apply_dense(self, grad, var):
        var_dtype = var.dtype.base_dtype
        lr_t = self._decayed_lr(var_dtype)
        m = self.get_slot(var, 'm')
        v = self.get_slot(var, 'v')
        beta_1_t = self._get_hyper('beta_1', var_dtype)
        beta_2_t = self._get_hyper('beta_2', var_dtype)
        epsilon_t = ops.convert_to_tensor(self.epsilon, var_dtype)
        local_step = math_ops.cast(self.iterations + 1, var_dtype)
        beta_1_power = math_ops.pow(beta_1_t, local_step)
        beta_2_power = math_ops.pow(beta_2_t, local_step)

        if self._initial_total_steps > 0:
            total_steps = self._get_hyper('total_steps', var_dtype)
            warmup_steps = total_steps * self._get_hyper('warmup_proportion', var_dtype)
            min_lr = self._get_hyper('min_lr', var_dtype)
            decay_steps = K.maximum(total_steps - warmup_steps, 1)
            decay_rate = (min_lr - lr_t) / decay_steps
            lr_t = tf.where(
                local_step <= warmup_steps,
                lr_t * (local_step / warmup_steps),
                lr_t + decay_rate * K.minimum(local_step - warmup_steps, decay_steps),
            )

        sma_inf = 2.0 / (1.0 - beta_2_t) - 1.0
        sma_t = sma_inf - 2.0 * local_step * beta_2_power / (1.0 - beta_2_power)

        m_t = state_ops.assign(m,
                               beta_1_t * m + (1.0 - beta_1_t) * grad,
                               use_locking=self._use_locking)
        m_corr_t = m_t / (1.0 - beta_1_power)

        v_t = state_ops.assign(v,
                               beta_2_t * v + (1.0 - beta_2_t) * math_ops.square(grad),
                               use_locking=self._use_locking)
        if self.amsgrad:
            vhat = self.get_slot(var, 'vhat')
            vhat_t = state_ops.assign(vhat,
                                      math_ops.maximum(vhat, v_t),
                                      use_locking=self._use_locking)
            v_corr_t = math_ops.sqrt(vhat_t / (1.0 - beta_2_power))
        else:
            vhat_t = None
            v_corr_t = math_ops.sqrt(v_t / (1.0 - beta_2_power))

        r_t = math_ops.sqrt((sma_t - 4.0) / (sma_inf - 4.0) *
                            (sma_t - 2.0) / (sma_inf - 2.0) *
                            sma_inf / sma_t)

        var_t = tf.where(sma_t >= 5.0, r_t * m_corr_t / (v_corr_t + epsilon_t), m_corr_t)

        if self._initial_weight_decay > 0.0:
            var_t += self._get_hyper('weight_decay', var_dtype) * var

        var_update = state_ops.assign_sub(var,
                                          lr_t * var_t,
                                          use_locking=self._use_locking)

        updates = [var_update, m_t, v_t]
        if self.amsgrad:
            updates.append(vhat_t)
        return control_flow_ops.group(*updates)

    def _resource_apply_sparse(self, grad, var, indices):
        var_dtype = var.dtype.base_dtype
        lr_t = self._decayed_lr(var_dtype)
        beta_1_t = self._get_hyper('beta_1', var_dtype)
        beta_2_t = self._get_hyper('beta_2', var_dtype)
        epsilon_t = ops.convert_to_tensor(self.epsilon, var_dtype)
        local_step = math_ops.cast(self.iterations + 1, var_dtype)
        beta_1_power = math_ops.pow(beta_1_t, local_step)
        beta_2_power = math_ops.pow(beta_2_t, local_step)

        if self._initial_total_steps > 0:
            total_steps = self._get_hyper('total_steps', var_dtype)
            warmup_steps = total_steps * self._get_hyper('warmup_proportion', var_dtype)
            min_lr = self._get_hyper('min_lr', var_dtype)
            decay_steps = K.maximum(total_steps - warmup_steps, 1)
            decay_rate = (min_lr - lr_t) / decay_steps
            lr_t = tf.where(
                local_step <= warmup_steps,
                lr_t * (local_step / warmup_steps),
                lr_t + decay_rate * K.minimum(local_step - warmup_steps, decay_steps),
            )

        sma_inf = 2.0 / (1.0 - beta_2_t) - 1.0
        sma_t = sma_inf - 2.0 * local_step * beta_2_power / (1.0 - beta_2_power)

        m = self.get_slot(var, 'm')
        m_scaled_g_values = grad * (1 - beta_1_t)
        m_t = state_ops.assign(m, m * beta_1_t, use_locking=self._use_locking)
        with ops.control_dependencies([m_t]):
            m_t = self._resource_scatter_add(m, indices, m_scaled_g_values)
        m_corr_t = m_t / (1.0 - beta_1_power)

        v = self.get_slot(var, 'v')
        v_scaled_g_values = (grad * grad) * (1 - beta_2_t)
        v_t = state_ops.assign(v, v * beta_2_t, use_locking=self._use_locking)
        with ops.control_dependencies([v_t]):
            v_t = self._resource_scatter_add(v, indices, v_scaled_g_values)

        if self.amsgrad:
            vhat = self.get_slot(var, 'vhat')
            vhat_t = state_ops.assign(vhat,
                                      math_ops.maximum(vhat, v_t),
                                      use_locking=self._use_locking)
            v_corr_t = math_ops.sqrt(vhat_t / (1.0 - beta_2_power))
        else:
            vhat_t = None
            v_corr_t = math_ops.sqrt(v_t / (1.0 - beta_2_power))

        r_t = math_ops.sqrt((sma_t - 4.0) / (sma_inf - 4.0) *
                            (sma_t - 2.0) / (sma_inf - 2.0) *
                            sma_inf / sma_t)

        var_t = tf.where(sma_t >= 5.0, r_t * m_corr_t / (v_corr_t + epsilon_t), m_corr_t)

        if self._initial_weight_decay > 0.0:
            var_t += self._get_hyper('weight_decay', var_dtype) * var

        var_update = self._resource_scatter_add(var, indices, tf.gather(-lr_t * var_t, indices))

        updates = [var_update, m_t, v_t]
        if self.amsgrad:
            updates.append(vhat_t)
        return control_flow_ops.group(*updates)

    def get_config(self):
        config = super(RAdam, self).get_config()
        config.update({
            'learning_rate': self._serialize_hyperparameter('learning_rate'),
            'beta_1': self._serialize_hyperparameter('beta_1'),
            'beta_2': self._serialize_hyperparameter('beta_2'),
            'decay': self._serialize_hyperparameter('decay'),
            'weight_decay': self._serialize_hyperparameter('weight_decay'),
            'epsilon': self.epsilon,
            'amsgrad': self.amsgrad,
            'total_steps': self._serialize_hyperparameter('total_steps'),
            'warmup_proportion': self._serialize_hyperparameter('warmup_proportion'),
            'min_lr': self._serialize_hyperparameter('min_lr'),
        })
        return config

In [3]:
train = pd.read_csv('data/train.csv').set_index('id')
test = pd.read_csv('data/test.csv').set_index('id')
# train

In [4]:
copy = train.copy()
# copy

In [5]:
from sklearn.model_selection import train_test_split

train_data, validation_data = train_test_split(copy, test_size = 0.2, shuffle = False, random_state = 1004)

In [6]:
# train_data
# validation_data

In [7]:
ytrain = train_data['digit'].to_frame()
letter = train_data['letter'].to_frame()

y_val = validation_data['digit'].to_frame()
letter_val = validation_data['letter'].to_frame()
test_letter = test['letter'].to_frame()
# ytrain

In [8]:
xtrain = train_data.drop(['digit', 'letter'], axis = 1)
x_val = validation_data.drop(['digit', 'letter'], axis = 1)
test = test.drop(['letter'], axis = 1)
# xtrain

In [9]:
# # digit  디코딩
# decoding = np.argmax(onehot_ytrain, axis=1).reshape(-1,1)
# decoding = pd.DataFrame(decoding, columns = ['digit'])
# decoding

In [10]:
# onehot
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()

onehot_letter = encoder.fit_transform(letter)
onehot_letter_val = encoder.fit_transform(letter_val)
onehot_test_letter = encoder.fit_transform(test_letter)


onehot_ytrain = tf.keras.utils.to_categorical(ytrain, 10)
onehot_y_val = tf.keras.utils.to_categorical(y_val, 10)

In [11]:
xtrain = np.array(xtrain)
ytrain = np.array(ytrain)
x_val = np.array(x_val)
y_val = np.array(y_val)
test = np.array(test)

onehot_letter = onehot_letter.toarray()
onehot_letter_val = onehot_letter_val.toarray()
onehot_test_letter = onehot_test_letter.toarray()

In [12]:
print('xtrain.shape = {}'.format(xtrain.shape))
print('ytrain.shape = {}'.format(ytrain.shape))
print('onehot_letter.shape = {}'.format(onehot_letter.shape))

print('x_val.shape = {}'.format(x_val.shape))
print('y_val.shape = {}'.format(y_val.shape))
print('onehot_letter_val.shape = {}'.format(onehot_letter_val.shape))

print('test.shape = {}'.format(test.shape))
print('onehot_test_letter.shape = {}'.format(onehot_test_letter.shape))



xtrain.shape = (1638, 784)
ytrain.shape = (1638, 1)
onehot_letter.shape = (1638, 26)
x_val.shape = (410, 784)
y_val.shape = (410, 1)
onehot_letter_val.shape = (410, 26)
test.shape = (20480, 784)
onehot_test_letter.shape = (20480, 26)


In [13]:
# reshape
xtrain = xtrain.reshape((1638, 28, 28, 1))
onehot_letter = onehot_letter.reshape((1638, 1, 26))

x_val = x_val.reshape((410, 28, 28, 1))
onehot_letter_val = onehot_letter_val.reshape((410, 1, 26))

test = test.reshape((20480, 28, 28, 1))
onehot_test_letter = onehot_test_letter.reshape((20480, 1, 26))


# float32
xtrain = xtrain.astype('float32')
x_val = x_val.astype('float32')
onehot_letter = onehot_letter.astype('float32')

onehot_ytrain = onehot_ytrain.astype('float32')
onehot_y_val = onehot_y_val.astype('float32')
onehot_letter_val = onehot_letter_val.astype('float32')

test = test.astype('float32')
onehot_test_letter = onehot_test_letter.astype('float32') 


# /255
xtrain /= 255.0
x_val /= 255.0
test /= 255.0

In [14]:
print('xtrain.shape = {}'.format(xtrain.shape))
print('ytrain.shape = {}'.format(ytrain.shape))
print('onehot_letter.shape = {}'.format(onehot_letter.shape))

print('x_val.shape = {}'.format(x_val.shape))
print('y_val.shape = {}'.format(y_val.shape))
print('onehot_letter_val.shape = {}'.format(onehot_letter_val.shape))

print('test.shape = {}'.format(test.shape))
print('onehot_test_letter.shape = {}'.format(onehot_test_letter.shape))

xtrain.shape = (1638, 28, 28, 1)
ytrain.shape = (1638, 1)
onehot_letter.shape = (1638, 1, 26)
x_val.shape = (410, 28, 28, 1)
y_val.shape = (410, 1)
onehot_letter_val.shape = (410, 1, 26)
test.shape = (20480, 28, 28, 1)
onehot_test_letter.shape = (20480, 1, 26)


In [15]:
# decoding['digit'].unique()

In [16]:
#OPTIMIZER = tf.keras.optimizers.SGD(lr=0.01)
OPTIMIZER = tf.keras.optimizers.Adam(lr = 0.001)


In [17]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

train_datagen = ImageDataGenerator(
    
    width_shift_range = 5,
    height_shift_range = 5,
    rotation_range = 10,
    zoom_range = 0.05
)

validation_datagen = ImageDataGenerator()

# train_datagen.fit(xtrain)
# validation_datagen.fit(x_val)

In [18]:
image_input = Input(shape = (28, 28, 1), dtype = 'float32', name = 'image')
letter_input = Input(shape = (1, 26), dtype = 'float32', name = 'letter')

In [22]:
## Entry flow


cnn = layers.Conv2D(32, (3, 3), activation = 'relu', padding = 'same')(image_input)
cnn = layers.BatchNormalization()(cnn)
cnn = layers.Conv2D(64, (3, 3), activation = 'relu', padding = 'same')(cnn)
cnn = layers.BatchNormalization()(cnn)

previous_block = cnn

for fliters in [128, 256, 728]:
    
    residual = layers.Conv2D(fliters, (1, 1), activation = None, padding = 'same', strides = 2)(previous_block)
   
    cnn = layers.SeparableConv2D(fliters, (3, 3), activation = 'relu', padding = 'same')(cnn)
    cnn = layers.BatchNormalization()(cnn)
    cnn = layers.SeparableConv2D(fliters, (3, 3), activation = 'relu', padding = 'same')(cnn)
    cnn = layers.BatchNormalization()(cnn)
    cnn = MaxPooling2D((3, 3), padding='same', strides=2)(cnn)
    

    cnn = tf.keras.layers.Add()([cnn, residual])
    previous_block = cnn


## Middle flow


for i in range(8):
    
    cnn = layers.SeparableConv2D(728, (3, 3), activation = 'relu', padding = 'same')(cnn)
    cnn = layers.BatchNormalization()(cnn)

    cnn = layers.SeparableConv2D(728, (3, 3), activation = 'relu', padding = 'same')(cnn)
    cnn = layers.BatchNormalization()(cnn)

    cnn = layers.SeparableConv2D(728, (3, 3), activation = None, padding = 'same')(cnn)
    cnn = layers.BatchNormalization()(cnn)
    
    cnn = tf.keras.layers.Add()([cnn, previous_block])
    previous_block = cnn


## Exit flow


previous_block = cnn
    
cnn = layers.SeparableConv2D(728, (3, 3), activation = 'relu', padding = 'same')(cnn)
cnn = layers.BatchNormalization()(cnn)

cnn = layers.SeparableConv2D(1024, (3, 3), activation=None, padding = 'same')(cnn)
cnn = layers.BatchNormalization()(cnn)

cnn = MaxPooling2D((3, 3), padding='same', strides=2)(cnn) # (19, 19, 1024) -> (10, 10, 1024)

residual = layers.Conv2D(1024, (1, 1), strides=2, activation=None, padding = 'same')(previous_block) # (19, 19, 728) -> (10, 10, 1024)
cnn = tf.keras.layers.Add()([cnn, residual])

cnn = layers.SeparableConv2D(1536, (3, 3), activation = 'relu', padding = 'same')(cnn)
cnn = layers.BatchNormalization()(cnn)
cnn = layers.SeparableConv2D(2048, (3, 3), activation = 'relu', padding = 'same')(cnn)
cnn = layers.BatchNormalization()(cnn)

cnn = GlobalAveragePooling2D()(cnn)

# letter LSTM
dnn = layers.LSTM(128, activation = 'relu', return_sequences = True)(letter_input)
dnn = layers.LSTM(256, activation = 'relu', return_sequences = True)(dnn)
dnn = layers.LSTM(256, activation = 'relu')(dnn)

concatenated = layers.concatenate([cnn, dnn])


result = layers.Dense(512, activation = 'relu')(concatenated)
result = layers.BatchNormalization()(result)
result = layers.Dropout(0.5)(result)
result = layers.Dense(10, activation = 'softmax')(result)

model = Model([image_input, letter_input], result)
model.summary()

ion_3[0][0]      
__________________________________________________________________________________________________
conv2d_2 (Conv2D)               (None, 14, 14, 128)  8320        batch_normalization_1[0][0]      
__________________________________________________________________________________________________
add (Add)                       (None, 14, 14, 128)  0           max_pooling2d[0][0]              
                                                                 conv2d_2[0][0]                   
__________________________________________________________________________________________________
separable_conv2d_2 (SeparableCo (None, 14, 14, 256)  34176       add[0][0]                        
__________________________________________________________________________________________________
batch_normalization_4 (BatchNor (None, 14, 14, 256)  1024        separable_conv2d_2[0][0]         
___________________________________________________________________________________________

In [23]:
model.compile(RAdam(learning_rate = 0.001),
                loss = 'categorical_crossentropy',
                metrics = ["accuracy"])

In [24]:

# def Create_model():

#     model = tf.keras.models.Sequential([
#         keras.layers.ZeroPadding2D((1, 1), input_shape = (28, 28, 1)),
#         keras.layers.Conv2D(128, (5, 5), activation = 'relu', padding = 'same'),
#         # keras.layers.BatchNormalization(),
#         # keras.layers.ZeroPadding2D((1, 1)),
#         keras.layers.Conv2D(128, (3, 3), activation = 'relu', padding = 'same'),
#         keras.layers.MaxPooling2D(2, 2),
#         # keras.layers.BatchNormalization(),
#         # keras.layers.ZeroPadding2D((1, 1)),
#         keras.layers.Conv2D(256, (3, 3), activation = 'relu', padding = 'same'),
#         # keras.layers.BatchNormalization(),
#         # keras.layers.ZeroPadding2D((1, 1)),
#         keras.layers.Conv2D(256, (3, 3), activation = 'relu', padding = 'same'),
#         keras.layers.MaxPooling2D(2, 2),
#         keras.layers.Dropout(0.2),
#         keras.layers.Flatten(),
#         # keras.layers.BatchNormalization(),
#         keras.layers.Dense(256, activation = 'relu'),
#         keras.layers.Dropout(0.5),
#         # keras.layers.BatchNormalization(),
#         keras.layers.Dense(10, activation = 'softmax')
#     ])

#     model.compile(RAdam(),
#                   loss = 'categorical_crossentropy',
#                   metrics = ["accuracy"])

#     return model

# model = Create_model()
# model.summary()


In [25]:
EarlyStopping = tf.keras.callbacks.EarlyStopping(monitor = 'val_accuracy', patience = 10)
ModelCheckpoint = tf.keras.callbacks.ModelCheckpoint(filepath = 'best_model.h5', monitor = 'val_loss', save_weights_only = True, save_best_only = True)
ReduceLROnPlateau = tf.keras.callbacks.ReduceLROnPlateau(monitor = 'val_loss', patience = 5, factor = 0.1)


In [26]:
# log_dir = "logs/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
# tensorboard = tf.keras.callbacks.TensorBoard(log_dir = log_dir, histogram_freq = 1)
# model.fit_generator(
#     datagen.flow(xtrain, ytrain1, batch_size = 32),
#     epochs = 150,
#     verbose = 1,
#     callbacks = [tensorboard, checkpoint]
# )
model.fit(train_datagen.flow([xtrain, onehot_letter], onehot_ytrain, seed = 2020, batch_size = 16),  
          epochs = 150,
          steps_per_epoch = len(xtrain) / 16,
          verbose = 1,
          validation_data = validation_datagen.flow([x_val, onehot_letter_val], onehot_y_val, seed = 2020, batch_size = 16),
          validation_steps = len(x_val) / 16,
          callbacks = [ModelCheckpoint]
)

# %tensorboard --logdir logs

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
 21/102 [=====>........................] - ETA: 53s - loss: 2.2586 - accuracy: 0.3067

KeyboardInterrupt: 

[Tensorboard Link](http://localhost:6006)  
71/71 [==============================] - 13s 181ms/step - loss: 0.0737 - accuracy: 0.9756 - val_loss: 0.5013 - val_accuracy: 0.8244  85%

Epoch 11/150  
58/58 [==============================] - 18s 316ms/step - loss: 0.2306 - accuracy: 0.9143 - val_loss: 0.4259 - val_accuracy: 0.8585 - lr: 0.0010 87%

In [27]:
y_pred = model.predict([test, onehot_test_letter], batch_size=32)
# digit  디코딩
decoding = np.argmax(y_pred, axis=1).reshape(-1,1)
decoding = pd.DataFrame(decoding, columns = ['digit'])
decoding

NameError: name 'model' is not defined

In [28]:
sorted(decoding['digit'].unique())

NameError: name 'decoding' is not defined

In [29]:
model.load_weights('best_model.h5')
y_pred = model.predict([test, onehot_test_letter], batch_size=26)
# digit  디코딩
decoding = np.argmax(y_pred, axis=1).reshape(-1,1)
decoding = pd.DataFrame(decoding, columns = ['digit'])
decoding

NameError: name 'model' is not defined

In [30]:
sub = pd.read_csv('data/submission.csv')
sub = sub.drop(['digit'], axis = 1)
sub = pd.concat([sub, decoding], axis = 1)
sub.to_csv('sub.csv', index = False, encoding = 'utf-8')
sub

NameError: name 'decoding' is not defined