<a href="https://colab.research.google.com/github/kis-balazs/machine-learning/blob/main/digit_recognizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# setup

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/kaggle.json
!chmod 600 /root/.kaggle/kaggle.json

!pip uninstall kaggle --quiet
!pip install kaggle==1.5.12 --quiet

In [None]:
!kaggle competitions download digit-recognizer

In [None]:
!unzip digit-recognizer.zip

# code

In [None]:
import os
import csv
import numpy as np
import pandas as pd

from PIL import Image

In [None]:
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.image import ImageDataGenerator

from sklearn.model_selection import train_test_split

## NN

### data preprocessing

In [None]:
def get_data(df: pd.DataFrame, train=True):
    print('$> ', end='')
    images, labels = [], []
    for idx, entry in df.iterrows():
        if train:
            labels.append(entry[0])
            images.append(entry[1:785])
        else:
            images.append(entry[0:784])
        if idx % 2_000 == 0:
            print(f'{idx}..', end='')

    images = np.array(images).astype('float')
    labels = np.array(labels).astype('float')
    print()
    return images, labels

In [None]:
train_x, train_y = get_data(pd.read_csv('train.csv'))
test_x, test_y = get_data(pd.read_csv('test.csv'), train=False)

train_x /= 255.0
test_x /= 255.0

train_x, val_x, train_y, val_y = train_test_split(
    train_x, train_y,
    test_size=.33,
    random_state=42  # hihi funny number
)

print('\ntrain:', train_x.shape, train_y.shape)
print('val:', val_x.shape, val_y.shape)
print('test:', test_x.shape, test_y.shape)

In [None]:
if len(train_x.shape) == 2:  # still 2
    # make 2d image
    train_x = np.array([np.array_split(sample, 28) for sample in train_x])
    val_x = np.array([np.array_split(sample, 28) for sample in val_x])
    test_x = np.array([np.array_split(sample, 28) for sample in test_x])
    
    # pivot axis = 3
    train_x = np.expand_dims(train_x, axis=3)
    val_x = np.expand_dims(val_x, axis=3)
    test_x = np.expand_dims(test_x, axis=3)
else:
    print('No dimension expansion on images, once done')

In [None]:
print('train:', train_x.shape, train_y.shape)
print('val:', val_x.shape, val_y.shape)
print('test:', test_x.shape, test_y.shape)

### model + training

In [None]:
model = tf.keras.models.Sequential([
    keras.layers.Conv2D(128, (3, 3), activation='relu', input_shape=train_x[0].shape),
    keras.layers.MaxPool2D(2, 2),
    keras.layers.Conv2D(64, (3, 3), activation='relu'),
    keras.layers.MaxPool2D(2, 2),

    keras.layers.Flatten(),
    keras.layers.Dropout(.3),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(10, activation='softmax')   
])

In [None]:
model.summary()

In [None]:
model.compile(
    optimizer='rmsprop',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

In [None]:
es = keras.callbacks.EarlyStopping(monitor='val_accuracy', mode='max', verbose=1, patience=5)

In [None]:
history = model.fit(
    x=train_x,
    y=train_y,
    epochs=100,
    batch_size=128,
    validation_data=(val_x, val_y),
    callbacks=[es],
    verbose=1
)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'r', label='Training accuracy')
plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()

plt.plot(epochs, loss, 'r', label='Training Loss')
plt.plot(epochs, val_loss, 'b', label='Validation Loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()

## KNeighbors

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

train_x = (train.iloc[:,1:].values).astype('float32')
train_y = train.iloc[:,0].values.astype('int32')
test_x = test.values.astype('float32') 

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(train_x, train_y)

## prediction + solution generation

In [None]:
# NN
yy = model.predict(test_x)
yy_labels = np.array([np.argmax(y) for y in yy])

# KNN
# yy_labels = knn.predict(test_x)

# ###
solution = pd.DataFrame({
    'ImageId': np.arange(1, len(test_x) + 1),
    'Label': yy_labels
})

solution.to_csv('solution.csv', index=False)

In [None]:
# TODO update comment when submitting
!kaggle competitions submit -c digit-recognizer -f solution.csv -m "Balazs @ 14.03.2022 -- interesting changes"