# Part 1: Preparing data and linear classification

In [60]:
import seaborn as sb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from keras.preprocessing import image
from keras.models import Sequential
from keras.layers import Conv2D, Lambda
from keras.layers import MaxPooling2D
from keras.layers import Flatten
from keras.layers import Dense, Dropout
from keras.layers.normalization import BatchNormalization
from keras.utils.np_utils import to_categorical
from keras.datasets import mnist
from keras.layers.advanced_activations import PReLU

import warnings
warnings.filterwarnings('ignore')

Reading test and train data

In [61]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [62]:
train.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [63]:
test.head()

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [64]:
y_train = train["label"]

x_train = train.drop(labels = ["label"],axis = 1)

In [65]:
y_train.value_counts()

1    4684
7    4401
3    4351
9    4188
2    4177
6    4137
0    4132
4    4072
8    4063
5    3795
Name: label, dtype: int64

Split on train and validation classes

In [66]:
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.1, random_state=42)

Let's try to classify digits with usual method of machine learning.

Choose the best parameters

In [67]:
parameters = {'C':[1, 10]}
linear_clf = LinearSVC()

opt_linear_clf = GridSearchCV(linear_clf, parameters, cv=5, scoring='accuracy')
opt_linear_clf.fit(x_train, y_train)

print("Best params:", opt_linear_clf.best_params_)

Best params: {'C': 10}


In [68]:
prediction = opt_linear_clf.predict(x_val)
linear_result = accuracy_score(y_val, prediction)

print('Result accuracy:', linear_result)

Result accuracy: 0.8607142857142858


Saving results

In [23]:
def save_prediction(prediction, file_name):
    pd.DataFrame({"ImageId":list(range(1, len(prediction)+1)),"Label":prediction}).to_csv(file_name,
                                                                                          index=False,header=True)
linear_prediction = opt_linear_clf.predict(test)
save_prediction(linear_prediction, 'linear_prediction.csv')

# Part 2: non-linear classification

We will use random forest to improve our result

In [24]:
parameters = {'n_estimators': [100, 200, 500]}
rf_clf = RandomForestClassifier()

opt_rf_clf = GridSearchCV(rf_clf, parameters, cv=5, scoring='accuracy')
opt_rf_clf.fit(x_train, y_train)

print("Best params:", opt_rf_clf.best_params_)

Best params: {'n_estimators': 500}


In [25]:
prediction = opt_rf_clf.predict(x_val)
rf_result = accuracy_score(y_val, prediction)

print('Result accuracy:', rf_result)

Result accuracy: 0.9654761904761905


Saving results

In [26]:
rf_prediction = opt_rf_clf.predict(test)    
save_prediction(prediction, 'rf_prediction.csv')

# Part 3: CNN

Prepare data

In [85]:
(X_train,y_train),(X_test,y_test)=mnist.load_data()
X_train=X_train.reshape(X_train.shape[0],28,28,1).astype('float32')
X_test=X_test.reshape(X_test.shape[0],28,28,1).astype('float32')
print(X_train.shape)
print(X_test.shape)

(60000, 28, 28, 1)
(10000, 28, 28, 1)


In [86]:
y_train=to_categorical(y_train)
y_test=to_categorical(y_test)
num_classes=y_test.shape[1]

Data augmentation

In [87]:
gen=image.ImageDataGenerator()
batches=gen.flow(X_train,y_train,batch_size=64)

Data normalization

In [88]:
mean=np.mean(X_train)
std=np.std(X_train)

def standardize(x):
    return (x-mean)/std

Model definition

In [89]:
def model():
    model=Sequential()
    model.add(Lambda(standardize,input_shape=(28,28,1)))
    model.add(Conv2D(64,(3,3),activation="linear"))
    model.add(PReLU())
    model.add(Conv2D(64,(3,3),activation="linear"))
    model.add(PReLU())
    
    model.add(MaxPooling2D(pool_size=(2,2)))
    model.add(BatchNormalization())
    model.add(Conv2D(128,(3,3),activation="linear"))
    model.add(PReLU())
    model.add(Conv2D(128,(3,3),activation="linear"))
    model.add(PReLU())
    
    model.add(MaxPooling2D(pool_size=(2,2)))
    model.add(BatchNormalization())
    model.add(Conv2D(256,(3,3),activation="linear"))
    model.add(PReLU())
    
    model.add(MaxPooling2D(pool_size=(2,2)))
    
    model.add(Flatten())
    model.add(BatchNormalization())
    model.add(Dense(512,activation="linear"))
    model.add(PReLU())
    model.add(Dense(10,activation="softmax"))
    
    model.compile(loss="categorical_crossentropy",optimizer="adam",metrics=["accuracy"])
    model.fit_generator(generator=batches,steps_per_epoch=batches.n,epochs=3)
    return model

Model learing

In [None]:
model=model()

Epoch 1/3
   22/60000 [..............................] - ETA: 15:00:00 - loss: 0.5680 - acc: 0.8274