### Preprocessing

In [58]:
# imports
import numpy as np 
import pandas as pd 
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

In [59]:
# import the data
df_test = pd.read_csv('../input/test.csv')
df_train = pd.read_csv('../input/train.csv')

In [60]:
from sklearn.model_selection import StratifiedShuffleSplit 

# Check the ditribution of the labels (1, 2, ..., 9) and the ratio of the test data to the train data
split = StratifiedShuffleSplit(n_splits=1, test_size=0.1)
for train_index, test_index in split.split(df_train, df_train['label']): # to create a test set with the same distribution of labels
    strat_train_set = df_train.loc[train_index]
    strat_test_set = df_train.loc[test_index]
    
strat_train_set['label'].hist()
strat_test_set['label'].hist()
plt.show()

In [61]:
# Separating the target value array y from the features
y_train = strat_train_set['label'].values
strat_train_set.drop('label', axis=1, inplace=True)
X_train = strat_train_set.values
#
y_test = strat_test_set['label'].values
strat_test_set.drop('label', axis=1, inplace=True)
X_test = strat_test_set.values
#
X_final = df_test.values # to be used in predicition

In [62]:
print(X_train.shape)

In [63]:
import random 
# Plotting a random digit
randomIndex = random.randint(0,100)
some_digit = X_train[randomIndex]
some_digit_image = some_digit.reshape(28,28) 
print("Image shape before:", some_digit.shape, ", Image shape after:", some_digit_image.shape)

plt.imshow(some_digit_image, cmap = matplotlib.cm.binary, interpolation='nearest')
plt.axis('off')

print("\nAcctual Number:", y_train[randomIndex], "\nImage:")
plt.show()

### Scale the features:

In [64]:
from sklearn.preprocessing import StandardScaler
# fit_transform() is used on the data so that we can scale the data and also learn the scaling parameters of that data.
# It is also known as data normalization.

# Why? to standardize the independent features present in the data in a fixed range. 
# If feature scaling is not done, then a machine learning algorithm tends to weigh greater values,higher 
# and consider smaller values as the lower values, regardless of the unit of the values.

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

scaler = StandardScaler()
X_test_scaled = scaler.fit_transform(X_test)

scaler = StandardScaler()
X_final_scaled = scaler.fit_transform(X_final)


### Train the classifier

In [65]:
# from sklearn.model_selection import GridSearchCV


# search_grid = [{'n_neighbors': [4], 'weights': ['distance'], 'n_jobs': [-1]}]
#grid_search = GridSearchCV(KNeighborsClassifier(), search_grid, cv=3, scoring='accuracy', verbose=3)
#grid_search.fit(X_train_scaled, y_train)


In [66]:
# Grid search the best parameters for KNN. Attention: This will take like forever,
# Thats's why I have inclued only the best parameters into the classifier: {'n_neighbors': 4, 'weights': 'distance'}
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=4)
knn_clf.fit(X_train_scaled, y_train) # fit -train- on the training data

In [67]:
from sklearn.metrics import accuracy_score

y_knn_pred = knn_clf.predict(X_test_scaled) # test -predict the test data-
print("Predicition on X_test_scaled", accuracy_score(y_test, y_knn_pred)) # validate the model

# y_knn_pred_final = knn_clf.predict(X_final_scaled)
# print("Predicition on X_final_scaled", accuracy_score(y_test, y_knn_pred_final)) # validate the model

In [68]:
# write the output file
# submission = pd.DataFrame({
#         "ImageId": list(range(1,len(y_knn_pred)+1)),
#         "Label": y_knn_pred
#     })
# print(submission.head())
# submission.to_csv('submission.csv', index=False) 