In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from skimage.measure import label, regionprops
from skimage.feature import local_binary_pattern
from skimage.feature import hog
from skimage import data, exposure
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.linear_model import SGDClassifier
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier 
from sklearn import neighbors
from sklearn.ensemble import VotingClassifier

from time import time
from matplotlib import offsetbox
from sklearn import manifold

## Loading data

In [None]:
# data-loading
images = np.load('/mnt/e/Pattern Recognition/DigitRecognizer/data/images.npy')
labels = np.load('/mnt/e/Pattern Recognition/DigitRecognizer/data/labels.npy')

## Creating binary images by applying adaptive thresholding

In [None]:
#binary images
from skimage.filters import threshold_local
thresh_images = np.zeros(images.shape, dtype='uint')


for i in range(len(thresh_images)):
    
    threshed = np.zeros((28, 28), dtype='uint')
    im = images[i, :, :]
    
    
    threshed[im<80] = 1
    threshed[im>80] = 0
    
    thresh_images[i, :, :] = threshed
    
     

In [None]:
plt.imshow(images[558, :, :])

In [None]:
plt.imshow(thresh_images[558,:,:], cmap='gray')

## Creating HOG feature vector for dimentionality reduction

In [None]:
hog_results = np.zeros((392, 60000), dtype='float64')

for i in range(60000):
    hog_results[:, i] = hog(thresh_images[i, :, :], orientations=8, pixels_per_cell=(4, 4),
                            cells_per_block=(1, 1), visualize=False, multichannel=False)


## To visualize our high-dimentional HOG matrix, lets use t-distributed Stochastic Neighbor Embedding

In [None]:
X = np.transpose(hog_results)
y = labels
n_samples, n_features = X.shape
n_neighbors = 30

In [None]:
# Scale and visualize the embedding vectors
def plot_embedding(X, title=None):
    x_min, x_max = np.min(X, 0), np.max(X, 0)
    X = (X - x_min) / (x_max - x_min)

    plt.figure()
    ax = plt.subplot(111)
    for i in range(X.shape[0]):
        plt.text(X[i, 0], X[i, 1], str(y[i]),
                 color=plt.cm.Set1(y[i] / 10.),
                 fontdict={'weight': 'bold', 'size': 9})

    if hasattr(offsetbox, 'AnnotationBbox'):
        # only print thumbnails with matplotlib > 1.0
        shown_images = np.array([[1., 1.]])  # just something big
        for i in range(X.shape[0]):
            dist = np.sum((X[i] - shown_images) ** 2, 1)
            if np.min(dist) < 4e-3:
                # don't show points that are too close
                continue
            shown_images = np.r_[shown_images, [X[i]]]
            imagebox = offsetbox.AnnotationBbox(
                offsetbox.OffsetImage(hog_results[i], cmap=plt.cm.gray_r),
                X[i])
            ax.add_artist(imagebox)
    plt.xticks([]), plt.yticks([])
    if title is not None:
        plt.title(title)

In [None]:
# t-SNE embedding of the digits dataset
print("Computing t-SNE embedding")
tsne = TSNE(n_components=2, perplexity=50,early_exaggeration = 4.0, n_iter = 250, init='pca', random_state=0)

X_tsne = tsne.fit_transform(X)

plot_embedding(X_tsne)

plt.show()
np.save('./X_tsne.npy',X_tsne)

## PCA 

In [None]:
#apply PCA to reduce dimentionality 
X = np.transpose(hog_results)
y = labels

pca = PCA(n_components = 16, svd_solver='full')
a = pca.fit_transform(X)


## Next step is to implement SGD classifier

In [None]:
#split dataset into training and test sets

X = np.transpose(hog_results)
y = labels
X_train, X_test, y_train, y_test = X[:50000], X[50000:], y[:50000], y[50000:]

In [None]:
#SGD classifier
sgd_clf = SGDClassifier(random_state=55, validation_fraction=0.2, verbose = 0, max_iter=100, n_jobs=12, shuffle=True)
sgd_clf.fit(X_train, y_train)

In [None]:
sgd_clf.predict(X_test)[6544]

In [None]:
cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring='accuracy')

In [None]:
#Random Forest Classifier to compare the results
randfor_clf = RandomForestClassifier(n_estimators=100, random_state=55, n_jobs=12)
cross_val_score(randfor_clf, X_train, y_train, cv=3, scoring='accuracy')

In [None]:
#scaling for RF
scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
cross_val_score(randfor_clf, X_train_scaled, y_train, cv=3, scoring='accuracy')

In [None]:
#knn 
knn_clf = neighbors.KNeighborsClassifier(n_neighbors=10, weights='uniform')
cross_val_score(knn_clf, X_train, y_train, cv=3, scoring='accuracy')

In [None]:
#ensemble of classifiers(Voting)
clf1 = sgd_clf
clf2 = randfor_clf
clf3 = knn_clf


eclf1 = VotingClassifier(estimators=[('sgd', clf1), ('rf', clf2), ('knn', clf3)], voting='hard')
cross_val_score(eclf1, X_train, y_train, cv=3, scoring='accuracy')