In [122]:
import cv2
import os
import numpy as np
import skimage

from skimage import io, transform, color, filters, data, morphology, measure

import matplotlib
import matplotlib.pyplot as plt

import pandas as pd

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [123]:
def load_images_from_folder(folder):
    images = []
    images_name = []
    for filename in os.listdir(folder):
        img = io.imread(os.path.join(folder,filename))
        if img is not None:
            images.append(img)
            images_name.append(filename[0:4])
    
    images_df = pd.DataFrame({"name": images_name,
                             "image": images})
    #print(images_df)
    return images_df

def display(np_image):
    """
    This is a display function that we have added to show numpy images at full size
    If you pass in an image with 3 channels, it will be displayed in RGB
    If you passn in an image with 1 channel, it will be displayed in grayscale
    """
    dpi = matplotlib.rcParams['figure.dpi']
    if len(np_image.shape) == 3:
        height, width, depth = np_image.shape
    else:
        height, width = np_image.shape

    # What size does the figure need to be in inches to fit the image?
    figsize = width / float(dpi), height / float(dpi)

    # Create a figure of the right size with one axes that takes up the full figure
    fig = plt.figure(figsize=figsize)
    ax = fig.add_axes([0, 0, 1, 1])

    # Hide spines, ticks, etc.
    ax.axis('off')
    
    # Display the image in either RGB or grayscale (depending on the amount of dimensions)
    if (len(np_image.shape) >= 3):
        ax.imshow(np_image)
    else:
        ax.imshow(np_image, cmap='gray')

    plt.show()

all_images_df = load_images_from_folder('../dataset-images/all_images')
all_images_df
train_images_df = all_images_df.sample(frac=0.9, random_state=25)
#print('-------80%------')
test_images_df = all_images_df.drop(train_images_df.index)

train_images_df.reset_index(inplace=True,drop=True)
test_images_df.reset_index(inplace=True,drop=True)
#print(len(list_images))
#print(list_images[0])
#print(list_images_name)
#print('----------done---------')

In [124]:
def initialize_image_grids(image, image_name):
    zipcode = image
    zipcode = zipcode *-1
    gray = color.rgb2gray(zipcode)
    thresh = filters.threshold_otsu(gray)
    binary = gray > thresh
    binary_splitted = [binary[:, :32], binary[:, 32:64], binary[:, 64:96], binary[:, 96:128]]
    
    df = pd.DataFrame()
    lol = len(binary_splitted)
    if (lol < 4):
        print('splitted is smaller than 4')
    for x in range(0, len(binary_splitted)):
        
        test = binary_splitted[x]
        height = 8
        width = 8

        s_width = 0
        s_heigth = 0
        #print('lol: ', image_name[x])
        df.loc[x, 'zip_number'] = image_name[x]

        for i in range(1, 17):
            h = s_heigth+height
            w = s_width+width
            feat = test[s_heigth:h, s_width:w]
            white_pixels = feat[feat==1]
            total_white_pixels = len(white_pixels)
            df.loc[x, i] = total_white_pixels
            if (i%4 == 0):
                s_width = 0
                s_heigth = s_heigth + height
            else:
                s_width = s_width + width
            #plt.subplot(4,4,i)
            #plt.imshow(feat)
            #display(feat)

        #pd.concat([df, df_tmp])
        
    return df

image_properties = ['label', 'area', 'centroid', 
                    'perimeter', 'eccentricity', 'euler_number',
                    'filled_area','perimeter_crofton', 'local_centroid', 
                    'major_axis_length', 'minor_axis_length', 'orientation']

def initialize_image_props(image, image_name):
    zipcode = image
    zipcode = zipcode *-1
    gray = color.rgb2gray(zipcode)
    thresh = filters.threshold_otsu(gray)
    binary = gray > thresh
    #display(binary)
    
    # remove the white small spots/dots
    binary = morphology.binary_opening(binary)
    #display(binary)
    
    # make the white zip numbers thicker
    binary = morphology.binary_dilation(binary)
    #display(binary)
    
    # some numbers may have holes in it. That could seperate the number in 2 different labels. with closing the gap will be closed
    binary = morphology.binary_closing(binary)
    #display(binary)
    
    # return number back to its original state before it was made thicker. make the numbers thinner
    binary = morphology.binary_erosion(binary)
    #display(binary)
    # return_num=True, connectivity=None
    #print(binary)
    label_image, total_labels = measure.label(binary, return_num=True)
    #display(label_image)
    global image_properties
    props = measure.regionprops_table(label_image, properties=image_properties)
    tmp_df = pd.DataFrame(props)
    
    # filter out labels that have too small areas. these labels are small dots that could not be filled with the opening function
    tmp_df = tmp_df.query('area > 50').reset_index(drop=True)
    #print(tmp_df)
    zip_numbers = list(image_name)[:4]
#    if len(tmp_df.index) == 4:
#        #print('-----------------')
#        #print(total_labels)
#        #tmp_df.insert(0, 'zip_number', zip_numbers)
#    else:
#        print('indexed is not 4, but it is: ' + len(tmp_df.index))
#        return None
        
    return tmp_df


In [125]:
result = pd.DataFrame()

for i in range(0, len(train_images_df.index)):
    image_name = train_images_df.iloc[i,0]
    image = train_images_df.iloc[i,1]
    tmp_result_grid = initialize_image_grids(image, image_name)
    tmp_result_props = initialize_image_props(image, image_name)
    tmp_result = pd.concat([tmp_result_grid,tmp_result_props], axis=1)
    result = pd.concat([result, tmp_result])


result.reset_index(inplace=True, drop=True)
result.drop(columns=['label'], inplace=True)
#result  

In [126]:
features = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,
            'major_axis_length', 'minor_axis_length', 'euler_number', 
            'centroid-0', 'eccentricity', 'local_centroid-0', 'perimeter', 
            'perimeter_crofton', 'filled_area', 'orientation']

standardScaler = StandardScaler()
minMaxScaler = MinMaxScaler()

def preprocess(X_train, X_test):
    print('-----Start preprocc----------')
    global features
    X_train[features] = minMaxScaler.fit_transform(X_train[features].to_numpy())
    X_test[features] = minMaxScaler.transform(X_test[features].to_numpy())
    print('---------Preprocess Done--------')
    return X_train, X_test

def splitTrainTest(result):
    # Split data into 50% train and 50% test subsets
    global features
    X_train, X_test, y_train, y_test = train_test_split(
        result[features], result['zip_number'], test_size=0.25, random_state=0)
    
    X_train, X_test = preprocess(X_train, X_test)
    print('-----------dataset splitted------------')
    return X_train, X_test, y_train, y_test

In [127]:
X_train, X_test, y_train, y_test = splitTrainTest(result)

-----Start preprocc----------
---------Preprocess Done--------
-----------dataset splitted------------


In [128]:
def knn_fit_score(X_train, X_test, y_train, y_test):
    train_samples = len(X_train)
    test_samples = len (X_test)
    neigh = KNeighborsClassifier(n_neighbors=5, weights='distance', metric='chebyshev')
    neigh.fit(X_train, y_train.to_numpy().reshape(train_samples))

    score = neigh.score(X_test, y_test.to_numpy().reshape(test_samples))
    return score

def kfolding_knn(dataset):
    kf = KFold(n_splits=5, shuffle=True)
    global features
    X_training = dataset[features]
    y_training = dataset['zip_number']
    k = 1
    for train_index, test_index in kf.split(X_training, y_training):
        
        X_train, X_test = X_training.loc[train_index,:], X_training.loc[test_index,:]
        y_train, y_test = y_training.loc[train_index], y_training.loc[test_index]
        X_train, X_test = preprocess(X_train, X_test)
        
        score = knn_fit_score(X_train, X_test, y_train, y_test)
        print("[fold {0}], score: {1:.5f}".
          format(k, score))
        k = k + 1
        
def searchHyperParams_knn(result):
    print('--------start searching params--------')
    X_train, X_test, y_train, y_test = splitTrainTest(result)
    print('--------splitted train test----------')
    grid_params = {
        'n_neighbors' : [3,5,11,14,19],
        'weights' : ['uniform', 'distance'],
        'metric' : ['euclidean', 'manhattan', 'chebyshev']
    }

    gs = GridSearchCV(
        KNeighborsClassifier(),
        grid_params,
        verbose = 1,
        cv = 3,
        n_jobs = -1
    )
    print('---------grid search started---------')
    gs_results = gs.fit(X_train, y_train)

    print('--------Done--------')
    print('best score:')
    print(gs_results.best_score_)
    print('best estimator:')
    print(gs_results.best_estimator_)
    print('best params:')
    print(gs_results.best_params_)
    
def gaussian_nb_fit_score(result):
    X_train, X_test, y_train, y_test = splitTrainTest(result)
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)

    score = gnb.score(X_test, y_test)
    print(score)
    
def searchHyperParams_svc(result):
    print('--------start searching params--------')
    X_train, X_test, y_train, y_test = splitTrainTest(result)
    print('--------splitted train test----------')
    grid_params = {
        'C': [0.1,1, 10, 100],
        'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
        'gamma': [1,0.1,0.01,0.001]
    }

    gs = GridSearchCV(
        SVC(),
        grid_params,
        verbose = 1,
        cv = 3,
        n_jobs = -1
    )
    print('---------grid search started---------')
    gs_results = gs.fit(X_train, y_train)

    print('--------Done--------')
    print('best score:')
    print(gs_results.best_score_)
    print('best estimator:')
    print(gs_results.best_estimator_)
    print('best params:')
    print(gs_results.best_params_)
    
def searchHyperParams_DT(result):
    print('--------start searching params--------')
    X_train, X_test, y_train, y_test = splitTrainTest(result)
    print('--------splitted train test----------')
    grid_params = {
        'criterion': ['gini', 'entropy'],
        'splitter' : ['best', 'random'],
        'max_depth': [1,5,10,20,50,100,150,None],
        'min_samples_split': np.arange(2,5),
        'min_samples_leaf': np.arange(1,5),
        'min_weight_fraction_leaf': np.arange(0,0.5)
    }

    gs = GridSearchCV(
        DecisionTreeClassifier(),
        grid_params,
        verbose = 1,
        cv = 3,
        n_jobs = -1
    )
    print('---------grid search started---------')
    gs_results = gs.fit(X_train, y_train)

    print('--------Done--------')
    print('best score:')
    print(gs_results.best_score_)
    print('best estimator:')
    print(gs_results.best_estimator_)
    print('best params:')
    print(gs_results.best_params_)

def searchHyperParams_RC(result):
    print('--------start searching params--------')
    X_train, X_test, y_train, y_test = splitTrainTest(result)
    print('--------splitted train test----------')
    grid_params = {
        'bootstrap': [True, False],
         'max_depth': [10, 20, 30, 40, None],
         'max_features': ['auto', 'sqrt'],
         'min_samples_leaf': [1, 2, 4],
         'min_samples_split': [2, 5, 10],
         'n_estimators': [200, 400, 600, 800]
    }

    gs = GridSearchCV(
        RandomForestClassifier(),
        grid_params,
        verbose = 3,
        cv = 3,
        n_jobs = -1
    )
    print('---------grid search started---------')
    gs_results = gs.fit(X_train, y_train)

    print('--------Done--------')
    print('best score:')
    print(gs_results.best_score_)
    print('best estimator:')
    print(gs_results.best_estimator_)
    print('best params:')
    print(gs_results.best_params_)

In [129]:
searchHyperParams_knn(result)

--------start searching params--------
-----Start preprocc----------
---------Preprocess Done--------
-----------dataset splitted------------
--------splitted train test----------
---------grid search started---------
Fitting 3 folds for each of 30 candidates, totalling 90 fits
--------Done--------
best score:
0.9421296296296297
best estimator:
KNeighborsClassifier(metric='manhattan', weights='distance')
best params:
{'metric': 'manhattan', 'n_neighbors': 5, 'weights': 'distance'}


In [130]:
gaussian_nb_fit_score(result)

-----Start preprocc----------
---------Preprocess Done--------
-----------dataset splitted------------
0.8356481481481481


In [131]:
searchHyperParams_svc(result)

--------start searching params--------
-----Start preprocc----------
---------Preprocess Done--------
-----------dataset splitted------------
--------splitted train test----------
---------grid search started---------
Fitting 3 folds for each of 64 candidates, totalling 192 fits
--------Done--------
best score:
0.960648148148148
best estimator:
SVC(C=10, gamma=1)
best params:
{'C': 10, 'gamma': 1, 'kernel': 'rbf'}


In [132]:
searchHyperParams_DT(result)

--------start searching params--------
-----Start preprocc----------
---------Preprocess Done--------
-----------dataset splitted------------
--------splitted train test----------
---------grid search started---------
Fitting 3 folds for each of 384 candidates, totalling 1152 fits
--------Done--------
best score:
0.83179012345679
best estimator:
DecisionTreeClassifier(criterion='entropy', max_depth=100, min_samples_split=3)
best params:
{'criterion': 'entropy', 'max_depth': 100, 'min_samples_leaf': 1, 'min_samples_split': 3, 'min_weight_fraction_leaf': 0.0, 'splitter': 'best'}


In [133]:
#searchHyperParams_RC(result)

In [134]:
def svc_fit_score(X_train, X_test, y_train, y_test):
    train_samples = len(X_train)
    test_samples = len (X_test)
    neigh = SVC(C=10, gamma=1)
    neigh.fit(X_train, y_train)

    score = neigh.score(X_test, y_test)
    return score

def kfolding_svc(dataset):
    kf = KFold(n_splits=5, shuffle=True)
    global features
    X_training = dataset[features]
    y_training = dataset['zip_number']
    k = 1
    for train_index, test_index in kf.split(X_training, y_training):
        
        X_train, X_test = X_training.loc[train_index,:], X_training.loc[test_index,:]
        y_train, y_test = y_training.loc[train_index], y_training.loc[test_index]
        X_train, X_test = preprocess(X_train, X_test)
        
        score = svc_fit_score(X_train, X_test, y_train, y_test)
        print("[fold {0}], score: {1:.5f}".
          format(k, score))
        k = k + 1

In [135]:
kfolding_svc(result)

-----Start preprocc----------
---------Preprocess Done--------
[fold 1], score: 0.96243
-----Start preprocc----------
---------Preprocess Done--------
[fold 2], score: 0.97110
-----Start preprocc----------
---------Preprocess Done--------
[fold 3], score: 0.96821
-----Start preprocc----------
---------Preprocess Done--------
[fold 4], score: 0.97101
-----Start preprocc----------
---------Preprocess Done--------
[fold 5], score: 0.95652


In [136]:
all_zipcodes = []
total_images = len(test_images_df.index)

#standardScaler = StandardScaler()
scaler = MinMaxScaler()

X_train, X_test, y_train, y_test = train_test_split(
        result[features], result['zip_number'], test_size=0.2, random_state=0)

X_train[features] = scaler.fit_transform(X_train[features].to_numpy())
X_test[features] = scaler.transform(X_test[features].to_numpy())
#train model
svc_clf = SVC(C=10, gamma=1)
svc_clf.fit(X_train, y_train)

score = svc_clf.score(X_test, y_test)
print('---------score from test------------------- ')
print(score)



---------score from test------------------- 
0.9595375722543352


In [137]:
for i in range(0, total_images):
    
    image_name = test_images_df.iloc[i, 0]
    image = test_images_df.iloc[i, 1]
    tmp_result_grid = initialize_image_grids(image, image_name)
    tmp_result_props = initialize_image_props(image, image_name)
    test_result = pd.concat([tmp_result_grid,tmp_result_props], axis=1)
    test_result.reset_index(inplace=True,drop=True)
    test_result['zip_number'] = test_result['zip_number'].astype(int)
    test_result[features] = scaler.transform(test_result[features].to_numpy())
    
    # classify
    predicted = svc_clf.predict(test_result[features])
    #print(predicted)
    all_zipcodes.append(predicted)


In [138]:
for i in range(0, len(all_zipcodes)):
    print('predicted: ', all_zipcodes[i][0], all_zipcodes[i][1], all_zipcodes[i][2], all_zipcodes[i][3])
    print('original: ',  test_images_df.loc[i,['name']].to_numpy())

predicted:  1 4 8 9
original:  ['1489']
predicted:  1 8 6 5
original:  ['1865']
predicted:  1 9 0 4
original:  ['1904']
predicted:  2 6 0 8
original:  ['2608']
predicted:  2 7 0 8
original:  ['2708']
predicted:  2 9 7 1
original:  ['2971']
predicted:  3 3 4 6
original:  ['3046']
predicted:  3 4 1 7
original:  ['3417']
predicted:  5 5 2 1
original:  ['3521']
predicted:  3 5 3 6
original:  ['3536']
predicted:  3 5 9 1
original:  ['3591']
predicted:  3 6 0 6
original:  ['3606']
predicted:  3 6 4 1
original:  ['3648']
predicted:  3 7 1 1
original:  ['3711']
predicted:  3 7 4 6
original:  ['3746']
predicted:  3 9 2 1
original:  ['3921']
predicted:  3 9 8 9
original:  ['3988']
predicted:  4 0 0 6
original:  ['4006']
predicted:  4 1 3 4
original:  ['4134']
predicted:  4 7 6 9
original:  ['4764']
predicted:  5 0 7 1
original:  ['5071']
predicted:  5 5 3 9
original:  ['5539']
predicted:  5 5 5 1
original:  ['5551']
predicted:  5 5 7 8
original:  ['5578']
predicted:  5 6 4 3
original:  ['5643']
