## Student Number : 408846
## Student Name : Ibrahim Sahin

# Recognizer
This notebook will be used to validate your model and grade your work.

# Collect features and execute model (YOUR IMPLEMENTATION HERE)
You need to implement the following function that predicts the digits that are available in an image. The function should return an array with 4 elements, containing these 4 digits from left to right.

In [7]:
import glob
import os
import sklearn
import joblib
import skimage, skimage.io
import pandas as pd
import numpy as np
from skimage import io, transform, color, filters, data, morphology, measure

In [8]:
# Load the preprocessor (REPLACE WITH YOUR CODE)
preproc = joblib.load('../classifiers/best_preprocessor.pkl') 

# Load your final model (REPLACE WITH YOUR CODE)
clf = joblib.load('../classifiers/best_classifier.pkl') 

In [9]:
features = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,
            'major_axis_length', 'minor_axis_length', 'euler_number', 
            'centroid-0', 'eccentricity', 'local_centroid-0', 'perimeter', 
            'perimeter_crofton', 'filled_area', 'orientation']

def initialize_image_grids(image):
    zipcode = image
    zipcode = zipcode *-1
    gray = color.rgb2gray(zipcode)
    thresh = filters.threshold_otsu(gray)
    binary = gray > thresh
    binary_splitted = [binary[:, :32], binary[:, 32:64], binary[:, 64:96], binary[:, 96:128]]
    
    df = pd.DataFrame()

    for x in range(0, len(binary_splitted)):
        
        test = binary_splitted[x]
        height = 8
        width = 8

        s_width = 0
        s_heigth = 0

        for i in range(1, 17):
            h = s_heigth+height
            w = s_width+width
            feat = test[s_heigth:h, s_width:w]
            white_pixels = feat[feat==1]
            total_white_pixels = len(white_pixels)
            df.loc[x, i] = total_white_pixels
            if (i%4 == 0):
                s_width = 0
                s_heigth = s_heigth + height
            else:
                s_width = s_width + width
        
    return df

image_properties = ['label', 'area', 'centroid', 
                    'perimeter', 'eccentricity', 'euler_number',
                    'filled_area','perimeter_crofton', 'local_centroid', 
                    'major_axis_length', 'minor_axis_length', 'orientation']

def initialize_image_props(image):
    zipcode = image
    zipcode = zipcode *-1
    gray = color.rgb2gray(zipcode)
    thresh = filters.threshold_otsu(gray)
    binary = gray > thresh
    
    # remove the white small spots/dots
    binary = morphology.binary_opening(binary)
    
    # make the white zip numbers thicker
    binary = morphology.binary_dilation(binary)
    
    # some numbers may have holes in it. That could seperate the number in 2 different labels. with closing the gap will be closed
    binary = morphology.binary_closing(binary)
    
    # return number back to its original state before it was made thicker. make the numbers thinner
    binary = morphology.binary_erosion(binary)

    label_image, total_labels = measure.label(binary, return_num=True)
    
    global image_properties
    props = measure.regionprops_table(label_image, properties=image_properties)
    tmp_df = pd.DataFrame(props)
    
    # filter out labels that have too small areas. these labels are small dots that could not be filled with the opening function
    tmp_df = tmp_df.query('area > 50').reset_index(drop=True)
        
    return tmp_df

In [10]:
def classify_image(filename):
    result = list()

    img_array = skimage.io.imread(filename)

    tmp_result_grid = initialize_image_grids(img_array)
    tmp_result_props = initialize_image_props(img_array)
    tmp_result = pd.concat([tmp_result_grid,tmp_result_props], axis=1)
    tmp_result[features] = preproc.transform(tmp_result[features])
    predicted = clf.predict(tmp_result[features])
    
    for i in range(4):
        result.append(str(predicted[i]))
    
    return result

# Score your model (DON'T CHANGE THIS CODE!!!)

In [11]:
DATASET_FOLDER = '../dataset-images/'

# Outcomes
correct_classified_digits = 0
incorrect_classified_digits = 0
correct_classified_zipcodes = 0
incorrect_classified_zipcodes = 0

# Score the classifier
files = glob.glob(os.path.join(DATASET_FOLDER, '*.png'))
for f in files:
    # Get the correct label from the filename
    correct_label = f[-8:-4]
    # Predict using the classifier
    predicted_label = classify_image(f)
    
    # Score digits
    zipcode_correct = True
    for i in range(len(correct_label)):
        if str(correct_label[i]) == str(predicted_label[i]):
            correct_classified_digits += 1
        else:
            incorrect_classified_digits += 1
            zipcode_correct = False
    
    # Score correct zipcodes
    if zipcode_correct:
        correct_classified_zipcodes += 1
    else:
        incorrect_classified_zipcodes += 1

print("Digit accuracy: ", (correct_classified_digits / (correct_classified_digits + incorrect_classified_digits)), "(", correct_classified_digits, "/", incorrect_classified_digits, ")")
print("Zipcode accuracy: ", (correct_classified_zipcodes / (correct_classified_zipcodes + incorrect_classified_zipcodes)), "(", correct_classified_zipcodes, "/", incorrect_classified_zipcodes, ")")        


Digit accuracy:  0.9885416666666667 ( 949 / 11 )
Zipcode accuracy:  0.9541666666666667 ( 229 / 11 )
