# ENGR418 Project Stage 2 Group 31

By: Jared Paull (63586572), Liam Ross (75469692)


In [7]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import linear_model
import os
from sklearn.metrics import confusion_matrix
from PIL import Image, ImageFilter
import PIL

## Scraping Image Data




In [279]:
image_size = 64
filter_value = 4
angles = [0, 15, 30, 45, 60, 75, 90, 105, 120, 135, 150, 165, 180, 195, 210, 225, 240, 255, 270, 295, 310, 325, 340]

x,y = get_image_feature_data("../data/training", image_size, filter_value, angles)
xt, yt = get_image_feature_data("../data/testing", image_size, filter_value, angles)
print("done")

done


In [280]:
log_regress = linear_model.LogisticRegression(solver = "liblinear")
log_regress.fit(x,y);


pred =  log_regress.predict(x)
predicted = confusion_format(pred)
actual = confusion_format(y)
print(pd.crosstab(actual, predicted, rownames=["Shape Actual"], colnames=["Shape Predicted"]))
print(f"Percentage of correct classification from model on training data set: {100-error_percentage(pred,y):.2f}%\n")

pred =  log_regress.predict(xt)
predicted = confusion_format(pred)
actual = confusion_format(yt)
print(pd.crosstab(actual, predicted, rownames=["Shape Actual"], colnames=["Shape Predicted"]))
print(f"Percentage of correct classification from model on testing data set: {100-error_percentage(pred,y):.2f}%")

Shape Predicted  Circle  Rectangle  Square
Shape Actual                              
Circle               27          0       0
Rectangle             0         27       0
Square                0          0      27
Percentage of correct classification from model on training data set: 100.00%

Shape Predicted  Circle  Rectangle  Square
Shape Actual                              
Circle               13          8       6
Rectangle             4         17       6
Square                9          4      14
Percentage of correct classification from model on testing data set: 54.32%


In [206]:
image_size = 64
filter_value = 6
angles = [0, 15, 30, 45, 60, 75, 90, 105, 120, 135, 150, 165, 180, 195, 210, 225, 240, 255, 270, 295, 310, 325, 340, 365]

x,y = get_image_feature_data("../data/training", image_size, filter_value, angles)
print(x.shape, y.shape)

(81, 1536) (81,)


### Creating Logistic Regression Model

Now that all of the image data is collected, and they have a corresponding label. The data can be fit to a logistic regression model.

In [207]:
# Creating logistic regression model instance that implements a liblinear solver type
# liblinear solver implements a coordinate descent algorithm which works well with high dimension (4096 here)
log_regress = linear_model.LogisticRegression(solver = "liblinear")
# method to fit the logistic regression instance with the data collected in the previous cell
log_regress.fit(x,y);

## Testing the algorithm

Now that a model exists, image data and labels are scraped from the testing folder, in the exact same fashion as the data collection from the training data folder.

In [209]:
# The code here is the same as that used to get image data from the training folder.
# This section will not be commented on, since the previous section covers all aspects of it.
# xt,yt represent image (xt) training data, and label (yt) training data

xt, yt = get_image_feature_data("../data/testing", image_size, filter_value, angles)

## Prediction and Confusion Matrix

The training data is fed into the model and an output is predicted (based on the model). Then the outputs from the model are compared with the correct values to see the model accuracy. First the training data is tested on the model.

In [210]:
# feed the training data into the model, pred is an array containing the output labels based on the model
pred =  log_regress.predict(x)

# These are two formatting questions to make the confusion matrix more appealing. Refer to confusion_format function at the bottom.
predicted = confusion_format(pred)
actual = confusion_format(y)

# prints a confusion matrix, rows are true values, and columns are the model's guessed values.
print(pd.crosstab(actual, predicted, rownames=["Shape Actual"], colnames=["Shape Predicted"]))

# then the percentage of errors is the number of errors divided by the total number of image samples times 100 for percentage.
# The error_percentage function is described below in comment detail.
print(f"\nPercentage of model errors from the testing data: {error_percentage(pred,y):.2f}%")

Shape Predicted  Circle  Rectangle  Square
Shape Actual                              
Circle               27          0       0
Rectangle             0         27       0
Square                0          0      27

Percentage of model errors from the testing data: 0.00%


Next, the testing data is tested on the model.

In [211]:
# feed the testing data into the model, pred is an array containing the output labels based on the model
pred =  log_regress.predict(xt)

# These are two formatting questions to make the confusion matrix more appealing. Refer to confusion_format function at the bottom.
predicted = confusion_format(pred)
actual = confusion_format(yt)

# prints a confusion matrix, rows are true values, and columns are the model's guessed values.
print(pd.crosstab(actual, predicted, rownames=["Shape Actual"], colnames=["Shape Predicted"]))

# then the percentage of errors is the number of errors divided by the total number of image samples times 100 for percentage.
# The error_percentage function is described below in comment detail.
print(f"\nPercentage of model errors from the testing data: {error_percentage(pred,y):.2f}%")

Shape Predicted  Circle  Rectangle  Square
Shape Actual                              
Circle               12         10       5
Rectangle             5         19       3
Square                8          7      12

Percentage of model errors from the testing data: 46.91%


---
---
---
---

# **Functions**

All of these functions **must** be ran before anything else. Each function has its purpose discussed, and are each well commented on.


In [278]:
# returns 2^n/2^n image that is the filtered edge detection version
def edge_image(image, image_size, filter_value):
    image = image.convert("L")
    image = image.filter(ImageFilter.FIND_EDGES)
    # choose 2^n + 2 in each dimension
    image = image.resize((image_size + 2,image_size + 2))
    image = PIL.Image.fromarray(np.array(image)[int(1) : int(image.height -1), int(1) : int(image.width - 1)])

    data = np.asarray(image)
    data[data <= filter_value] = 0
    data[data > 0] = 1
    #data = data * 5
    image = PIL.Image.fromarray(data)
#     for i in range(64):
#         for j in range(64):
#             print(data[i][j], end = " ")
#         print()
    return image

In [205]:
def edge_count(image):
    data = np.array(image)
    data = np.hstack(data)
    count_list = []
    for i in range(image.width):
        count = 0
        for j in range(image.height):
            count = count + data[i + j*image.height]
        count_list.append(count)
    return count_list

In [203]:
def get_image_feature_data(rel_dir, image_size, filter_value, angles):
    x = []
    y = []

    for pic in os.listdir(rel_dir):
        image = PIL.Image.open(f"{rel_dir}/{pic}")
        image = edge_image(image, image_size, filter_value)

        #angles = [0, 15, 30, 45, 60, 75, 90, 105, 120, 135, 150, 165, 180, 195, 210, 225, 240, 255, 270, 295, 310, 325, 340, 365] # 24 angles
        vec = []

        for angle in angles:
            img = image.rotate(angle)
            count = edge_count(img)
            vec.append(count)
        vec = np.array(vec)
        vec = np.hstack(vec)

        # examine the name of the picture file, can find correct label based on first letter of the file name.
        # c indicates the picture is a circle
        if( str.lower(pic[0]) == "c"):
            # classify circles as a 0
            y.append(0)
        # r indicates the picture is a rectangle
        elif (str.lower(pic[0]) == "r"):
            # classify rectangle as a 1
            y.append(1)
        # only other situation is the image is a square
        else:
            # classify square as a 2
            y.append(2)

        x.append(vec) # each image has 1536 features

    x = np.array(x)
    y = np.array(y)
    
    return x,y

In [181]:
# This function will convert from decimal label to strings.
# 0=>Circle, 1=>Rectangle, 2=>Square

def confusion_format(labels):
    test = []
    for i in labels:
        if i == 0:
            test.append("Circle")
        elif i == 1:
            test.append("Rectangle")
        else:
            test.append("Square")
    test = np.array(test)
    return test

In [182]:
def error_percentage(pred, y):
    
    #print(pred)
    #print(y)
    # the number of errors is the number of differences between the model's labels and the correct labels
    errors = 0
    for i in range(pred.size):
        # pred is the predicted array labels, while y is the actual
        if pred[i] != y[i]:
            errors = errors + 1
            
    # then the percentage of errors is the number of errors divided by the total number of image samples times 100 for percentage.
    return errors / pred.size * 100