In [1]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import os

In [2]:
# function converts rgb images to grayscale images
def rgb2gray(rgb):
    ''' converts rgb images to grayscale images 
    Input: numpy array with pixel values for the rgb image
    Return: numpy array with pixel values for the grey scale image'''
    
    # pictures have 32x32 pixels, to classify the data we need 33, last value is for classification --> 0 or 1
    gray = np.zeros((32, 33))
    # Going through every index of the array, then calculating the average of blue , green and red color. 
    for i in range(len(rgb)):
        for j in range(len(rgb)):
            blue = rgb[i, j, 0]
            green = rgb[i, j, 1]
            red = rgb[i, j, 2]
            # Calculating the average of blue , green and red color allows us to combine the three colors into one number.
            # Average determines how bright the pixel is.
            grayscale_value = (blue + green + red) / 3
            # replacing rgb values by gray-scale values
            gray[i, j] = grayscale_value
    return gray

In [3]:
#1 birds

directory1 = 'C:\\Users\\sabin\\bird\\'
# empty list for all colored images
birds_gray = []

# iterate over the files in the directory
for filename in os.listdir(directory1):
    # pathname
    url1 = os.path.join(directory1, filename)
    # get single picture from directory
    img_rgb1 = mpimg.imread(url1)
    # convert the picture into grayscale
    img_gray1 = rgb2gray(img_rgb1)
    # add image to list of gray images and calssify it as 0
    birds_gray.append(img_gray1)

print('birds_gray', birds_gray)



#2 frogs
directory2 = 'C:\\Users\\sabin\\frog\\'
# empty list for all colored images
frogs_gray = []

# iterate over the files in the directory
for filename in os.listdir(directory2):
    # pathname
    url2 = os.path.join(directory2, filename)
    # get single picture from directory
    img_rgb2 = mpimg.imread(url2)
    # convert the picture into grayscale
    img_gray2 = rgb2gray(img_rgb2)
    # change label for frogs to 1
    for i in range(len(img_gray2)):
        img_gray2[i][-1] = 1
    # add image to list of gray images and classify it as 1
    frogs_gray.append(img_gray2)

# convert list into numpy-array
#frogs_gray = np.array(frogs_gray)

# classify frog pictures with 1
# for i in range(len(frogs_gray[0])):
#     for j in range(len(frogs_gray[1])):
#         frogs_gray[i][j][-1] = 1
#         print(frogs_gray[i][j][-1]) 

print('frogs_gray', frogs_gray)

birds_gray [array([[0.3620915 , 0.29281048, 0.50588238, ..., 0.51764707, 0.51372548,
        0.        ],
       [0.38562091, 0.48496731, 0.53986931, ..., 0.53464051, 0.52941179,
        0.        ],
       [0.52156862, 0.54640524, 0.56078434, ..., 0.53464055, 0.5385621 ,
        0.        ],
       ...,
       [0.46274511, 0.47189546, 0.48496731, ..., 0.61699351, 0.69934646,
        0.        ],
       [0.46666666, 0.46928108, 0.49803921, ..., 0.61437909, 0.66274516,
        0.        ],
       [0.39738564, 0.50718955, 0.49542483, ..., 0.46666666, 0.49803921,
        0.        ]]), array([[0.29150327, 0.28496734, 0.26928107, ..., 0.36862747, 0.36862747,
        0.        ],
       [0.22875818, 0.2379085 , 0.20653596, ..., 0.33333333, 0.33725492,
        0.        ],
       [0.21568628, 0.23529412, 0.19869282, ..., 0.26013072, 0.26535948,
        0.        ],
       ...,
       [0.57777782, 0.47581704, 0.29019608, ..., 0.66013074, 0.65620915,
        0.        ],
       [0.60130723, 0.

In [25]:
# whole dataset:

dataset = birds_gray + frogs_gray
print(len(dataset))
dataset = np.array(dataset)
np.random.shuffle(dataset)

# we take only 1/4 of the whole dataset, to decrease computing time --> 500 images
dataset = dataset[:int(len(dataset)/4)]
print(len(dataset))
print(dataset)

# divide the dataset into 10% test and 90% training set

# 90% training:
xb = int((90 * len(birds_gray)) / 100)
xf = int((90 * len(frogs_gray)) / 100)

print('xb = ', xb)
print('xf = ', xf)

# class1 with label 0
train_1= birds_gray[0:xb]
test_1 = birds_gray[xb:]

# class2 with label 1
train_2 = frogs_gray[0:xf]
test_2 = frogs_gray[xf:]

2000
500
[[[0.01699347 0.0130719  0.00915033 ... 0.23660131 0.2627451  1.        ]
  [0.01830065 0.01568627 0.01045752 ... 0.2405229  0.27058824 1.        ]
  [0.01830065 0.01568627 0.0130719  ... 0.2405229  0.26928107 1.        ]
  ...
  [0.34117647 0.36339871 0.35163399 ... 0.38169936 0.35555557 1.        ]
  [0.38039215 0.3895425  0.34640523 ... 0.39607843 0.36732026 1.        ]
  [0.37777778 0.36470588 0.34117647 ... 0.44705884 0.41699346 1.        ]]

 [[0.32941178 0.26797386 0.32156865 ... 0.48627452 0.44967322 1.        ]
  [0.30457517 0.29934641 0.34248368 ... 0.42222222 0.42614381 1.        ]
  [0.12156863 0.17908498 0.16993465 ... 0.39084967 0.49934642 1.        ]
  ...
  [0.11764706 0.26666667 0.53986931 ... 0.44705884 0.46535949 1.        ]
  [0.16339869 0.37777778 0.50980397 ... 0.4614379  0.44183008 1.        ]
  [0.35686274 0.45228759 0.30588235 ... 0.52810458 0.43529415 1.        ]]

 [[0.09281046 0.09150327 0.09542484 ... 0.09019608 0.09019608 1.        ]
  [0.09411765

In [26]:
# this part is unnessesary
# merge the datasets into one, and divide them into test and training set

test = test_1 + test_2
test = np.array(test)
# shuffle data radomnly, so we get a mixed dataset with class 1 and class 2
np.random.shuffle(test)
test


train = train_1 + train_2
train = np.array(train)
# shuffle data radomnly, so we get a mixed dataset with class 1 and class 2
np.random.shuffle(train)
train

array([[[0.32679741, 0.34509806, 0.36078433, ..., 0.38692812,
         0.35686278, 0.        ],
        [0.34771244, 0.36993468, 0.38300657, ..., 0.40130719,
         0.35424836, 0.        ],
        [0.33856209, 0.34901961, 0.36601309, ..., 0.37516344,
         0.3542484 , 0.        ],
        ...,
        [0.29411765, 0.29411765, 0.2875817 , ..., 0.73333335,
         0.72156858, 0.        ],
        [0.26535948, 0.27712419, 0.28104575, ..., 0.70457522,
         0.61045758, 0.        ],
        [0.19738563, 0.24444445, 0.27450981, ..., 0.57516344,
         0.51372552, 0.        ]],

       [[0.15424837, 0.16209151, 0.16993465, ..., 0.16078431,
         0.16470588, 0.        ],
        [0.15163399, 0.15816995, 0.15816995, ..., 0.14771242,
         0.14901961, 0.        ],
        [0.15424837, 0.14640523, 0.13856209, ..., 0.14117647,
         0.1372549 , 0.        ],
        ...,
        [0.35816995, 0.36732026, 0.36339871, ..., 0.41568629,
         0.42483664, 0.        ],
        [0.4

The kNN task can be broken down into writing 3 primary functions:

- Calculate the distance between two points
- Find the nearest neighbours based on these pairwise distances
- Majority vote on a class labels based on the nearest neighbour list

Goal: 
Build a script that, for each input that needs classification, searches through the entire training set for the k-most similar instances. The class labels of the most similar instances should then be summarised by majority voting and returned as predictions for the test cases.

source: https://cambridgecoding.wordpress.com/2016/01/16/machine-learning-under-the-hood-writing-your-own-k-nearest-neighbour-algorithm/

In [27]:
# euclidean distance

def ec_distance(row1, row2):
    '''calculate the euclidean distance between two vectors
    Input: two vectors: row1 and row2
    Output: distance between the vectors; float'''
    distance = 0.0
    # len(row1) - 1, because last value is label, 0 or 1 
    for i in range(len(row1) - 1):
        for j in range(len(row2) - 1):
            distance += (row1[i][j] - row2[i][j])**2
    return np.sqrt(distance)
    

In [28]:
# Locate the most similar neighbors

def get_neighbors(train, test_row, k):
    '''compares distances between the test_row and the training set and returns the k nearest neighbours
    Input: 
        - train: np.array, X x n
        - test_row: one row of the test set, List: 1xn
    Return:  
        returns the k most similar records in the dataset in comparison to the test_row, 
        in ascending order of similarity;'''
    # list for distances between test_row and the training set
    distances = []
    for train_row in train:
        dist = ec_distance(test_row, train_row)
        distances.append((train_row, dist))
        
    # sort the list, so that the second item (distance) in the tuple (tuple[1]) is used in the sorting operation
    distances.sort(key = lambda tup: tup[1])
    
    neighbors = []
    for i in range(k):
        # append the k - nearest neighbours 
        neighbors.append(distances[i][0])
    return neighbors

neighbors = get_neighbors(train, test[0], 3)
for neighbor in neighbors:
    print(neighbor)

[[0.36862747 0.37254902 0.35947712 ... 0.36862747 0.3398693  1.        ]
 [0.36470588 0.3529412  0.37254902 ... 0.33725492 0.33594771 1.        ]
 [0.37124185 0.38692812 0.42483664 ... 0.32810458 0.39477126 1.        ]
 ...
 [0.41045753 0.40653594 0.41307191 ... 0.40000002 0.41830067 1.        ]
 [0.39477126 0.40784315 0.41176474 ... 0.38562095 0.40130719 1.        ]
 [0.38169936 0.38039219 0.39738564 ... 0.38431374 0.39346409 1.        ]]
[[0.56732027 0.54771245 0.5385621  ... 0.47189546 0.503268   0.        ]
 [0.54640524 0.53725493 0.53725493 ... 0.48235297 0.50457517 0.        ]
 [0.55424841 0.54248369 0.54901961 ... 0.49934642 0.51241831 0.        ]
 ...
 [0.43006539 0.41307191 0.41307191 ... 0.45098042 0.53464055 0.        ]
 [0.41045753 0.39738564 0.41307191 ... 0.44052291 0.45751635 0.        ]
 [0.37254902 0.36601309 0.36470588 ... 0.38823529 0.5163399  0.        ]]
[[0.43137256 0.37777778 0.36862747 ... 0.40653594 0.33856209 0.        ]
 [0.41568629 0.41568629 0.39607847 ... 

In [29]:
# Make a classification prediction with neighbors

def predict_classification(train, test_row, k):
    # get nearest neighbors
    neighbors = get_neighbors(train, test_row, k)
    
    
    # output_values are classification labels 0 or 1
    output_values = []
    for row in neighbors:
        output_values.append(row[-1][-1]) 
    # max() returns the most represented output value. Counts the number of labels, most labels are our prediction
    prediction = max(set(output_values), key = output_values.count)
    return prediction

prediction = predict_classification(train, test[0], 3)
print('Expected %d, Got %d.' % (train[0][-1][-1], prediction))

Expected 0, Got 0.


In [30]:
# kNN Algorithm

def kNN(train, test, num_neighbors):
    predictions = []
    for row in test:
        output =  predict_classification(train, row, num_neighbors)
        predictions.append(output)
    return (predictions)

In [31]:
# Calculate accuracy percentage

def accuracy(actual, predicted):
    '''returns accuracy of prediction [float]'''
    correct = 0.0
    for i in range(actual[0].size):
        if actual[i] == predicted[i]:
            correct += 1
    return correct * 100.0 / float(len(actual))

In [32]:
# Split a dataset into k folds for cross validation
import random

def cross_validation_split(dataset, n_folds):
    dataset_split = []
    dataset_copy = list(dataset)
    # divide the dataset by n_folds 
    fold_size = int(len(dataset) / n_folds)
    for i in range(n_folds):
        fold = []
        while len(fold) < fold_size:
            # get random index in range of the length of the dataset_copy 
            index = random.randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

In [33]:
# Calculate accuracy percentage

def accuracy(actual, predicted):
    '''returns accuracy of prediction [float]'''
    correct = 0.0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct * 100.0 / float(len(actual))

In [40]:
# Evaluate an algorithm using a cross validation split

def evaluate_algorithm(dataset, algorithm, n_folds, *args):
    folds = cross_validation_split(dataset, n_folds)
    scores = []
    # index for removing current fold
    i = 0
    j = 0
    print(j, '%')
    for fold in folds:
        train_set = list(folds)
        # remove the current fold (list) from train_set
        train_set.pop(i)
        i += 1
        train_set = sum(train_set, [])
        test_set = []
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None
        predicted = algorithm(train_set, test_set, *args)
        # print('fold', fold)
        actual = []   
        # apend true classfication to list 
        for row in fold:
            actual.append(row[0][-1])
        # accuracy
        acc = accuracy(actual, predicted)
        scores.append(acc)
        j += 10
        print(j, '%')
        
    return scores

In [41]:
# evaluate algorithm
# dataset is going to be split in 10 parts, 9 parts for training, 1 part for test
n_folds = 10
num_neighbors = 5
#dataset = train
scores = evaluate_algorithm(dataset, kNN, n_folds, num_neighbors)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

0 %
10 %
20 %
30 %
40 %
50 %
60 %
70 %
80 %
90 %
100 %
Scores: [70.0, 60.0, 62.0, 60.0, 66.0, 64.0, 60.0, 44.0, 58.0, 70.0]
Mean Accuracy: 61.400%


In [35]:
# see how it changes for n_folds = 5, so 80 % training, 20 % test
# evaluate algorithm
# number of epochs
n_folds = 5
num_neighbors = 5
#dataset = train
scores = evaluate_algorithm(dataset, kNN, n_folds, num_neighbors)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

0 %
20 %
40 %
60 %
80 %
100 %
Scores: [61.0, 54.0, 68.0, 62.0, 63.0]
Mean Accuracy: 61.600%


In [36]:
# evaluate algorithm
# number of epochs
n_folds = 5
num_neighbors = 7
#dataset = train
scores = evaluate_algorithm(dataset, kNN, n_folds, num_neighbors)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

0 %
20 %
40 %
60 %
80 %
100 %
Scores: [63.0, 59.0, 54.0, 62.0, 55.0]
Mean Accuracy: 58.600%


In [37]:
# evaluate algorithm
# number of epochs
n_folds = 5
# best key should be for sqrt(datapoints), lets see...
num_neighbors = 21
#dataset = train
scores = evaluate_algorithm(dataset, kNN, n_folds, num_neighbors)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

0 %
20 %
40 %
60 %
80 %
100 %
Scores: [46.0, 62.0, 68.0, 59.0, 62.0]
Mean Accuracy: 59.400%


In [38]:
# evaluate algorithm
# number of epochs
n_folds = 5
# best key should be for sqrt(datapoints), lets see...
num_neighbors = 3
#dataset = train
scores = evaluate_algorithm(dataset, kNN, n_folds, num_neighbors)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

0 %
20 %
40 %
60 %
80 %
100 %
Scores: [66.0, 65.0, 64.0, 61.0, 52.0]
Mean Accuracy: 61.600%
