In [37]:
import os 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import matplotlib.image as Image
import cv2
import time
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler


# Data import & Pre-processing --------------------------------------------------------------------

In [7]:
# Gets paths for images 
def get_catalog(file):
    species = []
    pics = []
    for root, subdir, files in os.walk(file):
        species.append(subdir)
        pics.append(files)
    species = species[0]
    
    catalog = {}
    for i in range(len(species)):
        catalog[species[i]] = pics[i+1]
    return catalog

In [8]:
def get_pics_colors(catalog, path, species_index, pic_index):
    count = 1
    new_catalog = {}
    for specie in catalog:
        blues, greens, reds = [], [], []
        for i in range(pic_index):
            colors = []
            blue, green, red = cv2.split(cv2.cvtColor(cv2.imread(path + specie + '/' + catalog[specie][i]), cv2.COLOR_BGR2RGB))
            res1 = cv2.resize(blue, dsize=(98, 64), interpolation=cv2.INTER_NEAREST)
            res2 = cv2.resize(green, dsize=(98, 64), interpolation=cv2.INTER_NEAREST)
            res3 = cv2.resize(red, dsize=(98, 64), interpolation=cv2.INTER_NEAREST)
            
            blues.append(res1.flatten())
            greens.append(res2.flatten())
            reds.append(res3.flatten())
            
        new_catalog[specie] = np.array([blues,greens,reds])
        if count == species_index:
            break
        count += 1
        
    return new_catalog
    

In [9]:
#Gets dictionary of pictures in grayscalse from paths
def get_pics_gray(catalog,path, species_index,pic_index):
    count = 1
    new_catalog = {}
    for specie in catalog:
        images =[]
        for i in range(pic_index):
            image = cv2.imread(path + specie + '/' + catalog[specie][i],cv2.IMREAD_GRAYSCALE)
            res = cv2.resize(image, dsize=(98, 64), interpolation=cv2.INTER_NEAREST)
            images.append(res.flatten())
        new_catalog[specie] = np.array(images)
        if count == species_index:
            break
        count += 1
        
    return new_catalog
    

In [10]:
# Plots sample pictures
def plot_sample(catalog, n_species,n_pics,flag):
    count = 1
    for specie in catalog:
        for i in range(n_pics):
            if flag:
                picture = plt.imshow(np.reshape(catalog[specie][i],(64,98)))
            else:
                picture = plt.imshow(np.reshape(catalog[specie][i],(112,112,3)))
            plt.show()
        if count == n_species:
            break
        count += 1
        

In [11]:
# Returns dictionary: species as keys, mxn array with pictures as rows and pixels as columns. 
def import_pics(path, nspec,npic):
    bird_catalog = get_catalog(path)
    #pics = get_pics_gray(bird_catalog,path, nspec, npic)
    pics = get_pics_colors(bird_catalog,path, nspec, npic)
    return pics

In [12]:
# Main function for data preprocessing ----------------------------------------------------------------------------------
seed = 2
np.random.RandomState(seed)


#Declare variables 
train_file, valid_file, test_file = 'data/birds/train/', 'data/birds/valid/', 'data/birds/test/'
num_species_train, num_pics_train = 3, 130
num_species_val, num_pics_val = 3, 5
num_species_test, num_pics_test = 3, 5

#get pictures
train_pictures = import_pics(train_file, num_species_train, num_pics_train)
valid_pictures = import_pics(valid_file, num_species_val, num_pics_val)
test_pictures = import_pics(test_file, num_species_test, num_pics_test)

#plot samples of imported images
#plot_sample(test_pictures,2,2,1)


# Dimensionality Reduction -------------------------------------------------------------------------

In [155]:
def eigendecomposition(images):
    species_values = {}
    for specie, values in images.items():
        color_values = []
        for color in values:
            mean = np.mean(color,axis=0)
            x = (color-mean)/ 255
            U, S, V = np.linalg.svd(x.T)
            color_values.append([U,S,mean])    
        species_values[specie] = color_values
    print("Done eigendecomposing.")
    return species_values

In [156]:
def get_eigenspecies(eigen,n_comp):
    species_eigenvectors = {}
    for spec, vals in  eigen.items():
        color_values = []
        for color in vals:
            U = color[0]
            mean = color[2]
            eigenvectors =  U[:,:n_comp]
            color_values.append((eigenvectors,mean))
        species_eigenvectors[spec] = color_values
    print("Done getting eigenspecies.")
    return species_eigenvectors

In [157]:
def combine_pics(pictures):
    test_images = []
    for spec,test_im in pictures.items():
        if len(test_images) == 0:
            test_images = test_im
        else:
            test_images = np.hstack([test_images,test_im])
    print("Done combining pics.")
    return test_images
    

In [158]:
def get_residuals(eigenspecies,test):
    residuals_matrix = []
    test = test / 255
    matrix_residual = []
    for i in range(3):
        residual_color = []
        for image in test[i]:
            per_species_residual = []
            for specie, metrics in eigenspecies.items():
                pre_image = image - (metrics[i][1]/255)
                residual = np.linalg.norm(pre_image - (metrics[i][0] @ metrics[i][0].T @ pre_image))**2
                per_species_residual.append(residual)
            residual_color.append(per_species_residual)
        matrix_residual.append(residual_color) 
    print("Done getting residuals.")
    return np.array(matrix_residual)

In [159]:
def get_classes(n_test_species,n_test_pics):
    temp_a = np.arange(n_test_species)
    temp_b = np.repeat(temp_a,n_test_pics)
    return np.array(temp_b +1)

In [160]:
start = time.time()
eigenvectors = eigendecomposition(train_pictures)
rgb_eigenspecies = get_eigenspecies(eigenvectors, 48)
combined_test_pics = combine_pics(test_pictures)
residuals = get_residuals(rgb_eigenspecies, combined_test_pics)
end = time.time()
print(end-start)

Done eigendecomposing.
Done getting eigenspecies.
Done combining pics.
Done getting residuals.
114.88443279266357


In [161]:
l2_residuals = np.sqrt(np.sum(residuals**2,axis=0))
#l2_residuals = np.sum(residuals,axis=0)**2 / 3
classification = l2_residuals.argmin(axis=1) + 1
true_classes = get_classes(num_species_test, num_pics_test)
print(accuracy_score(true_classes,classification))

0.6666666666666666


In [173]:
print(classification)
print(l2_residuals)

[1 2 1 1 1 2 2 2 3 2 1 2 3 2 3]
[[334.55437859 349.08314976 352.9828837 ]
 [278.07041247 274.76463818 293.58270176]
 [150.84676198 160.97574707 164.0289999 ]
 [255.00320823 276.85403663 295.42801958]
 [163.52717896 172.73939565 176.02040037]
 [230.61183267 185.061862   251.44541529]
 [201.77462796 191.90367549 204.81728274]
 [191.91894984 187.20682729 196.75342737]
 [ 72.03089873  72.17577557  71.53028186]
 [193.6605745  190.27408073 208.06516479]
 [220.34211897 239.70417897 227.84762884]
 [172.42792967 171.73335659 175.57842453]
 [139.36921697 162.36485084 134.31910051]
 [156.22106721 150.40616862 151.35097628]
 [159.94480309 166.2249552  155.01787082]]


In [172]:
new_residuals = []
for n in residuals:
    weights = (n / np.sum(n,axis=1)[:,None])
    new_residuals.append(weights)
new_residuals = np.array(new_residuals)

c = 0
most_rep_pixels = np.argmin(np.sum(residuals,axis=2),axis=0)
for i in range(most_rep_pixels.shape[0]):
    new_residuals[most_rep_pixels[i]][i] = new_residuals[most_rep_pixels[i]][i] * 0
new_residuals = 1 - new_residuals

final = np.array([residuals[color] * new_residuals[color] for color in range(3)])

l2_residuals = np.sqrt(np.sum(final**2,axis=0))
#l2_residuals = np.sum(residuals,axis=0)**2 / 3
classification = l2_residuals.argmin(axis=1) + 1
true_classes = get_classes(num_species_test, num_pics_test)
print(accuracy_score(true_classes,classification))
classification

0.6666666666666666


array([1, 2, 1, 1, 1, 2, 2, 2, 3, 2, 1, 2, 3, 2, 3], dtype=int64)

(3, 25, 5)


In [None]:
def eval_model(eigenspec,test_im n_test_species, n_test_pics):
    score = []
    principal_comp = []
    counter = np.arange(1,120, 1)
    for i in counter:
        eigenspecies = get_eigenspecies(eigenspec, i)
        residuals = get_residuals(eigenspecies,test_im)
        classification = residuals.argmin(axis=1) + 1
        true_classes = get_classes(n_test_species,n_test_pics)
        score.append(accuracy_score(true_classes,classification))
        principal_comp.append(i)
        print(i," iteration complete.")
    return score, principal_comp
    

# Density Estimation -----------------------------------------------------------------------------------

In [21]:
count = 1
for spec, values in train_pictures.items():
    for a in values:
        john = a 
        print(john, john.shape)
        if count == 1:
            break
    if count == 1:
        break

[[180 186 198 ... 173 195 173]
 [ 37  39  40 ...  60  60  60]
 [238 238 234 ...  41  42  41]
 ...
 [ 74  77  83 ...  91 133 110]
 [ 43  44  44 ... 115 103  85]
 [ 91  97 103 ... 104  92  88]] (130, 6272)


# Traditional Classification --------------------------------------------------------------------------

In [59]:
def get_pics(catalog,path, species_index,pic_index):
    count = 1
    new_catalog = {}
    for specie in catalog:
        images =[]
        for i in range(pic_index):
            image = cv2.imread(path + specie + '/' + catalog[specie][i])
            res = cv2.resize(image, dsize=(98, 64), interpolation=cv2.INTER_NEAREST)
            images.append(res.flatten())
        new_catalog[specie] = np.array(images)
        if count == species_index:
            break
        count += 1
        
    return new_catalog

In [60]:
def import_pics2(path, nspec,npic):
    bird_catalog = get_catalog(path)
    pics = get_pics(bird_catalog,path, nspec, npic)
    return pics

In [67]:
num_species_train, num_pics_train = 2, 130
num_species_val, num_pics_val = 3, 5
num_species_test, num_pics_test = 3, 5

#get pictures
train_pictures = import_pics2(train_file, num_species_train, num_pics_train)
valid_pictures = import_pics2(valid_file, num_species_val, num_pics_val)
test_pictures = import_pics2(test_file, num_species_test, num_pics_test)


In [70]:
for spec, value in train_pictures.items():
    print(value.shape)
print(train_pictures)

(130, 18816)
(130, 18816)
{'ABBOTTS BABBLER': array([[149, 172, 180, ..., 192, 171, 173],
       [ 33,  62,  37, ...,  78,  87,  60],
       [230, 238, 238, ...,  39,  41,  41],
       ...,
       [ 64,  86,  74, ...,  68, 125, 110],
       [ 22,  42,  43, ...,  88,  98,  85],
       [ 72, 132,  91, ...,  91, 128,  88]], dtype=uint8), 'ABBOTTS BOOBY': array([[231, 233, 234, ..., 131, 163, 176],
       [134, 135, 139, ..., 145, 142, 144],
       [ 53, 123, 117, ...,  22,  68,  56],
       ...,
       [ 55,  75,  92, ...,  45,  50,  59],
       [187, 184, 176, ..., 189, 181, 168],
       [167, 151, 139, ..., 149, 162, 148]], dtype=uint8)}


In [75]:
def get_Xy(pictures):
    test_images = []
    labels = []
    count = 1
    for species, values in pictures.items():
        if len(test_images) == 0:
            test_images = values
            labels = np.repeat(count,values.shape[0])
        else:
            test_images = np.concatenate((test_images,values),axis=0)
            labels = np.concatenate((labels, np.repeat(count, values.shape[0])),axis=0)
        count +=1
    test_images = np.array(test_images)
    return test_images, np.array(labels)

In [35]:
def train_model(X, label):
    y = np.reshape(np.repeat(label,X.shape[0]),(X.shape[0],1))
    X_scaled = StandardScaler().fit_transform(X)
    print(X_scaled,X_scaled.shape)
    
    knn = KNeighborsClassifier()
    n_neighbors = {'n_neighbors':[3,4,5,6,7,8]}
    
    lr = LogisticRegression()
    
    svm = SVC()
    kernel = {'kernel':('linear','rbf')}
    
    clf_knn = GridSearchCV(knn,n_neighbors).fit(X,y)
    

In [77]:
X, y = get_Xy(train_pictures)
print(y.shape)

(260,)


# Deep Learning -----------------------------------------------------------------------------------------

In [None]:
for specie in pictures:
    for n in pictures[specie]:
        print(n.shape)