In [1]:
import os 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import matplotlib.image as Image
import cv2
import time
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.cluster import KMeans

# Data import & Pre-processing --------------------------------------------------------------------

In [2]:
# Gets paths for images 
def get_catalog(file):
    species = []
    pics = []
    for root, subdir, files in os.walk(file):
        species.append(subdir)
        pics.append(files)
    species = species[0]
    
    catalog = {}
    for i in range(len(species)):
        catalog[species[i]] = pics[i+1]
    return catalog

In [None]:
def get_pics(catalog,path, species_index,pic_index):
    count = 1
    new_catalog = {}
    for specie in catalog:
        images =[]
        for i in range(pic_index):
            image = cv2.imread(path + specie + '/' + catalog[specie][i])
            res = cv2.resize(image, dsize=(98, 64), interpolation=cv2.INTER_NEAREST)
            images.append(res.flatten())
        new_catalog[specie] = np.array(images)
        if count == species_index:
            break
        count += 1
        
    return new_catalog

In [None]:
def import_pics2(path, nspec, npic):
    bird_catalog = get_catalog(path)
    pics = get_pics(bird_catalog,path, nspec, npic)
    return pics

In [None]:
def get_Xy(pictures):
    test_images = []
    labels = []
    count = 1
    for species, values in pictures.items():
        if len(test_images) == 0:
            test_images = values
            labels = np.repeat(count,values.shape[0])
        else:
            test_images = np.concatenate((test_images,values),axis=0)
            labels = np.concatenate((labels, np.repeat(count, values.shape[0])),axis=0)
        count +=1
    test_images = np.array(test_images)
    return test_images, np.array(labels)

In [None]:
# Main function for data preprocessing ----------------------------------------------------------------------------------
seed = 2
np.random.RandomState(seed)

#Declare variables 
train_file, valid_file, test_file = 'data/birds/train/', 'data/birds/valid/', 'data/birds/test/'
num_species_train, num_pics_train = 5, 100
num_species_val, num_pics_val = 3, 5
num_species_test, num_pics_test = 5, 5

#get pictures
train_pictures = import_pics2(train_file, num_species_train, num_pics_train)
valid_pictures = import_pics2(valid_file, num_species_val, num_pics_val)
test_pictures = import_pics2(test_file, num_species_test, num_pics_test)

# Dimensionality Reduction -------------------------------------------------------------------------

In [None]:
def eigendecomposition(images):
    species_values = {}
    for specie, values in images.items():
        mean = np.mean(values,axis=0)
        x = (values-mean)/ 255
        U, S, V = np.linalg.svd(x.T,full_matrices=False)
        species_values[specie] = [U, S, mean]
    print("Done eigendecomposing.")
    return species_values

In [None]:
def get_eigenspecies(eigen,n_comp):
    species_eigenvectors = {}
    for spec, vals in  eigen.items():
        U = vals[0]
        mean = vals[2]
        eigenvs =  U[:,:n_comp]
        species_eigenvectors[spec] = (eigenvs, mean)
    print("Done getting eigenspecies.")
    return species_eigenvectors

In [None]:
def get_residuals(eigenspecies,test):
    residuals_matrix = []
    test = test / 255
    matrix_residual = []
    for image in test:
        per_species_residual = []
        for specie, metrics in eigenspecies.items():
            pre_image = image - (metrics[1]/255)
            residual = np.linalg.norm(pre_image - (metrics[0] @ metrics[0].T @ pre_image))**2
            per_species_residual.append(residual)
        matrix_residual.append(per_species_residual) 
    print("Done getting residuals.")
    return np.array(matrix_residual)

In [None]:
start = time.time()
eigenvectors = eigendecomposition(train_pictures)
eigenspecies = get_eigenspecies(eigenvectors, 90)
X_test, y_test = get_Xy(test_pictures)
residuals = get_residuals(eigenspecies,X_test)
end = time.time()
print(end-start)

In [None]:
classification = residuals.argmin(axis=1) + 1
print(accuracy_score(y_test,classification))

# Density Estimation -----------------------------------------------------------------------------------

In [None]:
def dim_reduction(X):
    X = StandardScaler().fit_transform(X)
    pca = PCA(random_state=seed).fit(X)
    print(X.shape)
    new_X = pca.transform(X)
    print(new_X.shape)
    print("Dim reduction ready.")
    return new_X, pca

In [None]:
def retrieve_info(cluster_labels,y_train):
    reference_labels = {}
    
    for i in range(len(np.unique(cluster_labels))):
        index = np.where(cluster_labels == i,1,0)
        num = np.bincount(y_train[index==1]).argmax()
        reference_labels[i] = num
    return reference_labels

In [None]:
print(km_model.labels_)
print(np.where(km_model.labels_ == 0,1,0))
print(y)
print(y[np.where(km_model.labels_ == 0,1,0)])
print(np.bincount(y[np.where(km_model.labels_ == 0,1,0)]).argmax())
print(retrieve_info(km_model.labels_,y))

In [None]:
X_train, y_train = get_Xy(train_pictures)
X_test, y_test = get_Xy(test_pictures)

print(X_test.shape,X_train.shape)
X_pctrain, dim_reductor = dim_reduction(X_train)
X_pctest = dim_reductor.transform(X_test)

In [None]:
print(X_pctest.shape)

In [None]:
accuracy = []
clusters = []
for j in np.arange(2,500,10):
    km_model = KMeans(n_clusters=j,random_state=seed).fit(X)
    dic_labels = retrieve_info(km_model.labels_,y)
    actual_labels = [dic_labels[i] for i in km_model.labels_]
    accuracy.append(accuracy_score(actual_labels, y))
    clusters.append(j)
    print("Iteration " + str(j) + " complete")
plt.plot(clusters,accuracy)
plt.show()

# Traditional Classification --------------------------------------------------------------------------

In [None]:
def dim_reduction(X):
    X = StandardScaler().fit_transform(X)
    pca = PCA(random_state=seed).fit(X)
    new_X = pca.transform(X)
    print("Dim reduction ready.")
    return new_X

In [None]:
def train_model(X, label):
    
    knn = KNeighborsClassifier()
    n_neighbors = {'n_neighbors': list(range(1, 31))}
    
    lr = LogisticRegression(max_iter=1000,random_state=seed)
    
    svm = SVC()
    kernel = {'kernel':('linear','rbf'),'C': [0.1, 1, 10, 100],'gamma': [1, 0.1, 0.01, 0.001]}
    
    clf_svm = GridSearchCV(svm,kernel).fit(X,label)
    print("SVM done.")
    clf_knn = GridSearchCV(knn,n_neighbors).fit(X,label)
    print("KNN done")
    
    print(clf_knn.best_score_)
    print(clf_svm.best_score_)
    print(np.mean(cross_val_score(lr,X,label,cv=5)))
    print("LR done")
    

In [None]:
def build_model(X, label):
    return
    

In [None]:
X, y = get_Xy(train_pictures)
X = dim_reduction(X)
train_model(X, y)

# Deep Learning -----------------------------------------------------------------------------------------

In [None]:
for specie in pictures:
    for n in pictures[specie]:
        print(n.shape)