In [16]:
from scipy.io import loadmat
import json
import numpy as np
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from scipy.spatial import distance
from sklearn.preprocessing import StandardScaler
import random
from metric_learn import LMNN
from sklearn.cluster import KMeans

In [17]:
def kNN(k,query,qLabels,qID,gallery,gLabels,gID,metric):
    G = [] #list of lists, #list = #query imgs, length of each list = #gallery imgs used
    for i in range(len(query)):
        #indices for gallery features to use
        G.append([x for x in range(len(gallery)) if not(gLabels[x]==qLabels[i] and gID[x]==qID[i])]) 
    print('G')
    
    sorted_idx = []
    for i in range(len(query)):
        Dist = distance.cdist(np.reshape(query[i],(1,-1)),gallery[G[i]],metric = metric)
        sorted_idx.append(np.argsort(Dist))
    print('sorted_idx')

    def accuracy(k):
        NN = [arr[0,:k] for arr in sorted_idx]

        sum = 0
        for i in range(len(query)):
            usedLabels = gLabels[G[i]] # labels of gallery images used for each query image
            if(qLabels[i] in usedLabels[NN[i]]): 
                sum += 1
        acc = sum/len(query)
        return acc
    print('NN')
    
    if type(k) is list:
        acc = []
        for n in k:
            acc.append(accuracy(n))
    else:
        acc = accuracy(k)
    
    return acc

print('working')

working


In [18]:
data = loadmat('PR_data/cuhk03_new_protocol_config_labeled.mat')
camID = data['camId'].flatten()
filelist = data['filelist'].flatten()
gallery_idx = data['gallery_idx'].flatten()
labels = data['labels'].flatten()
query_idx = data['query_idx'].flatten()
train_idx = data['train_idx'].flatten()

print('camID:',camID.shape)
print('filelist:',filelist.shape)
print('gallery_idx:',gallery_idx.shape)
print('labels:',labels.shape)
print('query_idx:',query_idx.shape)
print('train_idx:',train_idx.shape)

camID: (14096,)
filelist: (14096,)
gallery_idx: (5328,)
labels: (14096,)
query_idx: (1400,)
train_idx: (7368,)


In [19]:
try:
    features = loadmat('PR_data/features.mat')
    features = features['features']
except FileNotFoundError:
    print('exception handling')
    with open('PR_data/feature_data.json','r')as f: 
        features = json.load(f) 
        features = np.asarray(features) # each row is a feature (data instance) print(features.shape)
print(features.shape)

(14096, 2048)


In [20]:
train = features[train_idx-1]
scalar = StandardScaler().fit(train)
#train = StandardScaler().fit_transform(features[train_idx-1])
train = scalar.transform(train)
tLabels = labels[train_idx-1]
#query = features[query_idx-1]
query = scalar.transform(features[query_idx-1])
qLabels = labels[query_idx-1]
#gallery = features[gallery_idx-1]
gallery = scalar.transform(features[gallery_idx-1])
gLabels = labels[gallery_idx-1]
tID = camID[train_idx-1]
qID = camID[query_idx-1]
gID = camID[gallery_idx-1]

c = len(np.unique(tLabels)) #767
print(c)

num_clusters = len(np.unique(gLabels))
print(num_clusters, len(np.unique(qLabels)))
print(len(train)-c) # 6601

767
700 700
6601


In [21]:
class Datapoint(object):
    def __init__(self, features, label, cam_id):
        self.features = features
        self.label = label
        self.cam_id = cam_id

class Dataset(object):
    def __init__(self, datapoint_list=None):
        if datapoint_list:
            self.datapoints = datapoint_list
        else:
            self.datapoints = []
    
    def features_array(self):
        return np.array([datapoint.features for datapoint in self.datapoints])
    def labels(self):
        return np.array([datapoint.label for datapoint in self.datapoints])
    def cam_ids(self):
        return np.array([datapoint.cam_id for datapoint in self.datapoints])

training_dataset = Dataset([Datapoint(train[i], tLabels[i], tID[i]) for i in range(len(train))])
query_dataset = Dataset([Datapoint(query[i], qLabels[i], qID[i]) for i in range(len(query))])
gallery_dataset = Dataset([Datapoint(gallery[i], gLabels[i], gID[i]) for i in range(len(gallery))])

In [22]:
def fit_model(model, training_dataset, query_dataset, gallery_dataset):
    model = model.fit(training_dataset.features_array(), training_dataset.labels())
    omega_train = model.transform(training_dataset.features_array())
    omega_query = model.transform(query_dataset.features_array())
    omega_gallery = model.transform(gallery_dataset.features_array())
    
    return model, omega_train, omega_query, omega_gallery

In [29]:

import sklearn.utils.linear_assignment_ as la
from sklearn.metrics import accuracy_score
from sklearn.metrics.cluster import normalized_mutual_info_score
 
def best_map(l1, l2): #returns new cluster labels
    """
    Permute labels of l2 to match l1 as much as possible
    """
    #in this case l2 are the cluster labels, l1 are the data labels
    if len(l1) != len(l2):
        print("L1.shape must == L2.shape")
        exit(0)
 
    label1 = np.unique(l1)
    n_class1 = len(label1)
 
    label2 = np.unique(l2)
    n_class2 = len(label2)
 
    n_class = max(n_class1, n_class2)
    G = np.zeros((n_class, n_class))
 
    for i in range(0, n_class1):
        for j in range(0, n_class2):
            ss = l1 == label1[i]
            tt = l2 == label2[j]
            G[i, j] = np.count_nonzero(ss & tt)
 
    A = la.linear_assignment(-G)
 
    new_l2 = np.zeros(l2.shape)
    for i in range(0, n_class2):
        new_l2[l2 == label2[A[i][1]]] = label1[A[i][0]]
    return new_l2.astype(int)
 
 
def evaluation(X_selected, n_clusters, y):
    """
    This function calculates ARI, ACC and NMI of clustering results
 
    Input
    -----
    X_selected: {numpy array}, shape (n_samples, n_selected_features}
            input data on the selected features 
    n_clusters: {int}
            number of clusters
    y: {numpy array}, shape (n_samples,)
            true labels
 
    Output
    ------
    nmi: {float}
        Normalized Mutual Information
    acc: {float}
        Accuracy
    """
    k_means = KMeans(n_clusters=n_clusters, init='k-means++', n_init=10, max_iter=300,
                     tol=0.0001, precompute_distances=True, verbose=0,
                     random_state=None, copy_x=True, n_jobs=1)
 
    k_means.fit(X_selected)
    y_predict = k_means.labels_ #original cluster labels 
    
    print(len(k_means.labels_))
 
    # calculate NMI
    nmi = normalized_mutual_info_score(y, y_predict) 
 
    # calculate ACC
    y_permuted_predict = best_map(y, y_predict) # new cluster labels
    acc = accuracy_score(y, y_permuted_predict) 
 
    return nmi, acc

In [27]:
print(num_clusters)

700


In [31]:
nmi,acc = evaluation(gallery_dataset.features_array(),num_clusters,gallery_dataset.labels())

5328


In [32]:
print(nmi,acc)

0.9162185277915547 0.6739864864864865


In [28]:
kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(gallery_dataset.features_array())
print(len(kmeans.labels_))

5328


In [32]:
acc = kNN(k = [1,2,3,5,10,20,50,75,100],query=omega_query,qLabels=query_dataset.labels(),qID=qID,gallery=omega_gallery,
          gLabels=gLabels,gID=gID,metric='euclidean')
print('pca:',acc)

G
sorted_idx
NN
pca: [0.43, 0.5157142857142857, 0.5685714285714286, 0.64, 0.7192857142857143, 0.7892857142857143, 0.8735714285714286, 0.8992857142857142, 0.9157142857142857]
