In [1]:
from scipy.io import loadmat
import json
import numpy as np
import time
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from scipy.spatial import distance
from sklearn.preprocessing import StandardScaler
import random
from metric_learn import LMNN

from sklearn.cluster import KMeans
import sklearn.utils.linear_assignment_ as la
from sklearn.metrics import accuracy_score
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.metrics.cluster import homogeneity_score
from sklearn.metrics.cluster import completeness_score
from collections import Counter

In [2]:
data = loadmat('PR_data/cuhk03_new_protocol_config_labeled.mat')
camID = data['camId'].flatten()
filelist = data['filelist'].flatten()
gallery_idx = data['gallery_idx'].flatten()
labels = data['labels'].flatten()
query_idx = data['query_idx'].flatten()
train_idx = data['train_idx'].flatten()

print('camID:',camID.shape)
print('filelist:',filelist.shape)
print('gallery_idx:',gallery_idx.shape)
print('labels:',labels.shape)
print('query_idx:',query_idx.shape)
print('train_idx:',train_idx.shape)

camID: (14096,)
filelist: (14096,)
gallery_idx: (5328,)
labels: (14096,)
query_idx: (1400,)
train_idx: (7368,)


In [3]:
try:
    features = loadmat('PR_data/features.mat')
    features = features['features']
except FileNotFoundError:
    print('exception handling')
    with open('PR_data/feature_data.json','r')as f: 
        features = json.load(f) 
        features = np.asarray(features) # each row is a feature (data instance) print(features.shape)
print(features.shape)

(14096, 2048)


In [4]:
train = features[train_idx-1]
scalar = StandardScaler().fit(train)
#train = StandardScaler().fit_transform(features[train_idx-1])
train = scalar.transform(train)
tLabels = labels[train_idx-1]
#query = features[query_idx-1]
query = scalar.transform(features[query_idx-1])
qLabels = labels[query_idx-1]
#gallery = features[gallery_idx-1]
gallery = scalar.transform(features[gallery_idx-1])
gLabels = labels[gallery_idx-1]
tID = camID[train_idx-1]
qID = camID[query_idx-1]
gID = camID[gallery_idx-1]

c = len(np.unique(tLabels)) #767
print(c)

num_clusters = len(np.unique(gLabels))
print(num_clusters, len(np.unique(qLabels)))
print(len(train)-c) # 6601

767
700 700
6601


In [5]:
class Datapoint(object):
    def __init__(self, features, label, cam_id):
        self.features = features
        self.label = label
        self.cam_id = cam_id

class Dataset(object):
    def __init__(self, datapoint_list=None):
        if datapoint_list:
            self.datapoints = datapoint_list
        else:
            self.datapoints = []
    
    def features_array(self):
        return np.array([datapoint.features for datapoint in self.datapoints])
    def labels(self):
        return np.array([datapoint.label for datapoint in self.datapoints])
    def cam_ids(self):
        return np.array([datapoint.cam_id for datapoint in self.datapoints])

training_dataset = Dataset([Datapoint(train[i], tLabels[i], tID[i]) for i in range(len(train))])
query_dataset = Dataset([Datapoint(query[i], qLabels[i], qID[i]) for i in range(len(query))])
gallery_dataset = Dataset([Datapoint(gallery[i], gLabels[i], gID[i]) for i in range(len(gallery))])

In [6]:
def fit_model(model, training_dataset, query_dataset, gallery_dataset):
    model = model.fit(training_dataset.features_array(), training_dataset.labels())
    omega_train = model.transform(training_dataset.features_array())
    omega_query = model.transform(query_dataset.features_array())
    omega_gallery = model.transform(gallery_dataset.features_array())
    
    return model, omega_train, omega_query, omega_gallery

In [7]:
def best_map(l1, l2): #returns new cluster labels
    """
    Permute labels of l2 to match l1 as much as possible
    """
    #in this case label2 are the cluster labels, label1 are the data labels
    if len(l1) != len(l2):
        print("L1.shape must == L2.shape")
        exit(0)
 
    label1 = np.unique(l1)
    n_class1 = len(label1)
 
    label2 = np.unique(l2)
    n_class2 = len(label2)
 
    n_class = max(n_class1, n_class2)
    G = np.zeros((n_class, n_class))
 
    for i in range(0, n_class1):
        for j in range(0, n_class2):
            ss = l1 == label1[i]
            tt = l2 == label2[j]
            G[i, j] = np.count_nonzero(ss & tt)
 
    A = la.linear_assignment(-G)
    print('A',A.shape,'\n,')
    print(A)
 
    new_l2 = np.zeros(l2.shape) 
    for i in range(0, n_class2): #target labels
        new_l2[l2 == label2[A[i][1]]] = label1[A[i][0]] # 
    return new_l2.astype(int)
 
 
def evaluation(X_selected,n_clusters,y):
    """
    This function calculates ARI, ACC and NMI of clustering results
 
    Input
    -----
    X_selected: {numpy array}, shape (n_samples, n_selected_features}
            input data on the selected features 
    n_clusters: {int}
            number of clusters
    y: {numpy array}, shape (n_samples,)
            true labels
 
    Output
    ------
    nmi: {float}
        Normalized Mutual Information
    acc: {float}
        Accuracy
    """
    k_means = KMeans(n_clusters=n_clusters, init='k-means++', n_init=10, max_iter=300,
                     tol=0.0001, precompute_distances=True, verbose=0,
                     random_state=0, copy_x=True, n_jobs=1)
 
    k_means.fit(X_selected)
    y_predict = k_means.labels_ #original cluster labels assigned to the gallery_features
    print('y_predict',len(y_predict),len(np.unique(y_predict)))
 
    # calculate NMI
    nmi = normalized_mutual_info_score(y, y_predict) # y= gallery_labels , y_predict = cluster labels
    h_score = homogeneity_score(y, y_predict) #each cluster only contains members of a single class
    c_score = completeness_score(y, y_predict) #all members of a given class are assigned to the same cluster
    print('homogenity:',h_score,'completeness:',c_score)
    
    # calculate ACC
    y_permuted_predict = best_map(y, y_predict) # y_permuted_predict = new cluster labels
    print('y_permuted_predict',len(y_permuted_predict),len(np.unique(y_permuted_predict)))
    acc = accuracy_score(y, y_permuted_predict) #returns the number of correctly classified samples
    #i.e. checks if the cluster labels match the labels assigned to the gallery_fectures 
    
    # want to remap clustercentre labels
    centers = k_means.cluster_centers_
    
    u, idx = np.unique(y_predict, return_index=True)
    center_labels = y_permuted_predict[idx]
    print(len(center_labels), len(np.unique(center_labels)))
    
    return nmi, acc, centers, center_labels,y_permuted_predict

In [8]:
start = time.time()
nmi,acc,centers,center_labels,gallery_clustered = evaluation(gallery_dataset.features_array(),num_clusters,gallery_dataset.labels())
end = time.time()
print('time:',end-start)

y_predict 5328 700
homogenity: 0.9132354526702438 completeness: 0.9238192959588092
A (700, 2) 
,
[[  0 582]
 [  1 131]
 [  2 554]
 ...
 [697 196]
 [698 174]
 [699  60]]
y_permuted_predict 5328 700
700 700
time: 185.5173168182373


In [9]:
Dist = distance.cdist(query_dataset.features_array(),centers,metric = 'euclidean')
print(Dist.shape)
sorted_idx = np.argsort(Dist)
print('sorted_idx',sorted_idx.shape,sorted_idx)

(1400, 700)
sorted_idx (1400, 700) [[661 293 135 ... 173 681 550]
 [582 293 661 ... 550 486 396]
 [131 621 102 ... 154 678 562]
 ...
 [588 174 453 ... 494 681 638]
 [ 60 120 350 ... 347 638 681]
 [ 60  76 216 ... 638 347 349]]


In [11]:
k=1
NN = sorted_idx[:,:k]
print(NN.shape,NN)
print(center_labels[NN[0]])

sum = 0
for i in range(len(query_dataset.features_array())):
    if(query_dataset.labels()[i] in center_labels[NN[i]]): #if there is a match
        sum += 1
acc = sum/len(query)
print('acc',acc*100)
print('NN')

(1400, 1) [[661]
 [582]
 [131]
 ...
 [588]
 [ 60]
 [ 60]]
[1426]
acc 68.42857142857143
NN
