In [1]:
%cd '/scratch/sk7898/deep_radar'
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import os
import numpy as np
import seaborn as sns
import keras
import matplotlib.pyplot as plt
from keras.models import Model, load_model
from keras.layers import LSTM, Dense, Flatten, Dropout, BatchNormalization
from sklearn.cluster import KMeans, SpectralClustering
from sklearn import metrics
from data import get_fft_data
from clustering_utils import *
from sklearn.metrics import mean_squared_error

/scratch/sk7898/deep_radar


Using TensorFlow backend.


In [2]:
def X_kmeans(X, y, n_clusters, n_classes=4):
        
    kmeans = KMeans(init='k-means++', n_clusters=n_clusters, n_init=n_classes)
    kmeans.fit(X)
        
    cluster_labels = kmeans.predict(X)        
    cluster_cents = kmeans.cluster_centers_
    cluster_modes = get_cluster_mode(y, cluster_labels, n_clusters=n_clusters)
    
    return kmeans, cluster_labels, cluster_cents, cluster_modes

In [3]:
def get_clust_dist_matrix(cluster_cents, cluster_modes, n_clusters):
        
    if len(cluster_modes.shape) == 1:
        cluster_modes = cluster_modes.reshape(cluster_modes.shape[0], 1)
        
    weights = 1 + np.abs(np.subtract(cluster_modes, cluster_modes.T))
    clust_dists = weights * metrics.pairwise.euclidean_distances(cluster_cents)
        
    return clust_dists

def get_homogeneity_score(labels, threshold):
    label_idx = np.unique(labels, return_inverse=True)[1]
    pi = np.bincount(label_idx).astype(np.float)
    pi = pi[pi > 0]
    pi_sum = np.sum(pi)
    probs = pi/pi_sum
    scores_above_thresh = probs[probs >= threshold]
    
    return_val = max(scores_above_thresh) if len(scores_above_thresh) > 0 else 0
    return return_val

def ideal_homogeneous_clusters(y_true, cluster_labels, cluster_modes, n_clusters, homogeneity_threshold=0.8):
    homogeneous_clusters = []
    homogeneous_labels = [] 
    homogeneity_scores = []
    
    for i in range(n_clusters):
        indexes = cluster_labels.flatten() == i
        score = get_homogeneity_score(y_true[indexes], homogeneity_threshold)
        if score > 0:                        # score > homogeneity_threshold
            homogeneous_clusters.append(i)
            homogeneous_labels.append(cluster_modes[i])
            homogeneity_scores.append(score)

    return np.array(homogeneity_scores), np.array(homogeneous_clusters), np.array(homogeneous_labels)

def ideal_dist_clusters(X,
                        h_clusters, 
                        cluster_labels,
                        cluster_modes, 
                        n_clusters,
                        dist_threshold=0):
    
    def dist_check(to_add, to_remove, dists, threshold):
        all_but_one = np.concatenate([dists[:to_remove], dists[to_remove+1:]])
        if np.min(all_but_one) > threshold:
            return True
        return False

    ideal_clusters, X_min_dists = [], []
    dist_dict = {}
    
    # For all cluster i in Homogeneous clusters H, get the pairwise distance of all samples of cluster i 
    # with all the other clusters in Homogeneous clusters which do not belong to the same class (calculated by the mode)
    for clust_id in h_clusters:
        indexes, smpls_end_idx = [], [0]
        end_idx_dict = {}
        idx = [i for i in h_clusters if cluster_modes[i] != cluster_modes[clust_id]]
        for c_id in idx:
            idxs, _ = np.where(cluster_labels.reshape(-1, 1) == c_id)
            end_idx = smpls_end_idx[-1] + len(idxs) - 1
            smpls_end_idx.append(end_idx)
            end_idx_dict[end_idx] = c_id
            indexes += list(idxs)
           
        X_ref = X[cluster_labels.flatten() == clust_id]
        X_clusters = X[indexes]
        dists = metrics.pairwise.euclidean_distances(X_ref, X_clusters)
        # Get the minimum value from the pairwise distances matrix
        min_dist = np.min(dists)
        
        # Add the cluster to the list of ideal_clusters if the minimum distance is above the dist_threshold
        # Save all the cluster pairs which has violate the dist_threshold requirement
        if min_dist > dist_threshold:
            X_min_dists.append(min_dist)
            ideal_clusters.append(clust_id)
        else:
            smpls_end_idx = np.array(smpls_end_idx)
            idx_x, idx_y = np.where(dists <= dist_threshold)
            for x, y in zip(idx_x, idx_y):
                temp_idxs = smpls_end_idx >= y
                if len(smpls_end_idx[temp_idxs]) > 0:
                    end_idx = smpls_end_idx[temp_idxs][0]
                    dist_dict[(clust_id, end_idx_dict[end_idx])] = dists[x, y]
        
    # For (c1, c2) cluster pair, if c2 is the only one causing violation in c1 and the other way around
    # Include c1 or c2 depending on the class representations we already have in the ideal_clusters
    modes_till_now, mode_counts = np.unique(cluster_modes[ideal_clusters], return_counts=True)
    for key, val in dist_dict.items():
        c1, c2 = key[0], key[1]
        c1_cls, c2_cls = cluster_modes[c1], cluster_modes[c2]
        if (c2, c1) in dist_dict.keys() and c1 not in ideal_clusters and c2 not in ideal_clusters:
            count_c1 = mode_counts[modes_till_now == c1_cls] if c1_cls in modes_till_now else 0
            count_c2 = mode_counts[modes_till_now == c2_cls] if c2_cls in modes_till_now else 0
            if count_c1 < count_c2 and dist_check(c1, c2, dists[c1], dist_threshold):
                ideal_clusters.append(c1)
                X_min_dists.append(val)
            elif dist_check(c2, c1, dists[c2], dist_threshold):
                ideal_clusters.append(c2)
                X_min_dists.append(val)
            else:
                pass
            
    return ideal_clusters, X_min_dists

In [4]:
def get_ideal_clusters(X, y_true, 
                       cluster_labels, 
                       n_clusters, 
                       cluster_cents, 
                       cluster_modes,
                       count_cls=[1, 2, 3, 4],
                       homogeneity_threshold=0.8,
                       dist_threshold=0,
                       mean_stats=False,
                       verbose=0):
    
    cluster_dist_matrix = get_clust_dist_matrix(cluster_cents,
                                                cluster_modes, 
                                                n_clusters)
    h_scores, h_clusters, h_modes = ideal_homogeneous_clusters(y_true, 
                                                               cluster_labels,
                                                               cluster_modes,
                                                               n_clusters,
                                                               homogeneity_threshold=homogeneity_threshold)
    n_samples = get_n_samples(h_clusters, cluster_labels)        
    
    if dist_threshold:
        ideal_clusters, cluster_min_dists = ideal_dist_clusters(X,
                                                                h_clusters,
                                                                cluster_labels,
                                                                cluster_modes, 
                                                                n_clusters, 
                                                                dist_threshold=dist_threshold)
        
        return ideal_clusters, h_clusters, h_scores, h_modes, cluster_min_dists
    
    else:
        return h_clusters, h_clusters, h_scores, h_modes, cluster_dist_matrix

In [5]:
def relabel_points(X_subset, y_subset, 
                   non_h_indexes,
                   ideal_clusters, 
                   cluster_cents, 
                   cluster_modes,
                   dist_diff_thresh=None):
    
    new_y = y_subset.copy()
    was_changed = np.zeros(len(y_subset), dtype='int')
    
    h_cents = cluster_cents[ideal_clusters]
    ideal_clust_modes = cluster_modes[ideal_clusters]
    dists = metrics.pairwise.euclidean_distances(X_subset, h_cents)
    min_dist = np.min(dists, axis=1)
    min_dist_idx = np.argmin(dists, axis=1)
    
    
    for idx, x in enumerate(X_subset):
        closest_cluster = ideal_clusters[min_dist_idx[idx]]
        closest_cluster_cls = cluster_modes[closest_cluster]
        x_clust_dists = dists[idx][ideal_clust_modes != closest_cluster_cls]
        min_diff_dist = np.min(x_clust_dists)/min_dist[idx]
        if min_diff_dist > dist_diff_thresh:
            new_y[idx] = closest_cluster_cls
            was_changed[idx] = 1
            
    return new_y, was_changed

In [6]:
def get_cluster_samples(X_train, y_train,
                        ideal_clusters,
                        h_clusters,
                        cluster_labels):
    
    indexes, non_h_indexes, h_indexes = [], [], []
    cluster_labels = cluster_labels.reshape(-1, 1)
    
    for clust_id in ideal_clusters:
        idxs, _ = np.where(cluster_labels == clust_id)
        indexes += list(idxs)

    for clust_id in h_clusters:
        idxs, _ = np.where(cluster_labels == clust_id)
        h_indexes += list(idxs)
        
    X_subset = X_train[indexes]
    c_labels = cluster_labels[indexes]
    c_labels = c_labels.flatten()
    
    non_h_indexes = [i for i in range(X_train.shape[0]) if i not in indexes]
    X_non_h_subset = X_train[non_h_indexes]
    y_non_h_subset = y_train[non_h_indexes]
    
    return X_subset, c_labels, X_non_h_subset, y_non_h_subset, indexes, non_h_indexes

In [7]:
sel_cls = [1, 2, 3, 4]
n_classes = len(sel_cls)
radar_dir = '/scratch/sk7898/radar_data/pedbike'
cls_str = '1_2_3_4'
layer_name = 'counting_dense_2'

data_dir = os.path.join(radar_dir, 'regression_fft_data')
model_dir = os.path.join(radar_dir, 'models/lstm/{}_amp_512_hidden_128'.format(cls_str)) 
model_path = os.path.join(model_dir, 'model_best_valid_loss_dp_4.h5')
fmodel_path = os.path.join(model_dir, 'best_valid_loss_dp_4_ft.h5')
cmodel = load_model(model_path)    
model = Model(inputs=cmodel.input, outputs=cmodel.get_layer(layer_name).output)
print(model.summary())

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1_input (InputLayer)    (None, 5, 256)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               197120    
_________________________________________________________________
batch_normalization_1 (Batch (None, 128)               512       
_________________________________________________________________
counting_dense_1 (Dense)     (None, 256)               33024     
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
counting_dense_2 (Dense)     (None, 64)                16448     
Total params: 247,104
Trainable params: 246,848
Non-trainable params: 256
___________________________________________________

In [8]:
X, X_test, y, y_test, old_y, old_y_test, seqs_train, seqs_test = get_fft_data(data_dir, sel_cls=sel_cls, data_mode='amp')    
old_y, y_test, old_y_test = old_y.flatten(), y_test.flatten(), old_y_test.flatten()
emb_train = model.predict(x=X)
emb_test = model.predict(x=X_test)
# print(seqs_test)
# print(emb_train.shape) #(16017, 64)

n_clusters, homogeneity_threshold, dist_threshold, diff_thresh = 60, 0.98, 2.5, 1.5
kmeans, cluster_labels, cluster_cents, cluster_modes = X_kmeans(emb_train, 
                                                                old_y,
                                                                n_clusters=n_clusters,
                                                                n_classes=n_classes)

ideal_clusters, h_clusters, scores, modes, dists = get_ideal_clusters(emb_train, 
                                                                      old_y, 
                                                                      cluster_labels,
                                                                      n_clusters,
                                                                      cluster_cents,
                                                                      cluster_modes,
                                                                      count_cls=sel_cls,
                                                                      homogeneity_threshold=homogeneity_threshold,
                                                                      dist_threshold=dist_threshold)

X_subset, c_labels, X_non_h_subset, y_non_h_subset, h_indexes, non_h_indexes = get_cluster_samples(emb_train, 
                                                                                                   old_y,
                                                                                                   ideal_clusters,
                                                                                                   h_clusters,
                                                                                                   cluster_labels)
print(ideal_clusters)

# new_y, was_changed = relabel_points(X_non_h_subset, y_non_h_subset, 
#                                     non_h_indexes, 
#                                     ideal_clusters, 
#                                     cluster_cents, 
#                                     cluster_modes, 
#                                     dist_diff_thresh=diff_thresh)

[2, 14, 17, 19, 20, 21, 24, 31, 43, 45, 53]


In [12]:
cluster_modes[ideal_clusters]

array([1, 2, 3, 3, 1, 4, 2, 1, 4, 1, 4])

In [13]:
if not os.path.exists(fmodel_path):
    fmodel = load_model(fmodel_path)
else:
    base_model, fmodel = None, None
    X_train = X[h_indexes]
    y_train = y[h_indexes]
    learning_rate = 1e-3
    batch_size = 32
    epochs = 15
    optimizer = keras.optimizers.Adam(lr=learning_rate)
    base_model = load_model(model_path)  

    outputs = keras.layers.Dense(n_classes)(base_model.get_layer('counting_dense_2').output)
    fmodel = keras.Model(inputs=base_model.input, outputs=outputs)

    fmodel.compile(loss='sparse_categorical_crossentropy', 
                   optimizer=optimizer, 
                   metrics=['sparse_categorical_accuracy'])

    for layer in fmodel.layers[:-1]:
        layer.trainable=False
    
    H_train = fmodel.fit(x=X_train,
                    y=y_train,
                    batch_size=batch_size,
                    validation_split=0.1,
                    epochs=epochs,
                    shuffle=True)
    fmodel.save(fmodel_path)

Train on 2134 samples, validate on 238 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [15]:
aggregate = True

cmodel = load_model(model_path)
predictions_1 = cmodel.predict(X_test) #evaluate(x=X_test, y=y_test)
predictions_2 = fmodel.predict(X_test) #evaluate(x=X_test, y=y_test)
predictions_1 = np.argmax(predictions_1, axis=1)
predictions_2 = np.argmax(predictions_2, axis=1)

if aggregate:
    l = np.chararray.split(seqs_test, '_')
    cuts = np.array([int(c[0]) for c in l])
    seqs = np.array([int(s[1]) for s in l])
    unq_cuts = np.unique(cuts, axis=0)
    y_test_1 = np.zeros(len(unq_cuts), dtype=int)
    predictions_11 = np.zeros(len(unq_cuts), dtype=int)
    predictions_21 = np.zeros(len(unq_cuts), dtype=int)
    
    for i, c in enumerate(unq_cuts):
        idxs = np.where(cuts == c)[0]
        cut_seqs = seqs[idxs]
        predictions_11[i] = np.max(predictions_1[idxs])
        predictions_21[i] = np.max(predictions_2[idxs])
        y_test_1[i] = y_test[idxs][0]
        if len(cut_seqs) > 1:
            sort_index = np.argsort(cut_seqs)
            print(y_test[idxs][0], cut_seqs[sort_index], predictions_1[idxs][sort_index], predictions_2[idxs][sort_index])
    mse_1 = mean_squared_error(y_test_1, predictions_11)
    mse_2 = mean_squared_error(y_test_1, predictions_21)
else:    
    mse_1 = mean_squared_error(y_test, predictions_1)
    mse_2 = mean_squared_error(y_test, predictions_2)
    
print('MSE of previous model: {} \nMSE after FT: {}'.format(mse_1, mse_2))

1 [ 2 12] [1 1] [1 1]
2 [ 4  9 17 19] [1 1 2 2] [1 1 3 0]
0 [ 3  6 10] [0 0 0] [1 3 1]
1 [ 5  6  7 12] [2 1 1 1] [1 1 3 0]
3 [2 3 9] [3 3 3] [1 3 2]
1 [1 5 9] [1 1 1] [1 3 1]
2 [ 7 10] [2 2] [3 3]
1 [2 3 5] [1 2 1] [1 1 3]
3 [ 2  8 12 14 26 29] [3 3 3 3 2 2] [0 0 0 0 0 1]
1 [ 7  9 11 12] [1 1 1 1] [1 1 1 1]
0 [2 4 6] [0 0 0] [0 0 0]
3 [ 3 11] [3 3] [1 3]
1 [4 5] [1 1] [3 3]
3 [0 2 5 8] [3 3 3 3] [2 2 3 3]
3 [ 3  8 13] [3 3 3] [1 3 3]
2 [3 4 5] [2 2 1] [1 3 0]
1 [ 3 11 15 16 18] [1 1 1 1 1] [1 1 0 1 1]
3 [15 17 23] [3 3 3] [3 0 1]
3 [ 4 10 14] [0 3 3] [1 0 3]
2 [2 3 5 8] [2 2 2 2] [1 1 1 1]
3 [ 0 16] [3 3] [1 1]
0 [1 5] [0 0] [0 0]
2 [ 3 12] [2 2] [1 3]
3 [ 4 15 16] [0 3 3] [1 0 0]
1 [ 1  5 10] [0 1 1] [1 1 0]
1 [ 0 15] [2 1] [1 1]
2 [ 2 10] [2 2] [0 0]
1 [3 8] [1 1] [1 3]
2 [11 17 22] [1 1 2] [1 3 0]
0 [2 6] [0 0] [1 1]
3 [12 16 18] [3 1 3] [3 0 1]
0 [3 7 8] [0 0 0] [1 1 1]
1 [ 0  6 11] [1 1 1] [1 3 1]
2 [ 0  2 11 14 15] [3 2 2 2 1] [1 0 0 0 1]
2 [ 3 20] [2 2] [1 1]
2 [ 4 11 14] [2 2 2

1 [14 20 23 28 29] [1 1 1 1 1] [3 1 1 1 1]
2 [16 22 29] [2 2 2] [1 1 1]
2 [ 6 13 17 20] [3 2 2 2] [1 3 0 0]
0 [ 3 12] [0 0] [0 0]
3 [ 3 11 14 20] [3 3 3 3] [1 3 0 3]
3 [ 7  8 11 23] [3 3 3 3] [3 3 3 1]
3 [ 0 13 23 24] [2 3 3 2] [1 3 0 0]
2 [11 20 22] [2 2 2] [3 1 1]
1 [ 1  8 15 27] [1 1 1 1] [1 3 3 1]
3 [ 4 21 24 31] [3 3 3 2] [3 1 1 1]
2 [ 1  9 11] [3 2 2] [1 3 3]
3 [ 7 14 19 21] [3 3 3 3] [0 3 0 0]
3 [ 7 15 16 27] [3 3 3 3] [3 3 3 1]
3 [ 3 20 22] [3 3 3] [1 3 3]
1 [17 20 21] [2 1 1] [3 1 1]
0 [3 7 9] [0 0 0] [0 0 0]
3 [13 15 16 21 22 23 24 28] [3 3 3 3 3 3 3 3] [1 1 1 1 1 1 1 1]
2 [ 0 22 25 26] [2 2 2 2] [1 0 0 1]
2 [12 16 26] [2 2 0] [1 3 0]
2 [ 8 15 25] [2 2 2] [1 3 0]
1 [1 2] [3 1] [1 1]
3 [ 4  6  7 13 17 25 30 32] [3 3 3 3 2 3 2 3] [3 3 3 3 1 0 0 0]
3 [ 0  5 23 27] [2 3 3 3] [1 3 1 0]
2 [ 8 16 22] [2 1 2] [3 1 0]
2 [ 4 12 16 17] [2 2 2 2] [3 3 1 1]
2 [ 9 34] [2 2] [3 1]
3 [ 4 14 25 30] [3 3 3 0] [3 3 1 1]
1 [ 2 15 26] [1 1 2] [3 3 1]
1 [ 2 10 18 19 21] [1 1 1 1 1] [1 0 3 3 1]
2 [

In [17]:
dict_1 = {}
dict_2 = {}
for k, i, j in zip(y_test, predictions_11, predictions_21):
    if k not in dict_1.keys():
        dict_1[k], dict_2[k] = [0, 0, 0, 0], [0, 0, 0, 0]
    dict_1[k][i] += 1
    dict_2[k][j] += 1

In [18]:
dict_1

{2: [50, 65, 48, 49],
 1: [31, 54, 44, 65],
 3: [39, 55, 45, 53],
 0: [20, 24, 29, 35]}

In [19]:
dict_2

{2: [43, 73, 8, 88],
 1: [26, 53, 7, 108],
 3: [27, 62, 8, 95],
 0: [19, 39, 0, 50]}