In [2]:
%cd ../
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from keras.models import Model
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.manifold import TSNE
from sklearn.neighbors import kneighbors_graph
from sklearn import metrics
from data import get_fft_data
from clustering_utils import *

/scratch/sk7898


Using TensorFlow backend.


ModuleNotFoundError: No module named 'data'

In [None]:
def clustering_dbscan(X_train, y_train,
                      min_samples_lst=None,
                      eps_lst=None,
                      pca_train=True):

    if eps_lst and min_samples_lst:
        col_names = ['min_samples:'+ str(min_samples)+' eps:' + str(eps) for min_samples in min_samples_lst
                                                                         for eps in eps_list 
                    ]
        
        fig, axs = get_plot_fig(rows=len(min_samples_lst),
                                cols=len(eps_lst),
                                col_names=col_names)
        
        for row_idx, min_samples in enumerate(min_samples_lst):
            for col_idx, eps in enumerate(eps_lst):
                db = DBSCAN(eps=eps, min_samples=min_samples).fit(X_train)
                core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
                core_samples_mask[db.core_sample_indices_] = True
                labels = db.labels_
                
                n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
                n_noise_ = list(labels).count(-1)

                print('Estimated number of clusters: %d' % n_clusters_)
                print('Estimated number of noise points: %d' % n_noise_)

                labels_palette = sns.color_palette("bright", len(np.unique(labels)))
        
                plot_data(axs, X, labels,
                          palette=labels_palette,
                          row_idx=row_idx,
                          col_idx=col_idx,
                          legend='full')
            
                print_scores(y_train, labels)
                
    else:
        db = DBSCAN(eps=0.3, min_samples=10).fit(X_train)
        core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
        core_samples_mask[db.core_sample_indices_] = True
        labels = db.labels_

        # DBSCAN helps us to identify noise in the data.
        # Number of clusters in labels, ignoring noise if present.
        n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
        n_noise_ = list(labels).count(-1)

        print('Estimated number of clusters: %d' % n_clusters_)
        print('Estimated number of noise points: %d' % n_noise_)
        
        print_scores(y_train, labels)
        labels_palette = sns.color_palette("bright", len(np.unique(labels)))
        
        sns.scatterplot(X[:, 0], X[:, 1], hue=labels, palette=labels_palette)

In [None]:
def clustering_agglomerative(X_train, y_train, 
                             affinity,
                             linkage,
                             n_clusters,
                             pca_train=True):
    
    # Create a graph capturing local connectivity. Larger number of neighbors
    # will give more homogeneous clusters to the cost of computation
    # time. A very large number of neighbors gives more evenly distributed
    # cluster sizes, but may not impose the local manifold structure of
    # the data
    
    pca = PCA(n_components=2).fit(X_train)
    X = pca.transform(X_train)
    
    X_train = X if pca_train else X_train
    
    # connectivity matrix for structured Ward
    connectivity = kneighbors_graph(X_train, n_neighbors=10, include_self=False)
    # make connectivity symmetric
    connectivity = 0.5 * (connectivity + connectivity.T)

    agg = AgglomerativeClustering(linkage=linkage,
                                  affinity=affinity,
                                  connectivity=connectivity,
                                  n_clusters=n_clusters)
    agg.fit(X_train)
     
    # Print the clustering scores
    print_scores(y_train, agg.labels_)
    
    labels_palette = sns.color_palette("bright", len(np.unique(agg.labels_)))
    sns.scatterplot(X[:, 0], X[:, 1], hue=agg.labels_, palette=labels_palette)

In [None]:
def clustering_kmeans(X_train, y, old_y, 
                      X_test, y_test, old_y_test,
                      cluster_list,
                      col_names,
                      sel_cls,
                      c_idx, inc_idx,
                      pca_train=True):
    
    def _kmeans_train(X, y, old_y,
                     n_clusters, 
                     n_classes,
                     cluster_palette,
                     row_idx, col_idx,
                     plot=True):

        kmeans = KMeans(init='k-means++', n_clusters=n_clusters, n_init=n_classes)
        kmeans.fit(X)
        
        clust_labels = kmeans.predict(X)        
        cent = kmeans.cluster_centers_
        clust_true_labels = get_cluster_mode(old_y, clust_labels, n_clusters=n_clusters)

        # print the clustering scores 
        print_scores(y, clust_labels, n_clusters=n_clusters)
        
        if plot:
            plot_data(axs, X=X, y=clust_labels,
                      palette=cluster_palette,
                      y_annotate=clust_true_labels,
                      cents=cent,
                      legend=False,
                      row_idx=row_idx, col_idx=col_idx)
        
            return kmeans, col_idx+1
        
        else:
            return kmeans, col_idx
            
    def _kmeans_predict(kmeans, X, old_y,
                       n_clusters,
                       cluster_palette,
                       row_idx, col_idx,
                       plot=True):
        
        clust_labels = kmeans.predict(X)
        cent_X = kmeans.cluster_centers_
        clust_true_labels = get_cluster_mode(old_y, clust_labels, n_clusters=n_clusters)

        if plot:
            plot_data(axs, X=X, y=clust_labels,
                      palette=cluster_palette,
                      y_annotate=clust_true_labels,
                      cents=cent_X,
                      legend=False,
                      row_idx=row_idx, col_idx=col_idx)
        
            return clust_labels, col_idx+1
        
        else:
            return clust_labels, col_idx
        
    n_classes = len(sel_cls) 
    plot = True if pca_train else False
    pca_plots = list(filter(lambda x: 'PCA' in x, col_names)) 
    train_kmeans = list(filter(lambda x: 'kmeans trained' in x, col_names)) 
    test_kmeans = list(filter(lambda x: 'kmeans predict' in x, col_names))
        
    if plot:
        fig, axs = get_plot_fig(rows=len(cluster_list),
                                cols=len(col_names),
                                col_names=col_names)
        true_palette = sns.color_palette("bright", n_classes)

    
    X_c, y_c, old_y_c = get_subset_data(X=X_train,
                                        y=y,
                                        old_y=old_y,
                                        idxs=c_idx)

    X_inc, y_inc, old_y_inc = get_subset_data(X=X_train,
                                              y=y,
                                              old_y=old_y,
                                              idxs=inc_idx)
    data_dict = { 
                    'train': (X_train, y, old_y),
                    'test': (X_test, y_test, old_y_test),
                    'c': (X_c, y_c, old_y_c),
                    'inc': (X_inc, y_inc, old_y_inc)
                }
    
    for i, n_clusters in enumerate(cluster_list):
        col_idx = 0
        cluster_palette = sns.color_palette("bright", n_clusters)

        print('***********************n_clusters={}*******************************'.format(n_clusters))
        
        if plot and len(pca_plots) > 0:
            for col in pca_plots:
                key = col.split('_')[1]
                try:
                    X,_, y = data_dict[key]
                except:
                    print('Unrecognized Dataset!')
                
                cents = cluster_centroids(X, y, n_classes=n_classes)
                plot_data(axs, X=X, y=y,
                          palette=true_palette,
                          y_annotate=sel_cls,
                          cents=cents,
                          row_idx=i, col_idx=col_idx)
                col_idx += 1
          
        if len(train_kmeans) > 0:
            for col in train_kmeans:
                key = col.split('_')[1]
                try:
                    X, y, old_y = data_dict[key]
                except:
                    print('Unrecognized Dataset!')
                    
                print('Clustering Scores on X_{} when trained with X_{}'.format(key))
                kmeans, col_idx = _kmeans_train(X, y, old_y,
                                               n_clusters=n_clusters,
                                               n_classes=n_classes,
                                               cluster_palette=cluster_palette,
                                               row_idx=i, col_idx=col_idx,
                                               plot=plot)

        if len(test_kmeans) > 0:
            for col in test_kmeans:
                key = col.split('_')[1]
                try:
                    X, _, y = data_dict[key]
                except:
                    print('Unrecognized Dataset!')
                    
            # Plot cluster predictions on X_inc
            clust_labels, col_idx = _kmeans_predict(kmeans, X, y,
                                                   n_clusters=n_clusters,
                                                   cluster_palette=cluster_palette,
                                                   row_idx=i, col_idx=col_idx,
                                                   plot=plot)
            if key == 'test':
                print('Clustering Scores on X_test')
                print_scores(y_test, clust_labels, n_clusters=n_clusters)

In [None]:
method = 'kmeans'
model_dir = '/scratch/sk7898/pedbike/models/lstm/'
cls_str_list = ['1_2_3_4']
sel_cls_list = [[1, 2, 3, 4]]
layer_name = 'counting_dense_2'
pca_train = True
relabel = True
    
for idx, (cls_str, sel_cls) in enumerate(zip(cls_str_list, sel_cls_list)):
    model_str = os.path.join(cls_str + '_amp_512_hidden_128/best_model.h5')
    model_path = os.path.join(model_dir, model_str)

    X, X_test, y, y_test, old_y, old_y_test, _, _ = get_fft_data(sel_cls=sel_cls, data_mode='amp')
    
    old_y, y_test, old_y_test = old_y.flatten(), y_test.flatten(), old_y_test.flatten()

    c_idx, inc_idx = get_correct_incorrect_idx(model_path,
                                               X,
                                               y_true=y, 
                                               n_classes=len(sel_cls)) 
    
    model = Model(inputs=model.input, outputs=model.get_layer(layer_name).output)
    emb_train = model.predict(x=X)
    emb_test = model.predict(x=X_test)
    
    X_train, X_test = get_pca_comps(emb_train, emb_test) if pca_train else (emb_train, emb_test)
    
    if method == 'kmeans':
        cluster_list = [20]
        # Options: PCA X_train, PCA X_test, PCA X_inc
        #          kmeans trained with X_train, kmeans trained with X_c
        #          kmeans predict X_test, kmeans predict X_inc
        col_names = [
                     'kmeans trained with X_c',
                     #'kmeans predict X_test'
                    ]
          
        clustering_kmeans(X_train, y, old_y,
                          X_test, y_test, old_y_test,
                          cluster_list=cluster_list,
                          col_names=col_names,
                          sel_cls=sel_cls,
                          c_idx=c_idx, inc_idx=inc_idx
                         )
        
    if method == 'DBSCAN':
        min_samples_lst = [2, 5, 10]
        eps_lst = [0.3, 0.5, 1.0, 1.5]
        clustering_dbscan(X_train, y,
                          min_samples_lst=min_samples_lst,
                          eps_lst=eps_lst
                         )
        
    if method == 'Agglomerative':
        clustering_agglomerative(X_train, y,
                                 affinity='euclidean',
                                 linkage='ward',  
                                 n_clusters=8
                                )