# Experiment -- Jousselme distance v.s. Euclidean metric 
This experiment takes Jousselme distance and euclidean metric as two dissimilarity measure functions and compare the clustring results under EKNNclus.
Data is from sushi3 data with most shared 40 sushi

In [265]:
%matplotlib qt5
import numpy as np
import pandas as pd
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt

import random
import re

from read import read_voter_file, read_score_file, read_order_file
from tools import all_mass_init, all_certain_mass_init, flatten_pref_mass_mat
from beliefKMeans import k_means, silhouette_score, _k_means_single
from beliefDBSCAN import cal_distances

from jpGeoPlot import scatter_on_map, read_geo_info




geo_info = pd.read_csv("../data/jap_prefecture_geo_en.csv")
vMap = read_voter_file("../data/sushi3-2016/sushi3.udata")
read_score_file("../data/sushi3-2016/sushi3b.5000.10.score", vMap)
read_order_file("../data/sushi3-2016/sushi3b.5000.10.order", vMap)

In [266]:
geo_info = read_geo_info("../data/jap_prefecture_geo_en.csv")
vMap = read_voter_file("../data/sushi3-2016/sushi3.com40.udata")
read_score_file("../data/sushi3-2016/sushi3b.com40.10.score", vMap)
read_order_file("../data/sushi3-2016/sushi3b.com40.10.order", vMap)

In [28]:
cand_list = list(range(100))
n_item = 100
n_sample = len(vMap)
vector_size = 16
n_pref_pair = int(n_item*(n_item-1)/2)
X = np.empty((n_sample, n_pref_pair, vector_size),dtype = object)
for k,v in vMap.items():
    #print(v.get_order_a())
    X[k] = flatten_pref_mass_mat(all_mass_init(v.get_scores(), v.get_order_b(), cand_list, nbItem = n_item))

In [29]:
dist_mat = cal_distances(X, 'jousselme')

In [44]:
dist_mat_2 = dist_mat*100

In [30]:
def cal_pair_in_common(vMap):
    com_mat = np.zeros((len(vMap),len(vMap)))
    for i in vMap.keys():
        for j in vMap.keys():
            com_element=len(set(vMap[i].get_order_b()).intersection(set(vMap[j].get_order_b())))
            #com_mat[i][j] = com_element*(com_element-1)/2
            com_mat[i][j] = com_element
    return com_mat

In [45]:
dist_mat_2

array([[ 0.        ,  1.28319753,  1.33656021, ...,  1.40209994,
         1.36789739,  1.42079017],
       [ 1.28319753,  0.        ,  1.22326495, ...,  1.25666202,
         1.28564371,  1.14139162],
       [ 1.33656021,  1.22326495,  0.        , ...,  1.30937597,
         1.2862837 ,  1.30183528],
       ..., 
       [ 1.40209994,  1.25666202,  1.30937597, ...,  0.        ,
         1.39425989,  1.34574569],
       [ 1.36789739,  1.28564371,  1.2862837 , ...,  1.39425989,
         0.        ,  1.39348654],
       [ 1.42079017,  1.14139162,  1.30183528, ...,  1.34574569,
         1.39348654,  0.        ]])

In [50]:
import EKNNclus
knn_ind, knn_dist = EKNNclus.get_KNN(dist_mat, 3)
alpha_mat = cal_pair_in_common(vMap)/10
knn_alpha = np.zeros(knn_ind.shape)
for i in range(knn_ind.shape[0]):
    knn_alpha[i] = alpha_mat[i][knn_ind[i]]
#print(knn_alpha)
knn_ind[0]*alpha_mat[0][[41,47,46]]

clus = EKNNclus.EKNNclus_Th(X, K = 6, D = dist_mat_2, alpha_mat = alpha_mat,ntrials=10, y0 = list(range(X.shape[0])),tr = True)

0 1 273 103
0 2 95 72
0 3 103 45
0 4 112 30
0 5 76 10
0 6 13 4
0 7 4 4
0 8 1 3
0 9 0 3
1 1 274 111
1 2 123 59
1 3 63 45
1 4 132 25
1 5 58 7
1 6 15 3
1 7 0 3
2 1 270 108
2 2 104 67
2 3 75 47
2 4 53 42
2 5 41 35
2 6 71 27
2 7 100 15
2 8 44 6
2 9 5 3
2 10 0 3
3 1 275 109
3 2 135 56
3 3 150 19
3 4 44 5
3 5 3 2
3 6 0 2
4 1 278 105
4 2 140 56
4 3 158 23
4 4 51 4
4 5 5 3
4 6 4 2
4 7 0 2
5 1 273 103
5 2 105 64
5 3 67 46
5 4 77 33
5 5 83 24
5 6 75 18
5 7 91 11
5 8 15 3
5 9 4 2
5 10 0 2
6 1 274 106
6 2 111 64
6 3 65 45
6 4 57 36
6 5 76 26
6 6 95 14
6 7 59 6
6 8 10 3
6 9 0 3
7 1 269 109
7 2 112 68
7 3 82 48
7 4 76 30
7 5 96 16
7 6 42 5
7 7 8 3
7 8 1 2
7 9 0 2
8 1 273 106
8 2 94 65
8 3 72 48
8 4 75 36
8 5 64 30
8 6 42 24
8 7 58 17
8 8 39 9
8 9 17 7
8 10 35 6
8 11 30 5
8 12 13 4
8 13 17 4
8 14 10 3
8 15 20 3
8 16 11 2
8 17 0 2
9 1 279 107
9 2 117 56
9 3 62 43
9 4 48 33
9 5 106 24
9 6 91 9
9 7 34 6
9 8 9 4
9 9 2 4
9 10 2 4
9 11 1 3
9 12 0 3


In [51]:
clus

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0,

In [267]:
scatter_on_map(vMap, clus, geo_info)

  axisbgc = ax.get_axis_bgcolor()
  b = ax.ishold()
    See the API Changes document (http://matplotlib.org/api/api_changes.html)
    for more details.
  ax.hold(b)


## Jousselme metric

In [53]:
from iBelief.distance import JousselmeDistance
from iBelief.Dcalculus import Dcalculus
import numpy as np

In [176]:
new_matrix = np.genfromtxt("/home/yzhang/ThesisWork/Program/sushiPref/experiment/generatedMass.csv")
new_matrix = new_matrix.T

In [219]:
labels_true = np.zeros((200))
labels_true[0:50] = 0
labels_true[50:100] = 1
labels_true[100:150] = 2
labels_true[150:200] = 3
print(labels_true)
print(len(np.where(labels_true==0)[0]))

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  2.  2.  2.  2.  2.  2.  2.  2.
  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.
  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.
  2.  2.  2.  2.  2.  2.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.
  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.
  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.
  3.  3.]
50


In [220]:
D = Dcalculus(16)
size = new_matrix.shape[0]
generated_mat_distances = np.zeros((size, size))
for i in range(size):
    for j in range(size):
        if j>i:
            generated_mat_distances[i][j] = JousselmeDistance(new_matrix[i], new_matrix[j], D)

In [221]:
generated_mat_distances = generated_mat_distances+generated_mat_distances.T

In [222]:
generated_mat_distances

array([[ 0.        ,  0.11560226,  0.13551698, ...,  0.43510741,
         0.37119032,  0.78541015],
       [ 0.11560226,  0.        ,  0.01991473, ...,  0.51566337,
         0.47997347,  0.81096133],
       [ 0.13551698,  0.01991473,  0.        , ...,  0.53084993,
         0.49902318,  0.81693561],
       ..., 
       [ 0.43510741,  0.51566337,  0.53084993, ...,  0.        ,
         0.20763474,  0.4645378 ],
       [ 0.37119032,  0.47997347,  0.49902318, ...,  0.20763474,
         0.        ,  0.67217254],
       [ 0.78541015,  0.81096133,  0.81693561, ...,  0.4645378 ,
         0.67217254,  0.        ]])

In [250]:
clus = EKNNclus.EKNNclus_Th(new_matrix, K = 65, D = generated_mat_distances , ntrials=30, 
                            y0 = list(range(new_matrix.shape[0])),tr = False)
jousselme_ss = silhouette_score(new_matrix, clus, generated_mat_distances)
print(jousselme_ss, len(np.unique(clus)), sklearn.metrics.adjusted_rand_score(labels_true, clus))

0 1 185 27
0 2 48 10
0 3 4 8
0 4 3 8
0 5 6 8
0 6 1 7
0 7 0 7
1 1 182 27
1 2 53 11
1 3 22 9
1 4 5 7
1 5 0 7
2 1 183 33
2 2 61 10
2 3 14 6
2 4 6 6
2 5 0 6
3 1 185 34
3 2 52 9
3 3 8 8
3 4 1 8
3 5 0 8
4 1 189 24
4 2 32 7
4 3 1 7
4 4 3 7
4 5 7 6
4 6 0 6
5 1 191 28
5 2 47 7
5 3 6 6
5 4 7 6
5 5 4 5
5 6 1 5
5 7 0 5
6 1 185 37
6 2 51 13
6 3 13 10
6 4 25 9
6 5 15 6
6 6 0 6
7 1 184 35
7 2 65 9
7 3 15 7
7 4 4 6
7 5 0 6
8 1 188 26
8 2 48 8
8 3 8 8
8 4 14 6
8 5 3 6
8 6 8 6
8 7 1 5
8 8 11 5
8 9 4 4
8 10 0 4
9 1 180 31
9 2 40 16
9 3 35 14
9 4 18 12
9 5 16 11
9 6 21 9
9 7 13 7
9 8 7 7
9 9 3 6
9 10 0 6
10 1 183 37
10 2 83 10
10 3 29 5
10 4 4 5
10 5 3 5
10 6 0 5
11 1 184 28
11 2 43 12
11 3 12 8
11 4 4 7
11 5 0 7
12 1 184 32
12 2 61 9
12 3 14 6
12 4 2 6
12 5 0 6
13 1 186 31
13 2 59 10
13 3 23 6
13 4 5 5
13 5 0 5
14 1 184 27
14 2 36 12
14 3 12 9
14 4 15 8
14 5 11 7
14 6 1 6
14 7 0 6
15 1 186 29
15 2 46 10
15 3 17 8
15 4 11 7
15 5 3 6
15 6 7 6
15 7 4 5
15 8 0 5
16 1 185 33
16 2 47 9
16 3 10 8
16 4 3 8
16 5 

In [227]:
sklearn.metrics.adjusted_rand_score(labels_true, clus)

0.22045042931200096

In [201]:
np.where(clus==3)

(array([152, 155, 157, 158, 159, 160, 162, 164, 168, 171, 173, 175, 176,
        177, 181, 185, 186, 188, 189, 190, 192, 193, 196, 199]),)

In [85]:
from sklearn.preprocessing import LabelEncoder
def check_number_of_labels(n_labels, n_samples):
    if not 1 < n_labels < n_samples:
        raise ValueError("Number of labels is %d. Valid values are 2 "
"to n_samples - 1 (inclusive)" % n_labels)
def silhouette_samples(X, labels, distances):#, **kwds):
    #X, labels = check_X_y(X, labels, accept_sparse=['csc', 'csr'])
    le = LabelEncoder()
    labels = le.fit_transform(labels)
    check_number_of_labels(len(le.classes_), X.shape[0])
    #if type(distances) == type(None):
    #    distances = JousselmeDistance(X, metric)#, metric=metric, **kwds)
    unique_labels = le.classes_
    n_samples_per_label = np.bincount(labels, minlength=len(unique_labels))

    # For sample i, store the mean distance of the cluster to which
    # it belongs in intra_clust_dists[i]
    intra_clust_dists = np.zeros(distances.shape[0], dtype=distances.dtype)

    # For sample i, store the mean distance of the second closest
    # cluster in inter_clust_dists[i]
    inter_clust_dists = np.inf + intra_clust_dists

    for curr_label in range(len(unique_labels)):

        # Find inter_clust_dist for all samples belonging to the same
        # label.
        mask = labels == curr_label
        current_distances = distances[mask]

        # Leave out current sample.
        n_samples_curr_lab = n_samples_per_label[curr_label] - 1
        if n_samples_curr_lab != 0:
            intra_clust_dists[mask] = np.sum(
                current_distances[:, mask], axis=1) / n_samples_curr_lab

        # Now iterate over all other labels, finding the mean
        # cluster distance that is closest to every sample.
        for other_label in range(len(unique_labels)):
            if other_label != curr_label:
                other_mask = labels == other_label
                other_distances = np.mean(
                    current_distances[:, other_mask], axis=1)
                inter_clust_dists[mask] = np.minimum(
                    inter_clust_dists[mask], other_distances)

    sil_samples = inter_clust_dists - intra_clust_dists
    sil_samples /= np.maximum(intra_clust_dists, inter_clust_dists)
    # score 0 for clusters of size 1, according to the paper
    sil_samples[n_samples_per_label.take(labels) == 1] = 0
    return sil_samples
def silhouette_score(X, labels,distances = None, metric='euclidean', sample_size=None,
                     random_state=None,  **kwds):
    if sample_size is not None:
        #X, labels = check_X_y(X, labels, accept_sparse=['csc', 'csr'])
        random_state = check_random_state(random_state)
        indices = random_state.permutation(X.shape[0])[:sample_size]
        if metric == "precomputed":
            X, labels = X[indices].T[indices].T, labels[indices]
        else:
            X, labels = X[indices], labels[indices]
    return np.mean(silhouette_samples(X, labels, distances = distances,**kwds))




In [103]:
jousselme_ss = silhouette_score(new_matrix, clus, generated_mat_distances)
print(jousselme_ss)

0.142436392191


In [104]:
jousselme_ss

0.1424363921909102

## L1 norm metric

In [88]:
from sklearn.metrics.pairwise import euclidean_distances
import sklearn.metrics 
from sklearn.cluster import KMeans


In [248]:
l1_dist_mat = dist_mat = euclidean_distances(new_matrix,new_matrix)
labels2 = EKNNclus.EKNNclus_Th(new_matrix, K = 65, D = l1_dist_mat , 
                            ntrials=20, y0 = list(range(new_matrix.shape[0])),tr = False)
euc_ss = sklearn.metrics.silhouette_score(new_matrix, labels2)
print(euc_ss, len(np.unique(labels2)), sklearn.metrics.adjusted_rand_score(labels_true, labels2))

0 1 182 29
0 2 54 9
0 3 8 7
0 4 2 7
0 5 0 7
1 1 181 32
1 2 54 13
1 3 20 10
1 4 10 8
1 5 3 8
1 6 0 8
2 1 183 31
2 2 59 9
2 3 15 7
2 4 7 6
2 5 5 5
2 6 0 5
3 1 184 32
3 2 53 10
3 3 14 6
3 4 6 5
3 5 9 5
3 6 1 4
3 7 0 4
4 1 187 23
4 2 45 9
4 3 11 6
4 4 0 6
5 1 184 26
5 2 46 11
5 3 13 8
5 4 4 8
5 5 0 8
6 1 186 29
6 2 41 11
6 3 16 11
6 4 8 8
6 5 16 8
6 6 4 6
6 7 0 6
7 1 181 31
7 2 70 12
7 3 22 9
7 4 14 8
7 5 7 6
7 6 7 5
7 7 1 5
7 8 0 5
8 1 193 26
8 2 35 6
8 3 3 6
8 4 0 6
9 1 181 31
9 2 71 12
9 3 27 9
9 4 9 7
9 5 7 6
9 6 0 6
10 1 188 29
10 2 46 8
10 3 8 6
10 4 1 6
10 5 4 6
10 6 4 5
10 7 10 5
10 8 5 4
10 9 0 4
11 1 187 30
11 2 54 9
11 3 10 7
11 4 0 7
12 1 176 41
12 2 74 15
12 3 32 10
12 4 8 9
12 5 7 9
12 6 4 8
12 7 3 8
12 8 7 7
12 9 6 7
12 10 8 6
12 11 0 6
13 1 186 25
13 2 47 10
13 3 15 8
13 4 1 8
13 5 0 8
14 1 184 30
14 2 42 11
14 3 12 10
14 4 3 10
14 5 6 9
14 6 5 9
14 7 3 8
14 8 0 8
15 1 191 28
15 2 31 8
15 3 5 7
15 4 0 7
16 1 182 31
16 2 68 12
16 3 43 8
16 4 19 5
16 5 0 5
17 1 181 30
17 2 60

## Euclidean without ignorance

In [264]:
new_matrix[0:-1][:].shape

(199, 16)

In [None]:
= euclidean_distances(new_matrix[,new_matrix)