In [1]:
import numpy as np
import scipy as sp
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.cluster import KMeans

# Kmeans Clustering on YearPredictionMSD

In [2]:
# reading data
data = pd.read_csv("year_prediction.csv")
data = data.rename(index=str, columns={"label":"year"})

In [3]:
# separate input attributes and output into different dataframes
X = data.iloc[:,1:]
Y = data.iloc[:,0]

# normalized the training set X
scaler = StandardScaler()
scaler.fit(X)
X_std = scaler.transform(X)

num_of_data, _ = X_std.shape

In [121]:
num_of_rep = 20
res = np.zeros(num_of_rep)
num_clusters = 10

for t in range(num_of_rep):
    kmeans = KMeans(n_clusters = num_clusters)
    kmeans.fit(X_std)
    record = kmeans.inertia_ / num_of_data
    res[t] = record
    print(record)

name_str = "./res/kmeans/kmeans-"+str(num_clusters)+"c.npy"    
np.save(name_str, res)

71.65047255247167
71.65037839463994
71.65038445285383
71.65043678725971
71.65055173588914
71.65043621527752
71.65043481087723
71.65066602130086
71.65744870527136
71.65045610855871
71.65057260055757
71.65039727026986
71.65057172896067
71.65042962393274
71.66007752663573
71.6505658605916
71.65057189113001
71.65051012927415
71.65057042708213
71.65041435398359


## Implement DistDimKmeans (Ding et al 2016)

In [7]:
def decode(number, b, t):
    res = []
    for _ in range(t):
        r = number % b
        res.append(r)
        number = (number - r) // b
    res.reverse()
    return res

def encode(number_list, b, t):
    res = 0
    number_list.reverse()
    for i in range(t):
        res += number_list[i] * (b ** i)
    return res

def distkmeans(D_list, w_list, n_c = 10):
    D = np.hstack(D_list)
    num_of_party = len(D_list)
    num_of_data, _ = D_list[0].shape
    kmeans_list = []
    total_dim = 0
    label_list = []
    for j in range(num_of_party):
        kmeans_list.append(KMeans(n_clusters = n_c))
        _, party_dim = D_list[j].shape
        total_dim += party_dim
        label = kmeans_list[j].fit_predict(D_list[j], sample_weight = w_list)
        label_list.append(label)
    
    grids_number = n_c ** num_of_party
    center_list = np.zeros((grids_number, total_dim))
    center_weights = np.zeros(grids_number)
    
    for h in range(grids_number):
        h_decode = decode(h, n_c, num_of_party)
        temp = []
        for j in range(num_of_party):
            temp.append((kmeans_list[j].cluster_centers_)[h_decode[j],:])
        center_list[h, :] = np.concatenate(temp)
    
    for i in range(num_of_data):
        temp = [l[i] for l in label_list]
        idx = encode(temp, n_c, num_of_party)
        center_weights[idx] += w_list[i]
    
    # normalize center_weights to 1
    center_weights = center_weights / np.sum(center_weights)
    
    server_kmeans = KMeans(n_clusters = n_c)
    server_kmeans.fit(center_list, sample_weight = center_weights)
    return server_kmeans

## Implement Coreset Construction

In [8]:
def uniform_kmeans(m, D_list, n_c = 10):
    D = np.hstack(D_list)
    D_df = pd.DataFrame(D)
    C = D_df.sample(n=m, replace=False)
    C = C.to_numpy()
    return C

def coreset_kmeans(m, D_list, n_c = 10):
    alpha = 2
    D = np.hstack(D_list)
    num_of_party = len(D_list)
    num_of_data, _ = D_list[0].shape
    kmeans_list = []
    label_list = []
    groupcost_list = []
    groupcount_list = []
    sensitivity = np.zeros((num_of_data, num_of_party))
    for j in range(num_of_party):
        kmeans_list.append(KMeans(n_clusters = n_c))
        label = kmeans_list[j].fit_predict(D_list[j])
        label_list.append(label)
        groupcost = np.zeros(n_c)
        groupcount = np.zeros(n_c)
        cost = kmeans_list[j].inertia_ / num_of_data
        t = kmeans_list[j].transform(D_list[j])
        for i in range(num_of_data):
            groupcount[label[i]] += 1
            groupcost[label[i]] += t[i,label[i]] ** 2
        for i in range(num_of_data):
            sensitivity[i,j] = alpha * (t[i,label[i]] ** 2) / cost \
                        + 2 * alpha * groupcost[label[i]] / (groupcount[label[i]] * cost) + 4 * num_of_data / groupcount[label[i]]
    s = np.sum(sensitivity, axis=1)
    D_df = pd.DataFrame(np.hstack((D, (1/s).reshape(-1,1))))
    C = D_df.sample(n=m, replace=False, weights=s)
    C = C.to_numpy()
    data = C[:,:-1]
    weights = C[:,-1]
    weights = weights / np.sum(weights)
    return data, weights

In [125]:
num_of_rep = 20
res = np.zeros(num_of_rep)
num_clusters = 10
size = 1000

X1 = X_std[:,:30]
X2 = X_std[:,30:60]
X3 = X_std[:,60:]

X_list = [X1, X2, X3]

for t in range(num_of_rep):
    d, w = coreset_kmeans(size, X_list, n_c = num_clusters)
    ckmeans = KMeans(n_clusters = num_clusters)
    ckmeans.fit(d,sample_weight = w)

    dist = ckmeans.transform(X_std)
    cost = np.sum((np.min(dist, axis=1)) ** 2) / num_of_data
    res[t] = cost
    print(cost)

name_str = "./res/kmeans/kmeans-coreset-"+str(num_clusters)+"c"+str(size)+"s.npy"    
np.save(name_str, res)

73.46019681608169
73.44234567211778
74.23817445185314
73.50473315261743
73.8470045225081
73.83340940298976
74.10437548157843
73.14254030775075
73.37503297719871
73.22802974817957
74.44289338836494
73.44334870173103
74.29590281199722
74.0795713427415
73.44194770990273
74.06784220645517
74.2770960732288
73.5925707763438
73.69805438857364
73.67229988940595


In [30]:
num_of_rep = 20
res = np.zeros(num_of_rep)
num_clusters = 10
size = 1200

for t in range(num_of_rep):
    d = uniform_kmeans(size, X_list, n_c = num_clusters)

    ukmeans = KMeans(n_clusters = num_clusters)
    ukmeans.fit(d)

    dist = ukmeans.transform(X_std)
    cost = np.sum((np.min(dist, axis=1)) ** 2) / num_of_data
    res[t] = cost
    print(cost)
    
name_str = "./res/kmeans/kmeans-uniform-"+str(num_clusters)+"c"+str(size)+"s.npy"    
np.save(name_str, res)

74.09765958686631
74.43134420860241
75.20928803051173
73.90252689073156
74.92758411617388
74.29166864941674
75.57129063612592
73.6838237077466
73.7346013765442
74.42756612921227
74.3671381982716
75.50098534065393
74.44264344906125
74.30753472293168
74.18747438812935
75.10918356548495
74.16285895851007
74.84423556074813
73.91553029201579
74.64825525706348


In [127]:
num_of_rep = 20
res = np.zeros(num_of_rep)
num_clusters = 10

for t in range(num_of_rep):
    dkmeans = distkmeans(X_list, np.ones(num_of_data) / num_of_data, n_c= num_clusters)

    dist = dkmeans.transform(X_std)
    cost = np.sum((np.min(dist, axis=1)) ** 2) / num_of_data
    res[t] = cost
    print(cost)

name_str = "./res/kmeans/distkmeans-"+str(num_clusters)+"c.npy"    
np.save(name_str, res)

75.46073871868204
74.68238959903377
74.90830729395657
74.73988430616967
74.53718666421031
75.84935711949967
74.56518608482786
74.64365223169615
75.20884984994078
74.70615628472683
74.87085874301633
75.24927899168051
75.13922804574419
74.87150920864161
74.65300333219258
74.36218738336815
74.68265369929877
75.35650019304215
74.73695602945946
74.50523114546864


In [9]:
num_of_rep = 20
res = np.zeros(num_of_rep)
num_clusters = 10
size = 1000

X1 = X_std[:,:30]
X2 = X_std[:,30:60]
X3 = X_std[:,60:]

X_list = [X1, X2, X3]

num_of_rep = 20
res = np.zeros(num_of_rep)
num_clusters = 10
size = 1000

for t in range(num_of_rep):
    d, w = coreset_kmeans(size, X_list, n_c = num_clusters)
    d1 = d[:,:30]
    d2 = d[:,30:60]
    d3 = d[:,60:]
    d_list = [d1,d2,d3]

    dkmeans = distkmeans(d_list, w, n_c= num_clusters)

    dist = dkmeans.transform(X_std)
    cost = np.sum((np.min(dist, axis=1)) ** 2) / num_of_data
    res[t] = cost
    print(cost)

name_str = "./res/kmeans/distkmeans-coreset-"+str(num_clusters)+"c"+str(size)+"s.npy"    
np.save(name_str, res)

76.16933092317159
76.45605367780722
77.52983651542255
76.08061189979055
76.56847816883194


  server_kmeans.fit(center_list, sample_weight = center_weights)


77.25550263097848
76.42566651173429
81.19247991466676
76.284550245235
77.87532229003337
79.3088700846753
77.18928903839786


  server_kmeans.fit(center_list, sample_weight = center_weights)


81.52991492224803
77.28176379745612
78.28636103886976
77.18456260234456
76.92814607141437
76.64724271583984
76.85096597898489


  server_kmeans.fit(center_list, sample_weight = center_weights)


82.05321574139471


In [29]:
num_of_rep = 20
res = np.zeros(num_of_rep)
num_clusters = 10
size = 2000

for t in range(num_of_rep):
    d = uniform_kmeans(size, X_list, n_c = 10)
    d1 = d[:,:30]
    d2 = d[:,30:60]
    d3 = d[:,60:]
    d_list = [d1,d2,d3]

    dkmeans = distkmeans(d_list, np.ones(size)/size, n_c= num_clusters)

    dist = dkmeans.transform(X_std)
    cost = np.sum((np.min(dist, axis=1)) ** 2) / num_of_data
    res[t] = cost
    print(cost)

name_str = "./res/kmeans/distkmeans-uniform-"+str(num_clusters)+"c"+str(size)+"s.npy"    
np.save(name_str, res)

79.39085358827454
76.5606840728351


  server_kmeans.fit(center_list, sample_weight = center_weights)


83.18637669950753


  server_kmeans.fit(center_list, sample_weight = center_weights)


77.99548294217148


  server_kmeans.fit(center_list, sample_weight = center_weights)


79.61920222989568
75.58056781913068
76.44567707235083
76.60954679687987
76.29462298199235
79.02140500283592
76.85751445003629
76.78936617684718


  server_kmeans.fit(center_list, sample_weight = center_weights)


82.20262456314619


  server_kmeans.fit(center_list, sample_weight = center_weights)


80.25219473048512


  server_kmeans.fit(center_list, sample_weight = center_weights)


77.63127298677428
75.6717266105124
75.50828962349925
79.17990053311787
78.45062958858257
79.27173004799764


In [33]:
# change the coreset size for Coreset+K and Coreset+D
num_of_rep = 20
num_clusters = 10
size_list = [2000,3000,4000,5000,6000]

for size in size_list:
    print("coreset size %d" % size)
    res = np.zeros(num_of_rep)
    for t in range(num_of_rep):
        d, w = coreset_kmeans(size, X_list, n_c = num_clusters)
        d1 = d[:,:30]
        d2 = d[:,30:60]
        d3 = d[:,60:]
        d_list = [d1,d2,d3]

        dkmeans = distkmeans(d_list, w, n_c= num_clusters)

        dist = dkmeans.transform(X_std)
        cost = np.sum((np.min(dist, axis=1)) ** 2) / num_of_data
        res[t] = cost
        print(cost)

    name_str = "./res/kmeans/distkmeans-coreset-"+str(num_clusters)+"c"+str(size)+"s.npy"    
    np.save(name_str, res)

coreset size 2000
75.5363547527181
77.38528668315446
75.53559154011047
75.59207796213053
76.350739317434
75.77777512618921
76.69133547398981
78.13804120896341
75.8709138204326
76.17325606768101
75.66381315428715
76.1998937903758
75.62603211543279
75.80646553061983


  server_kmeans.fit(center_list, sample_weight = center_weights)


78.04105303405287
77.68226659404947
75.27006407733847
75.97180795113466
76.83491150079142
76.25849345148525
coreset size 3000
77.24656687832977
74.94057169824639
75.18381189429951
75.3295937960009
75.66144378099693
76.19558364633686
75.44520705053863
75.15427070928938
75.21048867333423


  server_kmeans.fit(center_list, sample_weight = center_weights)


77.0518804999432
75.11972554758319
75.39883984070383
75.51157300829038
75.10381522260461
76.06856443709773
75.26683755484503
74.89981248558833
75.04700302337615
75.61106776822957
74.93432544166811
coreset size 4000
75.80218102722738
75.19652275025298
74.84841955891132
75.88691771677671
75.73979874436826
75.29957497705124
75.62102698514562
74.95892011503688
75.12702011027417
74.90765721972211
75.60267507605069
76.05031493206421
75.38450238462289
75.8167428922834
76.18911899380858
75.94699330919715
74.86438957102115
74.93108407641333
76.22114446192295
75.3966891382549
coreset size 5000
76.11370380000415
75.62497977654564
75.21242734738549
75.79827494698822
74.72059736331603
75.3480332719696
75.21773664042418
75.03646174321243
74.47337667176993
76.20071900687743
75.71927685793742
74.62078704433387
74.91108808548769
75.5216598459384
75.51837674922942
75.00152204208328
75.88803504599571
74.94495444480945
74.85558645545872
74.7028586988418
coreset size 6000
75.14406624858219
75.4996995508876

In [34]:
num_of_rep = 20
num_clusters = 10
size_list = [2000,3000,4000,5000,6000]

for size in size_list:
    print("coreset size %d" % size)
    res = np.zeros(num_of_rep)
    for t in range(num_of_rep):
        d, w = coreset_kmeans(size, X_list, n_c = num_clusters)
        ckmeans = KMeans(n_clusters = num_clusters)
        ckmeans.fit(d,sample_weight = w)

        dist = ckmeans.transform(X_std)
        cost = np.sum((np.min(dist, axis=1)) ** 2) / num_of_data
        res[t] = cost
        print(cost)

    name_str = "./res/kmeans/kmeans-coreset-"+str(num_clusters)+"c"+str(size)+"s.npy"    
    np.save(name_str, res)

coreset size 2000
72.46134317750244
72.52116586551901
72.61014470804933
72.81573168269493
72.33371461289352
72.26337975564925
73.06211024131603
73.62089593151009
73.11230152797461
72.39632365778596
72.67820645948818
73.0021798295595
72.4452990831582
72.93485535299025
72.2830870503119
73.21546904098415
72.47913020763211
72.44097881116309
72.51648313718226
72.50294769810229
coreset size 3000
72.443642053893
72.29599203601175
72.18008283071654
72.29967256103136
72.14654954832103
72.0906063076754
72.17480883473907
72.17939671683786
72.71576122429136
72.33543159607302
72.07657268460652
72.23593545905437
72.16610404418198
71.98147442021279
72.16432199864663
72.45768627221416
72.01440183988669
72.2538161800313
72.41816291473778
72.02112919970678
coreset size 4000
72.01556694584552
72.3091542428201
72.05348581287659
72.07560139161782
72.66235345865563
71.95339056708994
72.23278218265736
72.02783723524247
72.18106017754748
72.05509406898872
71.95371265335658
72.55713881255464
72.06705877656414


In [39]:
# change the coreset size for Uniform+K and Uniform+D
num_of_rep = 20
num_clusters = 10
size_list = [1000]

for size in size_list:
    print("uniform size %d" % size)
    res = np.zeros(num_of_rep)
    for t in range(num_of_rep):
        d = uniform_kmeans(size, X_list, n_c = num_clusters)
        d1 = d[:,:30]
        d2 = d[:,30:60]
        d3 = d[:,60:]
        d_list = [d1,d2,d3]

        dkmeans = distkmeans(d_list, np.ones(size)/size, n_c= num_clusters)

        dist = dkmeans.transform(X_std)
        cost = np.sum((np.min(dist, axis=1)) ** 2) / num_of_data
        res[t] = cost
        print(cost)

    name_str = "./res/kmeans/distkmeans-uniform-"+str(num_clusters)+"c"+str(size)+"s.npy"    
    np.save(name_str, res)

uniform size 1000
80.26167986751372
81.99329500553782
76.91167616043923
77.75682460496436
78.0466078070221
76.71822537725775
78.0477706766993


  server_kmeans.fit(center_list, sample_weight = center_weights)


79.17226355741911


  server_kmeans.fit(center_list, sample_weight = center_weights)


79.50080863952059


  server_kmeans.fit(center_list, sample_weight = center_weights)


79.94478846391101
77.69643234386326
81.30715834193082
79.99356675488264
78.19853178262592
79.59947040222353
78.95327589875124
77.77877250341777
78.54525663340135
76.74863838445513
80.144310014674


In [38]:
# change the coreset size for Coreset+K and Coreset+D
num_of_rep = 20
num_clusters = 10
size_list = [1000]

for size in size_list:
    print("uniform size %d" % size)
    res = np.zeros(num_of_rep)
    for t in range(num_of_rep):
        d = uniform_kmeans(size, X_list, n_c = num_clusters)

        ukmeans = KMeans(n_clusters = num_clusters)
        ukmeans.fit(d)

        dist = ukmeans.transform(X_std)
        cost = np.sum((np.min(dist, axis=1)) ** 2) / num_of_data
        res[t] = cost
        print(cost)
    
    name_str = "./res/kmeans/kmeans-uniform-"+str(num_clusters)+"c"+str(size)+"s.npy"    
    np.save(name_str, res)

uniform size 1000
74.30569766003276
73.89319646069934
75.26910626431298
76.19973633365068
73.9986084396271
75.35557061985534
74.85766155992279
75.4529064049674
75.09489693326229
74.03946914580823
75.2082713552428
76.5138539191653
73.64047519027662
75.83735016291122
75.73563004018227
73.95187054288306
74.7788408560981
73.99311710707036
74.81175722892104
75.17107539256627


# Clustering on Covtype Dataset

In [90]:
data = pd.read_csv("covtype.csv")

X = data.iloc[:,:-1]

print(X.describe())

X = X.to_numpy()

# normalized the training set X
scaler = StandardScaler()
scaler.fit(X[:,:10])
X_p = scaler.transform(X[:,:10])

X_std = X
X_std[:,:10] = X_p

num_of_data, _ = X_std.shape

           Elevation         Aspect          Slope  \
count  581012.000000  581012.000000  581012.000000   
mean     2959.365301     155.656807      14.103704   
std       279.984734     111.913721       7.488242   
min      1859.000000       0.000000       0.000000   
25%      2809.000000      58.000000       9.000000   
50%      2996.000000     127.000000      13.000000   
75%      3163.000000     260.000000      18.000000   
max      3858.000000     360.000000      66.000000   

       Horizontal_Distance_To_Hydrology  Vertical_Distance_To_Hydrology  \
count                     581012.000000                   581012.000000   
mean                         269.428217                       46.418855   
std                          212.549356                       58.295232   
min                            0.000000                     -173.000000   
25%                          108.000000                        7.000000   
50%                          218.000000                       3

In [91]:
num_of_rep = 20
res = np.zeros(num_of_rep)
num_clusters = 15

for t in range(num_of_rep):
    kmeans = KMeans(n_clusters = num_clusters)
    kmeans.fit(X_std)
    record = kmeans.inertia_ / num_of_data
    res[t] = record
    print(record)

name_str = "./res/kmeans/covtype-kmeans-"+str(num_clusters)+"c.npy"    
np.save(name_str, res)

3.1780513260801846
3.1959430379477545
3.184653949958412
3.1858349912713515
3.199568915142822
3.1726964810199285
3.184297104649487
3.19621003436975
3.2120655025438474
3.1886789882385504
3.186888378622155
3.1853065329395958
3.1841425722191476
3.2031915805006403
3.163123229579288
3.1749974972728743
3.1924252092797287
3.185747136006516
3.1926545721088915
3.1769870503616127


In [92]:
num_of_rep = 20
res = np.zeros(num_of_rep)
num_clusters = 15

list1 = list(range(0,54,3))
list2 = list(range(1,54,3))
list3 = list(range(2,54,3))
X1 = X_std[:,list1]
X2 = X_std[:,list2]
X3 = X_std[:,list3]

X_list = [X1, X2, X3]

X_std_trans = np.hstack(X_list)

for t in range(num_of_rep):
    dkmeans = distkmeans(X_list, np.ones(num_of_data) / num_of_data, n_c= num_clusters)

    dist = dkmeans.transform(X_std_trans)
    cost = np.sum((np.min(dist, axis=1)) ** 2) / num_of_data
    res[t] = cost
    print(cost)

name_str = "./res/kmeans/covtype-distkmeans-"+str(num_clusters)+"c.npy"    
np.save(name_str, res)

3.400514935409818
3.3907367605325804
3.4167273395421085
3.3934343187545637
3.37974429187362
3.4207025866092478
3.400673131225944
3.4170289726922762
3.4024567688690537
3.402768895046087
3.3712883971165346
3.424768985332094
3.4056999535502888
3.380980260179036
3.434287351353755
3.4016020532299818
3.4334676532564314
3.4318648315467666
3.370417704391156
3.36952016017511


In [96]:
num_of_rep = 20
num_clusters = 15
size_list = [1000,2000,3000,4000,5000,6000]

for size in size_list:
    print("coreset size %d" % size)
    res = np.zeros(num_of_rep)
    for t in range(num_of_rep):
        d, w = coreset_kmeans(size, X_list, n_c = num_clusters)
        ckmeans = KMeans(n_clusters = num_clusters)
        ckmeans.fit(d,sample_weight = w)

        dist = ckmeans.transform(X_std_trans)
        cost = np.sum((np.min(dist, axis=1)) ** 2) / num_of_data
        res[t] = cost
        print(cost)

    name_str = "./res/kmeans/covtype-kmeans-coreset-"+str(num_clusters)+"c"+str(size)+"s.npy"    
    np.save(name_str, res)

coreset size 1000
3.2898266830379734
3.3086414846217322
3.2924826492582064
3.249668718360195
3.284725320716797
3.2994393605037278
3.2703715282836328
3.297355762929792
3.3087708135522167
3.2844187381181893
3.2942616979845476
3.292084880606381
3.277179835291704
3.3017662230661387
3.3175404105492063
3.326396822321754
3.278218101160364
3.285384239838911
3.277953436477948
3.2809663223689283
coreset size 2000
3.2741304922663246
3.2420962768410018
3.2157663237348646
3.2387068902258784
3.2483696246141696
3.217240498567858
3.2174723797600255
3.24655777493909
3.2282853259059983
3.222359071125093
3.2608391916754007
3.2350952762149947
3.2506141839911202
3.2441009130628884
3.2383328559173608
3.2298465339958935
3.267232970651415
3.269714717619971
3.225855239006034
3.2329982732456255
coreset size 3000
3.22781141759904
3.261541271119441
3.192490845444669
3.2321037494029157
3.2683805733203317
3.2385632246012475
3.218144078275672
3.205305927625544
3.2167541777143924
3.215705669029069
3.247547779921941
3

In [97]:
num_of_rep = 20
num_clusters = 15
size_list = [1000,2000,3000,4000,5000,6000]

for size in size_list:
    print("coreset size %d" % size)
    res = np.zeros(num_of_rep)
    for t in range(num_of_rep):
        d, w = coreset_kmeans(size, X_list, n_c = num_clusters)
        d1 = d[:,:18]
        d2 = d[:,18:36]
        d3 = d[:,36:]
        d_list = [d1,d2,d3]

        dkmeans = distkmeans(d_list, w, n_c= num_clusters)

        dist = dkmeans.transform(X_std_trans)
        cost = np.sum((np.min(dist, axis=1)) ** 2) / num_of_data
        res[t] = cost
        print(cost)

    name_str = "./res/kmeans/covtype-distkmeans-coreset-"+str(num_clusters)+"c"+str(size)+"s.npy"    
    np.save(name_str, res)

coreset size 1000
3.4196340533149274
3.501181486055294
3.4678528418371806
3.4288431516678166
3.4761833566133573
3.483486614554225
3.445318320424309
3.4687201965083756
3.4577075687762995
3.4335198879770807
3.4522775639026113
3.4682354822584482
3.425161208913709
3.460937905496778
3.5131938963142377
3.453655725578486
3.445441940941207
3.44667209260458
3.4546379293127054
3.5143833265959663
coreset size 2000
3.429371676882337
3.391631352158863
3.434196589673751
3.3933614535701087
3.434846350437633
3.444434210209303
3.452095430023677
3.4727336109402462
3.4685969645981416
3.404375682446029
3.4631819058115307
3.4001260257312307
3.4350774378860947
3.4105108329029203
3.423016500273501
3.37556068070627
3.428585420523941
3.3958309656528702
3.3733923157863606
3.412079083243275
coreset size 3000
3.469744573982994
3.470611730920805
3.4432386011455645
3.3602998477749737
3.4098668399620626
3.383447554362119
3.427475459734208
3.430393372105963
3.415797512650994
3.391928007274525
3.411034289505124
3.4511

In [94]:
num_of_rep = 20
num_clusters = 15
size_list = [1000,2000,3000,4000,5000,6000]

for size in size_list:
    print("uniform size %d" % size)
    res = np.zeros(num_of_rep)
    for t in range(num_of_rep):
        d = uniform_kmeans(size, X_list, n_c = num_clusters)

        ukmeans = KMeans(n_clusters = num_clusters)
        ukmeans.fit(d)

        dist = ukmeans.transform(X_std_trans)
        cost = np.sum((np.min(dist, axis=1)) ** 2) / num_of_data
        res[t] = cost
        print(cost)
    
    name_str = "./res/kmeans/covtype-kmeans-uniform-"+str(num_clusters)+"c"+str(size)+"s.npy"    
    np.save(name_str, res)

uniform size 1000
3.26736749176371
3.2728323195799094
3.2761563283375437
3.3169900246406434
3.291810113916662
3.3153846393904427
3.284151442577258
3.3173644151322716
3.2860709597010764
3.3557919825053393
3.3220633727943487
3.3125700895457393
3.287428178379858
3.2788561591227814
3.245343791752276
3.2803575773479507
3.2991568731234135
3.3152476850486745
3.2787772232486536
3.272919772096808
uniform size 2000
3.2169450795886827
3.2364525532529353
3.2818147520651633
3.2936509663882636
3.2280513243508504
3.234677362461995
3.208243446637511
3.2378444537321984
3.2406090492569275
3.248663052500236
3.2664964798846983
3.235265416052846
3.2453317767582814
3.260922859507817
3.2780220314614192
3.2327066035525047
3.2508435311129005
3.244703591827094
3.2414354834281185
3.275688424386008
uniform size 3000
3.2368446883211917
3.2354171620962897
3.2078106011430525
3.2064093032614154
3.2473835314036803
3.2304340958059887
3.21792682872496
3.2489931654221107
3.248914834220576
3.223506036954928
3.238908583321

In [95]:
num_of_rep = 20
num_clusters = 15
size_list = [1000,2000,3000,4000,5000,6000]

for size in size_list:
    print("uniform size %d" % size)
    res = np.zeros(num_of_rep)
    for t in range(num_of_rep):
        d = uniform_kmeans(size, X_list, n_c = num_clusters)
        d1 = d[:,:18]
        d2 = d[:,18:36]
        d3 = d[:,36:]
        d_list = [d1,d2,d3]

        dkmeans = distkmeans(d_list, np.ones(size)/size, n_c= num_clusters)

        dist = dkmeans.transform(X_std_trans)
        cost = np.sum((np.min(dist, axis=1)) ** 2) / num_of_data
        res[t] = cost
        print(cost)

    name_str = "./res/kmeans/covtype-distkmeans-uniform-"+str(num_clusters)+"c"+str(size)+"s.npy"    
    np.save(name_str, res)

uniform size 1000
3.4821484964670235
3.533810428636744
3.5292784962529944
3.486587078127785
3.4049396683663815
3.4069604332482215
3.485294359603747
3.483061450592797
3.5361518392509734
3.4360312709321943
3.5151440414362844
3.4379570868178146
3.4779650654602134
3.4411041199417998
3.4754364220955147
3.4900856679901504
3.4406058479421686
3.5105982231968644
3.4198546060237223
3.4646741567637194
uniform size 2000
3.487079834178164
3.4459780191798823
3.4168762376278674
3.4494343328509607
3.4400505038604554
3.453976479993548
3.4137865778919947
3.383634243395492
3.377735655174665
3.410433776746688
3.434113373373711
3.405682165612733
3.3868844596821095
3.3996679846406366
3.517135192229797
3.465291637080469
3.4033220797089307
3.401814654763947
3.4609786268762734
3.4779700667065896
uniform size 3000
3.4655033676963805
3.4028656941520015
3.440124721654963
3.4309412585372976
3.4348421948059737
3.3735658612736086
3.445336526806892
3.4458274785377667
3.4439304676690665
3.4552823051213917
3.4188021009

# Clustering on 3D Spatial

In [2]:
data = pd.read_csv("3D_spatial_network.txt",header=None)

print(data.describe())

X = data.to_numpy()

# normalized the training set X
scaler = StandardScaler()
scaler.fit(X)
X_std = scaler.transform(X)

num_of_data, _ = X_std.shape
print(X_std.shape)

                  0              1              2              3
count  4.348740e+05  434874.000000  434874.000000  434874.000000
mean   9.786998e+07       9.731836      57.083758      22.185405
std    3.775233e+07       0.627349       0.289479      18.617989
min    4.482444e+06       8.146126      56.582486      -8.608184
25%    8.267897e+07       9.337649      56.846049       7.028053
50%    1.019797e+08       9.887195      57.042498      17.574678
75%    1.259547e+08      10.172359      57.308669      31.810224
max    1.577424e+08      11.199326      57.750511     134.441947
(434874, 4)


In [103]:
num_of_rep = 20
res = np.zeros(num_of_rep)
num_clusters = 15

for t in range(num_of_rep):
    kmeans = KMeans(n_clusters = num_clusters)
    kmeans.fit(X_std)
    record = kmeans.inertia_ / num_of_data
    res[t] = record
    print(record)

name_str = "./res/kmeans/spatial-kmeans-"+str(num_clusters)+"c.npy"    
np.save(name_str, res)

0.7847040302243996
0.7875931695627787
0.7847097171733699
0.7872285863005691
0.7846955419265766
0.7903248205966354
0.7890777220988681
0.7861733053980996
0.7855462413102573
0.7861316299178368
0.7921331901857225
0.7843259969211426
0.7847427231969911
0.7892717575361684
0.789018174798231
0.7875832268037324
0.7847872553344056
0.7856072066621225
0.7856915043811232
0.7888056542383599


In [104]:
num_of_rep = 20
res = np.zeros(num_of_rep)
num_clusters = 15

X1 = X_std[:,:2]
X2 = X_std[:,2:]

X_list = [X1, X2]

X_std_trans = np.hstack(X_list)

for t in range(num_of_rep):
    dkmeans = distkmeans(X_list, np.ones(num_of_data) / num_of_data, n_c= num_clusters)

    dist = dkmeans.transform(X_std_trans)
    cost = np.sum((np.min(dist, axis=1)) ** 2) / num_of_data
    res[t] = cost
    print(cost)

name_str = "./res/kmeans/spatial-distkmeans-"+str(num_clusters)+"c.npy"    
np.save(name_str, res)

0.8450514423234038
0.8498482165030932
0.8437884983898137
0.8690295311330035
0.8414840829332759
0.8433986610199014
0.8540652649487867
0.8247016482717416
0.8237930851830986
0.8772293654777515
0.8451708317005088
0.8518463323431745
0.8484327183474133
0.8298502061716986
0.838736609091271
0.8513961876453311
0.8366581519139412
0.8443419850764868
0.825807461940817
0.8444608402707191


In [107]:
num_of_rep = 20
num_clusters = 15
size_list = [1000,2000,3000,4000,5000,6000]

for size in size_list:
    print("coreset size %d" % size)
    res = np.zeros(num_of_rep)
    for t in range(num_of_rep):
        d, w = coreset_kmeans(size, X_list, n_c = num_clusters)
        ckmeans = KMeans(n_clusters = num_clusters)
        ckmeans.fit(d,sample_weight = w)

        dist = ckmeans.transform(X_std_trans)
        cost = np.sum((np.min(dist, axis=1)) ** 2) / num_of_data
        res[t] = cost
        print(cost)

    name_str = "./res/kmeans/spatial-kmeans-coreset-"+str(num_clusters)+"c"+str(size)+"s.npy"    
    np.save(name_str, res)

coreset size 1000
0.8077931624414815
0.8197158771054085
0.8260656415531523
0.8249523507894615
0.7986061987502341
0.8142927053923622
0.8107323699046184
0.8118944585293171
0.8028279077072413
0.7989481960331968
0.8255686822003111
0.8165605632522994
0.8142437687689175
0.8056498792485458
0.8141138088733543
0.822286371928261
0.8082457343346692
0.8036404885896147
0.8048783053740113
0.8060689937782943
coreset size 2000
0.7947454487717492
0.8022803652396693
0.7999686251776565
0.7955826527043445
0.7969361403014993
0.8064728916957987
0.8006835256896426
0.7954158696355621
0.8040857290826507
0.7980356524294989
0.7982928018452227
0.7987446553196498
0.7970422879453178
0.7975442363823534
0.7987985993539002
0.803954453853615
0.79793445353327
0.8030256296510915
0.7983779766852734
0.804082200359874
coreset size 3000
0.7917260931147311
0.8069340641402499
0.7974374721177198
0.8040005242616061
0.8074011484144983
0.7928629245054446
0.7918417132321751
0.789632829691635
0.7965401240401406
0.7919169944244212
0.

In [108]:
num_of_rep = 20
num_clusters = 15
size_list = [1000,2000,3000,4000,5000,6000]

for size in size_list:
    print("coreset size %d" % size)
    res = np.zeros(num_of_rep)
    for t in range(num_of_rep):
        d, w = coreset_kmeans(size, X_list, n_c = num_clusters)
        d1 = d[:,:2]
        d2 = d[:,2:]
        d_list = [d1,d2]

        dkmeans = distkmeans(d_list, w, n_c= num_clusters)

        dist = dkmeans.transform(X_std_trans)
        cost = np.sum((np.min(dist, axis=1)) ** 2) / num_of_data
        res[t] = cost
        print(cost)

    name_str = "./res/kmeans/spatial-distkmeans-coreset-"+str(num_clusters)+"c"+str(size)+"s.npy"    
    np.save(name_str, res)

coreset size 1000
0.8461811531504748
0.8609945114296333
0.858992995749023
0.8826224882796893
0.8640108009244357
0.8615669252022418
0.8776535340412118
0.8528398519483159
0.8723138329626635
0.8493953101124191
0.8606183953287111
0.8720132699347405
0.8801100002654031
0.8691686806063559
0.8590075782468715
0.8412918893702911
0.900703428794087
0.8741940401419053
0.8471076814317112
0.877913060479866
coreset size 2000
0.8447417112722602
0.8878277529691234
0.863423777972122
0.8535077560900346
0.8731517678909337
0.8293901946741447
0.8607539054433327
0.8633998295514942
0.8493416431560288
0.8458752278432233
0.8357524982411622
0.8323259744953254
0.8480199571413919
0.8463129865640401
0.8512202939480988
0.8643374819792005
0.8525781906524887
0.8465380417902568
0.85458853740753
0.8691172674742537
coreset size 3000
0.8626340024858979
0.8351579730078912
0.8569179797176403
0.8738198305088937
0.8473541820442084
0.8796889889141012
0.8589544712323437
0.8136615503661843
0.8509119238567737
0.8623066405486682
0.

In [105]:
num_of_rep = 20
num_clusters = 15
size_list = [1000,2000,3000,4000,5000,6000]

for size in size_list:
    print("uniform size %d" % size)
    res = np.zeros(num_of_rep)
    for t in range(num_of_rep):
        d = uniform_kmeans(size, X_list, n_c = num_clusters)

        ukmeans = KMeans(n_clusters = num_clusters)
        ukmeans.fit(d)

        dist = ukmeans.transform(X_std_trans)
        cost = np.sum((np.min(dist, axis=1)) ** 2) / num_of_data
        res[t] = cost
        print(cost)
    
    name_str = "./res/kmeans/spatial-kmeans-uniform-"+str(num_clusters)+"c"+str(size)+"s.npy"    
    np.save(name_str, res)

uniform size 1000
0.8267216991991627
0.811277517821705
0.8091104418178058
0.8108551332464147
0.8497368824429135
0.8152098595316768
0.8195572829004788
0.8141765232034749
0.809062032222823
0.8300020145515744
0.8126219057905849
0.8035277641197607
0.8075829549139251
0.821585970769965
0.8210186408185973
0.8330848408423663
0.8038007720741569
0.8186120275629049
0.8132519527844996
0.8770269137316035
uniform size 2000
0.793189940397404
0.8016336519824274
0.8039762807623798
0.7928893504035432
0.809276778560379
0.7987932440430808
0.7939778011944124
0.7947421894352891
0.7949075827590643
0.8003954071793488
0.8099705440299231
0.8046041938552033
0.8049042796122202
0.7957710013945288
0.8105734989628518
0.8055846986728143
0.800032071179148
0.8055775095256887
0.799162547672273
0.8064739021334193
uniform size 3000
0.7958629947452464
0.796686015463961
0.7977284223081818
0.793622791854016
0.792733925606197
0.795644615533378
0.8088607635565336
0.8045575501380132
0.7951420882206736
0.7957527430313651
0.80426

In [106]:
num_of_rep = 20
num_clusters = 15
size_list = [1000,2000,3000,4000,5000,6000]

for size in size_list:
    print("uniform size %d" % size)
    res = np.zeros(num_of_rep)
    for t in range(num_of_rep):
        d = uniform_kmeans(size, X_list, n_c = num_clusters)
        d1 = d[:,:2]
        d2 = d[:,2:]
        d_list = [d1,d2]

        dkmeans = distkmeans(d_list, np.ones(size)/size, n_c= num_clusters)

        dist = dkmeans.transform(X_std_trans)
        cost = np.sum((np.min(dist, axis=1)) ** 2) / num_of_data
        res[t] = cost
        print(cost)

    name_str = "./res/kmeans/spatial-distkmeans-uniform-"+str(num_clusters)+"c"+str(size)+"s.npy"    
    np.save(name_str, res)

uniform size 1000
0.8603921119292366
0.8707514586539219
0.8550908674680785
0.8832174226091238
0.8879095155359725
0.8849674160141193
0.858182698942717
0.867193664200089
0.8709263914647631
0.8611400377934386
0.8615185346756039
0.8202394989228053
0.8828444855212487
0.8412124521782437
0.8467296920509502
0.8509133520408517
0.8548951404055302
0.859701151628693
0.858174003106717
0.8307060378976827
uniform size 2000
0.8447959618054987
0.8581213401740962
0.8543031662610981
0.8957000270312151
0.8528841741128923
0.8779824481237809
0.8565814565417921
0.8835648972225693
0.8496215111155243
0.857741527744811
0.857604148022109
0.8604718018045612
0.844161285120432
0.8678001304672828
0.8597453934021806
0.8586405937033236
0.8923178493509342
0.8439213933738555
0.8360390483000034
0.8523859695189664
uniform size 3000
0.8550516701263361
0.8975405368597453
0.8194715993380614
0.8748124501282735
0.8610038857602589
0.8880627817475278
0.8426418557937836
0.8649158574928673
0.8245559703857539
0.8546089274291181
0.8

In [None]:
#KC House Data

In [4]:
# reading data
data = pd.read_csv("kc_house_data.csv")

data.describe()

y = data.iloc[:,2].to_numpy()
X = data.iloc[:,3:].to_numpy()

num_of_data, _ = X.shape
print(X.shape)

scaler = StandardScaler()
scaler.fit(X)
X_std = scaler.transform(X)

print(y[:10])

(21613, 18)
[ 221900.  538000.  180000.  604000.  510000. 1225000.  257500.  291850.
  229500.  323000.]


In [5]:
num_of_rep = 20
res = np.zeros(num_of_rep)
num_clusters = 5

for t in range(num_of_rep):
    kmeans = KMeans(n_clusters = num_clusters)
    kmeans.fit(X_std)
    record = kmeans.inertia_ / num_of_data
    res[t] = record
    print(record)

name_str = "./res/kmeans/kc-kmeans-"+str(num_clusters)+"c.npy"    
np.save(name_str, res)

11.305490640932755
11.433328987072965
11.433268379049498
11.274149144006264
11.433346859692067
11.437495261213812
11.433337685413182
11.43331218009778
11.281967098707677
11.4374918821151
11.433369175655573
11.281971264475233
11.43331218009778
11.490116199561177
11.490081664375573
11.433379981027485
11.433291057652962
11.274405981314807
11.492840303184106
11.433315797988188


In [10]:
num_of_rep = 20
num_clusters = 5
size_list = [100,200,300,400,500,600]

X1 = X_std[:,:9]
X2 = X_std[:,9:]

X_list = [X1, X2]

X_std_trans = np.hstack(X_list)

for t in range(num_of_rep):
    dkmeans = distkmeans(X_list, np.ones(num_of_data) / num_of_data, n_c= num_clusters)

    dist = dkmeans.transform(X_std_trans)
    cost = np.sum((np.min(dist, axis=1)) ** 2) / num_of_data
    res[t] = cost
    print(cost)

name_str = "./res/kmeans/kc-distkmeans-"+str(num_clusters)+"c.npy"    
np.save(name_str, res)

12.170222719510488
12.004861876230374
12.003905986398873
12.080629089455641
12.003634895246089
12.004031792000038
11.968852412807033
11.715374327037882
12.0836689705196
12.004470681230716
12.005168670343693
11.716024653649352
12.003929830188135
11.734436763471965
12.319030672210452
12.496691888435242
12.004593415251547
12.127371988591268
12.00458751165708
12.004306706118031


In [11]:
num_of_rep = 20
num_clusters = 5
size_list = [100,200,300,400,500,600]

for size in size_list:
    print("coreset size %d" % size)
    res = np.zeros(num_of_rep)
    for t in range(num_of_rep):
        d, w = coreset_kmeans(size, X_list, n_c = num_clusters)
        ckmeans = KMeans(n_clusters = num_clusters)
        ckmeans.fit(d,sample_weight = w)

        dist = ckmeans.transform(X_std_trans)
        cost = np.sum((np.min(dist, axis=1)) ** 2) / num_of_data
        res[t] = cost
        print(cost)

    name_str = "./res/kmeans/kc-kmeans-coreset-"+str(num_clusters)+"c"+str(size)+"s.npy"    
    np.save(name_str, res)

coreset size 100
11.818976199206558
12.009876159789544
12.756929598079362
12.119352720894224
12.216713583280283
11.86885073267288
12.825818807553476
12.1270614296096
11.9341739403557
11.975674749117522
11.775759409432988
11.870141989515448
11.778758136595101
12.568284190604617
12.51000603318906
12.252521232102149
12.101095995031683
12.183203976239055
12.129685896982773
13.278386954005981
coreset size 200
11.687061216909935
11.706163741314565
11.764974767583665
12.002907090838827
11.54333270670979
11.760505385186338
11.644149776704477
11.624536482146546
12.155378394938479
11.766837428043356
12.001060811090111
11.560582560923809
11.758263226193735
11.860972153984473
11.599124369949353
11.526764311861774
11.557150348567049
11.757870576845274
11.4358908457431
11.913714014518023
coreset size 300
11.908237194329613
11.503798340910077
11.438992085629282
11.795976799569525
11.39205931074042
11.541721947589508
11.583815737411953
11.508012483526565
11.426277960203954
11.432439626599823
11.501102

In [12]:
num_of_rep = 20
num_clusters = 5
size_list = [100,200,300,400,500,600]

for size in size_list:
    print("uniform size %d" % size)
    res = np.zeros(num_of_rep)
    for t in range(num_of_rep):
        d = uniform_kmeans(size, X_list, n_c = num_clusters)

        ukmeans = KMeans(n_clusters = num_clusters)
        ukmeans.fit(d)

        dist = ukmeans.transform(X_std_trans)
        cost = np.sum((np.min(dist, axis=1)) ** 2) / num_of_data
        res[t] = cost
        print(cost)
    
    name_str = "./res/kmeans/kc-kmeans-uniform-"+str(num_clusters)+"c"+str(size)+"s.npy"    
    np.save(name_str, res)

uniform size 100
12.178699574655603
11.967466212294886
12.34872254279996
13.001945886322094
12.166274669606814
12.909139369380574
12.358304873258973
12.397044796556017
12.216567051592161
12.086721027782104
12.084798556065802
12.2148418170657
12.169530383042975
12.054251287888274
12.222597711047415
12.753734142906781
12.296377449575484
12.60878947205886
12.781540484522779
12.517985315331138
uniform size 200
11.701997400813958
11.851186218251204
12.272502075570635
12.094226780534305
12.049389513145112
12.37978175809738
12.17277306048411
11.861445438108836
11.881493818603474
12.019889190720038
11.898211516746498
12.134273131845854
11.993345642066352
11.76800980971676
12.571656369591606
12.541819999342337
12.138137237515156
11.74071101008095
12.06150168906134
11.997469609356251
uniform size 300
11.895182369619084
11.618169331323909
11.837951200942944
11.92395184315144
11.743039169581971
12.87722579161743
11.660978519261482
11.708642906486087
11.744812808835423
12.215352786497244
11.7190313

In [116]:
num_of_rep = 20
num_clusters = 5
size_list = [400]

for size in size_list:
    print("coreset size %d" % size)
    res = np.zeros(num_of_rep)
    for t in range(num_of_rep):
        d, w = coreset_kmeans(size, X_list, n_c = num_clusters)
        d1 = d[:,:9]
        d2 = d[:,9:]
        d_list = [d1,d2]

        dkmeans = distkmeans(d_list, w, n_c= num_clusters)

        dist = dkmeans.transform(X_std_trans)
        cost = np.sum((np.min(dist, axis=1)) ** 2) / num_of_data
        res[t] = cost
        print(cost)

    name_str = "./res/kmeans/kc-distkmeans-coreset-"+str(num_clusters)+"c"+str(size)+"s.npy"    
    np.save(name_str, res)

coreset size 400
14.715148423713782
12.120997659682748
12.481974683784564
12.542834087622332
14.73128154951885
13.517133712789786
12.230795787070386
12.069178582655146
12.228745485817678
12.820133043091344
12.576805436834047
11.793907966323182
12.608097740645896
11.741336051570187
12.10722296324622
14.579172056671702
12.213048075396543
14.566544365983015
12.551993031544988
12.47262423626019


In [98]:
num_of_rep = 20
num_clusters = 5
size_list = [400]

for size in size_list:
    print("uniform size %d" % size)
    res = np.zeros(num_of_rep)
    for t in range(num_of_rep):
        d = uniform_kmeans(size, X_list, n_c = num_clusters)
        d1 = d[:,:9]
        d2 = d[:,9:]
        d_list = [d1,d2]

        dkmeans = distkmeans(d_list, np.ones(size)/size, n_c= num_clusters)

        dist = dkmeans.transform(X_std_trans)
        cost = np.sum((np.min(dist, axis=1)) ** 2) / num_of_data
        res[t] = cost
        print(cost)

    name_str = "./res/kmeans/kc-distkmeans-uniform-"+str(num_clusters)+"c"+str(size)+"s.npy"    
    np.save(name_str, res)

uniform size 400
13.121733020062505


  server_kmeans.fit(center_list, sample_weight = center_weights)


15.607778386565466
13.686173060503421
12.531449778938782
12.515945613561044
12.691097502981078
13.018389633161375


  server_kmeans.fit(center_list, sample_weight = center_weights)


15.560060276334042
12.379007141816007
12.39707972331171
12.508051715940516
12.06552936636562
12.396829168686768
12.781384617921999
12.865320715497694
12.8475887514731
12.251010083243576
11.957804513829926
15.674889554059265
12.207229131241938


  server_kmeans.fit(center_list, sample_weight = center_weights)
