In [9]:
import numpy as np
import scipy as sp
import pandas as pd

from sklearn.preprocessing import StandardScaler

from sklearn.cluster import KMeans

In [5]:
# reading data
data = pd.read_csv("year_prediction.csv")
data = data.rename(index=str, columns={"label":"year"})

In [12]:
# separate input attributes and output into different dataframes
X = data.iloc[:,1:]
Y = data.iloc[:,0]

# normalized the training set X
scaler = StandardScaler()
scaler.fit(X)
X_std = scaler.transform(X)

num_of_data, _ = X_std.shape

In [121]:
num_of_rep = 20
res = np.zeros(num_of_rep)
num_clusters = 10

for t in range(num_of_rep):
    kmeans = KMeans(n_clusters = num_clusters)
    kmeans.fit(X_std)
    record = kmeans.inertia_ / num_of_data
    res[t] = record
    print(record)

name_str = "./res/kmeans/kmeans-"+str(num_clusters)+"c.npy"    
np.save(name_str, res)

71.65047255247167
71.65037839463994
71.65038445285383
71.65043678725971
71.65055173588914
71.65043621527752
71.65043481087723
71.65066602130086
71.65744870527136
71.65045610855871
71.65057260055757
71.65039727026986
71.65057172896067
71.65042962393274
71.66007752663573
71.6505658605916
71.65057189113001
71.65051012927415
71.65057042708213
71.65041435398359


In [122]:
def decode(number, b, t):
    res = []
    for _ in range(t):
        r = number % b
        res.append(r)
        number = (number - r) // b
    res.reverse()
    return res

def encode(number_list, b, t):
    res = 0
    number_list.reverse()
    for i in range(t):
        res += number_list[i] * (b ** i)
    return res

def distkmeans(D_list, w_list, n_c = 10):
    D = np.hstack(D_list)
    num_of_party = len(D_list)
    num_of_data, _ = D_list[0].shape
    kmeans_list = []
    total_dim = 0
    label_list = []
    for j in range(num_of_party):
        kmeans_list.append(KMeans(n_clusters = n_c))
        _, party_dim = D_list[j].shape
        total_dim += party_dim
        label = kmeans_list[j].fit_predict(D_list[j], sample_weight = w_list[j])
        label_list.append(label)
    
    grids_number = n_c ** num_of_party
    center_list = np.zeros((grids_number, total_dim))
    center_weights = np.zeros(grids_number)
    
    for h in range(grids_number):
        h_decode = decode(h, n_c, num_of_party)
        temp = []
        for j in range(num_of_party):
            temp.append((kmeans_list[j].cluster_centers_)[h_decode[j],:])
        center_list[h, :] = np.concatenate(temp)
    
    for i in range(num_of_data):
        temp = [l[i] for l in label_list]
        idx = encode(temp, n_c, num_of_party)
        center_weights[idx] += w_list[i]
    
    # normalize center_weights to 1
    center_weights = center_weights / np.sum(center_weights)
    
    server_kmeans = KMeans(n_clusters = n_c)
    server_kmeans.fit(center_list, sample_weight = center_weights)
    return server_kmeans

In [140]:
def uniform_kmeans(m, D_list, n_c = 10):
    D = np.hstack(D_list)
    D_df = pd.DataFrame(D)
    C = D_df.sample(n=m, replace=False)
    C = C.to_numpy()
    return C

def coreset_kmeans(m, D_list, n_c = 10):
    alpha = 2
    D = np.hstack(D_list)
    num_of_party = len(D_list)
    num_of_data, _ = D_list[0].shape
    kmeans_list = []
    label_list = []
    groupcost_list = []
    groupcount_list = []
    sensitivity = np.zeros((num_of_data, num_of_party))
    for j in range(num_of_party):
        kmeans_list.append(KMeans(n_clusters = n_c))
        label = kmeans_list[j].fit_predict(D_list[j])
        label_list.append(label)
        groupcost = np.zeros(n_c)
        groupcount = np.zeros(n_c)
        cost = kmeans_list[j].inertia_ / num_of_data
        t = kmeans_list[j].transform(D_list[j])
        for i in range(num_of_data):
            groupcount[label[i]] += 1
            groupcost[label[i]] += t[i,label[i]] ** 2
        for i in range(num_of_data):
            sensitivity[i,j] = alpha * (t[i,label[i]] ** 2) / cost \
                        + 2 * alpha * groupcost[label[i]] / (groupcount[label[i]] * cost) + 4 * num_of_data / groupcount[label[i]]
    s = np.sum(sensitivity, axis=1)
    D_df = pd.DataFrame(np.hstack((D, (1/s).reshape(-1,1))))
    C = D_df.sample(n=m, replace=False, weights=s)
    C = C.to_numpy()
    data = C[:,:-1]
    weights = C[:,-1]
    return data, weights

In [125]:
num_of_rep = 20
res = np.zeros(num_of_rep)
num_clusters = 10
size = 1000

X1 = X_std[:,:30]
X2 = X_std[:,30:60]
X3 = X_std[:,60:]

X_list = [X1, X2, X3]

for t in range(num_of_rep):
    d, w = coreset_kmeans(size, X_list, n_c = num_clusters)
    ckmeans = KMeans(n_clusters = num_clusters)
    ckmeans.fit(d,sample_weight = w)

    dist = ckmeans.transform(X_std)
    cost = np.sum((np.min(dist, axis=1)) ** 2) / num_of_data
    res[t] = cost
    print(cost)

name_str = "./res/kmeans/kmeans-coreset-"+str(num_clusters)+"c"+str(size)+"s.npy"    
np.save(name_str, res)

73.46019681608169
73.44234567211778
74.23817445185314
73.50473315261743
73.8470045225081
73.83340940298976
74.10437548157843
73.14254030775075
73.37503297719871
73.22802974817957
74.44289338836494
73.44334870173103
74.29590281199722
74.0795713427415
73.44194770990273
74.06784220645517
74.2770960732288
73.5925707763438
73.69805438857364
73.67229988940595


In [126]:
num_of_rep = 20
res = np.zeros(num_of_rep)
num_clusters = 10
size = 1000

for t in range(num_of_rep):
    d = uniform_kmeans(size, X_list, n_c = num_clusters)

    ukmeans = KMeans(n_clusters = num_clusters)
    ukmeans.fit(d)

    dist = ukmeans.transform(X_std)
    cost = np.sum((np.min(dist, axis=1)) ** 2) / num_of_data
    res[t] = cost
    print(cost)
    
name_str = "./res/kmeans/kmeans-uniform-"+str(num_clusters)+"c"+str(size)+"s.npy"    
np.save(name_str, res)

74.60773761857315
74.46379461667574
74.31445072551573
74.77631191758621
75.17442865607056
74.84934256775868
74.35092542999016
75.22326893919981
73.99118084613146
75.91510853085776
75.21992427041205
74.39086570103453
74.0110563130215
74.55765218811541
75.300768914269
74.22327394436816
73.79944559887997
76.1726561268547
75.17044125955279
74.24681199262298


In [127]:
num_of_rep = 20
res = np.zeros(num_of_rep)
num_clusters = 10

for t in range(num_of_rep):
    dkmeans = distkmeans(X_list, np.ones(num_of_data) / num_of_data, n_c= num_clusters)

    dist = dkmeans.transform(X_std)
    cost = np.sum((np.min(dist, axis=1)) ** 2) / num_of_data
    res[t] = cost
    print(cost)

name_str = "./res/kmeans/distkmeans-"+str(num_clusters)+"c.npy"    
np.save(name_str, res)

75.46073871868204
74.68238959903377
74.90830729395657
74.73988430616967
74.53718666421031
75.84935711949967
74.56518608482786
74.64365223169615
75.20884984994078
74.70615628472683
74.87085874301633
75.24927899168051
75.13922804574419
74.87150920864161
74.65300333219258
74.36218738336815
74.68265369929877
75.35650019304215
74.73695602945946
74.50523114546864


In [135]:
num_of_rep = 20
res = np.zeros(num_of_rep)
num_clusters = 10
size = 1000

for t in range(num_of_rep):
    d, w = coreset_kmeans(size, X_list, n_c = num_clusters)
    d1 = d[:,:30]
    d2 = d[:,30:60]
    d3 = d[:,60:]
    d_list = [d1,d2,d3]

    dkmeans = distkmeans(d_list, w, n_c= num_clusters)

    dist = dkmeans.transform(X_std)
    cost = np.sum((np.min(dist, axis=1)) ** 2) / num_of_data
    res[t] = cost
    print(cost)

name_str = "./res/kmeans/distkmeans-coreset-"+str(num_clusters)+"c"+str(size)+"s.npy"    
np.save(name_str, res)

82.89508344173267
78.17332375380256
82.1344160206706
79.42853395264149
79.06359200244641
80.85760950781842
77.79485577719838
82.27749010803342
80.8792909961697
78.60302115292285
81.51629832240022
81.50396228098326
78.67847565851679
78.09106017734493
79.60122252403028
77.64333474643502
82.94515820001148
81.90834942909983
79.74436504663365
78.99582933040439


In [146]:
num_of_rep = 20
res = np.zeros(num_of_rep)
num_clusters = 10
size = 1000

for t in range(num_of_rep):
    d = uniform_kmeans(1000, X_list, n_c = 10)
    d1 = d[:,:30]
    d2 = d[:,30:60]
    d3 = d[:,60:]
    d_list = [d1,d2,d3]

    dkmeans = distkmeans(d_list, np.ones(1000)/1000, n_c= 10)

    dist = dkmeans.transform(X_std)
    cost = np.sum((np.min(dist, axis=1)) ** 2) / num_of_data
    res[t] = cost
    print(cost)

name_str = "./res/kmeans/distkmeans-uniform-"+str(num_clusters)+"c"+str(size)+"s.npy"    
np.save(name_str, res)

83.7401688838184
76.73987287624985
79.82042045170763
76.66487988730418
79.49220888350868
78.31231394027009
77.51736852769194


  server_kmeans.fit(center_list, sample_weight = center_weights)


85.90073372219145
76.60469045595319


  server_kmeans.fit(center_list, sample_weight = center_weights)


81.09228293706421
82.8331629089136
78.20572496101285
78.39609716472633
76.85923906041907


  server_kmeans.fit(center_list, sample_weight = center_weights)


86.99701133896076


  server_kmeans.fit(center_list, sample_weight = center_weights)


83.75390650887482
80.49824573909274
79.07890096114197
78.44241781896122
78.74596817766131


FileNotFoundError: [Errno 2] No such file or directory: './res/kmeans/distkmeans-uniform-10c1000s.npy'