In [1]:
import pandas as pd
import numpy as np
import random
from math import sqrt
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()

# get data
PATH = './data/Country-data.csv'

data_df = pd.read_csv(PATH)
data_df = data_df.set_index("country")
data_df

Unnamed: 0_level_0,child_mort,exports,health,imports,income,inflation,life_expec,total_fer,gdpp
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Afghanistan,90.2,10.0,7.58,44.9,1610,9.44,56.2,5.82,553
Albania,16.6,28.0,6.55,48.6,9930,4.49,76.3,1.65,4090
Algeria,27.3,38.4,4.17,31.4,12900,16.10,76.5,2.89,4460
Angola,119.0,62.3,2.85,42.9,5900,22.40,60.1,6.16,3530
Antigua and Barbuda,10.3,45.5,6.03,58.9,19100,1.44,76.8,2.13,12200
...,...,...,...,...,...,...,...,...,...
Vanuatu,29.2,46.6,5.25,52.7,2950,2.62,63.0,3.50,2970
Venezuela,17.1,28.5,4.91,17.6,16500,45.90,75.4,2.47,13500
Vietnam,23.3,72.0,6.84,80.2,4490,12.10,73.1,1.95,1310
Yemen,56.3,30.0,5.18,34.4,4480,23.60,67.5,4.67,1310


In [2]:
# Normalize data
data_matrix = data_df.to_numpy()
print('Not Normalized \n', data_matrix)
#print(data_matrix[:, 0].sum()/167)
#print(data_matrix[:, 0].std())

data = scaler.fit_transform(data_matrix)
print('Normalized \n', data)

Not Normalized 
 [[9.02e+01 1.00e+01 7.58e+00 ... 5.62e+01 5.82e+00 5.53e+02]
 [1.66e+01 2.80e+01 6.55e+00 ... 7.63e+01 1.65e+00 4.09e+03]
 [2.73e+01 3.84e+01 4.17e+00 ... 7.65e+01 2.89e+00 4.46e+03]
 ...
 [2.33e+01 7.20e+01 6.84e+00 ... 7.31e+01 1.95e+00 1.31e+03]
 [5.63e+01 3.00e+01 5.18e+00 ... 6.75e+01 4.67e+00 1.31e+03]
 [8.31e+01 3.70e+01 5.89e+00 ... 5.20e+01 5.40e+00 1.46e+03]]
Normalized 
 [[ 1.29153238 -1.13827979  0.27908825 ... -1.61909203  1.90288227
  -0.67917961]
 [-0.5389489  -0.47965843 -0.09701618 ...  0.64786643 -0.85997281
  -0.48562324]
 [-0.27283273 -0.09912164 -0.96607302 ...  0.67042323 -0.0384044
  -0.46537561]
 ...
 [-0.37231541  1.13030491  0.0088773  ...  0.28695762 -0.66120626
  -0.63775406]
 [ 0.44841668 -0.40647827 -0.59727159 ... -0.34463279  1.14094382
  -0.63775406]
 [ 1.11495062 -0.15034774 -0.33801514 ... -2.09278484  1.6246091
  -0.62954556]]


In [3]:
# Cost function for later
def cost(u, x, u_x):
    # u is a list of centroids
    # x are our data samples
    # u_x are what centroid index in u data sample at index i is assigned to
    running_sum = 0
    for i in range(len(x)):
        running_sum += ((x[i] - u[u_x[i]])**2).sum()

    return running_sum/len(x)
    

In [23]:
K = 4 # number of clusters
m = len(data)

# Pick K random samples to be our centroids
# note u_k is centroid k 
u = random.sample([i for i in data], K)

x = data.copy()

# K means   
for i in range(30):
    distances = [[] for i in range(K)]
    # for now we use euclidean distance
    for centroid_index in range(K):
        for row in x:
            distances[centroid_index].append(sqrt(((row-u[centroid_index])**2).sum()))

    # find closest centroid for each data point
    m_belong_to_k = [] # data point at index m belongs to cluster k
    for j in range(m):
        min_centroid_index = 0
        for d in range(K):
            if distances[d][j] < distances[min_centroid_index][j]:
                min_centroid_index = d
        m_belong_to_k.append(min_centroid_index)

    # calculate cost before moving
    print(f'Cost at iteration {i}: {cost(u, x, m_belong_to_k)}')

    # move centroids by finding mean of assigned points
    for j in range(len(u)):
        u[j] = np.zeros_like(u[j])
        count = 0
        for ind_belong in range(len(m_belong_to_k)):
            if m_belong_to_k[ind_belong] == j:
                count += 1
                u[j] += x[ind_belong]
        u[j] = u[j] / count

Cost at iteration 0: 7.543165481720137
Cost at iteration 1: 5.0768012120550114
Cost at iteration 2: 4.85380003472334
Cost at iteration 3: 4.666280918850804
Cost at iteration 4: 4.626698935314798
Cost at iteration 5: 4.617488112752445
Cost at iteration 6: 4.617488112752445
Cost at iteration 7: 4.617488112752445
Cost at iteration 8: 4.617488112752445
Cost at iteration 9: 4.617488112752445
Cost at iteration 10: 4.617488112752445
Cost at iteration 11: 4.617488112752445
Cost at iteration 12: 4.617488112752445
Cost at iteration 13: 4.617488112752445
Cost at iteration 14: 4.617488112752445
Cost at iteration 15: 4.617488112752445
Cost at iteration 16: 4.617488112752445
Cost at iteration 17: 4.617488112752445
Cost at iteration 18: 4.617488112752445
Cost at iteration 19: 4.617488112752445
Cost at iteration 20: 4.617488112752445
Cost at iteration 21: 4.617488112752445
Cost at iteration 22: 4.617488112752445
Cost at iteration 23: 4.617488112752445
Cost at iteration 24: 4.617488112752445
Cost at it

In [24]:
# See who is in what cluster
print(m_belong_to_k)
for i in range(K):
    print(f'In cluster {i}:')
    for j in range(len(m_belong_to_k)):
        if m_belong_to_k[j] == i:
            print(data_df.index[j], end=', ')
    print()

[3, 1, 1, 3, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 2, 1, 3, 1, 1, 1, 1, 1, 2, 1, 3, 3, 1, 3, 2, 1, 3, 3, 1, 1, 1, 3, 3, 0, 1, 3, 1, 2, 2, 2, 1, 1, 1, 1, 0, 3, 1, 1, 2, 2, 0, 3, 1, 2, 3, 2, 1, 1, 3, 3, 1, 3, 1, 2, 1, 1, 1, 1, 2, 2, 2, 1, 2, 1, 1, 3, 3, 2, 1, 3, 1, 1, 3, 3, 1, 1, 2, 1, 3, 3, 1, 1, 3, 2, 3, 1, 1, 1, 0, 1, 1, 3, 1, 3, 1, 2, 2, 3, 0, 2, 1, 3, 1, 1, 1, 1, 1, 2, 2, 1, 1, 3, 1, 1, 3, 1, 1, 3, 2, 2, 2, 1, 3, 2, 2, 1, 1, 3, 1, 2, 2, 1, 3, 1, 3, 3, 1, 1, 1, 1, 3, 1, 2, 2, 2, 1, 1, 1, 0, 1, 0, 3]
In cluster 0:
Congo, Rep., Equatorial Guinea, Gabon, Mongolia, Nigeria, Venezuela, Yemen, 
In cluster 1:
Albania, Algeria, Antigua and Barbuda, Argentina, Armenia, Azerbaijan, Bahamas, Bangladesh, Barbados, Belarus, Belize, Bhutan, Bolivia, Bosnia and Herzegovina, Botswana, Brazil, Bulgaria, Cambodia, Cape Verde, Chile, China, Colombia, Costa Rica, Croatia, Dominican Republic, Ecuador, Egypt, El Salvador, Estonia, Fiji, Georgia, Grenada, Guatemala, Guyana, Hungary, India, Indonesia, Iran, Iraq

In [12]:
# TODO --------
# check if this is correct, is cost supposed to stop decreasing that quickly?
# also apply more clusters and do elbow thing