In [33]:
import pandas as pd
import numpy as np

In [34]:
def readArff(filename):
    with open ('./UCI-Data/'+filename+'.arff', 'r') as f:
        # split lines, remove ones with comments
        lines = [line.lower() for line in f.read().split('\n') if not line.startswith('%')]
        
    # remove empty lines
    lines = [line for line in lines if line != '']
    
    columns = []
    data = []
    for index, line in enumerate(lines):
        if line.startswith('@attribute'):
            columns.append(line)
            
        if line.startswith('@data'):
            # get the rest of the lines excluding the one that says @data
            data = lines[index+1:]
            break
            
    # clean column names -- '@attribute colname  \t\t\t{a, b, ...}'
    cleaned_columns = [c[11:c.index('real')].strip() for c in columns[:-1]]
    
    # ** change for real values. skip last column and parse differently
    class_val = columns[-1]
    cleaned_columns.append(class_val[11:class_val.index('{')].strip())
    
    # clean and split data
    cleaned_data = [d.replace(', ', ',').split(',') for d in data]
    
    # create dataframe
    return pd.DataFrame(cleaned_data, columns = cleaned_columns)

In [79]:
def preprocess_data(df):
    ys = df.iloc[:,-1]
    ys = ys.values
    
    # change xs to 2d numpy array -- convert strings to floats
    xs = df.iloc[:,:-1].astype(float)
    xs = xs.values
    
    return xs, ys

In [80]:
X, y = preprocess_data(readArff("iris"))

In [81]:
k = len(set(y))
k

3

In [130]:
def initialize_centroids(X, k):
    """
    Returns a matrix representing k randomly chosen instances for the initial centroids
    """
    n_instances, n_features = np.shape(X)
    centroids = np.zeros((k, n_features))
    
    for i in range(k):
        # make sure they aren't the same?
        centroid = X[np.random.choice(range(n_instances))]
        centroids[i] = centroid
        
    return centroids

In [131]:
centroids = initialize_centroids(X,k)
centroids

array([[6.8, 3.2, 5.9, 2.3],
       [4.8, 3.4, 1.6, 0.2],
       [6.3, 2.3, 4.4, 1.3]])

In [132]:
def euclidean_distance(x1, x2):
    """ Calculates the euclidean distance between two points """
    assert np.size(x1) == np.size(x2)

    # Squared distance between each coordinate
    distances = np.square(x1 - x2)
    return np.sqrt(sum(distances))

In [133]:
euclidean_distance(X[0], centroids[0])

5.2573757712379665

In [135]:
def find_nearest_centroid(instance, centroids):
    """
    Helper method for create_clusters.
    
    Returns the index of the closest centroid for a given instance
    Distance measured using euclidean_distance
    """
    closest = -1
    closest_dist = float('inf')
    for i, c in enumerate(centroids):
        dist = euclidean_distance(instance, c)
        if dist < closest_dist:
            closest_dist = dist
            closest = i
    return closest

In [218]:
def create_clusters(X, k, centroids):
    """
    Returns a list of k-lists, each containing the indices of instances that are closest to the centroid
    
    ** stop storing the instances themselves it wastes space
    """
    n_instances, n_features = np.shape(X)
    clusters = [[] for _ in range(k)]     # create clusters of centroids
    
    for i, x_i in enumerate(X):
        centroid_idx = find_nearest_centroid(x_i, centroids)
        clusters[centroid_idx].append(i)
    
    assert sum([len(c) for c in clusters]) == n_instances # sanity check
    
    return clusters
    # turn list of np.arrays into list of 2D arrays
#     return [np.reshape(c, newshape = (len(c), n_features)) for c in clusters]

In [219]:
clusters = create_clusters(X, k, centroids)
clusters

[[100,
  102,
  103,
  104,
  105,
  107,
  108,
  109,
  110,
  111,
  112,
  114,
  115,
  116,
  117,
  118,
  120,
  122,
  124,
  125,
  128,
  129,
  130,
  131,
  132,
  135,
  136,
  137,
  139,
  140,
  141,
  143,
  144,
  145,
  147,
  148],
 [0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  44,
  45,
  46,
  47,
  48,
  49],
 [50,
  51,
  52,
  53,
  54,
  55,
  56,
  57,
  58,
  59,
  60,
  61,
  62,
  63,
  64,
  65,
  66,
  67,
  68,
  69,
  70,
  71,
  72,
  73,
  74,
  75,
  76,
  77,
  78,
  79,
  80,
  81,
  82,
  83,
  84,
  85,
  86,
  87,
  88,
  89,
  90,
  91,
  92,
  93,
  94,
  95,
  96,
  97,
  98,
  99,
  101,
  106,
  113,
  119,
  121,
  123,
  126,
  127,
  133,
  134,
  138,
  142,
  146,
  149]]

In [220]:
# for each cluster 1...k, calculate new centroid = mean of all points assigned to that cluster
def update_centroids(X, k, clusters):
    n_features = np.shape(X)[1]
    new_centroids = np.zeros((k, n_features))
    
    for i, clstr in enumerate(clusters):
        centroid = np.mean(X[clstr], axis=0)
        new_centroids[i] = centroid
    return new_centroids

In [212]:
new_centroids = update_centroids(X, k, clusters)
delta = new_centroids - centroids
delta

array([[ 0.04444444, -0.11944444, -0.12777778, -0.175     ],
       [ 0.206     ,  0.018     , -0.136     ,  0.044     ],
       [-0.365625  ,  0.4546875 ,  0.01875   ,  0.1234375 ]])

In [242]:
def predict(X):
    centroids = initialize_centroids(X, k)
    
    for _ in range(100):
        clusters = create_clusters(X, k, centroids)
        
        prev_centroids = centroids
        
        centroids = update_centroids(X, k, clusters)
       
        # If no centroids have changed => convergence
        delta = centroids - prev_centroids
        if np.all((delta == 0)):
            break
    return centroids

In [243]:
predict(X)

array([[5.88360656, 2.74098361, 4.38852459, 1.43442623],
       [5.006     , 3.418     , 1.464     , 0.244     ],
       [6.85384615, 3.07692308, 5.71538462, 2.05384615]])