In [3]:
import pandas as pd
import numpy as np

In [4]:
def readArff(filename):
    with open ('./UCI-Data/'+filename+'.arff', 'r') as f:
        # split lines, remove ones with comments
        lines = [line.lower() for line in f.read().split('\n') if not line.startswith('%')]
        
    # remove empty lines
    lines = [line for line in lines if line != '']
    
    columns = []
    data = []
    for index, line in enumerate(lines):
        if line.startswith('@attribute'):
            columns.append(line)
            
        if line.startswith('@data'):
            # get the rest of the lines excluding the one that says @data
            data = lines[index+1:]
            break
            
    # clean column names -- '@attribute colname  \t\t\t{a, b, ...}'
    cleaned_columns = [c[11:c.index('real')].strip() for c in columns[:-1]]
    
    # ** change for real values. skip last column and parse differently
    class_val = columns[-1]
    cleaned_columns.append(class_val[11:class_val.index('{')].strip())
    
    # clean and split data
    cleaned_data = [d.replace(', ', ',').split(',') for d in data]
    
    # create dataframe
    return pd.DataFrame(cleaned_data, columns = cleaned_columns)

In [5]:
def preprocess_data(df):
    ys = df.iloc[:,-1]
    ys = ys.values
    
    # change xs to 2d numpy array -- convert strings to floats
    xs = df.iloc[:,:-1].astype(float)
    xs = xs.values
    
    return xs, ys

In [6]:
X, y = preprocess_data(readArff("iris"))

In [7]:
k = len(set(y))
k

3

In [10]:
def initialize_centroids(X, k):
    """
    Returns a matrix representing k randomly chosen instances for the initial centroids
    """
    n_instances, n_features = np.shape(X)
    centroids = np.zeros((k, n_features))
    
    for i in range(k):
        # Note: should we be checking to make sure they aren't the same?
        # feels like yes but i dont want to deal with that now lol
        centroids[i] = X[np.random.choice(range(n_instances))]
        
    return centroids

In [11]:
centroids = initialize_centroids(X,k)
centroids

array([[5.7, 2.6, 3.5, 1. ],
       [5.1, 3.7, 1.5, 0.4],
       [5.7, 2.8, 4.1, 1.3]])

In [12]:
def euclidean_distance(x1, x2):
    """ Calculates the euclidean distance between two points """
    assert np.size(x1) == np.size(x2)

    # Squared distance between each coordinate
    distances = np.square(x1 - x2)
    return np.sqrt(sum(distances))

In [14]:
def find_nearest_centroid(instance, centroids):
    """
    Helper method for create_clusters.
    
    Returns the index of the closest centroid for a given instance
    Distance measured using euclidean_distance
    """
    closest = -1
    closest_dist = float('inf')
    for i, c in enumerate(centroids):
        dist = euclidean_distance(instance, c)
        if dist < closest_dist:
            closest_dist = dist
            closest = i
    return closest

In [15]:
def create_clusters(X, k, centroids):
    """
    Returns a list of k-lists, each containing the indices of instances that are closest to the centroid
    
    ** stop storing the instances themselves it wastes space, just save instances
    """
    n_instances, n_features = np.shape(X)
    clusters = [[] for _ in range(k)]     # create clusters of centroids
    
    for i, x_i in enumerate(X):
        centroid_idx = find_nearest_centroid(x_i, centroids)
        clusters[centroid_idx].append(i)
    
    assert sum([len(c) for c in clusters]) == n_instances # sanity check
    
    return clusters
    # turn list of np.arrays into list of 2D arrays
#     return [np.reshape(c, newshape = (len(c), n_features)) for c in clusters]

In [17]:
clusters = create_clusters(X, k, centroids)

In [18]:
# for each cluster 1...k, calculate new centroid = mean of all points assigned to that cluster
def update_centroids(X, k, clusters):
    n_features = np.shape(X)[1]
    new_centroids = np.zeros((k, n_features))
    
    for i, clstr in enumerate(clusters):
        centroid = np.mean(X[clstr], axis=0)
        new_centroids[i] = centroid
    return new_centroids

In [19]:
new_centroids = update_centroids(X, k, clusters)
delta = new_centroids - centroids
delta

array([[-0.33333333, -0.18888889,  0.02222222,  0.05555556],
       [-0.094     , -0.282     , -0.036     , -0.156     ],
       [ 0.65054945,  0.11758242,  0.94285714,  0.43736264]])

In [35]:
def generate_prediction_groups(X, clusters):
    """
    Return a vector of len n_instances that correspond to 0, 1, ... k 
    that corresponds to the cluster each instance was in at the end of training
    """
    preds = np.zeros(np.shape(X)[0])
    for i, c in enumerate(clusters):
        for instance in c:
            preds[instance] = i
    return preds

In [51]:
def generate_predition_map(X, k, clusters):
    """
    *Note:* uses true y value -- put in seperate function of generating predictions because of this
    This function returns a dict from our cluster numbers 0, 1, ... k -> actual class values
    """
    # map clusters to classes
    result = {k : 0 for k in range(k)}
    for i, c in enumerate(clusters):
        class_map = {class_val : 0 for class_val in y} # count number of each class per cluster to find most popular
        
        for instance in c:
            val = y[instance]
            class_map[val] = class_map.get(val, 0) + 1 # update counts
            
        most_popular_class = max(class_map, key=class_map.get)
        result[i] = most_popular_class
    return result

In [63]:
n_iter = 100
def predict(X):
    centroids = initialize_centroids(X, k)
    
    for _ in range(n_iter):
        clusters = create_clusters(X, k, centroids)
        
        prev_centroids = centroids
        
        centroids = update_centroids(X, k, clusters)
       
        # If no centroids have changed => convergence
        delta = centroids - prev_centroids
        if np.all((delta == 0)):
            break
    preds = generate_predictions(X, clusters)
    pred_map = generate_predition_map(X, k, clusters)
    results = [pred_map[p] for p in preds]
    return results

In [64]:
out = predict(X)

In [39]:
def accuracy_score(y_true, y_pred):
    """ Compare y_true to y_pred and return the accuracy """
    accuracy = np.sum(y_true == y_pred, axis=0) / len(y_true)
    return accuracy

In [67]:
1 - accuracy_score(out, y)

0.11333333333333329