
# Clustering with K-means 



In [4]:

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import altair as alt
%matplotlib inline

ModuleNotFoundError: No module named 'altair'

In [5]:
# loading in our toy data
df_kmeans = pd.read_csv('data_kmeans.csv')

In [6]:
def init_centroids( df_data, k ):
    # your code here
    rand_centriods = df_data.sample(n = k, random_state = 42)
    return rand_centriods.reset_index(drop = True)

In [7]:
# Testing our init centroid function
df_centroids = init_centroids( df_kmeans, 7 )
df_centroids

Unnamed: 0,x,y
0,-0.633437,0.488756
1,-0.597736,0.632194
2,-0.635867,0.166613
3,-0.539351,-0.875532
4,-0.588597,0.293062
5,-0.127985,-0.136639
6,0.342579,0.211717


## assign every row in a data frame to a centroid 

 

In [5]:
def get_distance(row, df_centroids):
    dis = []
    for x, y in df_centroids.iterrows():
        dis.append(np.linalg.norm(row - y))
    return dis

def assign_to_centroid( df_data, df_centroids ):
    # your code here
    centroid_assign = []
    for i, row in df_data.iterrows():
        dis = get_distance(row, df_centroids)
        mindis = dis.index(min(dis))
        centroid_assign.append(mindis)
    return pd.Series(centroid_assign)

        

In [6]:
# Check how many data points were assigned to each centroid
s_centroid_assignment = assign_to_centroid( df_kmeans, df_centroids )
s_centroid_assignment.value_counts()

6    126
1     94
3     71
5     58
0     53
2     26
4     22
dtype: int64

## recomputes the centroids



In [7]:
def compute_centroids( df_data, s_centroid_assignment ):
    # your code here   
    assignment = df_data.copy()
    assignment['Assignment'] = s_centroid_assignment
    new_cen = assignment.groupby('Assignment', as_index=False)
    df = pd.DataFrame(new_cen[['x','y']].mean())
    return df.drop(['Assignment'], axis = 1)

In [8]:
# Test it out
df_new_centroids = compute_centroids( df_kmeans, s_centroid_assignment)
df_new_centroids

Unnamed: 0,x,y
0,-0.715983,0.500552
1,-0.721574,0.708343
2,-0.738268,-0.11383
3,-0.696694,-0.699028
4,-0.546913,0.321364
5,-0.102365,-0.188876
6,0.523364,0.238569


## Compare two centroids
 

In [9]:
def compare_centroids( df_centroid_a, df_centroid_b ):
    # your code here
    return df_centroid_a.equals(df_centroid_b)

In [10]:
# Test it out, should print True followed by False
print(compare_centroids( df_new_centroids, df_new_centroids ))
print(compare_centroids( df_new_centroids, df_centroids ))

True
False


## K-means

In [11]:
def k_means( df_data, k ):
    # your code here
    init_centroid = init_centroids(df_data, k)
    centroid_assign = assign_to_centroid(df_data, init_centroid)
    #print(centroid_assign)
    change = compute_centroids(df_data, centroid_assign)
    
    new_assign =  assign_to_centroid(df_data, change)
    #print(new_assign.tail(50))
    #print(len(df_data))
    x = True
    while x == True:
        change = compute_centroids(df_data, new_assign)
        newer_assign = assign_to_centroid(df_data, change)
        if compare_centroids(new_assign, newer_assign) == True:
            x = False
            return newer_assign
        else:
            new_assign = newer_assign
        
        
        

In [12]:
# Call k_means with k = 5 to test
s_cluster_assignment = k_means( df_kmeans, 5)
s_cluster_assignment.value_counts()

0    178
3     89
2     74
4     63
1     46
dtype: int64

## Plot the results


In [30]:
dom = [0,1,2,3,4]
rng = ["red", "blue", "green", "yellow", "black"]
df_kmeans["Assignment"] = s_cluster_assignment
alt.Chart(df_kmeans, title="K-Means Scatter Plot with Color Representing Clusters").mark_circle(size=60).encode(
    x = alt.X("x"),
    y = alt.Y("y"),
    color = alt.Color("Assignment", scale=alt.
                    Scale(domain=dom, range=rng))
)