In [1]:
using DataFrames, CSV
using StatsBase, Statistics
using LinearAlgebra

US = CSV.read("USArrests.csv", DataFrame)
# murder per 100K
# asssult per 100K
# percent urban population
# rape per 100K

USm = Matrix(US[:,2:5]) # scaling requires a matrix

USsc = standardize(ZScoreTransform, USm, dims=1)
# scale each attribute to have mean 0 and std dev 1

# a function to compute the sqaured euclidean distance between
# two vectors x and y.
function distance(x, y)
    # to be completed
    dist = 0
    for i in range(start = 1, stop = length(x), step = 1)
        dist += (x[i] - y[i])^2
    end
    return dist
end

# a function to compute the centroid of a cluster.
# X is a matrix that contains the cluster,
# with rows as observations and (numeric) columns as attributes.
function centroid(X)
    # to be completed
    (m,n) = size(X)
    centr = zeros(size(X, 2))
    for j in 1:n
        colSum = 0
        for i in 1:m
            colSum += X[i,j]
        end
        centr[j] = colSum / size(X,1)
    end
    return centr
end

# a single iteration of kmeans
function kmeans1(X, k)
    (m,n) = size(X)
    if k > m
        error("number of clusters ", k, " > number of observations ", m)
    end

    clusters0 = rand(1:k, m)  # random initial assignment
    clusters = zeros(Int8, m) # will hold the cluster assignments
    
    ii = 0
    while true
        # compute the cluster centroids
        c = zeros(k, n)
        for i in 1:k
            c[i,:] = centroid( X[ findall(x->x==i, clusters0), : ] )
        end

        # assign each observation to the nearest centroid
        for i in 1:m
            clusters[i] = 1  # initially assign observation to cluster 1
            best = distance(X[i,:], c[1,:])
            for j in 2:k
                candidate = distance(X[i,:], c[j,:])
                if candidate < best  # assign to cluster j if closer
                    best = candidate
                    clusters[i] = j
                end
            end
        end
        if clusters == clusters0
            break
        end
        clusters0 = clusters
        ii += 1
        if ii % 1 == 0
            println("iteration ", ii)
        end
    end
    ncl = length(countmap(clusters))
    if ncl != k
        @warn "clustering solution contains "*string(ncl)*" < "*string(k)*" clusters."
    end
    return clusters
end

# a function compute the value of the kmeans objective
# function, the sum of the within-cluster distances.
# X is the matrix of observations.
# k is the number of clusters.
# cl is the clustering solution.
function objective(X, k, cl)
    # to be completed
    (m,n) = size(X)
    obj = 0
    for i in 1:k
        for j in 1:m
            obj += distance(j, cl[k])
        end
    end
    return obj
end

# driver function for kmeans.
# X is the (scaled) matrix of observations.
# k is the number of clusters.
# niter is the number of times to run the k-means algorithm.
# the best of the niter candidate solutions is returned.
function kmeans(X, k; niter=50)
    # to be completed
    cl0 = kmeans1(X, k)
    solution = cl0
    solution_obj = objective(X, k, cl0)
    for i in 2:niter
        cl = kmeans1(X, k)
        if objective(X, k, cl) < solution_obj #compare values of objective function of current candidate and best solution
            solution = cl
            solution_obj = objective(X, k, cl)
        end
    end
    return solution
end

cl = kmeans(USsc, 4);    # call kmeans driver function
objective(USsc, 4, cl)   # the objective function of the best solution found
cldict = countmap(cl)    # use countmap() to see the number of obs in each cluster

iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1
iteration 1


Dict{Int8, Int64} with 4 entries:
  4 => 8
  2 => 20
  3 => 14
  1 => 8

In [2]:
# obtain information from the clustering to solution to provide
# a qualitative description of each of the four clusters of states.

one_count = 0
two_count = 0
three_count = 0
four_count = 0
one_sums = [0,0,0,0]
two_sums = [0,0,0,0]
three_sums = [0,0,0,0]
four_sums = [0,0,0,0]
for i in 1:length(cl)
    if cl[i] == 1
        one_sums += USsc[i,:]
        one_count += 1
    end
    if cl[i] == 2
        two_sums += USsc[i,:]
        two_count += 1
    end
    if cl[i] == 3
        three_sums += USsc[i,:]
        three_count += 1
    end
    if cl[i] == 4
        four_sums += USsc[i,:]
        four_count += 1
    end
end
println("Average cluster 1 values: ", one_sums / one_count) 
println("Average cluster 2 values: ", two_sums / two_count) 
println("Average cluster 3 values: ", three_sums / three_count) 
println("Average cluster 4 values: ", four_sums / four_count) 

# cluster 1 is a high-crime, low urban-population cluster
# cluster 2 is a low-crime, high urban-population cluster
# cluster 3 is a high-crime, high urban-population cluster
# cluster 4 is a low-crime, low urban-population cluster

Average cluster 1 values: [1.446328981228591, 0.9838289096858123, -0.8317925251409619, 0.35291098598810533]
Average cluster 2 values: [-0.8547793947319698, -0.7914788984373234, 0.1492252370684777, -0.5484506837152924]
Average cluster 3 values: [0.5718208805401622, 0.9336886563303335, 0.9052208163968788, 0.9483152954943685]
Average cluster 4 values: [-0.31006703534394975, -0.639086812170586, -1.125406996224773, -0.6413360438150209]


In [3]:
# The kmeans algorithm places the obs into clusters by interatively finding the center of the cluster then places obs
# into the cluster in which they are closest to the center. It does this for n iterations then returns the final 
# cluster for each obs.

# After running the function multiple tines, my results showed that the cluster with the high-crime, 
# high urban-population and the low-crime, high urban-population clusters tended to have the most observations.