In [0]:
# Imports
import pandas as pd
import io
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import sklearn.cluster
import sklearn.datasets
import sklearn.metrics

# Practicum 1

## Code

In [2]:
# this code is used to load the data that was downloaded from UCI
from google.colab import files
uploaded = files.upload()

Saving auto-mpg.data to auto-mpg (2).data


In [0]:
#
# Get The Data, put it into Pandas
#

# This method is used for transforming the data into the appropriate type
def fixNumVals(x):
  if x == '?':
    return np.nan
  else:
    return float(x)

raw = uploaded['auto-mpg.data'].decode('utf-8').split('\n') # raw input
good = [] # will contain the final 2D array

for i in range(len(raw) - 1):
  splitOnTab = raw[i].split('\t')
  splitOnSpace = splitOnTab[0].split(" ")
  
  vehicleName = splitOnTab[1][1:-1] # remove the extra quotes
  splitOnSpace = list(filter(lambda x: x is not '', splitOnSpace)) # remove empty strings
  splitOnSpace = list(map(fixNumVals, splitOnSpace)) # convert strings to floats
  splitOnSpace.append(vehicleName)
  good.append(splitOnSpace)

headers = ["mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration", "model year", "origin", "car name"]

df = pd.DataFrame.from_records(good, columns = headers)
df = df[['mpg', 'displacement', 'horsepower', 'weight', 'acceleration']] # only use the continuous features

In [0]:
# Imputation
meanImp = SimpleImputer(missing_values=np.nan, strategy='mean')

# Impute the labeled data
for (columnName, columnData) in df.iteritems():
  new = meanImp.fit_transform(df[[columnName]]).ravel()
  df[columnName] = new

In [0]:
# Make another dataframe that includes the imputed values but also the 'origin' feature as a target
labeled = df
labeled['origin'] = pd.DataFrame.from_records(good, columns = [headers])[['origin']]

In [0]:
# Perform the clustering with the appropriate parameters
clustering = sklearn.cluster.AgglomerativeClustering(3, 'euclidean', None, None, 'auto', 'average', None).fit(df)
# Add the cluster labels as a column to the
df['clusterID'] = clustering.labels_

In [7]:
# Mean and Variance of each cluster
for i in range(3):
  c = df.loc[df['clusterID'] == i]
  print('Mean for cluster #{}:\n'.format(i), c.mean())
  print()
  print('Variance for cluster #{}:\n'.format(i), c.var())
  print()

Mean for cluster #0:
 mpg               27.365414
displacement     131.934211
horsepower        84.300061
weight          2459.511278
acceleration      16.298120
origin             1.845865
clusterID          0.000000
dtype: float64

Variance for cluster #0:
 mpg                 41.976309
displacement      2828.083391
horsepower         369.143491
weight          182632.099872
acceleration         5.718298
origin               0.727096
clusterID            0.000000
dtype: float64

Mean for cluster #1:
 mpg               13.889062
displacement     358.093750
horsepower       167.046875
weight          4398.593750
acceleration      13.025000
origin             1.000000
clusterID          1.000000
dtype: float64

Variance for cluster #1:
 mpg                 3.359085
displacement     2138.213294
horsepower        756.521577
weight          74312.340278
acceleration        3.591429
origin              0.000000
clusterID           0.000000
dtype: float64

Mean for cluster #2:
 mpg          

In [8]:
# Mean and Variance for each 'origin' label
for i in [1.0, 2.0, 3.0]:
  c = labeled.loc[labeled['origin'] == i]
  print('Mean for origin {}:\n'.format(i), c.mean())
  print()
  print('Variance for origin {}:\n'.format(i), c.var())
  print()

Mean for origin 1.0:
 mpg               20.083534
displacement     245.901606
horsepower       118.814769
weight          3361.931727
acceleration      15.033735
origin             1.000000
clusterID          0.779116
dtype: float64

Variance for origin 1.0:
 mpg                 40.997026
displacement      9702.612255
horsepower        1569.532304
weight          631695.128385
acceleration         7.568615
origin               0.000000
clusterID            0.696981
dtype: float64

Mean for origin 2.0:
 mpg               27.891429
displacement     109.142857
horsepower        81.241983
weight          2423.300000
acceleration      16.787143
origin             2.000000
clusterID          0.085714
dtype: float64

Variance for origin 2.0:
 mpg                 45.211230
displacement       509.950311
horsepower         410.659789
weight          240142.328986
acceleration         9.276209
origin               0.000000
clusterID            0.166460
dtype: float64

Mean for origin 3.0:
 mpg   

## Write-Up

Looking at the Mean and Variance for each of the features in each cluster and comparing them to each 'origin' class, there does not seem to be a clear relationship between the clusters and the origins of the different automobiles. This suggests that with regards to the country of origin of the automobiles, the specific data points do not align themselves into clearly defined clusters.

# Practicum 2

## Code

In [0]:
# Load the Data
boston = sklearn.datasets.load_boston()

In [0]:
# Put it into a Pandas DataFrame
data = pd.DataFrame(data = boston['data'], columns = boston['feature_names'])
#data['target'] = boston['target']

In [0]:
# Scale the data
data = pd.DataFrame(StandardScaler().fit_transform(data))

In [0]:
# Clusterings of different sizes
clusterings = []
for i in range(2,7):
  c = sklearn.cluster.KMeans(i).fit(data)
  clusterings.append(c)

In [13]:
# Analyze Silhouette Scores
for i,c in zip(range(2,7), clusterings):
  score = sklearn.metrics.silhouette_score(data,c.labels_)
  print('Silhouette Score for {} clusters = {}'.format(i, score))

Silhouette Score for 2 clusters = 0.36011768587358606
Silhouette Score for 3 clusters = 0.25740335019439353
Silhouette Score for 4 clusters = 0.2809804562187518
Silhouette Score for 5 clusters = 0.2878157430985236
Silhouette Score for 6 clusters = 0.2617743548302116


In [14]:
# It is determined that k = 2 is the optimal number of clusters
# Print the mean of each cluster
data['clusterID'] = clusterings[0].labels_
for i in range(2):
  c = data.loc[data['clusterID'] == i]
  print('Mean for cluster #{}:\n'.format(i), c.mean())
  print()

Mean for cluster #0:
 0           -0.390124
1            0.262392
2           -0.620368
3            0.002912
4           -0.584675
5            0.243315
6           -0.435108
7            0.457222
8           -0.583801
9           -0.631460
10          -0.285808
11           0.326451
12          -0.446421
clusterID    0.000000
dtype: float64

Mean for cluster #1:
 0            0.725146
1           -0.487722
2            1.153113
3           -0.005412
4            1.086769
5           -0.452263
6            0.808760
7           -0.849865
8            1.085145
9            1.173731
10           0.531248
11          -0.606793
12           0.829787
clusterID    1.000000
dtype: float64



In [15]:
# Get the centroids
for i in range(len(clusterings[0].cluster_centers_)):
  print('Coordinates for centroid {}:'.format(i))
  for j in clusterings[1].cluster_centers_[i]:
    print('\t' + str(j))
  print()

Coordinates for centroid 0:
	0.8054419184118975
	-0.4877223646701331
	1.1190964791862479
	0.015767025133660535
	1.126512523354708
	-0.4648907928112372
	0.7981648863493224
	-0.8551038642839881
	1.223134153967741
	1.2966869367966904
	0.6064067069281246
	-0.6413608899812762
	0.8728532843198098

Coordinates for centroid 1:
	-0.41230636656694325
	1.6531056917075286
	-1.0487348993742147
	-0.07554874573080773
	-1.1177831456426306
	0.5556961127226282
	-1.334513627550436
	1.425656439364847
	-0.6115209177369072
	-0.6441577197051647
	-0.6443176677798693
	0.3606864751509191
	-0.8800479115189067



## Write-Up

The coordinates of the centroids and means of the clusters are relatively close as expected, but the discrepancies suggest that more of the points that lie in to one side or another of the centroid, causing the mean to be slightly different from the representative point.

# Practicum 3

## Code

In [0]:
# Load the Data
wine = sklearn.datasets.load_wine()

In [0]:
# Put it into a Pandas DataFrame
data = pd.DataFrame(data = wine['data'], columns = wine['feature_names'])
#data['target'] = wine['target']

In [0]:
# Scale the data
data = pd.DataFrame(StandardScaler().fit_transform(data))

In [0]:
# Perform the clustering
clustering = sklearn.cluster.KMeans(3).fit(data)

In [20]:
# Metrics
homogeneity = sklearn.metrics.homogeneity_score(wine['target'], clustering.labels_)
completeness = sklearn.metrics.completeness_score(wine['target'], clustering.labels_)
print('Homogeneity =', homogeneity)
print('Completeness =', completeness)

Homogeneity = 0.8788432003662368
Completeness = 0.8729636016078732


## Write-Up

Both of Homogeneity Score and the Completeness Score are measures of how "well-defined" the clusters are. That is, Homogeneity observes whether all points in a cluster are of the same class and Completeness observes whether all points in a class belong to the same cluster. Since both of these scores are relatively high for the clustering above, it can be assumed that, for the most part, every cluster contains one class and every class belongs to one unique cluster. But since they are not equal to 1, we know that this isn't perfect but is relatively close.