# 0) Loading Data

In [33]:
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy.io.arff import loadarff
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

warnings.simplefilter("ignore", category=FutureWarning)


#Reading the ARFF file and normalizing it's data
data = loadarff('column_diagnosis.arff')
df = pd.DataFrame(data[0])
df['class'] = df['class'].str.decode('utf-8')
df.head()

scaler = MinMaxScaler()
X = np.array(df.drop('class', axis=1))
y = np.array(df['class'])
X = scaler.fit_transform(X, y)

## Exercise 1)

In [34]:
Kmeans3_labels = []

def purity_score(y_true, labels):
    cluster_scores = {}
    purity = 0
    for index, closest_cluster in enumerate(labels):
        correct_out = y_true[index]
        if closest_cluster not in cluster_scores:
            cluster_scores[closest_cluster] = {}
            if correct_out not in cluster_scores[closest_cluster]:
                cluster_scores[closest_cluster][correct_out] = 1
            else:
                cluster_scores[closest_cluster][correct_out] += 1
        else:
            if correct_out not in cluster_scores[closest_cluster]:
                cluster_scores[closest_cluster][correct_out] = 1
            else:
                cluster_scores[closest_cluster][correct_out] += 1
    for cluster_score in cluster_scores.values():
        purity += max(cluster_score.values())
    purity = purity / len(y_true)

    return purity

for k in (2, 3, 4, 5):
    KmeansCluster = KMeans(n_clusters=k, random_state=0)
    KmeansCluster.fit(X)
    labels = KmeansCluster.predict(X)
    if k == 3:
        Kmeans3_labels = labels
    silhouette = silhouette_score(X, labels)
    purity = purity_score(y, labels)
    print(f"For a value of K = {k}:\n\tSilhouette value: {silhouette}\n\tPurity value: {purity}")


For a value of K = 2:
	Silhouette value: 0.36044124340441114
	Purity value: 0.632258064516129
For a value of K = 3:
	Silhouette value: 0.29579055730002257
	Purity value: 0.667741935483871
For a value of K = 4:
	Silhouette value: 0.27442402122340176
	Purity value: 0.6612903225806451
For a value of K = 5:
	Silhouette value: 0.23823928397844843
	Purity value: 0.6774193548387096


From what we can see, as we increase the number of clusters in our Kmeans algorithm, the silhouette value decreases and the purity value increases. This means that a higher number of clusters reflects in a better discriminative power of each cluster, but also that each observations doesn't match as well with the ones inside the same cluster.

## Exercise 2)

### i)

In [36]:
pca = PCA(n_components=2)
pca.fit(X)
variance = pca.explained_variance_ratio_

print(f"Variance for the first component: {variance[0] * 100}%\nVariance for the second component: {variance[1] * 100}%")
print(f"Total variability explained by the top two principal components: {round(sum(variance), 4) * 100}")

Variance for the first component: 56.18144484299212%
Variance for the second component: 20.955952591361886%
Total variability explained by the top two principal components: 77.14


### ii)