## Lab 7: $k$-means
You can use external libraries for linear algebra operations but you are expected to write your own algorithms.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder

# Exercise 1
- Download the ```breast_cancer.csv``` dataset and preprocess it by using ```sklearn.preprocessing.OrdinalEncoder``` to properly deal with the categorical variables. 

In [None]:
df = pd.read_csv("../Datasets/breast_cancer.csv", header=None)
df.head()

feature_names = ['age', ' menopause', 'tumor-size','inv-nodes', 'node-caps', 'deg-malign', ' breast', 'breast-quad', 'irradiat']
#Attribute Information derived from https://archive.ics.uci.edu/ml/datasets/Breast+Cancer
#   0. age: 10-19, 20-29, 30-39, 40-49, 50-59, 60-69, 70-79, 80-89, 90-99.
#   1. menopause: lt40, ge40, premeno.
#   2. tumor-size: 0-4, 5-9, 10-14, 15-19, 20-24, 25-29, 30-34, 35-39, 40-44,
#                  45-49, 50-54, 55-59.
#   3. inv-nodes: 0-2, 3-5, 6-8, 9-11, 12-14, 15-17, 18-20, 21-23, 24-26,
#                 27-29, 30-32, 33-35, 36-39.
#   4. node-caps: yes, no.
#   5. deg-malig: 1, 2, 3.
#   6. breast: left, right.
#   7. breast-quad: left-up, left-low, right-up,	right-low, central.
#   8. irradiat:	yes, no.
#   9. Class: no-recurrence-events, recurrence-events

In [None]:
df = df.astype(str)

In [None]:
encoder = OrdinalEncoder()
encoder.fit(np.array(df))
data = encoder.transform(df)

In [None]:
data

In [None]:
X = data[:,0:9]
y = data[:,9] #in this way we have preprocessed also the response variable all at once

- Write your own function to compute the Mutual Information Criterion.
- Compute the Mutual Information between the covariates and the response variable (stored in the last column). Which features appear to be the most significant?

In [None]:
from sklearn.feature_selection import mutual_info_classif

In [None]:
MI = mutual_info_classif(X, y, discrete_features=True) 

In [None]:
MI

In [None]:
plt.figure(figsize=(10,8))

plt.bar(feature_names, MI)
plt.xticks(rotation=30)
plt.title("Mutual Information between the features and the response variable")
plt.show()

# Exercise 2
- Use the dataset ```s3.txt``` available in the ```Datasets``` folder.

In [None]:
s3 = np.genfromtxt("../Datasets/s3.txt")
s3

In [None]:
s3.shape

In [None]:
plt.figure(figsize=(10,8))

plt.scatter(s3[:,0], s3[:,1])
plt.title("Original dataset")
plt.show()

- Write your own implementation of the $k$-means clustering algorithm. 
- Test your implementation with 10 different inizializations and $k=15$.

In [None]:
from sklearn.cluster import KMeans

In [None]:
kmeans = KMeans(n_clusters=15, init='random', n_init=1)
# init = {'k-means++', 'random'}, callable or array-like of shape (n_clusters, n_features)
# default: 'k-means++'
# 'random': choose n_clusters observations (rows) at random from data for the initial centroids

# n_init: 'auto' or int, default n_init=10
# Number of times the k-means algorithm is run with different centroid seeds: The final result is the best output of n_init consecutive runs in terms of inertia.
# n_init=10 if init='random'
# n_init=1 if init='k-means++'

In [None]:
results = []

for i in range(10):
    kmeans = KMeans(n_clusters=15, init='random', n_init=1)
    kmeans.fit(s3)
    results.append((kmeans.inertia_, kmeans))


#kmeans.inertia_ : sum of distances of samples to their closest cluster center

In [None]:
results

- Plot the clustering results for which the loss is, respectively, the highest and the lowest.

In [None]:
sorted_results = sorted(results, key = lambda x : x[0])

In [None]:
sorted_results

In [None]:
kmeans_min = sorted_results[0][1]
kmeans_max = sorted_results[len(sorted_results)-1][1]

In [None]:
labels_min= kmeans_min.labels_
centers_min = kmeans_min.cluster_centers_

In [None]:
plt.figure(figsize=(10,8))

plt.scatter(s3[:,0], s3[:,1], c=labels_min)
plt.plot(centers_min[:,0], centers_min[:,1], 'ro')
plt.title("Minimum inertia")
plt.show()

In [None]:
labels_max= kmeans_max.labels_
centers_max = kmeans_max.cluster_centers_

In [None]:
plt.figure(figsize=(10,8))

plt.scatter(s3[:,0], s3[:,1], c=labels_max)
plt.plot(centers_max[:,0], centers_max[:,1], 'ro')
plt.title("Maximum inertia")
plt.show()