#  Basic Data Science in Python - Exercises 13/10  #

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn import datasets
from sklearn.cluster import DBSCAN, KMeans, Birch, OPTICS

### Exercise  1: Comparing clustering methods
Below you can see the Moons dataset, wich is two half-circles in 2D.

In [None]:
X, y = datasets.make_moons()
plt.scatter(*X.T, c=y)

Try using both DBScan and K-Means to label the two half-circles. Which method creates the correct clustering? Can you explain why?

In [None]:
db_labels = DBSCAN().fit_predict(X)
plt.scatter(*X.T, c=db_labels)

In [None]:
k_labels = KMeans(n_clusters=4).fit_predict(X)
plt.scatter(*X.T, c=k_labels)

### Exercise 2: Clustering the Iris Dataset
Use different clustering methods to learn a clustering of the iris dataset. Visualize the clusterings, and use Normalized Mutual Information (NMI) to measure which clustering method performs the best on the dataset.

Try to tune the hyperparameters to get the best clustering out of each method

In [None]:
from sklearn.metrics import normalized_mutual_info_score as nmi
from sklearn.metrics import homogeneity_score as purity
from sklearn.cluster import Birch, OPTICS

iris = datasets.load_iris()
X = iris.data
y = iris.target
plt.scatter(*X.T[2:4], c=y)

In [None]:
db_labels = DBSCAN(eps=0.4,min_samples=5).fit_predict(X)
plt.scatter(*X.T[2:4], c=db_labels)

In [None]:
k_labels = KMeans(n_clusters=3).fit_predict(X)
plt.scatter(*X.T[2:4], c=k_labels)

In [None]:
birch_labels = Birch().fit_predict(X)
plt.scatter(*X.T[2:4], c=birch_labels)
print(birch_labels)

In [None]:
optic_labels = OPTICS(min_samples=20).fit_predict(X)
plt.scatter(*X.T[2:4], c=optic_labels)

In [None]:
print("NMI")
print("KMeans:", nmi(y, k_labels))
print("DBSCAN:", nmi(y, db_labels))
print("BIRCH:", nmi(y, birch_labels))
print("OPTICS", nmi(y, optic_labels))

In [None]:
print("Purity")
print("KMeans:", purity(y, k_labels))
print("DBSCAN:", purity(y, db_labels))
print("BIRCH:", purity(y, birch_labels))
print("OPTICS", purity(y, optic_labels))

### Exercise 3: Different Size Clusters (Handin)

Use k-Means to cluster the below dataset. What happens? Which method should you use instead? Use the method you deem most fitting to cluster the dataset.

In [None]:
clusters_std = [1.5, 0.5]
X, y = datasets.make_blobs(
    n_samples=[1000, 100],
    centers=[[0.0, 0.0], [3.5, 3.5]],
    cluster_std=clusters_std,
    random_state=0,
    shuffle=False,
)
plt.scatter(*X.T, c=y)

### Exercise 4: Outlier detection
Look at the below dataset. What points do you consider outliers? 

Use Local Outlier Factor to detect the outlier points. How many agrees with your theory?

In [None]:
X, y = datasets.make_moons(n_samples=1000, noise=0.2)
for i in np.random.randint(0, 100, size=10):
    X[i] = X[i] * 2
plt.scatter(*X.T)

In [None]:
### YOUR CODE HERE
from sklearn.neighbors import LocalOutlierFactor as LOF

lof_outliers = LOF(n_neighbors=25).fit_predict(X)
plt.scatter(*X.T, c=lof_outliers)

### Exercise 5: K-Means for Colour Compression
An out-of-the box use of k-Means is using it for image compression. Below is an image that we want to compress to 10 colours. Since the colours can be seen as data points, use k-Means to cluster these together, and only use the cluster centers.

In [None]:
china = datasets.load_sample_image('china.jpg')
X = china/255 #Normalize the data
X = X.reshape(427*640, 3) #reshape the data
plt.imshow(china)

In [None]:
### YOUR CODE HERE
km = KMeans(n_clusters=100)
km.fit(X)
new_colours = km.cluster_centers_[km.predict(X)]
new_china = new_colours.reshape(china.shape)
plt.imshow(new_china)

### Exercise 6: Implement DB-Outliers (Hard)
Another algorithm for finding outliers is Distance-Based Outlier Detection. It works by the following formula:
$$ OutlierSet(\varepsilon, \pi) = \Big\{ p \in X : \frac{|\{x\in X : dist(p, x) < \varepsilon\} |}{n} \leq \pi \Big\} $$
That is, a point $p$ is an outlier if at most $\pi$ percent of $x\in D$ has a distance of less than $\varepsilon$ to $p$. 

Implement a simple Distance Based Outlier Detection algorithm and test it on the below dataset. Try tuning the parameters $\pi$ and $\varepsilon$.

In [None]:
def db_outliers(X, eps, pi):
    outlier_set = [0 for _ in range(X.shape[0])]
    ### YOUR CODE HERE
    for i, p in enumerate(X):
        nr_of_points = 0
        for x in X:
            if np.linalg.norm(p -x)**2 < eps:
                nr_of_points += 1
        if nr_of_points/X.shape[0] <= pi:
            outlier_set[i] = 1
    ### YOUR CODE HERE
    return outlier_set

X, _ = datasets.make_blobs(n_samples=100, centers=1, n_features=2, center_box=(0, 10), cluster_std=0.7)
#Add noise:
for i in np.random.randint(0, 100, size=10):
    X[i] = X[i] * 1.1

outliers = db_outliers(X, eps=1, pi=0.1) #Try tuning eps and pi
plt.scatter(*X.T, c=outliers)