# Lab 3 — Clustering

In [76]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pickle
from itertools import combinations
from collections import Counter
import random

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

%matplotlib inline
plt.style.use("ggplot")

In [3]:
from bokeh.io import show, output_notebook, gridplot
from bokeh.models import Div
from bokeh.layouts import column
from bokeh.plotting import figure, ColumnDataSource
from bokeh.models import HoverTool, ResetTool, PanTool, WheelZoomTool, SaveTool
output_notebook()

## 3.11 Clustering Tags

Load the data and prepare the points

In [16]:
with open('data.txt', 'r') as source_file:
    data = json.loads(json.load(source_file)) # Why is this neeeded?

In [17]:
tags = [tag for tag in data]
coordinates = np.array([data[tag] for tag in tags])

We also need to perform PCA on the coordinates in order to be able to visualize it on a lower dimensional space: here we want to project 5 dimensional vectors on a bidimensional space.

In [18]:
centered = coordinates - np.mean(coordinates, axis=0)
cov = (centered.T @ centered) / centered.shape[0]
inc_vals, inv_vecs = np.linalg.eigh(cov)
inc_vals = inc_vals[::-1]
inv_vecs = np.fliplr(inv_vecs)

In [19]:
colors = {
    0: '#ff0000',
    1: '#00ff00',
    2: '#0000ff',
    3: '#ffff00',
    4: '#551a8b'
}

def visualize(k, data, tags, projf):
    """ Performes k means on the provided data using k clusters
    and creates a bidimensional plot. The transformation of the
    data to a 2d space is performed by the provided projection
    function.
    """
    clus = KMeans(n_clusters=k).fit(data)
    
    coords2d = projf(data)
    source = ColumnDataSource(
        data = {
            'x' : [x[0] for x in coords2d],
            'y' : [x[1] for x in coords2d],
            'color' : [colors[x] for x in clus.labels_],
            'tagname' : tags
        }    
    )
    centroids_coords = [projf(centr) for centr in clus.cluster_centers_]
    centroids = ColumnDataSource(
        data = {
            'x' : [centr[0] for centr in centroids_coords],
            'y' : [centr[1] for centr in centroids_coords]
        }
    )

    hover = HoverTool(
            tooltips=[('Tag name', '@tagname')]
    )
    tools = [WheelZoomTool(), ResetTool(), PanTool(), SaveTool(), hover]
    p = figure(tools=tools, width=400, height=400, title='K = '+ str(k))
    p.circle('x', 'y', source=source, color='color')
    p.asterisk('x', 'y', size=30,source=centroids, color='#000000')
    
    return p

def four_plots(data, tags, projection_function):
    """ Draws a grid plot consisting of four kmeans 2d plots
    using k between 2 and 5.
    """
    plots = [visualize(k, data, tags, projection_function) for k in range(2,6)]
    plots = [plots[:2], plots[2:]]
    return gridplot(plots)

For each possible combination of length 2 of eingenvectors (aka principal directions) we plot the clustered data using, for each combination, $k\in \{2,3,4,5\}$

In [20]:
#Try projecting on on different principal directions
for i, j in combinations(range(5), 2):
    xy = inv_vecs[:, [i, j]]
    projection_function = lambda point: point @ xy
    p = four_plots(coordinates, tags, projection_function)
    title = 'Using components associated to eigenvalues number' + str((i, j))
    show(column(Div(text=title), p))

The principal directions that separate the cluster well are......

## 3.12 Clustering Movies

### Algorithm Implementation

We try to write a generic function implementing the k-medioids algorithm

In [69]:
def k_medioids(points, k, dist_f, max_dist = 2, max_iterations=200):
    """
    Runs the k-medioids algorithm on the provided (hashable) points.
    
    :param points:   the points to cluster. Points must be comparable and hashable
    :param k:        the number of clusters to create
    :param dist_f:   distance function taking two elements of points and returning
                        a real distance.
    :param max_dist: the maximal distance between two points (needed for min search)
    :max_iterations: the maximal number of iterations the algorithm should run
    """
    medioids = random.sample(points, k) # Random Init
    n_iter = 0 # Count the number of performed iterations
    clusters = [] # Store the clusters to be able to return them
    old_medioids = [] # Store previous medioids to check for convergence
    while (old_medioids != medioids and n_iter < max_iterations):
        old_medioids = medioids
        clusters = [set() for i in range(k)] # Initialize clusters
        
        # Assign points to clusters
        for x in points:
            dists = [(i, dist_f(x, medioids[i])) for i in range(k)]
            i = min(dists, key=lambda t: t[1])[0]
            clusters[i].add(x)
            
        # Recompute medioids
        for i in range(k):
            # Optimized research of x satisfying the argmin
            closest_point = None
            dist_from_closest = max_dist * len(clusters[i])
            for x in clusters[i]:
                sum_of_dists = 0
                for y in clusters[i]:
                    sum_of_dists += dist_f(x, y)
                if sum_of_dists < dist_from_closest:
                    dist_from_closest = sum_of_dists
                    closest_point = x
            medioids[i] = closest_point
        n_iter += 1
    return clusters, medioids, n_iter

### Clustering `most-rated.pickle`

The algorithm ready, it is time to prepare the data:

First we retrieve the movie IDs associated to the the set of genres they fit in:

In [55]:
!hdfs dfs -cat /ix/ml-20m/movies.txt | tail -n 1

{"genres": ["Adventure", "Fantasy", "Horror"], "movieId": 131262, "title": "Innocence (2014)"}


In [56]:
ids_to_genres = sc.textFile("/ix/ml-20m/movies.txt")\
           .map(json.loads)\
           .map(lambda j: (j['movieId'], set(j['genres'])))\
           .collectAsMap()

and then we read the movies in `most-rated.pickle` and extract, for each of them, its genres:

In [57]:
with open('most-rated.pickle', 'rb') as f:
    most_rated = pickle.load(f)

In [58]:
most_rated[:1]

((296, 'Pulp Fiction (1994)'),)

In [59]:
data = {movie[0] : ids_to_genres[movie[0]] for movie in most_rated}

We can now define the jaccard distance as a generic function taking two sets. 

Finally we define the distance function which will pass to `k-medioids` as a closure bringing with itself the `dict` associating movie IDs to the set of genres they fit in:

In [60]:
jaccard_dist = lambda A, B : 1 - (len(A & B) / len(A | B))
dist_f = lambda p1, p2: jaccard_dist(data[p1], data[p2])

We are now able to cluster the movies in `most-rated.pickle` using $k=2$:

In [70]:
clusters, medioids, n_iter = k_medioids(list(data.keys()), 2, dist_f)

In [75]:
# PROBLEM: Random init causes very different inputs
for _ in range(50):
    clusters, medioids, n_iter = k_medioids(list(data.keys()), 2, dist_f)
    print(medioids)

[5, 315]
[4270, 524]
[1094, 1527]
[14, 1527]
[8361, 5]
[111, 1527]
[6333, 11]
[194, 4148]
[11, 2000]
[315, 5]
[194, 1544]
[8361, 11]
[194, 1544]
[194, None]
[2403, 5]
[5, 315]
[194, 1544]
[2403, 11]
[315, 5]
[6333, 11]
[11, 5]
[194, 6333]
[194, 1544]
[194, 1544]
[4270, 11]
[8361, 11]
[11, 1544]
[194, None]
[6333, 194]
[315, 5]
[194, 1544]
[6333, 194]
[194, 1544]
[194, 6333]
[11, 293]
[8361, 11]
[170, 11]
[5, 2403]
[194, 515]
[194, 1573]
[194, None]
[315, 5]
[4270, 14]
[2403, 11]
[194, None]
[194, 6333]
[11, 4720]
[4270, 524]
[11, 1544]
[5, 293]


### Visualization

In [44]:
def count(ls):
    c = Counter()
    for x in ls:
        c[x] += 1
    return c

In [63]:
genres_frequencies_per_cluster = [count([genre for movie in cluster for genre in data[movie]]) for cluster in clusters]

In [35]:
genres_frequencies_per_cluster

[Counter({'Action': 99,
          'Adventure': 148,
          'Animation': 60,
          'Children': 80,
          'Comedy': 336,
          'Crime': 102,
          'Documentary': 6,
          'Drama': 359,
          'Fantasy': 86,
          'Film-Noir': 12,
          'Horror': 62,
          'IMAX': 19,
          'Musical': 42,
          'Mystery': 74,
          'Romance': 166,
          'Sci-Fi': 169,
          'Thriller': 183,
          'Western': 12}),
 Counter({'Action': 191,
          'Adventure': 80,
          'Animation': 2,
          'Children': 2,
          'Comedy': 55,
          'Crime': 59,
          'Drama': 89,
          'Fantasy': 27,
          'Film-Noir': 1,
          'Horror': 10,
          'IMAX': 3,
          'Musical': 2,
          'Mystery': 10,
          'Romance': 32,
          'Thriller': 87,
          'War': 50,
          'Western': 11})]