# Lab 3 — clustering

In [1]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pickle

import random

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA


from itertools import combinations

%matplotlib inline
plt.style.use("ggplot")

In [2]:
from bokeh.io import show, output_notebook, gridplot
from bokeh.models import Div
from bokeh.layouts import column
from bokeh.plotting import figure, ColumnDataSource
from bokeh.models import HoverTool, ResetTool, PanTool, WheelZoomTool, SaveTool
output_notebook()

## 3.11 Clustering Tags

Load the data and prepare the points

In [3]:
with open('data.txt', 'r') as source_file:
    data = json.loads(json.load(source_file)) # Why is this neeeded?

In [4]:
tags = [tag for tag in data]
coordinates = np.array([data[tag] for tag in tags])

We also need to perform PCA on the coordinates in order to be able to visualize it on a lower dimensional space: here we want to project 5 dimensional vectors on a bidimensional space.

In [5]:
centered = coordinates - np.mean(coordinates, axis=0)
cov = (centered.T @ centered) / centered.shape[0]
inc_vals, inv_vecs = np.linalg.eigh(cov)
inc_vals = inc_vals[::-1]
inv_vecs = np.fliplr(inv_vecs)

In [6]:
colors = {
    0: '#ff0000',
    1: '#00ff00',
    2: '#0000ff',
    3: '#ffff00',
    4: '#551a8b'
}

def visualize(k, data, tags, projf):
    """ Performes k means on the provided data using k clusters
    and creates a bidimensional plot. The transformation of the
    data to a 2d space is performed by the provided projection
    function.
    """
    clus = KMeans(n_clusters=k).fit(data)
    
    coords2d = projf(data)
    source = ColumnDataSource(
        data = {
            'x' : [x[0] for x in coords2d],
            'y' : [x[1] for x in coords2d],
            'color' : [colors[x] for x in clus.labels_],
            'tagname' : tags
        }    
    )
    centroids_coords = [projf(centr) for centr in clus.cluster_centers_]
    centroids = ColumnDataSource(
        data = {
            'x' : [centr[0] for centr in centroids_coords],
            'y' : [centr[1] for centr in centroids_coords]
        }
    )

    hover = HoverTool(
            tooltips=[('Tag name', '@tagname')]
    )
    tools = [WheelZoomTool(), ResetTool(), PanTool(), SaveTool(), hover]
    p = figure(tools=tools, width=400, height=400, title='K = '+ str(k))
    p.circle('x', 'y', source=source, color='color')
    p.asterisk('x', 'y', size=30,source=centroids, color='#000000')
    
    return p

def four_plots(data, tags, projection_function):
    """ Draws a grid plot consisting of four kmeans 2d plots
    using k between 2 and 5.
    """
    plots = [visualize(k, data, tags, projection_function) for k in range(2,6)]
    plots = [plots[:2], plots[2:]]
    return gridplot(plots)

For each possible combination of length 2 of eingenvectors (aka principal directions) we plot the clustered data using, for each combination, $k\in \{2,3,4,5\}$

In [7]:
#Try projecting on on different principal directions
for i, j in combinations(range(5), 2):
    xy = inv_vecs[:, [i, j]]
    projection_function = lambda point: point @ xy
    p = four_plots(coordinates, tags, projection_function)
    title = 'Using components associated to eigenvalues number' + str((i, j))
    show(column(Div(text=title), p))

The principal directions that separate the cluster well are......

## 3.12 Clustering Movies

In [131]:
def jaccard_k_medioids(points, k, dist_f, max_dist = 1, max_iterations=200):
    medioids = random.sample(points, k) # random init
    n_iter = 0
    clusters = []
    old_medioids = []
    while (old_medioids != medioids and n_iter < max_iterations):
        old_medioids = medioids
        clusters = [set() for i in range(k)] # keys
        
        # Assign points to clusters
        for x in points:
            dists = [(i, dist_f(x, medioids[i])) for i in range(k)]
            i = min(dists, key=lambda t: t[1])[0]
            clusters[i].add(x)
            
        # Recompute medioids
        for i in range(k):
            closest_point = -1
            dist_from_closest = max_dist
            for x in clusters[i]:
                sum = 0
                for y in clusters[i]:
                    sum += dist_f(x, y)
                if sum < dist_from_closest:
                    dist_from_closest = sum
                    closest_point = x
            medioids[i] = closest_point
        n_iter += 1
    return clusters, medioids, n_iter

In [107]:
!hdfs dfs -cat /ix/ml-20m/movies.txt | tail -n 2

{"genres": ["(no genres listed)"], "movieId": 131260, "title": "Rentun Ruusu (2001)"}
{"genres": ["Adventure", "Fantasy", "Horror"], "movieId": 131262, "title": "Innocence (2014)"}


In [108]:
movies = sc.textFile("/ix/ml-20m/movies.txt")\
           .map(json.loads)\
           .map(lambda j: (j['movieId'], set(j['genres'])))\
           .collectAsMap()

In [109]:
with open('most-rated.pickle', 'rb') as f:
    mr = pickle.load(f)

In [110]:
mr[:2]

((296, 'Pulp Fiction (1994)'), (356, 'Forrest Gump (1994)'))

In [111]:
data = {movie[0] : movies[movie[0]] for movie in mr}

In [112]:
jaccard_dist = lambda A, B : 1 - (len(A & B) / len(A | B))
dist_f = lambda p1, p2: jaccard_dist(data[p1], data[p2])

In [127]:
clusters, medioids, n_iter = jaccard_k_medioids(list(data.keys()), 2, dist_f)

In [128]:
from collections import Counter
def count(ls):
    c = Counter()
    for x in ls:
        c[x] += 1
    return c

In [129]:
genres_frequencies_per_cluster = [count([genre for movie in cluster for genre in data[movie]]) for cluster in clusters]

In [130]:
genres_frequencies_per_cluster

[Counter({'Action': 243,
          'Adventure': 183,
          'Animation': 56,
          'Children': 75,
          'Comedy': 349,
          'Crime': 115,
          'Documentary': 6,
          'Drama': 350,
          'Fantasy': 96,
          'Film-Noir': 6,
          'Horror': 28,
          'IMAX': 14,
          'Musical': 43,
          'Mystery': 28,
          'Romance': 179,
          'Sci-Fi': 52,
          'Thriller': 87,
          'War': 50,
          'Western': 22}),
 Counter({'Action': 47,
          'Adventure': 45,
          'Animation': 6,
          'Children': 7,
          'Comedy': 42,
          'Crime': 46,
          'Drama': 98,
          'Fantasy': 17,
          'Film-Noir': 7,
          'Horror': 44,
          'IMAX': 8,
          'Musical': 1,
          'Mystery': 56,
          'Romance': 19,
          'Sci-Fi': 117,
          'Thriller': 183,
          'Western': 1})]

# ISSUE: VERY DIFFERENT RESULTS ACROSS DIFFERENT EXECUTIONS OF MEDIOIDS