# Lab 3 — clustering

In [1]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pickle

import random

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA


from itertools import combinations

%matplotlib inline
plt.style.use("ggplot")

In [2]:
from bokeh.io import show, output_notebook, gridplot
from bokeh.models import Div
from bokeh.layouts import column
from bokeh.plotting import figure, ColumnDataSource
from bokeh.models import HoverTool, ResetTool, PanTool, WheelZoomTool, SaveTool
output_notebook()

## 3.11 Clustering Tags

Load the data and prepare the points

In [12]:
with open('data.txt', 'r') as source_file:
    data = json.loads(json.load(source_file)) # Why is this neeeded?

In [13]:
tags = [tag for tag in data]
coordinates = np.array([data[tag] for tag in tags])

We also need to perform PCA on the coordinates in order to be able to visualize it on a lower dimensional space: here we want to project 5 dimensional vectors on a bidimensional space.

In [50]:
centered = coordinates - np.mean(coordinates, axis=0)
cov = (centered.T @ centered) / centered.shape[0]
inc_vals, inv_vecs = np.linalg.eigh(cov)
inc_vals = inc_vals[::-1]
inv_vecs = np.fliplr(inv_vecs)

In [51]:
colors = {
    0: '#ff0000',
    1: '#00ff00',
    2: '#0000ff',
    3: '#ffff00',
    4: '#551a8b'
}

def visualize(k, data, tags, projf):
    """ Performes k means on the provided data using k clusters
    and creates a bidimensional plot. The transformation of the
    data to a 2d space is performed by the provided projection
    function.
    """
    clus = KMeans(n_clusters=k).fit(data)
    
    coords2d = projf(data)
    source = ColumnDataSource(
        data = {
            'x' : [x[0] for x in coords2d],
            'y' : [x[1] for x in coords2d],
            'color' : [colors[x] for x in clus.labels_],
            'tagname' : tags
        }    
    )
    centroids_coords = [projf(centr) for centr in clus.cluster_centers_]
    centroids = ColumnDataSource(
        data = {
            'x' : [centr[0] for centr in centroids_coords],
            'y' : [centr[1] for centr in centroids_coords]
        }
    )

    hover = HoverTool(
            tooltips=[('Tag name', '@tagname')]
    )
    tools = [WheelZoomTool(), ResetTool(), PanTool(), SaveTool(), hover]
    p = figure(tools=tools, width=400, height=400, title='K = '+ str(k))
    p.circle('x', 'y', source=source, color='color')
    p.asterisk('x', 'y', size=30,source=centroids, color='#000000')
    
    return p

def four_plots(data, tags, projection_function):
    """ Draws a grid plot consisting of four kmeans 2d plots
    using k between 2 and 5.
    """
    plots = [visualize(k, data, tags, projection_function) for k in range(2,6)]
    plots = [plots[:2], plots[2:]]
    return gridplot(plots)

For each possible combination of length 2 of eingenvectors (aka principal directions) we plot the clustered data using, for each combination, $k\in \{2,3,4,5\}$

In [52]:
#Try projecting on on different principal directions
for i, j in combinations(range(5), 2):
    xy = inv_vecs[:, [i, j]]
    projection_function = lambda point: point @ xy
    p = four_plots(coordinates, tags, projection_function)
    title = 'Using components associated to eigenvalues number' + str((i, j))
    show(column(Div(text=title), p))

The principal directions that separate the cluster well are......

## 3.12 Clustering Movies

In [64]:
def k_medioids(points, distf, k):
    # Init
    r = range(len(points))
    M_indices = np.array(random.sample(r, k))
    z = 0
    medioids = np.zeros((k, points.shape[1]))
    for i in range(k):
        medioids[i] = points[M_indices[z]]
        z += 1
        
    clusters = [set() for i in range(k)]
    
    # assign points to clusters
    for p in range(len(points)):
        i = np.argmin(distf(medioids, points[p]))
        clusters[i].add(p)
    
    # Recompute medioids
    for i in range(k):
        distances = []
        for xi in clusters[i]:
            x = points[xi]
            sum = 0
            for yi in clusters[i]:
                y = points[yi]
                sum += distf(x, y)
            distances.append((xi, sum))
        medioids[i] = points[min(distances, key=lambda t: t[1])]
    # Until convergence?
        
            
    
    

In [65]:
a = np.zeros((3, 2))
a[1] = np.array([5,6])

k_medioids(a, lambda a,b: a + b, 2)

IndexError: list index out of range

In [44]:
!ls -la

total 3900
drwxr-xr-x 3 vinz root    4096 mag  6 16:14 .
drwxr-xr-x 5 vinz root    4096 mag  5 20:49 ..
-rw-rw-r-- 1 vinz vinz  146301 mag  6 11:15 data.txt
drwxr-xr-x 2 vinz vinz    4096 mag  6 11:27 .ipynb_checkpoints
-rw-rw-r-- 1 vinz vinz 3547584 mag  6 16:14 lab3-cluster.ipynb
-rw-rw-r-- 1 vinz vinz  119483 mag  6 11:15 lab3-dimred.ipynb
-rw-r--r-- 1 vinz root    1549 mag  5 20:50 lab3-recsys.ipynb
-rw-r--r-- 1 vinz root   42338 mag  5 20:50 most-rated.pickle
-rwxr-xr-x 1 vinz root    2185 mag  5 20:50 rate-movies.py
-rw-r--r-- 1 vinz root    1926 mag  5 20:50 selected-movies.pickle
-rw-r--r-- 1 vinz root   98768 mag  5 20:50 snippets.ipynb


In [4]:
with open('most-rated.pickle', 'rb') as f:
    movies = pickle.load(f)

In [7]:
names = {t[0]: t[1] for t in movies}


{296: 'Pulp Fiction (1994)',
 356: 'Forrest Gump (1994)',
 318: 'Shawshank Redemption, The (1994)',
 593: 'Silence of the Lambs, The (1991)',
 480: 'Jurassic Park (1993)',
 260: 'Star Wars: Episode IV - A New Hope (1977)',
 110: 'Braveheart (1995)',
 589: 'Terminator 2: Judgment Day (1991)',
 2571: 'Matrix, The (1999)',
 527: "Schindler's List (1993)",
 1: 'Toy Story (1995)',
 457: 'Fugitive, The (1993)',
 150: 'Apollo 13 (1995)',
 780: 'Independence Day (a.k.a. ID4) (1996)',
 50: 'Usual Suspects, The (1995)',
 1210: 'Star Wars: Episode VI - Return of the Jedi (1983)',
 592: 'Batman (1989)',
 1196: 'Star Wars: Episode V - The Empire Strikes Back (1980)',
 2858: 'American Beauty (1999)',
 32: 'Twelve Monkeys (a.k.a. 12 Monkeys) (1995)',
 590: 'Dances with Wolves (1990)',
 1198: 'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)',
 608: 'Fargo (1996)',
 47: 'Seven (a.k.a. Se7en) (1995)',
 380: 'True Lies (1994)',
 588: 'Aladdin (1992)',
 377: 'Speed (1994)',


In [14]:
!tail 

data.txt	    lab3-recsys.ipynb  selected-movies.pickle
lab3-cluster.ipynb  most-rated.pickle  snippets.ipynb
lab3-dimred.ipynb   rate-movies.py
