In [3]:
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF
from sklearn.manifold import TSNE
from bokeh.io import show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, Row, LogScale
from bokeh.layouts import row
from bokeh.models.tools import HoverTool
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
output_notebook()

In [5]:
links = pd.read_csv('../data/ml-latest-small/links.csv')
movies = pd.read_csv('../data/ml-latest-small/movies.csv')
ratings = pd.read_csv('../data/ml-latest-small/ratings.csv')
tags = pd.read_csv('../data/ml-latest-small/tags.csv')

In [6]:
movie_hist,movie_bin_edges = np.histogram(ratings.groupby('movieId').count(),bins=list(range(150)))
user_hist, user_bin_edges = np.histogram(ratings.groupby('userId').count(),bins=list(range(150)))

In [7]:
ratings_by_movieId = figure(title='Number of Ratings by MovieId')
ratings_by_movieId.quad(top=movie_hist, bottom=1, left=movie_bin_edges[:-1], right=movie_bin_edges[1:],
           fill_color="navy", line_color="white", alpha=0.5)
ratings_by_userId = figure(title='Number of Ratings by UserId')
ratings_by_userId.quad(top=user_hist, bottom=0, left=user_bin_edges[:-1], right=user_bin_edges[1:],fill_color="navy", line_color="white", alpha=0.5)
p = row(ratings_by_movieId, ratings_by_userId,sizing_mode='scale_both')
show(p)

In [8]:
tmp=pd.merge(movies.set_index('movieId'),ratings.groupby('movieId').count(),left_index=True,right_index=True)
tmp = tmp[tmp['userId']>40].drop(['userId','timestamp'],axis=1)
tmp['num_ratings'] = tmp['rating']
filtered_movies = tmp.drop('rating',axis=1)

In [9]:
data=pd.pivot_table(ratings,values='rating',index='userId',columns='movieId')
data = data[filtered_movies.index]

In [10]:
D=data.fillna(0)
model = NMF(n_components=10)
model.fit(D.values)
T = model.components_
W = model.fit_transform(D.values)

In [11]:
movie_embedding = pd.DataFrame(T,columns=filtered_movies['title']).T
tsne_embedding = TSNE(n_components=2).fit_transform(movie_embedding.values)
tsne_frame = pd.DataFrame(tsne_embedding,columns=['x','y'])
pca_embedding = PCA(n_components=2).fit_transform(movie_embedding.values)
pca_frame = pd.DataFrame(pca_embedding,columns=['x','y'])

In [12]:
tsne_frame.index = filtered_movies.index
pca_frame.index = filtered_movies.index

In [13]:
tsne_frame['title']=filtered_movies['title']
pca_frame['title']=filtered_movies['title']

In [14]:
f = figure(tooltips=[('name','@title')])
source = ColumnDataSource(tsne_frame)
f.circle(x='x',y='y',source=source)

In [15]:
show(f)

In [16]:
Groups = KMeans(n_clusters=12).fit(tsne_embedding)

In [17]:
centers = Groups.cluster_centers_

In [18]:
f = figure(tooltips=[('name','@title')])
source = ColumnDataSource(tsne_frame)
f.circle(x='x',y='y',source=source)
f.circle(x=centers[:,0],y=centers[:,1],color='red',radius=5,alpha=.3)

In [19]:
show(f)

In [20]:
nbrs = NearestNeighbors(n_neighbors=5).fit(tsne_embedding)

In [21]:
distances, indices = nbrs.kneighbors(centers)

In [22]:
for x in indices:
    for y in x:
        print(tsne_frame.iloc[y,:]['title'])
    print('===========')

Firm, The (1993)
Crimson Tide (1995)
Clear and Present Danger (1994)
Outbreak (1995)
Cliffhanger (1993)
Fear and Loathing in Las Vegas (1998)
Casino (1995)
Scarface (1983)
Lock, Stock & Two Smoking Barrels (1998)
Big Lebowski, The (1998)
Pirates of the Caribbean: Dead Man's Chest (2006)
Ice Age (2002)
Ocean's Eleven (2001)
Incredibles, The (2004)
Catch Me If You Can (2002)
Conspiracy Theory (1997)
Ronin (1998)
So I Married an Axe Murderer (1993)
Few Good Men, A (1992)
Payback (1999)
Raising Arizona (1987)
Ed Wood (1994)
Sling Blade (1996)
Young Frankenstein (1974)
Crying Game, The (1992)
Inside Out (2015)
Star Wars: Episode VII - The Force Awakens (2015)
X-Men: First Class (2011)
Scott Pilgrim vs. the World (2010)
How to Train Your Dragon (2010)
Executive Decision (1996)
Dragonheart (1996)
Sabrina (1995)
Phenomenon (1996)
Rumble in the Bronx (Hont faan kui) (1995)
Erin Brockovich (2000)
101 Dalmatians (One Hundred and One Dalmatians) (1961)
About a Boy (2002)
Mulan (1998)
Chocolat (200

In [23]:
Groups = KMeans(n_clusters=15).fit(movie_embedding.values)
centers = Groups.cluster_centers_
nbrs = NearestNeighbors(n_neighbors=8,p=2).fit(movie_embedding.values)
distances, indices = nbrs.kneighbors(centers)

for x in indices:
    for y in x:
        print(movie_embedding.index[y])
    print('===========')

Peter Pan (1953)
Rocky Horror Picture Show, The (1975)
Jungle Book, The (1967)
About a Boy (2002)
101 Dalmatians (One Hundred and One Dalmatians) (1961)
Notting Hill (1999)
Cinderella (1950)
Erin Brockovich (2000)
Annie Hall (1977)
Raising Arizona (1987)
Amadeus (1984)
Vertigo (1958)
Lawrence of Arabia (1962)
Seven Samurai (Shichinin no samurai) (1954)
Sling Blade (1996)
Fish Called Wanda, A (1988)
Social Network, The (2010)
Toy Story 3 (2010)
The Martian (2015)
X-Men: First Class (2011)
How to Train Your Dragon (2010)
Scott Pilgrim vs. the World (2010)
Wolf of Wall Street, The (2013)
The Hunger Games (2012)
Monsters, Inc. (2001)
Catch Me If You Can (2002)
Ocean's Eleven (2001)
Incredibles, The (2004)
Finding Nemo (2003)
Beautiful Mind, A (2001)
Cast Away (2000)
Ice Age (2002)
Batman Forever (1995)
Mask, The (1994)
Speed (1994)
Net, The (1995)
While You Were Sleeping (1995)
Clear and Present Danger (1994)
Stargate (1994)
Ace Ventura: Pet Detective (1994)
Ronin (1998)
Entrapment (1999)
