In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF
from sklearn.manifold import TSNE
from bokeh.io import show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, Row, LogScale
from bokeh.layouts import row
from bokeh.models.tools import HoverTool
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
output_notebook()

In [2]:
links = pd.read_csv('../data/ml-latest-small/links.csv')
movies = pd.read_csv('../data/ml-latest-small/movies.csv')
ratings = pd.read_csv('../data/ml-latest-small/ratings.csv')
tags = pd.read_csv('../data/ml-latest-small/tags.csv')

In [3]:
movie_hist,movie_bin_edges = np.histogram(ratings.groupby('movieId').count(),bins=list(range(150)))
user_hist, user_bin_edges = np.histogram(ratings.groupby('userId').count(),bins=list(range(150)))

In [4]:
ratings_by_movieId = figure(title='Number of Ratings by MovieId')
ratings_by_movieId.quad(top=movie_hist, bottom=1, left=movie_bin_edges[:-1], right=movie_bin_edges[1:],
           fill_color="navy", line_color="white", alpha=0.5)
ratings_by_userId = figure(title='Number of Ratings by UserId')
ratings_by_userId.quad(top=user_hist, bottom=0, left=user_bin_edges[:-1], right=user_bin_edges[1:],fill_color="navy", line_color="white", alpha=0.5)
p = row(ratings_by_movieId, ratings_by_userId,sizing_mode='scale_both')
show(p)

In [5]:
tmp=pd.merge(movies.set_index('movieId'),ratings.groupby('movieId').count(),left_index=True,right_index=True)
tmp = tmp[tmp['userId']>40].drop(['userId','timestamp'],axis=1)
tmp['num_ratings'] = tmp['rating']
filtered_movies = tmp.drop('rating',axis=1)

In [6]:
data=pd.pivot_table(ratings,values='rating',index='userId',columns='movieId')
data = data[filtered_movies.index]

In [7]:
D=data.fillna(0)
model = NMF(n_components=10)
model.fit(D.values)
T = model.components_
W = model.fit_transform(D.values)

In [8]:
movie_embedding = pd.DataFrame(T,columns=filtered_movies['title']).T
tsne_embedding = TSNE(n_components=2).fit_transform(movie_embedding.values)
tsne_frame = pd.DataFrame(tsne_embedding,columns=['x','y'])
pca_embedding = PCA(n_components=2).fit_transform(movie_embedding.values)
pca_frame = pd.DataFrame(pca_embedding,columns=['x','y'])

In [9]:
tsne_frame.index = filtered_movies.index
pca_frame.index = filtered_movies.index

In [10]:
tsne_frame['title']=filtered_movies['title']
pca_frame['title']=filtered_movies['title']

In [11]:
f = figure(tooltips=[('name','@title')])
source = ColumnDataSource(tsne_frame)
f.circle(x='x',y='y',source=source)

In [12]:
show(f)

In [14]:
Groups = KMeans(n_clusters=12).fit(tsne_embedding)

In [15]:
centers = Groups.cluster_centers_

In [16]:
f = figure(tooltips=[('name','@title')])
source = ColumnDataSource(tsne_frame)
f.circle(x='x',y='y',source=source)
f.circle(x=centers[:,0],y=centers[:,1],color='red',radius=5,alpha=.3)

In [17]:
show(f)

In [227]:
nbrs = NearestNeighbors(n_neighbors=5).fit(tsne_embedding)

In [228]:
distances, indices = nbrs.kneighbors(centers)

In [229]:
for x in indices:
    for y in x:
        print(tsne_frame.iloc[y,:]['title'])
    print('===========')

Basic Instinct (1992)
Conspiracy Theory (1997)
Cocoon (1985)
Godfather: Part III, The (1990)
Ronin (1998)
Wallace & Gromit: A Close Shave (1995)
Hotel Rwanda (2004)
Pianist, The (2002)
Back to the Future Part II (1989)
Back to the Future Part III (1990)
Star Trek: Generations (1994)
Die Hard: With a Vengeance (1995)
Batman Forever (1995)
Stargate (1994)
Clear and Present Danger (1994)
Inside Out (2015)
Star Wars: Episode VII - The Force Awakens (2015)
X-Men: First Class (2011)
Scott Pilgrim vs. the World (2010)
Big Hero 6 (2014)
Executive Decision (1996)
Sabrina (1995)
Phenomenon (1996)
Tin Cup (1996)
Rumble in the Bronx (Hont faan kui) (1995)
Hook (1991)
Erin Brockovich (2000)
10 Things I Hate About You (1999)
Bug's Life, A (1998)
About a Boy (2002)
Mystic River (2003)
Mulholland Drive (2001)
Fear and Loathing in Las Vegas (1998)
Scarface (1983)
Big Lebowski, The (1998)
E.T. the Extra-Terrestrial (1982)
Princess Bride, The (1987)
Groundhog Day (1993)
Contact (1997)
Ferris Bueller's Da

In [13]:
Groups = KMeans(n_clusters=15).fit(movie_embedding.values)
centers = Groups.cluster_centers_
nbrs = NearestNeighbors(n_neighbors=8,p=2).fit(movie_embedding.values)
distances, indices = nbrs.kneighbors(centers)

for x in indices:
    for y in x:
        print(movie_embedding.index[y])
    print('===========')

Da Vinci Code, The (2006)
Mummy Returns, The (2001)
The Butterfly Effect (2004)
Illusionist, The (2006)
Stranger than Fiction (2006)
Terminal, The (2004)
Black Hawk Down (2001)
Austin Powers in Goldmember (2002)
Stargate (1994)
Batman Forever (1995)
Clear and Present Danger (1994)
Speed (1994)
Die Hard: With a Vengeance (1995)
True Lies (1994)
Star Trek: Generations (1994)
Net, The (1995)
Social Network, The (2010)
Toy Story 3 (2010)
The Martian (2015)
X-Men: First Class (2011)
How to Train Your Dragon (2010)
Scott Pilgrim vs. the World (2010)
Wolf of Wall Street, The (2013)
The Hunger Games (2012)
Raising Arizona (1987)
Thelma & Louise (1991)
Blues Brothers, The (1980)
Glory (1989)
Unforgiven (1992)
Sling Blade (1996)
American Graffiti (1973)
Young Frankenstein (1974)
Fear and Loathing in Las Vegas (1998)
Donnie Brasco (1997)
Mystic River (2003)
Lock, Stock & Two Smoking Barrels (1998)
Mulholland Drive (2001)
Magnolia (1999)
Run Lola Run (Lola rennt) (1998)
Traffic (2000)
Phenomenon (