In [21]:
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF
from sklearn.manifold import TSNE
from bokeh.io import show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, Row, LogScale
from bokeh.layouts import row
from bokeh.models.tools import HoverTool
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
output_notebook()

In [22]:
links = pd.read_csv('../data/ml-latest-small/links.csv')
movies = pd.read_csv('../data/ml-latest-small/movies.csv')
ratings = pd.read_csv('../data/ml-latest-small/ratings.csv')
tags = pd.read_csv('../data/ml-latest-small/tags.csv')

In [23]:
movie_hist,movie_bin_edges = np.histogram(ratings.groupby('movieId').count(),bins=list(range(150)))
user_hist, user_bin_edges = np.histogram(ratings.groupby('userId').count(),bins=list(range(150)))

In [24]:
ratings_by_movieId = figure(title='Number of Ratings by MovieId')
ratings_by_movieId.quad(top=movie_hist, bottom=1, left=movie_bin_edges[:-1], right=movie_bin_edges[1:],
           fill_color="navy", line_color="white", alpha=0.5)
ratings_by_userId = figure(title='Number of Ratings by UserId')
ratings_by_userId.quad(top=user_hist, bottom=0, left=user_bin_edges[:-1], right=user_bin_edges[1:],fill_color="navy", line_color="white", alpha=0.5)
p = row(ratings_by_movieId, ratings_by_userId,sizing_mode='scale_both')
show(p)

In [5]:
tmp=pd.merge(movies.set_index('movieId'),ratings.groupby('movieId').count(),left_index=True,right_index=True)
tmp = tmp[tmp['userId']>40].drop(['userId','timestamp'],axis=1)
tmp['num_ratings'] = tmp['rating']
filtered_movies = tmp.drop('rating',axis=1)

The ```filtered_movies``` dataset includes all movies with ratings from at least 40 users.

In [140]:
data=pd.pivot_table(ratings,values='rating',index='userId',columns='movieId')
data = data[filtered_movies.index]

The ```data``` dataframe ratings for users by ratings by filtered movies.

In [224]:

train, test = D.iloc[0:400,:],D.iloc[400:,:]

In [143]:
movies_pca = PCA(n_components=10).fit(train.values)

In [144]:
embedded_users = movies_pca.transform(train.values)
nbrs = NearestNeighbors(n_neighbors=8).fit(embedded_users)

In [222]:
def predict(user):
    embedded_user = movies_pca.transform(user)
    distances, similar_users = nbrs.kneighbors(embedded_user)
    prediction = train.iloc[similar_users[0,:],:].mean()
    recommended = [i for i,x in enumerate(prediction) if x>0]
    liked = [i for i,x in enumerate(user[0,:]) if x>0]
    disliked = [i for i,x in enumerate(user[0,:]) if x<0]
    recommend = set(recommended) - set(liked)
    recommend = [(x, prediction.iloc[x]) for x in recommend]
    recommend = sorted(recommend, key=lambda x: x[1])[0:8]
    liked = [(x, user[0,x]) for x in liked]
    liked = sorted(liked, key=lambda x: x[1])[0:8]
    print("LIKED")
    for x,j in liked:
        title= filtered_movies.iloc[x,:]['title']
        score = user[0,x]
        print(title, score)
    print("RECOMMENDED")
    for x,j in recommend:
        title = filtered_movies.iloc[x,:]['title']
        score = prediction.iloc[x]
        print(title, score)
    return recommend,prediction
    

In [223]:
for j in range(10):
    predict(test.iloc[j,:].values.reshape(1,-1))
    print('---')

LIKED
Toy Story (1995) 0.5
Forrest Gump (1994) 0.5
Lion King, The (1994) 0.5
Nightmare Before Christmas, The (1993) 0.5
Mulan (1998) 0.5
Edward Scissorhands (1990) 0.5
Shrek (2001) 0.5
Monsters, Inc. (2001) 0.5
RECOMMENDED
Casino (1995) 0.0625
Rumble in the Bronx (Hont faan kui) (1995) 0.0625
(500) Days of Summer (2009) 0.0625
Naked Gun 33 1/3: The Final Insult (1994) 0.0625
Avengers, The (2012) 0.0625
Deadpool (2016) 0.0625
Snow White and the Seven Dwarfs (1937) 0.0625
Pinocchio (1940) 0.0625
---
LIKED
Broken Arrow (1996) 1.0
Rumble in the Bronx (Hont faan kui) (1995) 1.0
Lion King, The (1994) 1.0
Demolition Man (1993) 1.0
Fugitive, The (1993) 1.0
Jurassic Park (1993) 1.0
Schindler's List (1993) 1.0
Home Alone (1990) 1.0
RECOMMENDED
Toy Story (1995) 0.125
Father of the Bride Part II (1995) 0.125
Taxi Driver (1976) 0.125
Congo (1995) 0.125
Nine Months (1995) 0.125
Waterworld (1995) 0.125
Natural Born Killers (1994) 0.125
Four Weddings and a Funeral (1994) 0.125
---
LIKED
Fast Times at 

In [205]:
[i for i,x in test.iloc[1,:]>0

movieId
1         False
2         False
3         False
5         False
6         False
          ...  
122882    False
122886    False
122904    False
134130    False
134853    False
Name: 402, Length: 616, dtype: bool

In [173]:
recommend

{110, 150, 292, 318, 344, 349, 590}

In [180]:
filtered_movies.loc[590,:]

title          Dances with Wolves (1990)
genres           Adventure|Drama|Western
num_ratings                          164
Name: 590, dtype: object

In [146]:
D.mean()

movieId
1         0.324590
2         0.077869
3         0.022131
5         0.005738
6         0.158197
            ...   
122882    0.063115
122886    0.057377
122904    0.073770
134130    0.078689
134853    0.057377
Length: 616, dtype: float64

In [147]:
(user!=0).sum()

22

In [148]:
user = test.iloc[1,:].values.reshape(1,-1)

In [159]:
prediction = predict(user)

KeyError: 68

In [150]:
(prediction>0).sum()

78

In [157]:
filtered_movies

Unnamed: 0_level_0,title,genres,num_ratings
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
2,Jumanji (1995),Adventure|Children|Fantasy,110
3,Grumpier Old Men (1995),Comedy|Romance,52
5,Father of the Bride Part II (1995),Comedy,49
6,Heat (1995),Action|Crime|Thriller,102
...,...,...,...
122882,Mad Max: Fury Road (2015),Action|Adventure|Sci-Fi|Thriller,47
122886,Star Wars: Episode VII - The Force Awakens (2015),Action|Adventure|Fantasy|Sci-Fi|IMAX,41
122904,Deadpool (2016),Action|Adventure|Comedy|Sci-Fi,54
134130,The Martian (2015),Adventure|Drama|Sci-Fi,48


In [152]:
recommended = [i for i,x in enumerate(prediction) if x>1]
liked = [i for i,x in enumerate(user[0,:]) if x>0]
disliked = [i for i,x in enumerate(user[0,:]) if x<0]

In [153]:
len(set(disliked) & set(recommended))

0

In [154]:
set(recommended) - set(liked)

{30, 63, 69}

In [155]:
embedded_user.shape

(1, 10)

In [71]:
distances, similar_users = nbrs.kneighbors(embedded_user)

In [72]:
similar_users

array([[305, 113, 142, 147, 247, 340,  51, 212]])

In [75]:
train.iloc[similar_users[0,:],:].mean()

movieId
1        -0.4375
2         0.0000
3         0.0000
5         0.0000
6         0.0000
           ...  
122882   -0.7500
122886   -0.8750
122904   -1.0000
134130   -0.1250
134853   -0.8750
Length: 616, dtype: float64