### Create rating matrix 
* with rows=users, 
* columns=movies, 
* values in the matrix= user_rating a for movie_id=1 etc

In [15]:
import pandas as pd
from sklearn.decomposition import NMF
import numpy as np
import pickle

In [3]:
# import clean dataset
df = pd.read_csv('./data/ml-latest-small/dev_ds_ratings_names_uniqueids.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,userId,movieId_x,rating,timestamp,movieId_unique,movieId_y,title,genres
0,0,1,1,4.0,964982703,1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,1,3,4.0,964981247,3,3,Grumpier Old Men (1995),Comedy|Romance
2,2,1,6,4.0,964982224,6,6,Heat (1995),Action|Crime|Thriller
3,3,1,47,5.0,964983815,47,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,4,1,50,5.0,964982931,50,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [4]:
# creating ratings matrix (R)
R = df.pivot_table(index='userId', columns='movieId_unique', values='rating', dropna=False)
R.shape # 610 users , 9724 movieIds

(610, 9719)

### Handle missing data


In [5]:
med_values = R.median().median()
R.fillna(med_values,inplace=True)
R.head()

movieId_unique,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,3.5,4.0,3.5,3.5,4.0,3.5,3.5,3.5,3.5,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
2,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
3,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
4,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
5,4.0,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5


### Train NMF

In [8]:
m = NMF(n_components=20)

In [9]:
m.fit(R)
# 60 components: ((610, 60), (60, 9719), 228.1942897155037)
# 20 components: ((610, 20), (20, 9719), 274.6005132350494)



NMF(n_components=20)

### Check out the sub-matrices, and the reconstruction error

In [13]:
Q = m.components_
P = m.transform(R)
error = m.reconstruction_err_ #this is an absolute score, so no intuition from looking at in isolation! 
P.shape, Q.shape, error

((610, 20), (20, 9719), 274.63452618657476)

In [14]:
new_R = np.dot(P,Q)
pd.DataFrame(new_R.round(1), columns=R.columns, index=R.index)

movieId_unique,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.2,3.6,3.7,3.5,3.6,3.8,3.6,3.5,3.5,3.6,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
2,3.6,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
3,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.6,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
4,3.8,3.5,3.5,3.5,3.4,3.5,3.4,3.5,3.5,3.5,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
5,3.6,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,3.7,3.5,3.6,3.4,3.2,3.6,3.3,3.5,3.5,3.1,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
607,3.7,3.5,3.5,3.5,3.5,3.7,3.5,3.5,3.5,3.6,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
608,2.6,2.0,2.4,3.6,3.2,3.6,3.7,3.6,3.4,3.4,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
609,3.6,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5


In [16]:
# save the model to disk
filename = 'NMF_model.sav'
pickle.dump(m, open(filename, 'wb'))

In [18]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
loaded_model

NMF(n_components=20)