In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF
from sklearn.impute import KNNImputer
import pickle

## Load in the data

In [2]:
movies = pd.read_csv('./ml-latest-small/movies.csv')#, index_col=0)
links = pd.read_csv('./ml-latest-small/links.csv', index_col=0) ### don't know if we need this one
ratings = pd.read_csv('./ml-latest-small/ratings.csv')#,  index_col=0)#, parse_dates=True)
tags = pd.read_csv('./ml-latest-small/tags.csv', index_col=0, parse_dates=True)

In [3]:
movies.columns, links.columns, ratings.columns, tags.columns

(Index(['movieId', 'title', 'genres'], dtype='object'),
 Index(['imdbId', 'tmdbId'], dtype='object'),
 Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object'),
 Index(['movieId', 'tag', 'timestamp'], dtype='object'))

In [4]:
movies.shape, links.shape, ratings.shape, tags.shape

((9742, 3), (9742, 2), (100836, 4), (3683, 3))

In [5]:
movies#.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [6]:
ratings#.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [7]:
tags.head()

Unnamed: 0_level_0,movieId,tag,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,60756,funny,1445714994
2,60756,Highly quotable,1445714996
2,60756,will ferrell,1445714992
2,89774,Boxing story,1445715207
2,89774,MMA,1445715200


In [8]:
tags.dtypes

movieId       int64
tag          object
timestamp     int64
dtype: object

In [9]:
movies.dtypes

movieId     int64
title      object
genres     object
dtype: object

## Prep Data for NMF

In [10]:
df = pd.merge(ratings[['userId','movieId','rating']],
              movies[['title','movieId']], 
              how='inner',
              on='movieId').drop(columns=['movieId'])
df

Unnamed: 0,userId,rating,title
0,1,4.0,Toy Story (1995)
1,5,4.0,Toy Story (1995)
2,7,4.5,Toy Story (1995)
3,15,2.5,Toy Story (1995)
4,17,4.5,Toy Story (1995)
...,...,...,...
100831,610,2.5,Bloodmoon (1997)
100832,610,4.5,Sympathy for the Underdog (1971)
100833,610,3.0,Hazard (2005)
100834,610,3.5,Blair Witch (2016)


In [11]:
u_m_matrix = pd.pivot_table(df, values='rating',index='userId', columns='title')

u_m_matrix

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,,,,,,,,,,,...,,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,,,,,,,,,,,...,,,,,,4.5,3.5,,,
609,,,,,,,,,,,...,,,,,,,,,,


In [12]:
with open('u_m_matrix.pickle', 'wb') as f:
    pickle.dump(u_m_matrix, f)

### `deal with the missings --> knn (or normalize)?`

In [13]:
# percentage of missings 
u_m_matrix.isna().sum().sum() / (u_m_matrix.shape[0]*u_m_matrix.shape[1])

0.9829922460483859

In [14]:
imputer = KNNImputer(n_neighbors=2)

In [15]:
data1 = imputer.fit(u_m_matrix)

In [16]:
with open('knn2.pickle', 'wb') as f:
    # Pickle the model
    pickle.dump(data1, f)

In [24]:
data1 = imputer.fit_transform(u_m_matrix)

In [25]:
u_m_matrix_nona = pd.DataFrame(data1, index=u_m_matrix.index, columns=u_m_matrix.columns)#.values

In [17]:
### easier ALTERNATIVE: fill missings with mean
#print(u_m_matrix.mean().mean())

#u_m_matrix = u_m_matrix.fillna(u_m_matrix.mean().mean())

---

## NMF

* movie_genre = Q
* user_genre = P
* user_movie = R

In [26]:
data2 = u_m_matrix_nona

In [27]:
#need a dataframe for this
R = pd.DataFrame(data2, index=data2.index, columns=data2.columns).values 

In [48]:
with open('R_matrix.pickle', 'wb') as f:
    pickle.dump(R, f)

In [51]:
R

array([[4.  , 4.  , 3.5 , ..., 2.  , 4.  , 1.  ],
       [4.  , 4.  , 3.5 , ..., 1.75, 2.  , 1.  ],
       [4.  , 4.  , 3.5 , ..., 1.75, 2.5 , 1.  ],
       ...,
       [4.  , 4.  , 3.5 , ..., 2.  , 3.25, 1.  ],
       [4.  , 4.  , 3.5 , ..., 2.25, 4.25, 1.  ],
       [4.  , 4.  , 3.5 , ..., 1.5 , 3.25, 1.  ]])

In [28]:
u_m_matrix_nona.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,4.0,3.5,5.0,4.0,1.5,4.0,3.0,4.5,3.75,...,1.5,4.0,3.25,3.0,3.0,3.75,2.0,2.0,4.0,1.0
2,4.0,4.0,3.5,5.0,4.0,1.5,4.25,3.0,4.5,3.0,...,1.5,4.0,3.25,3.0,3.0,4.0,2.25,1.75,2.0,1.0
3,4.0,4.0,3.5,5.0,4.0,1.5,2.5,3.0,3.0,2.75,...,1.5,4.5,3.25,3.0,3.0,2.75,3.25,1.75,2.5,1.0
4,4.0,4.0,3.5,5.0,4.0,1.5,3.75,3.0,4.25,3.25,...,1.5,3.25,3.25,3.0,3.0,3.5,2.25,2.25,3.0,1.0
5,4.0,4.0,3.5,5.0,4.0,1.5,2.5,3.0,4.5,3.25,...,1.5,4.25,4.0,3.0,3.0,4.5,3.5,1.75,3.0,1.0


In [29]:
#create a model and set the hyperparameters
# model assumes R ~ PQ'

model = NMF(n_components=20, ## n_components == genres --> 19 from readme
            init='random',   ## nndsvd - better for sparseness // nndsvda - better when sparsity is not desired ???
            random_state=42, ## for random init
            tol = 0.0001,    ## tolerance
            max_iter = 5000) ## default of 200 easily maxed out with n_compo=19
                             ## but obvs increases fitting time!!!

In [30]:
# fitting the model to R
model.fit(R)

NMF(init='random', max_iter=5000, n_components=20, random_state=42)

In [31]:
with open('nmf.pickle', 'wb') as f:
    # Pickle the model
    pickle.dump(model, f)

In [32]:
Q = model.components_  # movie-genre matrix
Q.shape #genres, no.of.movies

(20, 9719)

In [33]:
P = model.transform(R)  # user-genre matrix
P.shape #no.of users, genre

(610, 20)

In [34]:
print(model.reconstruction_err_) #reconstruction error

nR = np.dot(P, Q)
print(nR.shape) ## The reconstructed matrix!

778.2638686253229
(610, 9719)


## make a prediction for a new user!

--> need to find a way to add values for new users --> how to use the knn imputer for new/complete empty data?

In [35]:
#create an empty array - len= number of movies
new_user = np.zeros((1,R.shape[1]))#, fill_value=u_m_matrix_nona.mean())#.mean()) 
new_user[:] = np.NaN

In [36]:
new_user

array([[nan, nan, nan, ..., nan, nan, nan]])

In [37]:
new_user.shape

(1, 9719)

In [38]:
# ask the user to rate some movies - 17 (2) and 50 (5) the user has seen
new_user[0][17] = 2
new_user[0][50] = 5

In [39]:
new_u = imputer.transform(new_user)

In [40]:
# we need Q
Q
# we need user_P, 
user_P = model.transform(new_u)
Q.shape, user_P.shape

((20, 9719), (1, 20))

In [41]:
# and then we do dp of Q and user_p to get actual user_recommendations
#what shape do I want - 1,9724
actual_recommendations = np.dot(user_P, Q)

In [42]:
#data from the model infiltrated into our recomendations
actual_recommendations

array([[3.98675763, 3.98675763, 3.48841292, ..., 2.08588376, 3.12193069,
        0.99668941]])

In [43]:
# take some recommendations and give them back
np.argsort(actual_recommendations) #index of a sorted array
np.argmax(actual_recommendations)  #index containing the movieid of the best film

3983

In [44]:
u_m_matrix.columns[2124]# this is the movieid of the new users best recommendation

'Damage (Fatale) (1992)'

In [45]:
recommendation = movies[movies['movieId']==2824]['title'] #### add new number here

In [46]:
recommendation

2125    On the Ropes (1999)
Name: title, dtype: object

---
## Parking lot
--- 

In [47]:
# convert series
df['datetime'] = pd.to_datetime(df['datetime'])

KeyError: 'datetime'