# Matrix Factorization with Keras

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt

from collections import Counter
from sklearn.utils import shuffle

from scipy.sparse import lil_matrix, csr_matrix

## 1. Loading the data

In [2]:
df = pd.read_csv('data/rating.csv')
len(df)

20000263

In [3]:
mo = pd.read_csv('data/movie.csv')
mo.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
# Joing the two data frame
df2 = pd.merge(df, mo, how = 'inner', on = ['movieId'])
df2.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,2,3.5,2005-04-02 23:53:47,Jumanji (1995),Adventure|Children|Fantasy
1,5,2,3.0,1996-12-25 15:26:09,Jumanji (1995),Adventure|Children|Fantasy
2,13,2,3.0,1996-11-27 08:19:02,Jumanji (1995),Adventure|Children|Fantasy
3,29,2,3.0,1996-06-23 20:36:14,Jumanji (1995),Adventure|Children|Fantasy
4,34,2,3.0,1996-10-28 13:29:44,Jumanji (1995),Adventure|Children|Fantasy


In [5]:
df2 = df2.drop(columns = ['timestamp', 'genres'])
df2.head()

Unnamed: 0,userId,movieId,rating,title
0,1,2,3.5,Jumanji (1995)
1,5,2,3.0,Jumanji (1995)
2,13,2,3.0,Jumanji (1995)
3,29,2,3.0,Jumanji (1995)
4,34,2,3.0,Jumanji (1995)


In [6]:
# Make the user Id starts from 0 
df2.userId -= 1
df2.head()

Unnamed: 0,userId,movieId,rating,title
0,0,2,3.5,Jumanji (1995)
1,4,2,3.0,Jumanji (1995)
2,12,2,3.0,Jumanji (1995)
3,28,2,3.0,Jumanji (1995)
4,33,2,3.0,Jumanji (1995)


## 2. Preprocessing 

### 2-1. Assigning movie index

In [7]:
movie_set = set(df.movieId.values)

In [8]:
movie_idx = {}
i = 0

for k in movie_set:
    movie_idx[k] = i
    i += 1

In [9]:
df['movie_idx'] = df.apply(lambda x: movie_idx[x.movieId], axis = 1)

### 2-3. Splitting into train and test set

In [10]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,movie_idx
0,1,2,3.5,2005-04-02 23:53:47,2
1,1,29,3.5,2005-04-02 23:31:16,29
2,1,32,3.5,2005-04-02 23:33:39,32
3,1,47,3.5,2005-04-02 23:32:07,47
4,1,50,3.5,2005-04-02 23:29:40,50


In [11]:
N = df.userId.max() + 1
M = df.movieId.max() + 1

print("The number of Users is ", N)
print("The number of Movies is ", M)

The number of Users is  138494
The number of Movies is  131263


In [12]:
cut = int(0.8*len(df))

df = shuffle(df)
tr = df.iloc[:cut]
te = df.iloc[cut:]

In [14]:
# Construct an empty matrix with the shape of (N, M)
A = lil_matrix((N, M))

# Make the sparse matrix 
def update_train(x):
    
    a = int(x.userId)
    b = int(x.movie_idx)
    
    A[a, b] = x.rating 
    return A[a, b]

In [None]:
tr.apply(update_train, axis = 1)

In [None]:
# Construct an empty matrix with the shape of (N, M)
A_te = lil_matrix((N, M))

# Make the sparse matrix 
def update_train(x):
    
    a = int(x.userId)
    b = int(x.movie_idx)
    
    A_te[a, b] = x.rating 

In [None]:
te.apply(update_train, axis = 1)

In [None]:
# Make a mask to tell which entries exist 
A = A.tocsr()
mask = (A > 0)

In [18]:
rating_avg = tr.rating.mean()    # global average

X_tr = [tr.userId.values, tr.movieId.values]
y_tr = tr.rating.values - rating_avg

X_te = [te.userId.values, te.movieId.values]
y_te = te.rating.values - rating_avg

## 3. Modeling

In [19]:
K = 10                           # Latent Dimensionality
reg = 0                          # regularity penalty
epochs = 10

## 4. Evaluation

In [None]:
# Fitting the model 
r = model.fit(x = X_tr, y = y_tr,
                    epochs = epochs, 
                    batch_size = 128,
                    validation_data = (X_te, y_te))