# Recommender system  using Matrix factorization
http://users.cecs.anu.edu.au/~akmenon/papers/autorec/autorec-paper.pdf

In [2]:
import numpy as np
import pandas as pd
import os
import warnings
warnings.filterwarnings("ignore")

In [3]:
from keras.models import load_model
from sklearn.model_selection import train_test_split
from keras.layers import Input, Embedding, Flatten, Dot, Dense, Concatenate
from keras.models import Model
warnings.filterwarnings('ignore')
%matplotlib inline

## Analyse Dataset


### Movie lens Dataset
Reading dataset (MovieLens 1M movie ratings dataset: downloaded from https://grouplens.org/datasets/movielens/1m/)


In [4]:
import os
import zipfile
from os.path import exists
cwd = os.getcwd()
file_exists = exists('./ml-1m/movies.dat')
if(file_exists==False):
      
        print('downloading....')
        os.system('curl -o ml-1m.zip -SL https://files.grouplens.org/datasets/movielens/ml-1m.zip')
        print('download Complete')
        print('Extracting..')
        savePath=cwd
        savefile="./ml-1m.zip"
        with zipfile.ZipFile(savefile, 'r') as zip_ref:
            zip_ref.extractall(savePath)
        print('Complete')

In [5]:

movie_data = pd.io.parsers.read_csv('ml-1m/movies.dat',
    names=['movie_id', 'title', 'genre'],
    engine='python', delimiter='::')

movie_data

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [6]:
data = pd.io.parsers.read_csv('ml-1m/ratings.dat', 
    names=['user_id', 'movie_id', 'rating', 'time'],
    engine='python', delimiter='::')

data

Unnamed: 0,user_id,movie_id,rating,time
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


### Generate Train and test data

In [7]:
train_df, val_df = train_test_split(data, test_size=0.2, random_state=42)


In [8]:
train_df.head(5)

Unnamed: 0,user_id,movie_id,rating,time
416292,2507,3035,2,974076680
683230,4087,2840,4,965431652
2434,19,457,3,978146863
688533,4118,2804,4,965804599
472584,2907,805,4,971838472


In [9]:
val_df.head(5)

Unnamed: 0,user_id,movie_id,rating,time
895536,5412,2683,2,960243649
899739,5440,904,5,959995181
55687,368,3717,4,976311423
63727,425,1721,4,976283587
822011,4942,3697,1,962642480


In [10]:
movie_ids = list(set(list(train_df.movie_id.unique())+ list(val_df.movie_id.unique())))
user_ids = list(set(list(train_df.user_id.unique()) + list(val_df.user_id.unique())))

In [11]:
dict_movies = {}
index = 0
for ids in sorted(movie_ids):
    dict_movies[ids] = index
    index += 1

In [12]:
dict_users = {}
index = 0
for ids in sorted(user_ids):
    dict_users[ids] = index
    index += 1

In [13]:
train_df["movie_id"] = train_df["movie_id"].map(dict_movies)
train_df["user_id"] = train_df["user_id"].map(dict_users)

In [14]:
val_df["movie_id"] = val_df["movie_id"].map(dict_movies)
val_df["user_id"] = val_df["user_id"].map(dict_users)

In [15]:
for col in ["user_id", "movie_id", "rating"]:
    train_df[col] = train_df[col].astype(np.float32)
    val_df[col] = val_df[col].astype(np.float32)

In [16]:
num_unique_users=len(set(list(train_df.user_id.unique()) + list(val_df.user_id.unique())))
num_unique_movies=len(set(list(train_df.movie_id.unique()) + list(val_df.movie_id.unique())))

### Colaborative filtering 

In [17]:
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras import optimizers as opt
from tensorflow.keras import regularizers as rgl
from tensorflow.keras.layers import Embedding, multiply, concatenate, Flatten, Input, Dense, Dropout

In [18]:
dropout_rate=0.05
dense_1=256
dense_2=128
dense_3=64

activation_func="sigmoid"
regularizer=rgl.l2(0.00001)#l2 regularization
max_rating=5
min_rating=0

In [19]:
EMBEDDING_SIZE=64
users_input = Input(shape=(1,), name="user_input")
users_embedding = Embedding(num_unique_users + 1, EMBEDDING_SIZE, embeddings_regularizer=regularizer, name="user_embeddings")(users_input)
users_bias = Embedding(num_unique_users + 1, 1, embeddings_regularizer=regularizer, name="user_bias")(users_input)

movies_input = Input(shape=(1,), name="movie_input")
movies_embedding = Embedding(num_unique_movies + 1, EMBEDDING_SIZE, embeddings_regularizer=regularizer, name="movie_embedding")(movies_input)
movies_bias = Embedding(num_unique_movies + 1, 1, embeddings_regularizer=regularizer, name="movie_bias")(movies_input)

dot_product_users_movies = multiply([users_embedding, movies_embedding])

input_terms = concatenate([dot_product_users_movies, users_bias, movies_bias])

input_terms = Flatten(name="fl_inputs")(input_terms)

output = Dense(dense_1, activation=activation_func, name="dense_0")(input_terms)

output = Dropout(dropout_rate)(output)

output = Dense(dense_2, activation=activation_func, name="dense_1")(output)

output = Dropout(dropout_rate)(output)

output = Dense(dense_3, activation=activation_func, name="dense_2")(output)

output = Dropout(dropout_rate)(output)

output = Dense(1, activation="sigmoid", name="output")(output) * (max_rating - min_rating) + min_rating

In [20]:
model = Model(inputs=[users_input, movies_input], outputs=output)

In [21]:
opt_adam = opt.Adam(lr = 0.001)
model.compile(optimizer=opt_adam, loss= ['mse'], metrics=['mean_absolute_error'])

In [22]:
model.fit(x=[train_df.user_id, train_df.movie_id], y=train_df.rating,validation_data=([val_df.user_id, val_df.movie_id], val_df.rating), batch_size=512, epochs=3, verbose=1 )

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x280432cf6d8>

In [23]:
model.predict([val_df.user_id, val_df.movie_id])

array([[3.5914702],
       [4.665625 ],
       [3.4766152],
       ...,
       [3.7966003],
       [3.7859251],
       [4.403532 ]], dtype=float32)

In [24]:
model.evaluate([val_df.user_id, val_df.movie_id], val_df.rating)



[0.8295639157295227, 0.7220215201377869]

In [40]:
model.predict([val_df.user_id[:10], val_df.movie_id[:10]])

array([[3.5914702],
       [4.665625 ],
       [3.4766152],
       [3.3732233],
       [2.8491461],
       [3.7023697],
       [2.4310231],
       [2.7757912],
       [3.2255325],
       [2.7391794]], dtype=float32)

In [42]:
type(val_df.user_id[:10])

pandas.core.series.Series

In [47]:
model.predict([pd.Series([9.0]), pd.Series([1.0])])

array([[3.8083613]], dtype=float32)

In [29]:
val_df

Unnamed: 0,user_id,movie_id,rating,time
895536,5411.0,2480.0,2.0,960243649
899739,5439.0,843.0,5.0,959995181
55687,367.0,3475.0,4.0,976311423
63727,424.0,1574.0,4.0,976283587
822011,4941.0,3455.0,1.0,962642480
...,...,...,...,...
756007,4504.0,1297.0,4.0,964985875
477775,2933.0,691.0,1.0,971493127
424188,2571.0,907.0,5.0,973913032
293600,1747.0,1492.0,3.0,974711260
