# Model Sturcture : Collaborative Filtering Matrix Factorization

In [1]:
from tensorflow.keras.layers import Embedding, Input, Dot, Flatten
from tensorflow.keras import Model

In [2]:
from tensorflow.keras import optimizers

In [3]:
import tensorflow as tf
tf.keras.backend.clear_session()

In [4]:
class CFMF():
  def __init__(self, u, i):
    self._user_num = u
    self._item_num = i
    self._embed_dim = 10
    self._learning_rate = 0.001

  def create_model(self):
    #inputs
    user_input = Input(shape=(1,), name='input_user')
    item_input = Input(shape=(1,), name='input_item')

    user_embed = Embedding(self._user_num, self._embed_dim, name='user_embedding')(user_input)
    item_embed = Embedding(self._item_num, self._embed_dim, name='item_embedding')(item_input)

    mat = Dot(axes=2)([user_embed, item_embed])

    x = Flatten()(mat)

    model = Model(inputs=[user_input, item_input], outputs=x)
    model.compile(loss='mse', optimizer=optimizers.Adam(learning_rate=self._learning_rate))

    print(model.summary())

    return model

    


# Process MovieLens 100k data

In [5]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [6]:
import pandas as pd

In [7]:
ratings = pd.read_csv('gdrive/My Drive/iir_training_python/Recommendation System/ratings.csv')

In [8]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [9]:
user_num = len(ratings['userId'].unique())
item_num = len(ratings['movieId'].unique())

In [10]:
# get the number of users and items
print(user_num, item_num)

610 9724


The movieId is not continuos, so we map them to a continous sequence


In [None]:
tmpDict = dict(enumerate(ratings['movieId'].unique()))
movieIdDict = {v: k for k, v in tmpDict.items()}
movieIdDict

# Split training and testing data

In [12]:
 from sklearn.model_selection import train_test_split

In [13]:
x_train, x_test = train_test_split(ratings, test_size=0.2)

In [14]:
print(len(x_train), len(x_test))

80668 20168


map movieId to the continous sequence

In [15]:
import numpy as np

In [16]:
m = []
for i in x_train['movieId'].to_numpy():
  m.append(movieIdDict[i])

np.array(m)

array([4249,  465,  766, ...,  856,  621, 3320])

In [17]:
training_data_x = {'input_user': x_train['userId'].to_numpy(),
          'input_item': np.array(m)}

training_data_x

{'input_item': array([4249,  465,  766, ...,  856,  621, 3320]),
 'input_user': array([509, 129, 606, ..., 560, 313, 489])}

In [18]:
cfmf = CFMF(user_num + 1, item_num)

In [19]:
model = cfmf.create_model()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_user (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_item (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
user_embedding (Embedding)      (None, 1, 10)        6110        input_user[0][0]                 
__________________________________________________________________________________________________
item_embedding (Embedding)      (None, 1, 10)        97240       input_item[0][0]                 
_______________________________________________________________________________________

In [20]:
model.fit(x=training_data_x, y=x_train['rating'].to_numpy(), batch_size=16, epochs=10, verbose=1, shuffle=True, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f53702850f0>

In [25]:
np.array(3,)

array(3)

In [24]:
model.predict(x={'input_user': np.array(3,), 'input_item':np.array(100,)})

IndexError: ignored