# Model Sturcture : Collaborative Filtering Matrix Factorization

In [1]:
from tensorflow.keras.layers import Embedding, Input, Dot, Flatten
from tensorflow.keras import Model

In [2]:
from tensorflow.keras import optimizers

In [3]:
import tensorflow as tf
tf.keras.backend.clear_session()

In [5]:
class CFMF():
  def __init__(self, u, i):
    self._user_num = u
    self._item_num = i
    self._embed_dim = 10
    self._learning_rate = 0.001

  def create_model(self):
    #inputs
    user_input = Input(shape=(1,), name='input_user')
    item_input = Input(shape=(1,), name='input_item')

    user_embed = Embedding(self._user_num, self._embed_dim, name='user_embedding')(user_input)
    item_embed = Embedding(self._item_num, self._embed_dim, name='item_embedding')(item_input)

    mat = Dot(axes=2)([user_embed, item_embed])

    x = Flatten()(mat)

    model = Model(inputs=[user_input, item_input], outputs=x)
    model.compile(loss='mse', optimizer=optimizers.Adam(learning_rate=self._learning_rate))

    print(model.summary())

    return model

    


# Process MovieLens 100k data

In [6]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [9]:
import pandas as pd

In [85]:
ratings = pd.read_csv('gdrive/My Drive/iir_training_python/Recommendation System/ratings.csv')

In [86]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [12]:
user_num = len(ratings['userId'].unique())
item_num = len(ratings['movieId'].unique())

In [13]:
# get the number of users and items
print(user_num, item_num)

610 9724


The movieId is not continuos, so we map them to a continous sequence


In [14]:
tmpDict = dict(enumerate(ratings['movieId'].unique()))
movieIdDict = {v: k for k, v in tmpDict.items()}
movieIdDict

{1: 0,
 3: 1,
 6: 2,
 47: 3,
 50: 4,
 70: 5,
 101: 6,
 110: 7,
 151: 8,
 157: 9,
 163: 10,
 216: 11,
 223: 12,
 231: 13,
 235: 14,
 260: 15,
 296: 16,
 316: 17,
 333: 18,
 349: 19,
 356: 20,
 362: 21,
 367: 22,
 423: 23,
 441: 24,
 457: 25,
 480: 26,
 500: 27,
 527: 28,
 543: 29,
 552: 30,
 553: 31,
 590: 32,
 592: 33,
 593: 34,
 596: 35,
 608: 36,
 648: 37,
 661: 38,
 673: 39,
 733: 40,
 736: 41,
 780: 42,
 804: 43,
 919: 44,
 923: 45,
 940: 46,
 943: 47,
 954: 48,
 1009: 49,
 1023: 50,
 1024: 51,
 1025: 52,
 1029: 53,
 1030: 54,
 1031: 55,
 1032: 56,
 1042: 57,
 1049: 58,
 1060: 59,
 1073: 60,
 1080: 61,
 1089: 62,
 1090: 63,
 1092: 64,
 1097: 65,
 1127: 66,
 1136: 67,
 1196: 68,
 1197: 69,
 1198: 70,
 1206: 71,
 1208: 72,
 1210: 73,
 1213: 74,
 1214: 75,
 1219: 76,
 1220: 77,
 1222: 78,
 1224: 79,
 1226: 80,
 1240: 81,
 1256: 82,
 1258: 83,
 1265: 84,
 1270: 85,
 1275: 86,
 1278: 87,
 1282: 88,
 1291: 89,
 1298: 90,
 1348: 91,
 1377: 92,
 1396: 93,
 1408: 94,
 1445: 95,
 1473: 96,
 

# Split training and testing data

In [15]:
 from sklearn.model_selection import train_test_split

In [16]:
x_train, x_test = train_test_split(ratings, test_size=0.2)

In [17]:
print(len(x_train), len(x_test))

80668 20168


map movieId to the continous sequence

In [18]:
import numpy as np

In [19]:
m = []
for i in x_train['movieId'].to_numpy():
  m.append(movieIdDict[i])

np.array(m)

array([4970,  762, 1027, ...,  744, 1120,  194])

In [20]:
training_data_x = {'input_user': x_train['userId'].to_numpy(),
          'input_item': np.array(m)}

training_data_x

{'input_item': array([4970,  762, 1027, ...,  744, 1120,  194]),
 'input_user': array([140, 525, 469, ...,  66, 428, 410])}

In [21]:
cfmf = CFMF(user_num + 1, item_num)

In [22]:
model = cfmf.create_model()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_user (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_item (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
user_embedding (Embedding)      (None, 1, 10)        6110        input_user[0][0]                 
__________________________________________________________________________________________________
item_embedding (Embedding)      (None, 1, 10)        97240       input_item[0][0]                 
_______________________________________________________________________________________

In [23]:
model.fit(x=training_data_x, y=x_train['rating'].to_numpy(), batch_size=16, epochs=10, verbose=1, shuffle=True, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f5a1046e550>

In [24]:
x_test

Unnamed: 0,userId,movieId,rating,timestamp
78128,484,5159,4.0,1342300866
45313,298,88812,1.5,1467489499
61080,398,1042,4.5,1311207319
5747,41,5377,4.0,1458938982
54476,357,56805,4.5,1348627288
...,...,...,...,...
58759,382,58105,4.0,1515162028
32857,223,1136,3.5,1226210067
70023,448,93287,3.0,1357029139
57449,380,5476,3.0,1494268109


# Test Predict

In [27]:
test_data_x = {'input_user':np.array([251,]),
        'input_item':np.array([movieIdDict[356],])}

In [28]:
model.predict(x=test_data_x)

array([[4.8818984]], dtype=float32)

In [138]:
def predict(in_user, in_item):
  test_data_x = {'input_user':np.array([in_user,]),
        'input_item':np.array([movieIdDict[in_item],])}
  result = model.predict(x=test_data_x)
  return result[0][0]

In [139]:
predict(41, 5377)

3.4620132

# TODO

1. Implement testing (Top n error...)
2. Better Predict Interface


In [89]:
# get max user id
max_movie_id = np.max(ratings['movieId'].to_numpy())

In [93]:
def getRatedMovie(in_user, maxid = max_movie_id):
  rated = ratings.loc[ratings['userId'] == in_user]['movieId'].to_numpy()
  x = np.random.randint(maxid)
  while(x not in rated):
    x = np.random.randint(maxid)
  
  return x

In [146]:
def getNotRatedMovie(in_user, n, maxid = max_movie_id):
  rated = ratings.loc[ratings['userId'] == in_user]['movieId'].to_numpy()
  not_rated = []
  while(len(not_rated) < n):
    x = np.random.randint(maxid)
    if((x not in rated)and (x in movieIdDict.keys())):
      not_rated.append(x)

  return np.array(not_rated)

In [147]:
getNotRatedMovie(2,99)

array([   932,   4774,   6218,  91077,   7025,  46231, 175475, 172887,
         8866, 103210,   7882,  37475,   3042,  27790, 106487, 118082,
         2862, 120919, 138210,    352,   6509,  62383,   5570,   4490,
         4069,   3106,  51174, 128197,  91325,  78142, 134524,   2937,
        93855,  32017,  90576,   3045,   8391,   4678, 155892, 180045,
         1260, 116797,   3635,   5991,   3809, 149334,   1914,   4031,
         2043,   3422, 133281,   2232,   7932, 108689,   8917, 109897,
         1104,  68597,  95175,   3654,   2542,    294,   4614,   4127,
         5135,  71668,   4083, 180031,   8387,   6664, 111113,   4307,
         3673,   1565, 132660,   8014,  61167,   6310,   1194, 174909,
         5752,   6604,  56782,  38294,  94018, 121099,   3863,    638,
         4736, 170399,  61818,   1945, 102066,  81191,   5500, 101962,
         3389,  55241,   4775])

In [148]:
getRatedMovie(2)

333

In [149]:
eval_data = np.concatenate([[getRatedMovie(2)],getNotRatedMovie(2,99)])

# Top 5 hit

In [178]:
eval_data

array([ 80906,   4865, 159077,  27426,   2762,   6817,   1654,  48394,
         2402,  88515,  26726,   8451,   3145,   6798,   2137, 106542,
         3767,   1970,  49524,   8117,  52299,   4024,   1533,   5171,
         2076,   3390, 130840,   1257,   3389,   2210,   8511,   1445,
         1707,   5943, 130840,   3408,    499, 172583, 121035,   7411,
         5629, 102125,  77881,  34528,   5372,   2315, 124484,   6245,
       107436,  78772,   1202,   2179,   3826,  38198,  45440,  60069,
          259,   1513,  74789,   2577, 114044,   4521,   4180,  62799,
        53468, 175303,   3977,   2481,    673, 143559, 100737,  91128,
         5463,   7209,  71838,   3753,    509,  89840,  79224,   6334,
          945, 125974, 116985, 136838,  61394,   6382,  66915,  69122,
         2848,  72479, 152081, 182793,  39435, 109416,  34312,     65,
         5767,   1619, 149508,   2672])

In [150]:
eval_result = []

for i in eval_data:
  # movie id -> rating
  eval_result.append({i:predict(1,i)})

In [176]:
def myCmp(e):
  for i in e.values():
    return i

eval_result.sort(key=myCmp, reverse=True)

In [195]:
# top 5 error
eval_data[0] in [list(x.keys())[0] for x in eval_result[:5]]

True