<a href="https://colab.research.google.com/github/jefffang19/Keras_recommandSys/blob/master/MatrixFac.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model Sturcture : Collaborative Filtering Matrix Factorization

In [1]:
from tensorflow.keras.layers import Embedding, Input, Dot, Flatten
from tensorflow.keras import Model

In [2]:
from tensorflow.keras import optimizers

In [3]:
import tensorflow as tf
tf.keras.backend.clear_session()

In [4]:
#custom accuracy calculation
def accuracy_w_threshhold_metric_fn(y_true, y_pred) :
  approx = tf.abs(y_true - y_pred) <= 0.25
  approx = tf.cast(approx, tf.int32)
  return tf.reduce_mean(approx, axis=-1)

In [5]:
class CFMF():
  def __init__(self, u, i):
    self._user_num = u
    self._item_num = i
    self._embed_dim = 10
    self._learning_rate = 0.001

  def create_model(self):
    #inputs
    user_input = Input(shape=(1,), name='input_user')
    item_input = Input(shape=(1,), name='input_item')

    user_embed = Embedding(self._user_num, self._embed_dim, name='user_embedding')(user_input)
    item_embed = Embedding(self._item_num, self._embed_dim, name='item_embedding')(item_input)

    mat = Dot(axes=2)([user_embed, item_embed])

    x = Flatten()(mat)

    model = Model(inputs=[user_input, item_input], outputs=x)
    #accuracy: choose binary accuracy becauese it can approximate the model output to match the label(ground truth)
    model.compile(loss='mse', optimizer=optimizers.Adam(learning_rate=self._learning_rate), metrics=[accuracy_w_threshhold_metric_fn,])

    print(model.summary())

    return model

    


# Process MovieLens 100k data

In [6]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [7]:
import pandas as pd
import numpy as np

In [8]:
ratings = pd.read_csv('gdrive/My Drive/iir_training_python/Recommendation System/ratings.csv')

In [9]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [10]:
query0 = ratings[ratings['userId'] == 1]
query1 = query0[query0['movieId'] == 50]
list(query1.index)[0]

4

take the last movie that each user rated ( leave-one-out )

In [None]:
# { user : movieId }
leaveOneOut = {}
# loop through all the users who had rated
for i in np.unique(ratings['userId'].to_numpy()):
  # get the last rated MovieId of user[i]
  leaveOneOut[i] = ratings[ratings['userId'] == i]['movieId'].to_numpy()[-1]

print(leaveOneOut)
len(leaveOneOut)

In [12]:
# get the number of users and items
user_num = len(ratings['userId'].unique())
item_num = len(ratings['movieId'].unique())
print(user_num, item_num)

610 9724


The movieId is not continuos, so we map them to a continous sequence


In [None]:
# { 'original movieId' : 'mapped movieId (continuous)' }
tmpDict = dict(enumerate(ratings['movieId'].unique()))
movieIdDict = {v: k for k, v in tmpDict.items()}
print(movieIdDict)

# Split training and testing data (WRONG, DO NOT DO)

In [None]:
 from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test = train_test_split(ratings, test_size=0.2)

In [None]:
print(len(x_train), len(x_test))

80668 20168


# Process Training Data

In [14]:
trainset = ratings
# drop the last movie userId[i] rated from the training set
for i in leaveOneOut.keys():
  query0 = trainset[trainset['userId'] == i]
  query1 = query0[query0['movieId'] == leaveOneOut[i]]
  #print(list(query1.index))
  trainset = trainset.drop(list(query1.index))

In [15]:
trainset

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100830,610,166528,4.0,1493879365
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047


shuffle trainset

In [16]:
from sklearn.utils import shuffle

In [17]:
trainset = shuffle(trainset)

In [18]:
trainset

Unnamed: 0,userId,movieId,rating,timestamp
97637,606,1580,2.5,1171310310
15703,103,2294,4.0,1431957820
89632,580,6645,4.0,1167861051
90510,589,736,5.0,856038815
18280,116,586,3.5,1337199910
...,...,...,...,...
75686,477,2701,3.0,1200941197
20372,135,485,5.0,1009692293
71370,458,236,4.0,845652668
11779,72,593,4.5,1217324766


map movieId to the continous sequence

In [19]:
import numpy as np

In [20]:
m = []
for i in trainset['movieId'].to_numpy():
  m.append(movieIdDict[i])

np.array(m)

array([ 101, 1592, 5417, ...,  560,   34, 5557])

In [21]:
training_data_x = {'input_user': trainset['userId'].to_numpy(),
          'input_item': np.array(m)}

training_data_x

{'input_item': array([ 101, 1592, 5417, ...,  560,   34, 5557]),
 'input_user': array([606, 103, 580, ..., 458,  72, 191])}

In [22]:
training_data_y = trainset['rating'].to_numpy()
training_data_y

array([2.5, 4. , 4. , ..., 4. , 4.5, 5. ])

In [23]:
cfmf = CFMF(user_num + 1, item_num)

In [24]:
model = cfmf.create_model()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_user (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_item (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
user_embedding (Embedding)      (None, 1, 10)        6110        input_user[0][0]                 
__________________________________________________________________________________________________
item_embedding (Embedding)      (None, 1, 10)        97240       input_item[0][0]                 
_______________________________________________________________________________________

In [None]:
model.fit(x=training_data_x, y=training_data_y, batch_size=16, epochs=50, verbose=1, shuffle=True, validation_split=0.2)

# Test Predict interface

In [26]:
def predict(in_user, in_item):
  test_data_x = {'input_user':np.array([in_user,]),
        'input_item':np.array([movieIdDict[in_item],])}
  # calling predict() with batch is very slow, call model(x) directly is faster
  result = model(test_data_x, training=False)
  return result.numpy()[0,0]

In [27]:
predict(41, 5377)

3.801024

# Create eval data

In [None]:
# get max user id
max_movie_id = np.max(ratings['movieId'].to_numpy())
max_movie_id

193609

In [None]:
def getRatedMovie(in_user, maxid = max_movie_id):
  return leaveOneOut[in_user]

In [None]:
getRatedMovie(1)

5060

In [None]:
def getNotRatedMovie(in_user, n, maxid = max_movie_id):
  rated = ratings.loc[ratings['userId'] == in_user]['movieId'].to_numpy()
  not_rated = []
  while(len(not_rated) < n):
    x = np.random.randint(maxid)
    if((x not in rated)and (x in movieIdDict.keys())):
      not_rated.append(x)

  return np.array(not_rated)

In [None]:
getNotRatedMovie(1,99)

array([  7702,   4728,    984, 136018,   3365,  97701,   1526,   5168,
       106542,   4677,   4211,   2624,    587, 142602,  59333,  69604,
         3528,   3586,   3957,   6898,   4407, 141544, 190213,   1446,
        59784,   7730,   5136, 147330,    841,  82765,   3662,   3306,
         3307,   3586,   8360,   8596,   7802, 107449,  79073,   6763,
          319,   6571,  26797,    837,  61257,  59915,   4259, 128736,
        63853,   5353,  90471,   6567, 120138, 175293,  98697,   5066,
         5283,   6990, 140289,  96829, 136540,  95858,  58105,    161,
         2488,  54962,    147,  97866,   2423, 168144,   7015,  42556,
       100882,  74075, 136471,  49265, 160573,  92643,   4756,  37736,
         2320,   4234,  26681,    520, 152970, 131610,   2979,   3802,
        69805,   4881,    201,   8966,   3929,  55061,   6773,  33781,
         2080,  54648,   1271])

In [None]:
eval_data = []

for i in leaveOneOut.keys():
  eval_data.append(np.concatenate([[getRatedMovie(i)],getNotRatedMovie(i,99)]))


In [None]:
len(eval_data)

610

In [None]:
eval_data

# Top n hit

In [None]:
top_1_hit = 0
top_5_hit = 0

# for each user
for user in range(len(eval_data)):
  eval_result = []

  # predict eval_data
  for i in eval_data[user]:
    eval_result.append((i,predict(user+1,i)))

  eval_result.sort(key= lambda s:s[1], reverse=True)

  #top 1 hit
  if(eval_data[user][0] == eval_result[0][0] ):
    top_1_hit += 1

  top_5_rate = [i[0] for i in eval_result[:5]]
  #top 5 hit
  if(eval_data[user][0] in top_5_rate ):
    top_5_hit += 1
  
  #print('user {} {}'.format(user,top_1_hit))
  #print('user {} {}'.format(user,top_5_hit))
  #print(eval_result)
  print('{}/{}'.format(user+1,len(eval_data)))
  

In [None]:
#top n hit rate
print('top 1 hit rate : {}'.format(top_1_hit/len(eval_data)) )
print('top 5 hit rate : {}'.format(top_5_hit/len(eval_data)) )

top 1 hit rate : 0.03934426229508197
top 5 hit rate : 0.13442622950819672


In [None]:
x=[(1,5),(2,7),(3,3)]

In [None]:
x.sort(key= lambda s:s[1], reverse=True)

# Test Accuracy

In [28]:
ratings = pd.read_csv('gdrive/My Drive/iir_training_python/Recommendation System/ratings.csv')

In [29]:
# the last movie and rating of each user
test_data = []

for i in leaveOneOut.keys():
  query0 = ratings[ratings['userId'] == i]
  query1 = query0[query0['movieId'] == leaveOneOut[i]]
  test_data.append( (movieIdDict[query1['movieId'].values[0]], query1['rating'].values[0]) )
  

In [30]:
test_data = np.array(test_data)
# divide to test_x(data) and test_y(label)
test_x = test_data[...,0]
test_y = test_data[...,1]

test_data_x = {'input_user': np.array(range(len(test_x))),
          'input_item': test_x}

In [31]:
model.evaluate(x=test_data_x, y=test_y)



[3.1609222888946533, 0.1213114783167839]

# Theory
Accuracy is 0 due to our model perdict values are floats, but all the labels are either int or .5