In [5]:
import keras 
from Dataset import Dataset

In [6]:
dataset = Dataset('./data/ml-1m')

In [7]:
train, testRatings, testNegatives = dataset.trainMatrix, dataset.testRatings, dataset.testNegatives

In [8]:
train.shape # train is sparse matrix 6040x3706

(6040, 3706)

In [12]:
len(testRatings) # 6040 list 

6040

In [13]:
testRatings[:10]

[[0, 25],
 [1, 133],
 [2, 207],
 [3, 208],
 [4, 222],
 [5, 396],
 [6, 74],
 [7, 91],
 [8, 514],
 [9, 659]]

In [17]:
len(testNegatives) # 6040 list

6040

In [26]:
len(testNegatives[:10][2]) # 99

99

In [27]:
num_users,num_items = train.shape

In [70]:
from keras.models import Input, Model
from keras.layers import Embedding, Flatten, merge, Dense,regularizers
from keras.layers import concatenate

In [67]:
user_input = Input(shape = (1,), name = 'user_input',dtype = 'int32')
MLP_Embedding_User = Embedding(input_dim = num_users,output_dim = int(layers[0]/2))
MLP_Embedding_User(user_input)

<tf.Tensor 'embedding_2/Gather:0' shape=(?, 1, 10) dtype=float32>

In [76]:
model = get_model(num_users,num_items)

In [75]:
def get_model(num_users, num_items, layers = [20,10], reg_layers=[0,0]):
    assert len(layers) == len(reg_layers)
    num_layer = len(layers) #Number of layers in the MLP
    # Input variables
    user_input = Input(shape=(1,), dtype='int32', name = 'user_input')
    item_input = Input(shape=(1,), dtype='int32', name = 'item_input')

    MLP_Embedding_User = Embedding(input_dim = num_users,
                                   output_dim = int(layers[0]/2), 
                                   name = 'user_embedding',
                                   embeddings_initializer = 'uniform',
                                   embeddings_regularizer = regularizers.l2(0),
                                   input_length=1) # model size : (None,1,layers[0]/2)
    
    MLP_Embedding_Item = Embedding(input_dim = num_items,
                                   output_dim = int(layers[0]/2),
                                   name = 'item_embedding',
                                   embeddings_initializer = 'uniform',
                                   embeddings_regularizer = regularizers.l2(0),
                                   input_length=1)   
    
    # Crucial to flatten an embedding vector!
    user_latent = Flatten()(MLP_Embedding_User(user_input))
    item_latent = Flatten()(MLP_Embedding_Item(item_input))
    
    # The 0-th layer is the concatenation of embedding layers
    vector = concatenate([user_latent, item_latent])
    
    # MLP layers
    for idx in range(1, num_layer):
        layer = Dense(layers[idx],
                      kernel_regularizer = regularizers.l2(reg_layers[idx]), activation='relu', name = 'layer%d' %idx)
        vector = layer(vector)
        
    # Final prediction layer
    prediction = Dense(1, activation='sigmoid', kernel_initializer='lecun_uniform', name = 'prediction')(vector)
    
    model = Model(inputs=[user_input, item_input], 
                  outputs=prediction)
    
    return model

In [33]:
u_input = Input(shape = (1,), dtype='int32', name = 'user_input')
i_input = Input(shape = (1,), dtype = 'int32', name = 'item_input')

In [34]:
layers = [20,10]
reg_layers = [0,0]

In [31]:
MLP_Embedding_User = Embedding(input_dim = num_users, 
                               output_dim = layers[0]/2,
                               name = 'user_embedding',
                               embeddings_initializer = 'uniform',                               
                               W_regularizer = l2(reg_layers[0]),
                               input_length=1)

TensorShape([Dimension(None), Dimension(1)])

In [47]:
MLP_Embedding_Item = Embedding(input_dim = num_items,
                               output_dim = layers[0]/2, 
                               name = 'item_embedding',
                               embeddings_initializer = 'uniform',
                               embeddings_regularizer = regularizers.l2(0),
                               input_length=1) # model size : (None,1,layers[0]/2)
    

In [44]:
MLP_Embedding_User.get_config()

{'activity_regularizer': None,
 'batch_input_shape': (None, 1),
 'dtype': 'float32',
 'embeddings_constraint': None,
 'embeddings_initializer': {'class_name': 'RandomUniform',
  'config': {'maxval': 0.05, 'minval': -0.05, 'seed': None}},
 'embeddings_regularizer': {'class_name': 'L1L2',
  'config': {'l1': 0.0, 'l2': 0.0}},
 'input_dim': 6040,
 'input_length': 1,
 'mask_zero': False,
 'name': 'user_embedding',
 'output_dim': 10.0,
 'trainable': True}

In [48]:
MLP_Embedding_Item.get_config()

{'activity_regularizer': None,
 'batch_input_shape': (None, 1),
 'dtype': 'float32',
 'embeddings_constraint': None,
 'embeddings_initializer': {'class_name': 'RandomUniform',
  'config': {'maxval': 0.05, 'minval': -0.05, 'seed': None}},
 'embeddings_regularizer': {'class_name': 'L1L2',
  'config': {'l1': 0.0, 'l2': 0.0}},
 'input_dim': 3706,
 'input_length': 1,
 'mask_zero': False,
 'name': 'item_embedding',
 'output_dim': 10.0,
 'trainable': True}

In [106]:
import importlib
importlib.reload(evaluate)

<module 'evaluate' from 'd:\\ihong\\py_repo\\DL\\keras_tutorial\\v2\\evaluate.py'>

In [111]:
import evaluate
import numpy as np 
from time import time

In [121]:
learning_rate = 0.001
model.compile(optimizer=keras.optimizers.Adagrad(lr=learning_rate), loss='binary_crossentropy')

# evaluate_model(model, testRatings, testNegatives, K = 10, num_thread = 1)
t1 = time()
(hits, ndcgs) = evaluate_model(model, testRatings, testNegatives, K=10, num_thread = 1)
hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
print('Init: HR = %.4f, NDCG = %.4f [%.1f]' %(hr, ndcg, time()-t1))


Init: HR = 0.0934, NDCG = 0.0416 [4.2]


In [115]:
def get_train_instances(train, num_negatives):
    user_input, item_input, labels = [],[],[]
    num_users = train.shape[0]
    for (u, i) in train.keys():
        # positive instance
        user_input.append(u)
        item_input.append(i)
        labels.append(1)
        # negative instances
        for t in range(num_negatives):
            j = np.random.randint(num_items)
            while (u and j) in train.keys():
                j = np.random.randint(num_items)
            user_input.append(u)
            item_input.append(j)
            labels.append(0)
    return user_input, item_input, labels

In [127]:
# Train model
num_negatives = 4 # 
epochs = 20 # 
batch_size = 256
layers = [64,32,16,8]
verbose = 1 
topK = 10
evaluation_threads = 1
# out = 1
#####

best_hr, best_ndcg, best_iter = hr, ndcg, -1
for epoch in range(epochs):
    t1 = time()
    # Generate training instances
    user_input, item_input, labels = get_train_instances(train, num_negatives)

    # Training        
    hbist = model.fit([np.array(user_input), np.array(item_input)], #input
                     np.array(labels), # labels 
                     batch_size=batch_size, epochs=1, verbose=0, shuffle=True)
    t2 = time()

    # Evaluation
    if epoch %verbose == 0:
        (hits, ndcgs) = evaluate_model(model, testRatings, testNegatives, topK, evaluation_threads)
        hr, ndcg, loss = np.array(hits).mean(), np.array(ndcgs).mean(), hist.history['loss'][0]
        print('Iteration %d [%.1f s]: HR = %.4f, NDCG = %.4f, loss = %.4f [%.1f s]' 
              % (epoch,  t2-t1, hr, ndcg, loss, time()-t2))
        if hr > best_hr:
            best_hr, best_ndcg, best_iter = hr, ndcg, epoch
#             if out > 0:
#                 model.save_weights(model_out_file, overwrite=True)

print("End. Best Iteration %d:  HR = %.4f, NDCG = %.4f. " %(best_iter, best_hr, best_ndcg))

Iteration 0 [32.8 s]: HR = 0.4536, NDCG = 0.2533, loss = 0.3905 [4.2 s]
Iteration 1 [32.0 s]: HR = 0.4538, NDCG = 0.2536, loss = 0.3904 [4.1 s]
Iteration 2 [31.5 s]: HR = 0.4548, NDCG = 0.2540, loss = 0.3904 [4.2 s]
Iteration 3 [32.6 s]: HR = 0.4540, NDCG = 0.2534, loss = 0.3899 [4.2 s]
Iteration 4 [34.3 s]: HR = 0.4545, NDCG = 0.2534, loss = 0.3900 [4.2 s]
Iteration 5 [32.6 s]: HR = 0.4541, NDCG = 0.2537, loss = 0.3899 [4.2 s]
Iteration 6 [32.1 s]: HR = 0.4535, NDCG = 0.2535, loss = 0.3898 [4.2 s]
Iteration 7 [32.3 s]: HR = 0.4533, NDCG = 0.2534, loss = 0.3897 [4.2 s]
Iteration 8 [31.9 s]: HR = 0.4533, NDCG = 0.2530, loss = 0.3896 [4.2 s]
Iteration 9 [32.2 s]: HR = 0.4535, NDCG = 0.2534, loss = 0.3894 [4.1 s]
Iteration 10 [32.8 s]: HR = 0.4530, NDCG = 0.2536, loss = 0.3894 [4.2 s]
Iteration 11 [31.6 s]: HR = 0.4540, NDCG = 0.2539, loss = 0.3894 [4.2 s]
Iteration 12 [32.1 s]: HR = 0.4533, NDCG = 0.2536, loss = 0.3892 [4.1 s]
Iteration 13 [31.8 s]: HR = 0.4536, NDCG = 0.2538, loss = 0.3

In [138]:
import math
import heapq # for retrieval topK
import multiprocessing
import numpy as np
from time import time

#from numba import jit, autojit

# Global variables that are shared across processes
_model = None
_testRatings = None
_testNegatives = None
_K = None

def evaluate_model(model, testRatings, testNegatives, K, num_thread):
    """
    Evaluate the performance (Hit_Ratio, NDCG) of top-K recommendation
    Return: score of each test rating.
    """
    global _model
    global _testRatings
    global _testNegatives
    global _K
    _model = model
    _testRatings = testRatings
    _testNegatives = testNegatives
    _K = K
        
    hits, ndcgs = [],[]
    if(num_thread > 1): # Multi-thread
        pool = multiprocessing.Pool(processes=num_thread)
        res = pool.map(eval_one_rating, range(len(_testRatings)))
        pool.close()
        pool.join()
        hits = [r[0] for r in res]
        ndcgs = [r[1] for r in res]
        return (hits, ndcgs)
    # Single thread
    for idx in range(len(_testRatings)):
        (hr,ndcg) = eval_one_rating(idx)
        hits.append(hr)
        ndcgs.append(ndcg)      
    return (hits, ndcgs)

def eval_one_rating(idx):
    rating = _testRatings[idx]
    items = _testNegatives[idx]
    u = rating[0]
    gtItem = rating[1]
    items.append(gtItem)
    # Get prediction scores
    map_item_score = {}
    users = np.full(len(items), u, dtype = 'int32')
    predictions = _model.predict([users, np.array(items)], 
                                 batch_size=100, verbose=0)
    for i in range(len(items)):
        item = items[i]
        map_item_score[item] = predictions[i]
    items.pop()
    
    # Evaluate top rank list
    ranklist = heapq.nlargest(_K, map_item_score, key=map_item_score.get)
    hr = getHitRatio(ranklist, gtItem)
    ndcg = getNDCG(ranklist, gtItem)
    return (hr, ndcg)

def getHitRatio(ranklist, gtItem):
    for item in ranklist:
        if item == gtItem:
            return 1
    return 0

def getNDCG(ranklist, gtItem):
    for i in range(len(ranklist)):
        item = ranklist[i]
        if item == gtItem:
            return math.log(2) / math.log(i+2)
    return 0


In [148]:
users


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0])

In [151]:
users

array([133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133,
       133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133,
       133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133,
       133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133,
       133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133,
       133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133,
       133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133,
       133, 133, 133, 133, 133, 133, 133, 133])

In [153]:
np.array(items).shape

(99,)

In [155]:
np.array(items)

array([1072, 3154, 3368, 3644,  549, 1810,  937, 1514, 1713, 2186,  660,
       2303, 2416,  670, 1176,  788,  889, 3120, 2344, 2525, 3301, 2055,
       1436, 2630,   11, 2773, 2176, 1847,  740, 2332, 3561,  263, 3658,
       3282, 1980, 2093, 3287, 3190, 3475,  569, 2315, 1442,  592,  546,
       3133, 1852, 2648,  934,  337,  483, 1017, 3452,  467, 1183, 1765,
        601, 2413, 2602, 2801, 2976,  918,  753, 3540, 3341, 2973, 1580,
       2118, 3511,  526, 1719,  525, 1520,  486,  557, 1353,  500, 2902,
       1687, 1295, 2997, 2415,  797, 2518,  926, 3537, 1746, 1676, 1875,
       3029, 1535,  341, 3525, 1429, 2225, 1628, 2061,  469, 3056, 2553])

In [158]:
rating = testRatings[1];
items = testNegatives[1]; # len = 99
u = rating[1] # 0
users = np.full(len(items), u, dtype = 'int32')
predictions = model.predict([users, np.array(items)],batch_size = 100, verbose=0) # users -> (99,) , items -> (99,)
predictions.shape # (99,1)

(99, 1)

In [160]:
map_item_score = {}
for i in range(len(items)):
    item = items[i]
    map_item_score[item] = predictions[i]
items.pop()

2553

In [161]:
map_item_score

{11: array([ 0.14165312], dtype=float32),
 263: array([ 0.4957276], dtype=float32),
 337: array([ 0.30453429], dtype=float32),
 341: array([ 0.05421362], dtype=float32),
 467: array([ 0.29047358], dtype=float32),
 469: array([ 0.24630435], dtype=float32),
 483: array([ 0.24569876], dtype=float32),
 486: array([ 0.53998804], dtype=float32),
 500: array([ 0.34748152], dtype=float32),
 525: array([ 0.40124953], dtype=float32),
 526: array([ 0.22707066], dtype=float32),
 546: array([ 0.48369297], dtype=float32),
 549: array([ 0.0347945], dtype=float32),
 557: array([ 0.27600738], dtype=float32),
 569: array([ 0.17987074], dtype=float32),
 592: array([ 0.21300307], dtype=float32),
 601: array([ 0.24544401], dtype=float32),
 660: array([ 0.13015649], dtype=float32),
 670: array([ 0.16243596], dtype=float32),
 740: array([ 0.46578985], dtype=float32),
 753: array([ 0.33468956], dtype=float32),
 788: array([ 0.56217146], dtype=float32),
 797: array([ 0.511473], dtype=float32),
 889: array([ 0.

In [140]:
evaluate_model(model,testRatings=testRatings,testNegatives=testNegatives, K=10, num_thread=1)

([0,
  1,
  1,
  0,
  0,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  0,
  1,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  1,
  0,
  1,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  1,
  0,
  1,
  1,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  1,
  1,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  1,
  1,
  0,
  0,
  1,
  0,
  1,
  1,
  0,
  1,
  1,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  1,
  0,
  1,
  1,
  1,
  0,
  0,
  1,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  1,
  0,
  1,
  0,
  0,
  0,
  1,
  1,
  0,
  1,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  1,
  1,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  1,
  0,
  0,
  0,
