In [1]:
import pandas as pd
import numpy as np
import os
import glob
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('data/theepochtimes_com/news_dataset.csv', usecols=['user_id', 'item_id', 'timestamp'])

df.head(10)

Unnamed: 0,user_id,item_id,timestamp
0,0,294703,1529661427
1,0,294703,1529671360
2,1,294703,1529660632
3,2,294703,1529669019
4,2,294703,1529675314
5,3,294703,1529712267
6,2,294703,1529696333
7,0,294703,1529724454
8,4,294703,1529705798
9,2,294703,1529721260


In [3]:
df['rating'] = 1

In [4]:
from datetime import datetime
from pytz import timezone

def make_interval(start_time, end_time, timestamps):
    if start_time == 'first':
        start_timestamp = timestamps[0]
    else:
        start_datetime = datetime.strptime(start_time, '%d/%m/%Y %H:%M:%S')
        start_timestamp = (start_datetime - datetime(1970, 1, 1)).total_seconds()

    if end_time == 'last':
        end_timestamp = timestamps[-1]
    else:
        end_datetime = datetime.strptime(end_time, '%d/%m/%Y %H:%M:%S')
        end_timestamp = (end_datetime - datetime(1970, 1, 1)).total_seconds()

    return df[(df['timestamp'] > start_timestamp) & (df['timestamp'] < end_timestamp)]

df.sort_values(by='timestamp', inplace=True)
print('Start time:')
print(datetime.fromtimestamp(df.iloc[0][2], tz=timezone('US/Eastern')).strftime('%d/%m/%Y %H:%M:%S'))
print('End time:')
print(datetime.fromtimestamp(df.iloc[-1][2], tz=timezone('US/Eastern')).strftime('%d/%m/%Y %H:%M:%S'))

# choose date between 2018-06-22 00:00:00 and 2018-06-28 23:59:59
train = make_interval('22/06/2018 10:00:00', '22/06/2018 18:00:00', df.timestamp)
test  = make_interval('22/06/2018 18:00:00', '22/06/2018 19:00:00', df.timestamp)

print()
print(f'Train #users: {len(train.user_id.unique())}')
print(f'Train #items: {len(train.item_id.unique())}')
print(f'Test #users: {len(test.user_id.unique())}')
print(f'Test #items: {len(test.item_id.unique())}')

Start time:
22/06/2018 00:00:00
End time:
28/06/2018 23:59:59

Train #users: 41060
Train #items: 1681
Test #users: 11617
Test #items: 674


In [5]:
from keras.callbacks import Callback

class Metrics(Callback):

    def on_train_begin(self, logs={}):
        self.val_recalls = []
        self.val_precisions = []

    def on_epoch_end(self, epoch, logs={}):

        val_recall = calculate_average_recall_at_k(self.model, test, k)
        val_precision = calculate_average_precision_at_k(self.model, test, k)

        self.val_recalls.append(val_recall)
        self.val_precisions.append(val_precision)
        print(f' - precision@{k}: {val_precision} - recall@{k}: {val_recall}')
        return

def calculate_average_precision_at_k(model, test, k=10):
    test['predicted'] = model.predict([test.user_id, test.item_id])
    users = list(set(test['user_id']))

    precisions_for_each_user = []
    for user in users:
        sample = test[test.user_id == user]
        top_k = get_top_k_items(sample['item_id'], sample['predicted'], k)
        precision = sum(sample['rating']) / k
        precisions_for_each_user.append(precision)
    average_precision = np.mean(precisions_for_each_user)
    return np.round(average_precision, 4)

def calculate_average_recall_at_k(model, test, k=10):
    test['predicted'] = model.predict([test.user_id, test.item_id])
    users = list(set(test['user_id']))

    recall_for_each_user = []
    for user in users:
        sample = test[test.user_id == user]
        top_k = get_top_k_items(sample['item_id'], sample['predicted'], k)
        recall = sum(sample['rating'][top_k.index]) / len(sample['rating'])
        recall_for_each_user.append(recall)
    average_recall = np.mean(recall_for_each_user)
    return np.round(average_recall, 4)

def get_top_k_items(items, predicted, k):
    top_k = (predicted.sort_values()[-k:])
    return items[top_k.index]

k = 10
metrics = Metrics()

Using TensorFlow backend.


In [6]:
import keras
from keras.optimizers import Adam
from keras.utils.vis_utils import model_to_dot

n_users, n_items = len(df.user_id.unique()), len(df.item_id.unique())
embedding_size = 5

users_input = keras.layers.Input(shape=[1],name='User')
users_embedding = keras.layers.Embedding(n_users + 1, embedding_size, name='User-Embedding')(users_input)
users_vec = keras.layers.Flatten(name='FlattenUsers')(users_embedding)

items_input = keras.layers.Input(shape=[1],name='Item')
items_embedding = keras.layers.Embedding(n_items + 1, embedding_size, name='Item-Embedding')(items_input)
items_vec = keras.layers.Flatten(name='FlattenItems')(items_embedding)

prod = keras.layers.merge([users_vec, items_vec], mode='dot', name='DotProduct')
model_mf = keras.Model(inputs=[users_input, items_input], outputs=prod)
model_mf.compile(optimizer='adam', loss='mean_squared_error')
model_mf.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
User (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
Item (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
User-Embedding (Embedding)      (None, 1, 5)         2849685     User[0][0]                       
__________________________________________________________________________________________________
Item-Embedding (Embedding)      (None, 1, 5)         24590       Item[0][0]                       
__________________________________________________________________________________________________
FlattenUse

In [7]:
from scipy.sparse import coo_matrix

interactions = coo_matrix((np.ones(len(train),int), (train.user_id.values, train.item_id.values)))

In [None]:
history_mf = model_mf.fit(x=[train.user_id.values, train.item_id.values], y=???, epochs=10, callbacks=[metrics])