In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [12]:
df = pd.read_csv('data/theepochtimes_com/news_dataset.csv', usecols=['user_id', 'item_id', 'timestamp'])
df.head(10)

Unnamed: 0,user_id,item_id,timestamp
0,0,294703,1529661427
1,0,294703,1529671360
2,1,294703,1529660632
3,2,294703,1529669019
4,2,294703,1529675314
5,3,294703,1529712267
6,2,294703,1529696333
7,0,294703,1529724454
8,4,294703,1529705798
9,2,294703,1529721260


In [13]:
from datetime import datetime
from pytz import timezone

def make_interval(start_time, end_time, timestamps):
    if start_time == 'first':
        start_timestamp = timestamps[0]
    else:
        start_datetime = datetime.strptime(start_time, '%d/%m/%Y %H:%M:%S')
        start_timestamp = (start_datetime - datetime(1970, 1, 1)).total_seconds()

    if end_time == 'last':
        end_timestamp = timestamps[-1]
    else:
        end_datetime = datetime.strptime(end_time, '%d/%m/%Y %H:%M:%S')
        end_timestamp = (end_datetime - datetime(1970, 1, 1)).total_seconds()

    return df[(df['timestamp'] > start_timestamp) & (df['timestamp'] < end_timestamp)]

df.sort_values(by='timestamp', inplace=True)
print('Start time:')
print(datetime.fromtimestamp(df.iloc[0][2], tz=timezone('US/Eastern')).strftime('%d/%m/%Y %H:%M:%S'))
print('End time:')
print(datetime.fromtimestamp(df.iloc[-1][2], tz=timezone('US/Eastern')).strftime('%d/%m/%Y %H:%M:%S'))

# choose date between 2018-06-22 00:00:00 and 2018-06-28 23:59:59
train = make_interval('22/06/2018 10:00:00', '22/06/2018 18:00:00', df.timestamp)
test  = make_interval('22/06/2018 18:00:00', '22/06/2018 19:00:00', df.timestamp)

print()
print(f'Train #users: {len(train.user_id.unique())}')
print(f'Train #items: {len(train.item_id.unique())}')
print(f'Test #users: {len(test.user_id.unique())}')
print(f'Test #items: {len(test.item_id.unique())}')

Start time:
22/06/2018 00:00:00
End time:
28/06/2018 23:59:59

Train #users: 41060
Train #items: 1681
Test #users: 11617
Test #items: 674


In [14]:
from spotlight.interactions import Interactions
train_interactions = Interactions(train['user_id'].values, train['item_id'].values, train['timestamp'].values)
test_interactions = Interactions(test['user_id'].values, test['item_id'].values, test['timestamp'].values)

In [22]:
from spotlight.factorization.implicit import ImplicitFactorizationModel

model = ImplicitFactorizationModel(n_iter=1, embedding_dim=10, loss='adaptive_hinge', learning_rate=0.2)
model.fit(train_interactions)

In [23]:
from spotlight.evaluation import precision_recall_score

k = 10
precision_recall = precision_recall_score(model, test_interactions, k=k)

print(f'Precision@{k}: {precision_recall[0].mean()}')
print(f'Recall@{k}: {precision_recall[1].mean()}')

Precision@10: 0.0011276577429628991
Recall@10: 0.007146443840512876
