In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import spacy
import warnings
from datetime import datetime
import torch
import torch.nn as nn
import torch.nn.functional as F

warnings.filterwarnings('ignore')
df1 = pd.read_csv('data/news_dataset.csv')
print(df1.shape[0])
df1.head(1)

1620779


Unnamed: 0,user_id,item_id,fullVisitorId,timestamp,pagePath,country,category,title,scores
0,0,294703,88b526ee2d655644615e5c18fbeb7c5c,1529661427,/,United States,"[{""name"": ""/Arts & Entertainment"", ""confidence...","Breaking news, independent China news",5702478


In [2]:
def reindex(df1):
    nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner', 'textcat'])
    unique_items = df1.drop_duplicates('pagePath').copy().reset_index()
    unique_items.drop(['user_id','item_id','fullVisitorId','timestamp','country','scores', 'index'], axis=1, inplace=True)
    unique_items = unique_items.reset_index().rename(columns={"index": "item_id"})
    unique_items["item_id"] = unique_items["item_id"] + 1
    unique_items['vector'] = unique_items['title'].apply(lambda doc: nlp(doc).vector)

    df_result = pd.merge(df1.drop(['user_id','item_id','title','category'], axis=1)
                         , unique_items, on='pagePath', how='left')

    unique_users = df1.drop_duplicates('fullVisitorId').copy().reset_index()
    unique_users = unique_users[['fullVisitorId']].reset_index().rename(columns={"index": "user_id"})
    df_result = pd.merge(df_result, unique_users, on='fullVisitorId', how='left')
    return df_result, unique_items, unique_users

df_result, unique_items, unique_users = reindex(df1)
print(df_result.shape[0])
df_result.head(2)

1620779


Unnamed: 0,fullVisitorId,timestamp,pagePath,country,scores,item_id,category,title,vector,user_id
0,88b526ee2d655644615e5c18fbeb7c5c,1529661427,/,United States,5702478,1,"[{""name"": ""/Arts & Entertainment"", ""confidence...","Breaking news, independent China news","[-0.22008367, 0.48308718, -0.055601496, 0.0240...",0
1,88b526ee2d655644615e5c18fbeb7c5c,1529671360,/,United States,5702478,1,"[{""name"": ""/Arts & Entertainment"", ""confidence...","Breaking news, independent China news","[-0.22008367, 0.48308718, -0.055601496, 0.0240...",0


In [3]:
df = df_result[['user_id', 'item_id', 'timestamp']]
# df['timestamp'] = df['timestamp'].apply(datetime.fromtimestamp)
df.head(3)

Unnamed: 0,user_id,item_id,timestamp
0,0,1,1529661427
1,0,1,1529671360
2,1,1,1529660632


In [4]:
from datetime import datetime
from pytz import timezone

def make_interval(start_time, end_time, timestamps):
    if start_time == 'first':
        start_timestamp = timestamps[0]
    else:
        start_datetime = datetime.strptime(start_time, '%d/%m/%Y %H:%M:%S')
        start_timestamp = (start_datetime - datetime(1970, 1, 1)).total_seconds()

    if end_time == 'last':
        end_timestamp = timestamps[-1]
    else:
        end_datetime = datetime.strptime(end_time, '%d/%m/%Y %H:%M:%S')
        end_timestamp = (end_datetime - datetime(1970, 1, 1)).total_seconds()

    return df[(df['timestamp'] > start_timestamp) & (df['timestamp'] < end_timestamp)]

df.sort_values(by='timestamp', inplace=True)
print('Start time:')
print(datetime.fromtimestamp(df.iloc[0][2], tz=timezone('US/Eastern')).strftime('%d/%m/%Y %H:%M:%S'))
print('End time:')
print(datetime.fromtimestamp(df.iloc[-1][2], tz=timezone('US/Eastern')).strftime('%d/%m/%Y %H:%M:%S'))

# choose date between 2018-06-22 00:00:00 and 2018-06-28 23:59:59
train = make_interval('22/06/2018 10:00:00', '22/06/2018 18:00:00', df.timestamp)
test  = make_interval('22/06/2018 18:00:00', '22/06/2018 19:00:00', df.timestamp)

print(f'Train engagements: {train.shape[0]}')
print(f'Train #users: {len(train.user_id.unique())}')
print(f'Train #items: {len(train.item_id.unique())}')
print(f'Test engagements: {test.shape[0]}')
print(f'Test #users: {len(test.user_id.unique())}')
print(f'Test #items: {len(test.item_id.unique())}')

Start time:
22/06/2018 00:00:00
End time:
28/06/2018 23:59:59
Train engagements: 67710
Train #users: 41060
Train #items: 1681
Test engagements: 13343
Test #users: 11617
Test #items: 674


In [5]:
from spotlight.interactions import Interactions

train_interactions = Interactions(train['user_id'].values, train['item_id'].values, timestamps = train['timestamp'].values)
test_interactions = Interactions(test['user_id'].values, test['item_id'].values, timestamps = test['timestamp'].values)
train_interactions

<Interactions dataset (105984 users x 2401 items x 67710 interactions)>

In [6]:
# from spotlight.factorization.implicit import ImplicitFactorizationModel
# model = ImplicitFactorizationModel(n_iter=10, embedding_dim=10, loss='adaptive_hinge', 
#                                    learning_rate=0.2, use_cuda='CUDA')
# model.fit(train_interactions)

In [7]:
# from spotlight.evaluation import precision_recall_score
# from spotlight.evaluation import mrr_score
# k = 10
# precision_recall = precision_recall_score(model, test_interactions, k=k)

# print(f'Precision@{k}: {precision_recall[0].mean()}')
# print(f'Recall@{k}: {precision_recall[1].mean()}')
# mrr = mrr_score(model, test_interactions)
# mrr

In [8]:
# 10 Precision@10: 0.06753034346216752 Recall@10: 0.6657698897756573
# 100 Precision@10: 0.08117414134458123 Recall@10: 0.8010888189312052
# 1000 Precision@10: 0.08268055435998968 Recall@10: 0.8156248847132896

In [9]:
text_embedding = np.vstack([[0]*300]+unique_items['vector'].tolist())
torch_embedding = nn.Embedding.from_pretrained(torch.FloatTensor(text_embedding))
torch_embedding

Embedding(4918, 300)

In [25]:
train_sequences = train_interactions.to_sequence(5)
test_sequences = test_interactions.to_sequence(5)

from spotlight.datasets.synthetic import generate_sequential
from spotlight.evaluation import sequence_mrr_score
from spotlight.evaluation import sequence_precision_recall_score, _get_precision_recall
from spotlight.sequence.representations import CNNNet, LSTMNet
from spotlight.layers import ScaledEmbedding, ZeroEmbedding
# from spotlight.sequence.implicit import ImplicitSequenceModel
from implicit import ImplicitSequenceModel


PADDING_IDX = 0
class LSTMNet1(nn.Module):
    def __init__(self, num_items, embedding_dim=32,
                 item_embedding_layer=None, sparse=False):

        super(LSTMNet1, self).__init__()

        self.embedding_dim = embedding_dim

        if item_embedding_layer is not None:
            self.item_embeddings = item_embedding_layer
        else:
            self.item_embeddings = ScaledEmbedding(num_items, embedding_dim,
                                                   padding_idx=PADDING_IDX,
                                                   sparse=sparse)
        print(self.item_embeddings)
        self.item_biases = ZeroEmbedding(num_items, 1, sparse=sparse,
                                         padding_idx=PADDING_IDX)

        self.lstm = nn.LSTM(batch_first=True,
                            input_size=embedding_dim,
                            hidden_size=embedding_dim)

    def user_representation(self, item_sequences):

        # Make the embedding dimension the channel dimension
        sequence_embeddings = (self.item_embeddings(item_sequences).permute(0, 2, 1))
        # Add a trailing dimension of 1
        sequence_embeddings = (sequence_embeddings.unsqueeze(3))
        # Pad it with zeros from left
        sequence_embeddings = (F.pad(sequence_embeddings, (0, 0, 1, 0)).squeeze(3))
        sequence_embeddings = sequence_embeddings.permute(0, 2, 1)

        user_representations, _ = self.lstm(sequence_embeddings)
        user_representations = user_representations.permute(0, 2, 1)

        return user_representations[:, :, :-1], user_representations[:, :, -1]

    def forward(self, user_representations, targets):
        target_embedding = (self.item_embeddings(targets).permute(0, 2, 1).squeeze())
        target_bias = self.item_biases(targets).squeeze()

        dot = ((user_representations * target_embedding).sum(1).squeeze())
        return target_bias + dot

from spotlight.layers import BloomEmbedding, ScaledEmbedding
embedding = BloomEmbedding(unique_items.shape[0]+1,
                               32,
                               compression_ratio=1.,
                               num_hash_functions=2)

lstm1 = LSTMNet1(unique_items.shape[0]+1, embedding_dim = 300, item_embedding_layer = torch_embedding)
lstm1.parameters()

Embedding(4918, 300)


<generator object Module.parameters at 0x7f852235eba0>

In [26]:
smodel = ImplicitSequenceModel(n_iter=20, batch_size = 10000, embedding_dim=32,
                              representation=lstm1, num_negative_samples = 5,
                              loss='adaptive_hinge', use_cuda=True) 

smodel.fit(train_sequences)

In [27]:
# mrr = sequence_mrr_score(smodel, test_interactions.to_sequence(5), exclude_preceding = False)
# mrr.shape

In [28]:
def sequence_precision_recall_score1(model, test, k=10, exclude_preceding=False):
    sequences = test.sequences[:, :-1]
    targets = test.sequences[:, -1:]
    precision_recalls = []
    for i in range(len(sequences)):
        predictions = -model.predict(sequences[i])
        if exclude_preceding:
            predictions[sequences[i]] = FLOAT_MAX

        predictions = predictions.argsort()[:k]
        precision_recall = _get_precision_recall(predictions, targets[i], k)
        precision_recalls.append(precision_recall)

    precision = np.array(precision_recalls)[:, 0]
    recall = np.array(precision_recalls)[:, 1]
    return precision, recall

k=10
precision_recall = sequence_precision_recall_score1(smodel, test_sequences, k = k)
print(f'Precision@{k}: {precision_recall[0].mean()}')
print(f'Recall@{k}: {precision_recall[1].mean()}')

Precision@10: 0.08190639759553459
Recall@10: 0.8190639759553456


In [None]:
# bpr
# 'cnn'
# 10 Precision@10: 0.01094031773293259 Recall@10: 0.10940317732932589
# 100 Precision@10: 0.08146844139115501 Recall@10: 0.81468441391155
# 'lstm'
# 100 Precision@10: 0.07891799055388579 Recall@10: 0.7891799055388579
# 1000 Precision@10: 0.0802060970373551 Recall@10: 0.8020609703735508
# adaptive_hinge
# 10 Precision@10: 0.08036066981537142 Recall@10: 0.803606698153714
# 20 Precision@10: 0.08182911120652642 Recall@10: 0.8182911120652641
# 30 Precision@10: 0.08175182481751826 Recall@10: 0.8175182481751825
# text embedding
# 10 Precision@10: 0.08177758694718765  Recall@10: 0.8177758694718763
# 20 Precision@10: 0.08190639759553459  Recall@10: 0.8190639759553456
# 30 Precision@10: 0.08186346071275227 Recall@10: 0.8186346071275226
