In [22]:
import pandas as pd
import numpy as np
import datetime as dt
from datetime import datetime, timezone
from sklearn import preprocessing
import helper_functions

import hashlib
import json
import os
import shutil
import sys

import numpy as np

from sklearn.model_selection import ParameterSampler

from spotlight.cross_validation import user_based_train_test_split
from spotlight.sequence.implicit import ImplicitSequenceModel
from spotlight.sequence.representations import CNNNet
from spotlight.evaluation import sequence_mrr_score


CUDA = (os.environ.get('CUDA') is not None or
        shutil.which('nvidia-smi') is not None)

NUM_SAMPLES = 100

LEARNING_RATES = [1e-3, 1e-2, 5 * 1e-2, 1e-1]
LOSSES = ['bpr', 'hinge', 'adaptive_hinge', 'pointwise']
BATCH_SIZE = [8, 16, 32, 256]
EMBEDDING_DIM = [8, 16, 32, 64, 128, 256]
N_ITER = list(range(5, 20))
L2 = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 0.0]


## Data Prepration

In [23]:
# df = pd.read_csv('Final_Data.csv')

In [24]:
# df.rename(columns = {'event_time':'Time', 'product_id': 'item_id', 'category_id': 'category'}, inplace = True) 

In [25]:
# Final_Data = df[['user_id', 'item_id', 'Time', 'category']]

In [26]:
# Final_Data['timestamp'] = Final_Data.Time.apply(lambda x: dt.datetime.strptime(x, '%Y-%m-%d %H:%M:%S UTC').replace(tzinfo=timezone.utc).timestamp()) #This is not UTC. It does not really matter.


In [27]:
# del(Final_Data['Time'])

In [28]:
# Final_Data.to_csv('Final_Data.csv', index=False)

In [29]:
data = pd.read_csv('Final_Data.csv')

In [30]:
Final_Data = data[['user_id','item_id','timestamp']]

In [31]:
# Threshold data to only include users and models with min 4 products.
threshold_data= helper_functions.threshold_interactions_df(Final_Data, 'user_id', 'item_id', 4, 4)

Starting interactions info
Number of rows: 681451
Number of cols: 47690
Sparsity: 0.023%
Ending interactions info
Number of rows: 203125
Number of columns: 41513
Sparsity: 0.082%


In [32]:
threshold_data.head()

Unnamed: 0,user_id,item_id,timestamp
0,576802932,5712790,1575158000.0
1,412120092,5764655,1575158000.0
2,494077766,4958,1575158000.0
3,348405118,5848413,1575158000.0
4,576005683,5824148,1575158000.0


In [33]:
le_usr = preprocessing.LabelEncoder() # user encoder
le_itm = preprocessing.LabelEncoder() # item encoder

In [34]:
# shift item_ids with +1 (but not user_ids):
item_ids = (le_itm.fit_transform(threshold_data['item_id'])+1).astype('int32')
user_ids = (le_usr.fit_transform(threshold_data['user_id'])).astype('int32')

In [35]:
user_ids

array([149348,  18113,  45906, ...,  20503,  30858,  20503], dtype=int32)

In [36]:
from spotlight.interactions import Interactions
implicit_interactions = Interactions(user_ids, item_ids, timestamps=threshold_data.timestamp)

from spotlight.cross_validation import user_based_train_test_split, random_train_test_split
train, test = user_based_train_test_split(implicit_interactions, 0.3)

In [37]:
sequential_interaction = train.to_sequence()
implicit_sequence_model = ImplicitSequenceModel(use_cuda=False, n_iter=40, loss='bpr', representation='pooling', batch_size = 256)
implicit_sequence_model.fit(sequential_interaction, verbose=True)

Epoch 0: loss 0.12694355212466227
Epoch 1: loss 0.06387347555191246
Epoch 2: loss 0.05390962021601681
Epoch 3: loss 0.04904808868647963
Epoch 4: loss 0.046175982906101684
Epoch 5: loss 0.044368601968859334
Epoch 6: loss 0.043098615105381724
Epoch 7: loss 0.042138821685437075
Epoch 8: loss 0.041179562718960984
Epoch 9: loss 0.04069027460975382
Epoch 10: loss 0.04014982555878559
Epoch 11: loss 0.039571156083748456
Epoch 12: loss 0.03925112645633191
Epoch 13: loss 0.03908930134792806
Epoch 14: loss 0.03876662801954661
Epoch 15: loss 0.038450171428207995
Epoch 16: loss 0.03824082280019509
Epoch 17: loss 0.03811528391860279
Epoch 18: loss 0.037944146804686114
Epoch 19: loss 0.037683549091820416
Epoch 20: loss 0.037481536064035106
Epoch 21: loss 0.037418507529211414
Epoch 22: loss 0.03732553334433617
Epoch 23: loss 0.03727703265699138
Epoch 24: loss 0.03718482616209995
Epoch 25: loss 0.036969433778647295
Epoch 26: loss 0.03702512338946049
Epoch 27: loss 0.036905153674591115
Epoch 28: loss 0.

#### Ground Truth

In [53]:
threshold_data[threshold_data['user_id'] == le_usr.inverse_transform([3])[0]]

Unnamed: 0,user_id,item_id,timestamp
4709875,5493470,5745712,1573495000.0
4709923,5493470,5745712,1573495000.0
4710005,5493470,49674,1573495000.0
4710061,5493470,49674,1573495000.0
4710102,5493470,5587748,1573495000.0
4710500,5493470,4590,1573496000.0
4710537,5493470,4590,1573496000.0
4710624,5493470,5815680,1573496000.0
4710729,5493470,34763,1573496000.0
4710739,5493470,31589,1573496000.0


#### Prediction

In [58]:
predictions = implicit_sequence_model.predict([2, 3, 4, 5])
item_ids= (-predictions).argsort()[:5] # last 10 items
print(le_itm.inverse_transform(item_ids))
print(predictions[item_ids])

[5809161    3776    4203 5809162    7890]
[105.83562  105.34837  104.38099  100.691154 100.09545 ]
