In [1]:
from bert4rec import BertDataset, RECBERTO, BERTEmbedding

In [2]:
import pandas as pd
import torch
import pytorch_lightning as pl
from io import BytesIO
from urllib.request import urlopen
from zipfile import ZipFile
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import torch.utils.data as data
from pytorch_lightning.callbacks import ModelCheckpoint
import numpy as np
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

from torchrecsys.datasets import InteractionsDataset, SequenceDataset
from torchrecsys.models import BaseModel
from torchrecsys.task import Ranking
from torchrecsys.layers import BruteForceLayer
import torchrecsys as trs

import warnings
warnings.filterwarnings('ignore')

# Data

In [3]:
MAX_LEN = 10

In [4]:
candidates = pd.read_csv("data/candidate_items.csv")
train_purchases = pd.read_csv("data/train_purchases.csv",  parse_dates=['date'])
train_sessions = pd.read_csv("data/train_sessions.csv",  parse_dates=['date'])

test_sessions = pd.read_csv("data/test_leaderboard_sessions.csv",  parse_dates=['date'])
final_test = pd.read_csv("data/test_final_sessions.csv")
all_interactions = pd.concat([train_sessions, train_purchases, test_sessions], ignore_index=True)

features = pd.read_csv("data/item_features.csv")

aux = all_interactions.copy()
n_items = all_interactions.item_id.max()+1

In [5]:
result = features.drop_duplicates(subset=['item_id', 'feature_category_id']).reset_index(drop=True).pivot(index="item_id", columns="feature_category_id", values="feature_value_id")
# Change the axis labels
aux= result.index
result = result.reset_index(drop=True)
result.columns = ["feature_" + str(x) for x in result.columns]
result["item_id"]  = aux

# shift column 'Name' to first position
first_column = result.pop('item_id')
result.insert(0, 'item_id', first_column)

In [6]:
category_features = []

for each in result.columns:
    threshold = 0.25*len(result[each].dropna())
    if result[each].nunique() < threshold:
        category_features.append(each)

for each in category_features:
    result[each] = result[each].astype('category')
    
    
#Drop columns with nans
keep_columns = []
for each in result.columns:
    percentage_nan = result[each].isnull().sum() * 100 / len(result)
    if percentage_nan < 50:
        keep_columns.append(each)
        
result = result[keep_columns]

In [7]:
#Build catalog
#UNUSED NOW AS WE NEED TO FIND WHAT ARE CATEGORICAL WHAT CONTINUOUS AND DO MORE ANALYSIS
feature_catalog = {}
for row in tqdm(result.values, total=len(result)):
    #Replacenan with 0
    row = np.nan_to_num(row, nan=0.0, posinf=None, neginf=None)
    feature_catalog[row[0]] = row
    
feature_catalog[0] = [0 for u in range(len(result.columns))] #PAD
feature_catalog[1] = [1 for u in range(len(result.columns))] #MAsk 

for u in range(30000):
    if u not in feature_catalog:
        feature_catalog[u] = [0 for u in range(len(result.columns))] #PAD

  0%|          | 0/23691 [00:00<?, ?it/s]

In [8]:
# Create list of browsed items in the session ordered by timestamp. WE DONT KNOW IF THE PURCHASED ITEM IS ALWAYS AFTER THE LASTEST BROWSED ITEM!! CHECK ON THIS
all_interactions = all_interactions.sort_values(['date']).groupby('session_id')["item_id"].apply(list).to_frame().reset_index()
all_interactions = all_interactions[all_interactions['item_id'].map(len) >1]

In [9]:
class featureEmbedder(torch.nn.Module):
    def __init__(self, n_features, embedding_size, embedding_dimensions):
        super().__init__()
        self.n_features = n_features
        for feature in range(self.n_features):
            setattr(self,f'embedding_feature_{feature}', torch.nn.Embedding(embedding_size[feature], embedding_dimensions[feature]))
            
            
    def forward(self, x):
        r = []
        for feature in range(self.n_features):
            aux = getattr(self,f'embedding_feature_{feature}',0)(x[:,:,feature])
            r.append(aux)
            
        return torch.cat(r, dim=2)

emb = featureEmbedder(17, [30000]+[1028 for u in range(17)], [256]+[16 for u in range(17)])

In [10]:
dataset = all_interactions.item_id.values

train_dataset = BertDataset(
        dataset,
        mode="train",
        max_len=MAX_LEN,
        num_items=n_items,
        mask_prob=0.3,
        item_catalog=feature_catalog,
)
train = data.DataLoader(
    train_dataset,
    batch_size=2096,
    shuffle=True,
    num_workers=3,
)

validate_dataset = BertDataset(
    dataset, mode="validate", max_len=MAX_LEN, num_items=n_items, item_catalog=feature_catalog,
)
validate = data.DataLoader(
    validate_dataset,
    batch_size=4192,
    shuffle=False,
    num_workers=6,
)

In [11]:
np.random.shuffle(dataset[0])

In [12]:
for f in train:
    break

# init model
model = RECBERTO(
    max_len=MAX_LEN,
    num_items=n_items,
    learning_rate=0.001,
    training_metrics=False,
)

model(f[0]).shape

torch.Size([2096, 10, 28145])

# Train

In [13]:
# init model
model = RECBERTO(
    max_len=MAX_LEN,
    num_items=n_items,
    learning_rate=1e-3,
    training_metrics=False,
)

checkpoint_callback = ModelCheckpoint(
  filename="best-checkpoint",
  save_top_k=1,
  monitor="val/loss",
  mode="min"
)

trainer = pl.Trainer(
    auto_select_gpus=True,
    gpus=-1,
    max_epochs=200,
    precision=32,
    callbacks=[EarlyStopping(monitor="val/loss", patience=3), checkpoint_callback],
    check_val_every_n_epoch=3
)

# Train the model ⚡
trainer.fit(model, train, validate)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type             | Params
--------------------------------------------------------
0 | embedding          | BERTEmbedding    | 7.9 M 
1 | transformer_blocks | ModuleList       | 12.6 M
2 | out                | Linear           | 14.4 M
3 | criterion          | CrossEntropyLoss | 0     
4 | acc                | Accuracy         | 0     
5 | recall             | Recall           | 0     
--------------------------------------------------------
35.0 M    Trainable params
0         Non-trainable params
35.0 M    Total params
139.985   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

# Score

In [14]:
test_sessions = test_sessions.sort_values(['date']).groupby('session_id')["item_id"].apply(list).to_frame().reset_index()

In [15]:
inference_dataset = test_sessions.item_id.values


inference_dataset = BertDataset(
    inference_dataset, mode="inference", max_len=MAX_LEN, num_items=n_items, session_ids=test_sessions.session_id.values,  item_catalog=feature_catalog,
)
inference = data.DataLoader(
    inference_dataset,
    batch_size=1028,
    shuffle=False,
    num_workers=6,
)

In [16]:
# trainer.test(model, dataloaders=inference, ckpt_path="best" )

In [17]:
r = pd.read_csv("predict.csv")

In [18]:
# r["item_id"] = r.item_id.apply(lambda x: x[0])
# r = r.explode('topindices')
# r["rank"] = r.groupby(["sessionid"]).cumcount()+1
r

Unnamed: 0,session_id,item_id,rank
0,26,3260,1
1,26,5383,2
2,26,2213,3
3,26,27630,4
4,26,7368,5
...,...,...,...
4999995,4439757,19150,96
4999996,4439757,26,97
4999997,4439757,15140,98
4999998,4439757,23405,99
