# MSGIFSR Session Recommender
# @author: Ivan Vrkic

This notebook is set up to be run in Google Colab.


# Setup

In [1]:
!nvidia-smi

Sat Jul  9 14:40:22 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install gdown
!pip install dgl
!pip install wandb
!pip install dgl-cu101 #order matter

!gdown --fuzzy https://drive.google.com/file/d/1Bo2PHNcGyiQJE-dldYhLXL1RgYPenv7u/view?usp=sharing
!gdown --fuzzy https://drive.google.com/file/d/127tRcgb06QbdV7hTWRpj9K2C1xMnbFIR/view?usp=sharing
!gdown --fuzzy https://drive.google.com/file/d/1ZLwbIHXy8-CARacsMPPzzIU10NxvcVhz/view?usp=sharing

!git clone https://github.com/ivanvrkic/MSGIFSR-SessionRec-pytorch
!cp -r /content/MSGIFSR-SessionRec-pytorch/* /content/

!mkdir datasets/dressipy
!mv test_final_sessions.csv datasets/dressipy
!mv train_sessions.csv datasets/dressipy
!mv train_purchases.csv datasets/dressipy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting dgl
  Downloading dgl-0.6.1-cp37-cp37m-manylinux1_x86_64.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 7.0 MB/s 
Installing collected packages: dgl
Successfully installed dgl-0.6.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wandb
  Downloading wandb-0.12.21-py2.py3-none-any.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 7.3 MB/s 
Collecting setproctitle
  Downloading setproctitle-1.2.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29 kB)
Collecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.6.0-py2.py3-none-any.whl (145 kB)
[K     |████████████████████████████████| 145 kB 52.6 MB/s 
[?25hCollecting GitPython>=1.0.0
  Downloadin

In [1]:
import argparse
import os
import numpy as np
import torch
import random
import sys
import pickle
import pandas as pd

sys.path.append('/content')
sys.path.append('/content/src')

from pathlib import Path
import os
import numpy as np
import torch as th
from torch.utils.data import DataLoader, SequentialSampler
from src.utils.data.dataset import read_dataset, AugmentedDataset
from src.utils.data.collate import (
    seq_to_ccs_graph,
    collate_fn_factory_ccs
)
from src.utils.train import TrainRunner, prepare_batch
from src.models import MSGIFSR


Using backend: pytorch


# Preprocess


Place dressipy files, i.e. `
train_sessions.csv`,`train_purchases.csv`,`test_final_sessions.csv
` in 
`
dataset_dir
`

If
`train_for_leaderboard`is set to `True` when calling `preprocess_dressipy`, model will be trained on the full training dataset.

In [6]:
!gdown --fuzzy https://drive.google.com/file/d/1Eqmc_cKAZbhDF-R9H2P5mWF4E-BIXl6M/view?usp=sharing
!gdown --fuzzy https://drive.google.com/file/d/1uWwBuFLJ9h5yn4d6ax-l1nk41s_wikp1/view?usp=sharing
!gdown --fuzzy https://drive.google.com/file/d/1nEFz-YVgevUlnXqdwgm0WOUdWaNo1Tex/view?usp=sharing
!mv train.txt datasets/dressipy
!mv test.txt datasets/dressipy
!mv num_items.txt datasets/dressipy

Downloading...
From: https://drive.google.com/uc?id=1Eqmc_cKAZbhDF-R9H2P5mWF4E-BIXl6M
To: /content/train.txt
100% 24.5M/24.5M [00:00<00:00, 221MB/s]
Downloading...
From: https://drive.google.com/uc?id=1uWwBuFLJ9h5yn4d6ax-l1nk41s_wikp1
To: /content/num_items.txt
100% 5.00/5.00 [00:00<00:00, 9.25kB/s]
Downloading...
From: https://drive.google.com/uc?id=1nEFz-YVgevUlnXqdwgm0WOUdWaNo1Tex
To: /content/test.txt
100% 2.23M/2.23M [00:00<00:00, 189MB/s]


In [2]:
use_preprocessed=True

dataset_dir = Path('datasets/dressipy/')
if not use_preprocessed:
  from utils.data.preprocess import preprocess_dressipy

  preprocess_dressipy(dataset_dir,train_for_leaderboard=False)
  #preprocess_dressipy(dataset_dir,train_for_leaderboard=True)

# Config

In [3]:

def seed_torch(seed=42):
    seed = int(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.enabled = True
    
seed_torch(123)

def get_freer_gpu():
    os.system('nvidia-smi -q -d Memory |grep -A4 GPU|grep Free >tmp')
    memory_available = [int(x.split()[2]) for x in open('tmp', 'r').readlines()]
    # memory_available = memory_available[1:6]
    if len(memory_available) == 0:
        return -1
    return int(np.argmax(memory_available))

os.environ["CUDA_VISIBLE_DEVICES"] = str(get_freer_gpu())

class Args:
  def __init__(self):
    pass
args = Args()
args.dataset_dir=str(dataset_dir)
args.embedding_dim= 256
args.num_layers= 1
args.feat_drop= 0.1
args.lr= 0.1
args.batch_size= 512
args.epochs= 30
args.weight_decay= 1e-4
args.patience= 3
args.num_workers= 4
args.valid_split= None
args.log_interval= 100
args.order= 1
args.reducer= 'mean'
args.norm= True

#opt
args.extra=None
args.fusion=None

In [4]:
device = th.device('cuda' if th.cuda.is_available() else 'cpu')
dataset_dir = Path(args.dataset_dir)
print('reading dataset')
train_sessions, test_sessions, num_items = read_dataset(dataset_dir)
# num_items += 5

if args.valid_split is not None:
    num_valid      = int(len(train_sessions) * args.valid_split)
    test_sessions  = train_sessions[-num_valid:]
    train_sessions = train_sessions[:-num_valid]

train_set = AugmentedDataset(train_sessions)
test_set  = AugmentedDataset(test_sessions)
print(len(train_set))
print(len(test_set))

collate_fn = collate_fn_factory_ccs((seq_to_ccs_graph,), order=args.order)

train_loader = DataLoader(
    train_set,
    batch_size=args.batch_size,
    # shuffle=True,
    # drop_last=True,
    num_workers=args.num_workers,
    collate_fn=collate_fn,
    pin_memory=True,
    sampler=SequentialSampler(train_set)
)

test_loader = DataLoader(
    test_set,
    batch_size=args.batch_size,
    shuffle=True,
    num_workers=args.num_workers,
    collate_fn=collate_fn,
    pin_memory=True
)

reading dataset
3999069
363780


# Train

In [None]:
model = MSGIFSR(num_items, args.dataset_dir, args.embedding_dim, args.num_layers, dropout=args.feat_drop, reducer=args.reducer, order=args.order, norm=args.norm, extra=args.extra, fusion=args.fusion, device=device)
model = model.to(device)

print(model)

runner = TrainRunner(
    args.dataset_dir,
    model,
    train_loader,
    test_loader,
    device=device,
    lr=args.lr,
    weight_decay=args.weight_decay,
    patience=args.patience,
)

print('start training')
mrr, hit = runner.train(args.epochs, args.log_interval)
print('MRR@20\tHR@20')
print(f'{mrr * 100:.3f}%\t{hit * 100:.3f}%')


# Evaluation

In [7]:
!gdown --fuzzy https://drive.google.com/file/d/1NXsbnmB7NxZp2A_yHEhRXe6-lItYnJdb/view?usp=sharing

Downloading...
From: https://drive.google.com/uc?id=1NXsbnmB7NxZp2A_yHEhRXe6-lItYnJdb
To: /content/pretrainedMSGIFSR.model
  0% 0.00/36.5M [00:00<?, ?B/s] 55% 19.9M/36.5M [00:00<00:00, 198MB/s]100% 36.5M/36.5M [00:00<00:00, 258MB/s]


In [9]:
preload=False
if preload:
  print('Loading pretrained model...')
  trained_for_leaderboard=False
  with open("pretrainedMSGIFSR.model","rb") as f:
    model = pickle.load(f)
  print('Ready for evaluation.')

Loading pretrained model...
Ready for evaluation.


In [10]:
model.eval()
mrr = 0
hit = 0
num_samples = 0

ranking_df = pd.DataFrame(columns = ['item_id','rank','ground_truth'])
with th.no_grad():

    for batch in test_loader:
        seqs, inputs, labels = prepare_batch(batch, device)
        logits = model(*inputs)
        batch_size   = logits.size(0)
        num_samples += batch_size
        topk         = logits.topk(k=100)[1]
        labels       = labels.unsqueeze(-1)
        hit_ranks    = torch.where(topk == labels)[1] + 1
        hit         += hit_ranks.numel()
        mrr         += hit_ranks.float().reciprocal().sum().item()
        df = pd.DataFrame(
            {"item_id":topk.cpu().numpy().tolist(),
             "ground_truth":labels.cpu().numpy().ravel()
            }
        )
        df['rank'] = df.item_id.apply(lambda x: list(range(1,len(x)+1)))
        ranking_df = pd.concat([ranking_df,df],ignore_index=True)

ranking_df = ranking_df.reset_index()
ranking_df.columns = ['session_id','item_id','rank','ground_truth']
ground_truth = pd.Series(ranking_df.ground_truth.values,index=ranking_df.session_id).to_dict()
ranking_df = ranking_df.explode(['item_id','rank']).reset_index(drop=True)
ranking_df = ranking_df.drop("ground_truth",axis=1)
print("MRR ",mrr/num_samples,"Hit percentage", hit/num_samples)

MRR  0.19559177901183958 Hit percentage 0.6684946945956347


In [None]:
ground_truth[363779]

In [11]:
ranking_df

Unnamed: 0,session_id,item_id,rank
0,0,13877,1
1,0,8049,2
2,0,1088,3
3,0,1087,4
4,0,6283,5
...,...,...,...
36377995,363779,7815,96
36377996,363779,4541,97
36377997,363779,353,98
36377998,363779,3346,99


In [None]:
# with open('/content/drive/MyDrive/colab/recsys-2022/ranking_df', "wb") as f:
#   pickle.dump(ranking_df, f)
# with open('/content/drive/MyDrive/colab/recsys-2022/ground_truth', "wb") as f:
#   pickle.dump(ground_truth, f)


In [None]:
# evaluator = MeanReciprocitalEvaluator()
# evaluator.get_ranking_metrics(ranking_df, ground_truth, check_if_input_correct = True)