In [1]:
import os
import random
import argparse
import numpy as np
import pandas as pd
from tqdm import tqdm
import pickle 

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, SequentialSampler

import sys
sys.path=list(set(sys.path))
root_dir='/home/ec2-user/SageMaker/sequence-based-recommendation'
model_name="NARM"
model_path=os.path.join(root_dir,model_name)
sys.path.append(model_path)

from NARM import metric
from NARM.utils import collate_fn
from NARM.narm import NARM
from NARM.dataset import load_data, RecSysDataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def seed_everything(seed):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
seed_everything(101)

## NARM Model

In [3]:
def validate(valid_loader, model,device):
    model.eval()
    recalls = []
    mrrs = []
    losses=[]
    with torch.no_grad():
        for seq, target, lens in valid_loader:
            seq = seq.to(device)
            target = target.to(device)
            outputs = model(seq, lens)
            criterion = nn.CrossEntropyLoss()
            loss = criterion(outputs, target)
            logits = F.softmax(outputs, dim = 1)
            recall, mrr = metric.evaluate(logits, target, k = args.topk)
            recalls.append(recall)
            mrrs.append(mrr)
            losses.append(loss.item())
    
    mean_recall = np.mean(recalls)
    mean_mrr = np.mean(mrrs)
    mean_loss=np.mean(losses)
    
    return mean_recall, mean_mrr, mean_loss

In [4]:
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--dataset_path', default='./YOOCHOOSE_data/yoochoose1_64/', 
                        help='dataset directory path: datasets/amex/yoochoose1_4/yoochoose1_64')
    
    parser.add_argument('--n_items', type=int, default=37484, help='number of unique items. 37484 for yoochoose')
    parser.add_argument('--batch_size', type=int, default=100, help='input batch size')
    parser.add_argument('--hidden_size', type=int, default=128, help='hidden state size of gru module')
    parser.add_argument('--embed_dim', type=int, default=128, help='the dimension of item embedding')
    parser.add_argument('--topk', type=int, default=20, help='number of top score items selected for calculating recall and mrr metrics')
    parser.add_argument("--model_checkpoint", type=str, default="amex_checkpoint.pth") 
    args,_ = parser.parse_known_args()
    print(args)

Namespace(batch_size=100, dataset_path='./YOOCHOOSE_data/yoochoose1_64/', embed_dim=128, hidden_size=128, model_checkpoint='amex_checkpoint.pth', n_items=37484, topk=20)


#### yoochoose1_64

In [5]:
model_name="NARM"

args.dataset_path='./YOOCHOOSE_data/yoochoose1_64/'
args.n_items=37484
args.batch_size=100 
args.epoch=30 
args.embed_dim=128 
args.hidden_size=128
args.model_checkpoint='yoochoose1_64_checkpoint.pth'

train, test = load_data(args.dataset_path)
train_data = RecSysDataset(train)
test_data = RecSysDataset(test)
train_loader = DataLoader(train_data, batch_size = args.batch_size, shuffle = True, collate_fn = collate_fn)
test_loader = DataLoader(test_data, batch_size = args.batch_size, shuffle = False, collate_fn = collate_fn)

print('{:<30}{:<10,} '.format("training batch",len(train_loader)))
print('{:<30}{:<10,} '.format("test batch",len(test_loader)))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = NARM(args.n_items, args.hidden_size, args.embed_dim, args.batch_size).to(device)

model_path=os.path.join(os.getcwd(),model_name, args.model_checkpoint)
ckpt = torch.load(model_path)
model.load_state_dict(ckpt['state_dict'])
recall_v1, mrr_v1,loss_v1 = validate(test_loader, model,device)
print()
print("Test: Recall@{}: {:.4f}, MRR@{}: {:.4f}".format(args.topk, recall_v1, args.topk, mrr_v1))

--------------------------------------------------
Dataset info:
Number of sessions: 369859
--------------------------------------------------
--------------------------------------------------
Dataset info:
Number of sessions: 55898
--------------------------------------------------
training batch                3,699      
test batch                    559        

Test: Recall@20: 0.6979, MRR@20: 0.2958


#### diginetica

In [6]:
model_name="NARM"

args.dataset_path='./diginetica_data/'
args.n_items=43098
args.batch_size=100 
args.epoch=30 
args.embed_dim=128 
args.hidden_size=128
args.model_checkpoint='diginetica_checkpoint.pth'

train, test = load_data(args.dataset_path)
train_data = RecSysDataset(train)
test_data = RecSysDataset(test)
train_loader = DataLoader(train_data, batch_size = args.batch_size, shuffle = True, collate_fn = collate_fn)
test_loader = DataLoader(test_data, batch_size = args.batch_size, shuffle = False, collate_fn = collate_fn)

print('{:<30}{:<10,} '.format("training batch",len(train_loader)))
print('{:<30}{:<10,} '.format("test batch",len(test_loader)))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = NARM(args.n_items, args.hidden_size, args.embed_dim, args.batch_size).to(device)
model_path=os.path.join(os.getcwd(),model_name, args.model_checkpoint)
ckpt = torch.load(model_path)
model.load_state_dict(ckpt['state_dict'])
recall_v2, mrr_v2,loss_v2 = validate(test_loader, model,device)
print()
print("Test: Recall@{}: {:.4f}, MRR@{}: {:.4f}".format(args.topk, recall_v2, args.topk, mrr_v2))

--------------------------------------------------
Dataset info:
Number of sessions: 719470
--------------------------------------------------
--------------------------------------------------
Dataset info:
Number of sessions: 60858
--------------------------------------------------
training batch                7,195      
test batch                    609        

Test: Recall@20: 0.4743, MRR@20: 0.1533


#### Amex

In [7]:
model_name="NARM"

args.dataset_path="./dataset/amex_explorepoi-poi_category/"
args.n_items=556
args.batch_size=32 
args.epoch=30 
args.embed_dim=256 
args.hidden_size=256
args.model_checkpoint='amex_checkpoint.pth'

train, test = load_data(args.dataset_path)
train_data = RecSysDataset(train)
test_data = RecSysDataset(test)
train_loader = DataLoader(train_data, batch_size = args.batch_size, shuffle = True, collate_fn = collate_fn)
test_loader = DataLoader(test_data, batch_size = args.batch_size, shuffle = False, collate_fn = collate_fn)

print('{:<30}{:<10,} '.format("training batch",len(train_loader)))
print('{:<30}{:<10,} '.format("test batch",len(test_loader)))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = NARM(args.n_items, args.hidden_size, args.embed_dim, args.batch_size).to(device)

model_path=os.path.join(os.getcwd(),model_name, args.model_checkpoint)
ckpt = torch.load(model_path)
model.load_state_dict(ckpt['state_dict'])
recall_v3, mrr_v3,loss_v3 = validate(test_loader, model,device)
print()
print("Test: Recall@{}: {:.4f}, MRR@{}: {:.4f}".format(args.topk, recall_v3, args.topk, mrr_v3))

--------------------------------------------------
Dataset info:
Number of sessions: 3536
--------------------------------------------------
--------------------------------------------------
Dataset info:
Number of sessions: 305
--------------------------------------------------
training batch                111        
test batch                    10         

Test: Recall@20: 0.6827, MRR@20: 0.4614


In [8]:
output_df=pd.DataFrame()
output_df["Model"]=["NARM"]*3
output_df["Dataset"]=["yoochoose1_64","diginetica","amex-poi-category"]
output_df["Recall@20"]=[recall_v1,recall_v2,recall_v3]
output_df["MRR@20"]=[mrr_v1,mrr_v2,mrr_v3]
output_df.style.format({'Recall@20':'{:.2%}','MRR@20':'{:.2%}'})

Unnamed: 0,Model,Dataset,Recall@20,MRR@20
0,NARM,yoochoose1_64,69.79%,29.58%
1,NARM,diginetica,47.43%,15.33%
2,NARM,amex-poi-category,68.27%,46.14%


## SRGNN

In [9]:
sys.path=list(set(sys.path))
root_dir='/home/ec2-user/SageMaker/sequence-based-recommendation'
old_model_name="NARM"
old_model_path=os.path.join(root_dir,old_model_name)
sys.path=[x for x in sys.path if x !=old_model_path]

model_name="SRGNN"
model_path=os.path.join(root_dir,model_name)
sys.path.append(model_path)

from SRGNN.srgnn import SRGNN
from SRGNN.collate import (collate_fn_factory, seq_to_session_graph)
from SRGNN import metric
from SRGNN.dataset import load_data,RecSysDataset

Using backend: pytorch


In [10]:
def prepare_batch(batch, device):
    inputs, labels = batch
    # inputs, labels = batch
    inputs_gpu  = [x.to(device) for x in inputs]
    labels_gpu  = labels.to(device)
   
    return inputs_gpu, labels_gpu 

def validate(valid_loader, model,device):
    model.eval()
    recalls = []
    mrrs = []
    losses=[]
    with torch.no_grad():
        for step, batch in tqdm(enumerate(valid_loader), total=len(valid_loader),position=0,leave=True):
            inputs, labels = prepare_batch(batch, device)
            outputs = model(*inputs)
            criterion = nn.CrossEntropyLoss()
            loss = criterion(outputs, labels)
            # loss = nn.functional.nll_loss(outputs, labels)
            logits = F.softmax(outputs, dim = 1)
            recall, mrr = metric.evaluate(logits, labels, k = args.topk)
            recalls.append(recall)
            mrrs.append(mrr)
            losses.append(loss.item())
    
    mean_recall = np.mean(recalls)
    mean_mrr = np.mean(mrrs)
    mean_loss=np.mean(losses)
    
    return mean_recall, mean_mrr, mean_loss

In [11]:
if __name__ == '__main__':
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        '--dataset-dir', default='../YOOCHOOSE_data/yoochoose1_64/', help='the dataset directory'
    )
    parser.add_argument("--seed",  type=int,default=101,
            help="random seed for np.random.seed, torch.manual_seed and torch.cuda.manual_seed.")
    parser.add_argument(
    '--batch-size', type=int, default=100, help='the batch size for training'
    )

    parser.add_argument('--n_items', type=int, default=37484, help='number of unique items. 37484 for yoochoose')
    parser.add_argument('--embedding-dim', type=int, default=100, help='the embedding size')
    parser.add_argument('--num-layers', type=int, default=1, help='the number of layers')
    parser.add_argument('--feat-drop', type=float, default=0.1, help='the dropout ratio for features')
    parser.add_argument(
    '--valid-split',
    type=float,
    default=0.1,
    help='the fraction for the validation set',
    )
    parser.add_argument(
        '--num-workers',
        type=int,
        default=0,
        help='the number of processes to load the input graphs',
    )
    
    parser.add_argument(
        '--topk', 
        type=int, 
        default=20, 
        help='number of top score items selected for calculating recall and mrr metrics',
    )
    
    parser.add_argument("--model_checkpoint", type=str, default="amex_checkpoint.pth")
    
    args,_= parser.parse_known_args()
    print(args)

Namespace(batch_size=100, dataset_dir='../YOOCHOOSE_data/yoochoose1_64/', embedding_dim=100, feat_drop=0.1, model_checkpoint='amex_checkpoint.pth', n_items=37484, num_layers=1, num_workers=0, seed=101, topk=20, valid_split=0.1)


#### yoochoose1_64

In [12]:
model_name="SRGNN"

args.dataset_path='./YOOCHOOSE_data/yoochoose1_64/'
args.n_items=37484
args.batch_size=100 
args.epoch=30 
args.embedding_dim=128
args.num_layers=1
args.model_checkpoint='yoochoose1_64_checkpoint.pth'

train, test = load_data(args.dataset_path)
train_data = RecSysDataset(train)
test_data = RecSysDataset(test)

collate_fn = collate_fn_factory(seq_to_session_graph)
train_loader = DataLoader(
    train_data,
    batch_size=args.batch_size,
    # shuffle=True,  # Remove shuffle=True in this case as SubsetRandomSampler shuffles data already
    # drop_last=True,
    num_workers=args.num_workers,
    collate_fn=collate_fn,
    pin_memory=True,
    sampler=SequentialSampler(train_data)
)

test_loader = DataLoader(
    test_data,
    batch_size=args.batch_size,
    # shuffle=True,
    num_workers=args.num_workers,
    collate_fn=collate_fn
)
print()
print('{:<30}{:<10,} '.format("training mini-batch",len(train_loader)))
print('{:<30}{:<10,} '.format("test mini-batch",len(test_loader)))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SRGNN(args.n_items, args.embedding_dim, args.num_layers, feat_drop=args.feat_drop).to(device)

model_path=os.path.join(os.getcwd(),model_name, args.model_checkpoint)
ckpt = torch.load(model_path)
model.load_state_dict(ckpt['state_dict'])
recall_v1, mrr_v1,loss_v1 = validate(test_loader, model,device)
print()
print("Test: Recall@{}: {:.4f}, MRR@{}: {:.4f}".format(args.topk, recall_v1, args.topk, mrr_v1))

--------------------------------------------------
Dataset info:
Number of sessions: 369859
--------------------------------------------------
--------------------------------------------------
Dataset info:
Number of sessions: 55898
--------------------------------------------------

training mini-batch           3,699      
test mini-batch               559        


100%|██████████| 559/559 [00:18<00:00, 29.87it/s]


Test: Recall@20: 0.7074, MRR@20: 0.3130





#### diginetica

In [13]:
model_name="SRGNN"

args.dataset_path='./diginetica_data/'
args.n_items=43098
args.batch_size=100 
args.epoch=30 
args.embedding_dim=128
args.num_layers=1
args.model_checkpoint='diginetica_checkpoint.pth'

train, test = load_data(args.dataset_path)
train_data = RecSysDataset(train)
test_data = RecSysDataset(test)

collate_fn = collate_fn_factory(seq_to_session_graph)
train_loader = DataLoader(
    train_data,
    batch_size=args.batch_size,
    # shuffle=True,  # Remove shuffle=True in this case as SubsetRandomSampler shuffles data already
    # drop_last=True,
    num_workers=args.num_workers,
    collate_fn=collate_fn,
    pin_memory=True,
    sampler=SequentialSampler(train_data)
)

test_loader = DataLoader(
    test_data,
    batch_size=args.batch_size,
    # shuffle=True,
    num_workers=args.num_workers,
    collate_fn=collate_fn
)
print()
print('{:<30}{:<10,} '.format("training mini-batch",len(train_loader)))
print('{:<30}{:<10,} '.format("test mini-batch",len(test_loader)))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SRGNN(args.n_items, args.embedding_dim, args.num_layers, feat_drop=args.feat_drop).to(device)

model_path=os.path.join(os.getcwd(),model_name, args.model_checkpoint)
ckpt = torch.load(model_path)
model.load_state_dict(ckpt['state_dict'])
recall_v2, mrr_v2,loss_v2 = validate(test_loader, model,device)
print()
print("Test: Recall@{}: {:.4f}, MRR@{}: {:.4f}".format(args.topk, recall_v2, args.topk, mrr_v2))

--------------------------------------------------
Dataset info:
Number of sessions: 719470
--------------------------------------------------
--------------------------------------------------
Dataset info:
Number of sessions: 60858
--------------------------------------------------

training mini-batch           7,195      
test mini-batch               609        


100%|██████████| 609/609 [00:20<00:00, 30.39it/s]


Test: Recall@20: 0.5157, MRR@20: 0.1716





#### Amex

In [14]:
model_name="SRGNN"

args.dataset_path="./dataset/amex_explorepoi-poi_category/"
args.n_items=556
args.batch_size=32 
args.epoch=30 
args.embedding_dim=256
args.num_layers=1
args.model_checkpoint='amex_checkpoint.pth'

train, test = load_data(args.dataset_path)
train_data = RecSysDataset(train)
test_data = RecSysDataset(test)

collate_fn = collate_fn_factory(seq_to_session_graph)
train_loader = DataLoader(
    train_data,
    batch_size=args.batch_size,
    # shuffle=True,  # Remove shuffle=True in this case as SubsetRandomSampler shuffles data already
    # drop_last=True,
    num_workers=args.num_workers,
    collate_fn=collate_fn,
    pin_memory=True,
    sampler=SequentialSampler(train_data)
)

test_loader = DataLoader(
    test_data,
    batch_size=args.batch_size,
    # shuffle=True,
    num_workers=args.num_workers,
    collate_fn=collate_fn
)
print()
print('{:<30}{:<10,} '.format("training mini-batch",len(train_loader)))
print('{:<30}{:<10,} '.format("test mini-batch",len(test_loader)))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SRGNN(args.n_items, args.embedding_dim, args.num_layers, feat_drop=args.feat_drop).to(device)

model_path=os.path.join(os.getcwd(),model_name, args.model_checkpoint)
ckpt = torch.load(model_path)
model.load_state_dict(ckpt['state_dict'])
recall_v3, mrr_v3,loss_v3 = validate(test_loader, model,device)
print()
print("Test: Recall@{}: {:.4f}, MRR@{}: {:.4f}".format(args.topk, recall_v3, args.topk, mrr_v3))

--------------------------------------------------
Dataset info:
Number of sessions: 3536
--------------------------------------------------
--------------------------------------------------
Dataset info:
Number of sessions: 305
--------------------------------------------------

training mini-batch           111        
test mini-batch               10         


100%|██████████| 10/10 [00:00<00:00, 65.79it/s]


Test: Recall@20: 0.7386, MRR@20: 0.4179





In [15]:
tempt={"Model":["SRGNN"]*3, "Dataset":["yoochoose1_64","diginetica","amex-poi-category"],
      "Recall@20":[recall_v1,recall_v2,recall_v3],"MRR@20":[mrr_v1,mrr_v2,mrr_v3]}
tempt=pd.DataFrame(tempt)
output_df=output_df.append(tempt,ignore_index=True)
output_df.style.format({'Recall@20':'{:.2%}','MRR@20':'{:.2%}'})

Unnamed: 0,Model,Dataset,Recall@20,MRR@20
0,NARM,yoochoose1_64,69.79%,29.58%
1,NARM,diginetica,47.43%,15.33%
2,NARM,amex-poi-category,68.27%,46.14%
3,SRGNN,yoochoose1_64,70.74%,31.30%
4,SRGNN,diginetica,51.57%,17.16%
5,SRGNN,amex-poi-category,73.86%,41.79%


## NISER

In [16]:
sys.path=list(set(sys.path))
root_dir='/home/ec2-user/SageMaker/sequence-based-recommendation'
old_model_name="SRGNN"
old_model_path=os.path.join(root_dir,old_model_name)
sys.path=[x for x in sys.path if x !=old_model_path]

model_name="NISER"
model_path=os.path.join(root_dir,model_name)
sys.path.append(model_path)

from NISER.niser import NISER
from NISER.collate import (collate_fn_factory, seq_to_session_graph)
from NISER import metric
from NISER.dataset import load_data,RecSysDataset

In [17]:
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument(
    '--dataset-dir', default='../YOOCHOOSE_data/yoochoose1_64/', help='the dataset directory'
)
parser.add_argument("--seed",  type=int,default=101,
        help="random seed for np.random.seed, torch.manual_seed and torch.cuda.manual_seed.")
parser.add_argument('--n_items', type=int, default=37484, help='number of unique items. 37484 for yoochoose')
parser.add_argument('--embedding-dim', type=int, default=128, help='the embedding size')
parser.add_argument('--num-layers', type=int, default=1, help='the number of layers')
parser.add_argument('--feat-drop', type=float, default=0.1, help='the dropout ratio for features')
parser.add_argument('--batch-size', type=int, default=100, help='the batch size for training')
parser.add_argument('--epochs', type=int, default=30, help='the number of training epochs')
parser.add_argument("--output_name", type=str, default="amex_metrics.txt")
parser.add_argument("--model_checkpoint", type=str, default="amex_checkpoint.pth")
parser.add_argument('--topk', type=int, default=20, help='number of top score items selected for calculating recall and mrr metrics')
parser.add_argument('--num-workers',type=int,default=0,help='the number of processes to load the input graphs')
args,_= parser.parse_known_args()
print(args)

Namespace(batch_size=100, dataset_dir='../YOOCHOOSE_data/yoochoose1_64/', embedding_dim=128, epochs=30, feat_drop=0.1, model_checkpoint='amex_checkpoint.pth', n_items=37484, num_layers=1, num_workers=0, output_name='amex_metrics.txt', seed=101, topk=20)


#### yoochoose1_64

In [18]:
model_name="NISER"

args.dataset_path='./YOOCHOOSE_data/yoochoose1_64/'
args.n_items=37484
args.batch_size=100 
args.epoch=30 
args.embedding_dim=128
args.num_layers=1
args.model_checkpoint='yoochoose1_64_checkpoint.pth'

train, test = load_data(args.dataset_path)
train_data = RecSysDataset(train)
test_data = RecSysDataset(test)

collate_fn = collate_fn_factory(seq_to_session_graph)
train_loader = DataLoader(
    train_data,
    batch_size=args.batch_size,
    # shuffle=True,  # Remove shuffle=True in this case as SubsetRandomSampler shuffles data already
    # drop_last=True,
    num_workers=args.num_workers,
    collate_fn=collate_fn,
    pin_memory=True,
    sampler=SequentialSampler(train_data)
)

test_loader = DataLoader(
    test_data,
    batch_size=args.batch_size,
    # shuffle=True,
    num_workers=args.num_workers,
    collate_fn=collate_fn
)
print()
print('{:<30}{:<10,} '.format("training mini-batch",len(train_loader)))
print('{:<30}{:<10,} '.format("test mini-batch",len(test_loader)))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = NISER(args.n_items, args.embedding_dim, args.num_layers, feat_drop=args.feat_drop).to(device)

model_path=os.path.join(os.getcwd(),model_name, args.model_checkpoint)
ckpt = torch.load(model_path)
model.load_state_dict(ckpt['state_dict'])
recall_v1, mrr_v1,loss_v1 = validate(test_loader, model,device)
print()
print("Test: Recall@{}: {:.4f}, MRR@{}: {:.4f}".format(args.topk, recall_v1, args.topk, mrr_v1))

--------------------------------------------------
Dataset info:
Number of sessions: 369859
--------------------------------------------------
--------------------------------------------------
Dataset info:
Number of sessions: 55898
--------------------------------------------------

training mini-batch           3,699      
test mini-batch               559        


100%|██████████| 559/559 [00:18<00:00, 29.85it/s]


Test: Recall@20: 0.7193, MRR@20: 0.3171





#### diginetica

In [19]:
model_name="NISER"

args.dataset_path='./diginetica_data/'
args.n_items=43098
args.batch_size=100 
args.epoch=30 
args.embedding_dim=128
args.num_layers=1
args.model_checkpoint='diginetica_checkpoint.pth'

train, test = load_data(args.dataset_path)
train_data = RecSysDataset(train)
test_data = RecSysDataset(test)

collate_fn = collate_fn_factory(seq_to_session_graph)
train_loader = DataLoader(
    train_data,
    batch_size=args.batch_size,
    # shuffle=True,  # Remove shuffle=True in this case as SubsetRandomSampler shuffles data already
    # drop_last=True,
    num_workers=args.num_workers,
    collate_fn=collate_fn,
    pin_memory=True,
    sampler=SequentialSampler(train_data)
)

test_loader = DataLoader(
    test_data,
    batch_size=args.batch_size,
    # shuffle=True,
    num_workers=args.num_workers,
    collate_fn=collate_fn
)
print()
print('{:<30}{:<10,} '.format("training mini-batch",len(train_loader)))
print('{:<30}{:<10,} '.format("test mini-batch",len(test_loader)))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = NISER(args.n_items, args.embedding_dim, args.num_layers, feat_drop=args.feat_drop).to(device)

model_path=os.path.join(os.getcwd(),model_name, args.model_checkpoint)
ckpt = torch.load(model_path)
model.load_state_dict(ckpt['state_dict'])
recall_v2, mrr_v2,loss_v2 = validate(test_loader, model,device)
print()
print("Test: Recall@{}: {:.4f}, MRR@{}: {:.4f}".format(args.topk, recall_v2, args.topk, mrr_v2))

--------------------------------------------------
Dataset info:
Number of sessions: 719470
--------------------------------------------------
--------------------------------------------------
Dataset info:
Number of sessions: 60858
--------------------------------------------------

training mini-batch           7,195      
test mini-batch               609        


100%|██████████| 609/609 [00:20<00:00, 30.21it/s]


Test: Recall@20: 0.5500, MRR@20: 0.1874





#### Amex

In [20]:
model_name="NISER"

args.dataset_path="./dataset/amex_explorepoi-poi_category/"
args.n_items=556
args.batch_size=32 
args.epoch=30 
args.embedding_dim=256
args.num_layers=1
args.model_checkpoint='amex_checkpoint.pth'

train, test = load_data(args.dataset_path)
train_data = RecSysDataset(train)
test_data = RecSysDataset(test)

collate_fn = collate_fn_factory(seq_to_session_graph)
train_loader = DataLoader(
    train_data,
    batch_size=args.batch_size,
    # shuffle=True,  # Remove shuffle=True in this case as SubsetRandomSampler shuffles data already
    # drop_last=True,
    num_workers=args.num_workers,
    collate_fn=collate_fn,
    pin_memory=True,
    sampler=SequentialSampler(train_data)
)

test_loader = DataLoader(
    test_data,
    batch_size=args.batch_size,
    # shuffle=True,
    num_workers=args.num_workers,
    collate_fn=collate_fn
)
print()
print('{:<30}{:<10,} '.format("training mini-batch",len(train_loader)))
print('{:<30}{:<10,} '.format("test mini-batch",len(test_loader)))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = NISER(args.n_items, args.embedding_dim, args.num_layers, feat_drop=args.feat_drop).to(device)

model_path=os.path.join(os.getcwd(),model_name, args.model_checkpoint)
ckpt = torch.load(model_path)
model.load_state_dict(ckpt['state_dict'])
recall_v3, mrr_v3,loss_v3 = validate(test_loader, model,device)
print()
print("Test: Recall@{}: {:.4f}, MRR@{}: {:.4f}".format(args.topk, recall_v3, args.topk, mrr_v3))

--------------------------------------------------
Dataset info:
Number of sessions: 3536
--------------------------------------------------
--------------------------------------------------
Dataset info:
Number of sessions: 305
--------------------------------------------------

training mini-batch           111        
test mini-batch               10         


100%|██████████| 10/10 [00:00<00:00, 64.80it/s]


Test: Recall@20: 0.8511, MRR@20: 0.5949





In [21]:
tempt={"Model":["NISER"]*3, "Dataset":["yoochoose1_64","diginetica","amex-poi-category"],
      "Recall@20":[recall_v1,recall_v2,recall_v3],"MRR@20":[mrr_v1,mrr_v2,mrr_v3]}
tempt=pd.DataFrame(tempt)
output_df=output_df.append(tempt,ignore_index=True)
output_df.style.format({'Recall@20':'{:.2%}','MRR@20':'{:.2%}'})

Unnamed: 0,Model,Dataset,Recall@20,MRR@20
0,NARM,yoochoose1_64,69.79%,29.58%
1,NARM,diginetica,47.43%,15.33%
2,NARM,amex-poi-category,68.27%,46.14%
3,SRGNN,yoochoose1_64,70.74%,31.30%
4,SRGNN,diginetica,51.57%,17.16%
5,SRGNN,amex-poi-category,73.86%,41.79%
6,NISER,yoochoose1_64,71.93%,31.71%
7,NISER,diginetica,55.00%,18.74%
8,NISER,amex-poi-category,85.11%,59.49%


## TAGNN

In [22]:
sys.path=list(set(sys.path))
root_dir='/home/ec2-user/SageMaker/sequence-based-recommendation'
old_model_name="NISER"
old_model_path=os.path.join(root_dir,old_model_name)
sys.path=[x for x in sys.path if x !=old_model_path]

model_name="TAGNN"
model_path=os.path.join(root_dir,model_name)
sys.path.append(model_path)

from TAGNN.utils import build_graph, Data, split_validation
from TAGNN.model import *

In [23]:
parser = argparse.ArgumentParser()
parser.add_argument('--dataset', default='../YOOCHOOSE_data/yoochoose1_64/', help='the dataset directory')
parser.add_argument('--batchSize', type=int, default=100, help='input batch size')
parser.add_argument('--hiddenSize', type=int, default=100, help='hidden state size')
parser.add_argument('--epoch', type=int, default=30, help='the number of epochs to train for')
parser.add_argument('--lr', type=float, default=0.001, help='learning rate')  # [0.001, 0.0005, 0.0001]
parser.add_argument('--lr_dc', type=float, default=0.1, help='learning rate decay rate')
parser.add_argument('--lr_dc_step', type=int, default=3, help='the number of steps after which the learning rate decay')
parser.add_argument('--l2', type=float, default=1e-5, help='l2 penalty')  # [0.001, 0.0005, 0.0001, 0.00005, 0.00001]
parser.add_argument("--gradient_accumulation",action='store_true', help='gradient accumulation or not')
parser.add_argument("--accumulation_steps",type=int,default=2,
                           help="Number of updates steps to accumulate before performing a backward/update pass.")
parser.add_argument('--step', type=int, default=1, help='gnn propogation steps')
parser.add_argument('--patience', type=int, default=10, help='the number of epoch to wait before early stop ')
parser.add_argument('--nonhybrid', action='store_true', help='only use the global preference to predict')
parser.add_argument('--validation', action='store_true', help='validation')
parser.add_argument('--valid_portion', type=float, default=0.1, help='split the portion of training set as validation set')
parser.add_argument("--output_name", type=str, default="amex_metrics.txt")
parser.add_argument("--model_checkpoint", type=str, default="amex_checkpoint.pth")
parser.add_argument('--topk', type=int, default=20, help='number of top score items selected for calculating recall and mrr metrics')
args,_= parser.parse_known_args()
print(args)

Namespace(accumulation_steps=2, batchSize=100, dataset='../YOOCHOOSE_data/yoochoose1_64/', epoch=30, gradient_accumulation=False, hiddenSize=100, l2=1e-05, lr=0.001, lr_dc=0.1, lr_dc_step=3, model_checkpoint='amex_checkpoint.pth', nonhybrid=False, output_name='amex_metrics.txt', patience=10, step=1, topk=20, valid_portion=0.1, validation=False)


#### yoochoose1_64

In [24]:
model_name="TAGNN"

args.dataset='./YOOCHOOSE_data/yoochoose1_64/'
args.batchSize=100
args.epoch=30
args.hiddenSize=128
args.step=1
args.model_checkpoint="yoochoose1_64_checkpoint.pth"

n_node=37484

train_path_data=os.path.join(args.dataset,"train.txt")
test_path_data=os.path.join(args.dataset,"test.txt")
with open(train_path_data, 'rb') as f1:
    train_data = pickle.load(f1)
with open(test_path_data, 'rb') as f2:
    test_data = pickle.load(f2)

train_data = Data(train_data, shuffle=True)
test_data = Data(test_data, shuffle=False)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SessionGraph(args, n_node).to(device)

model_path=os.path.join(os.getcwd(),model_name, args.model_checkpoint)
ckpt = torch.load(model_path)
model.load_state_dict(ckpt['state_dict'])

model.eval()
hit, mrr = [], []
slices = test_data.generate_batch(model.batch_size)
for i in tqdm(slices, total=len(slices),position=0, leave=True):
    targets, scores = forward(model, i, test_data)
    sub_scores = scores.topk(20)[1]
    sub_scores = trans_to_cpu(sub_scores).detach().numpy()
    for score, target, mask in zip(sub_scores, targets, test_data.mask):
        hit.append(np.isin(target - 1, score))
        if len(np.where(score == target - 1)[0]) == 0:
            mrr.append(0)
        else:
            mrr.append(1 / (np.where(score == target - 1)[0][0] + 1))
recall_v1 = np.mean(hit)
mrr_v1 = np.mean(mrr) 
print("Test: Recall@{}: {:.4f}, MRR@{}: {:.4f}".format(args.topk, recall_v1, args.topk, mrr_v1))

  A = trans_to_cuda(torch.Tensor(A).float())
100%|██████████| 559/559 [00:50<00:00, 11.02it/s]

Test: Recall@20: 0.7066, MRR@20: 0.3074





#### diginetica

In [25]:
model_name="TAGNN"

args.dataset='./diginetica_data/'
args.batchSize=100
args.epoch=30
args.hiddenSize=128
args.step=1
args.model_checkpoint="diginetica_checkpoint.pth"

n_node=43098

train_path_data=os.path.join(args.dataset,"train.txt")
test_path_data=os.path.join(args.dataset,"test.txt")
with open(train_path_data, 'rb') as f1:
    train_data = pickle.load(f1)
with open(test_path_data, 'rb') as f2:
    test_data = pickle.load(f2)

train_data = Data(train_data, shuffle=True)
test_data = Data(test_data, shuffle=False)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SessionGraph(args, n_node).to(device)

model_path=os.path.join(os.getcwd(),model_name, args.model_checkpoint)
ckpt = torch.load(model_path)
model.load_state_dict(ckpt['state_dict'])

model.eval()
hit, mrr = [], []
slices = test_data.generate_batch(model.batch_size)
for i in tqdm(slices, total=len(slices),position=0, leave=True):
    targets, scores = forward(model, i, test_data)
    sub_scores = scores.topk(20)[1]
    sub_scores = trans_to_cpu(sub_scores).detach().numpy()
    for score, target, mask in zip(sub_scores, targets, test_data.mask):
        hit.append(np.isin(target - 1, score))
        if len(np.where(score == target - 1)[0]) == 0:
            mrr.append(0)
        else:
            mrr.append(1 / (np.where(score == target - 1)[0][0] + 1))
recall_v2 = np.mean(hit)
mrr_v2 = np.mean(mrr)
print("Test: Recall@{}: {:.4f}, MRR@{}: {:.4f}".format(args.topk, recall_v2, args.topk, mrr_v2))

100%|██████████| 609/609 [00:44<00:00, 13.72it/s]


Test: Recall@20: 0.5247, MRR@20: 0.1837


#### Amex

In [26]:
model_name="TAGNN"

args.dataset='./dataset/amex_explorepoi-poi_category/'
args.batchSize=32
args.epoch=30
args.hiddenSize=256
args.step=1
args.model_checkpoint="amex_checkpoint.pth"

n_node=556

train_path_data=os.path.join(args.dataset,"train.txt")
test_path_data=os.path.join(args.dataset,"test.txt")
with open(train_path_data, 'rb') as f1:
    train_data = pickle.load(f1)
with open(test_path_data, 'rb') as f2:
    test_data = pickle.load(f2)

train_data = Data(train_data, shuffle=True)
test_data = Data(test_data, shuffle=False)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SessionGraph(args, n_node).to(device)

model_path=os.path.join(os.getcwd(),model_name, args.model_checkpoint)
ckpt = torch.load(model_path)
model.load_state_dict(ckpt['state_dict'])

model.eval()
hit, mrr = [], []
slices = test_data.generate_batch(model.batch_size)
for i in tqdm(slices, total=len(slices),position=0, leave=True):
    targets, scores = forward(model, i, test_data)
    sub_scores = scores.topk(20)[1]
    sub_scores = trans_to_cpu(sub_scores).detach().numpy()
    for score, target, mask in zip(sub_scores, targets, test_data.mask):
        hit.append(np.isin(target - 1, score))
        if len(np.where(score == target - 1)[0]) == 0:
            mrr.append(0)
        else:
            mrr.append(1 / (np.where(score == target - 1)[0][0] + 1))
recall_v3 = np.mean(hit)
mrr_v3 = np.mean(mrr)
print("Test: Recall@{}: {:.4f}, MRR@{}: {:.4f}".format(args.topk, recall_v3, args.topk, mrr_v3))

100%|██████████| 10/10 [00:00<00:00, 100.43it/s]

Test: Recall@20: 0.7311, MRR@20: 0.4154





In [27]:
tempt={"Model":["TAGNN"]*3, "Dataset":["yoochoose1_64","diginetica","amex-poi-category"],
      "Recall@20":[recall_v1,recall_v2,recall_v3],"MRR@20":[mrr_v1,mrr_v2,mrr_v3]}
tempt=pd.DataFrame(tempt)
output_df=output_df.append(tempt,ignore_index=True)
output_df.style.format({'Recall@20':'{:.2%}','MRR@20':'{:.2%}'})

Unnamed: 0,Model,Dataset,Recall@20,MRR@20
0,NARM,yoochoose1_64,69.79%,29.58%
1,NARM,diginetica,47.43%,15.33%
2,NARM,amex-poi-category,68.27%,46.14%
3,SRGNN,yoochoose1_64,70.74%,31.30%
4,SRGNN,diginetica,51.57%,17.16%
5,SRGNN,amex-poi-category,73.86%,41.79%
6,NISER,yoochoose1_64,71.93%,31.71%
7,NISER,diginetica,55.00%,18.74%
8,NISER,amex-poi-category,85.11%,59.49%
9,TAGNN,yoochoose1_64,70.66%,30.74%


## LESSR

In [28]:
sys.path=list(set(sys.path))
root_dir='/home/ec2-user/SageMaker/sequence-based-recommendation'
old_model_name="TAGNN"
old_model_path=os.path.join(root_dir,old_model_name)
sys.path=[x for x in sys.path if x !=old_model_path]

model_name="LESSR"
model_path=os.path.join(root_dir,model_name)
sys.path.append(model_path)

from LESSR.lessr import LESSR
from LESSR.collate import (collate_fn_factory, seq_to_eop_multigraph,seq_to_shortcut_graph)
from LESSR import metric
from LESSR.dataset import load_data,RecSysDataset

In [29]:
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument(
    '--dataset-dir', default='../YOOCHOOSE_data/yoochoose1_64/', help='the dataset directory'
)
parser.add_argument('--n_items', type=int, default=37484, help='number of unique items. 37484 for yoochoose')
parser.add_argument('--embedding-dim', type=int, default=128, help='the embedding size')
parser.add_argument('--num-layers', type=int, default=1, help='the number of layers')
parser.add_argument('--feat-drop', type=float, default=0.1, help='the dropout ratio for features')
parser.add_argument('--step', type=int, default=1, help='gnn propogation steps')
parser.add_argument('--batch-size', type=int, default=100, help='the batch size for training')
parser.add_argument('--epochs', type=int, default=30, help='the number of training epochs')
parser.add_argument("--model_checkpoint", type=str, default="amex_checkpoint.pth") 
parser.add_argument('--topk', type=int, default=20, help='number of top score items selected for calculating recall and mrr metrics')
parser.add_argument('--num-workers',type=int,default=0,help='the number of processes to load the input graphs') 
args,_= parser.parse_known_args()
print(args)

Namespace(batch_size=100, dataset_dir='../YOOCHOOSE_data/yoochoose1_64/', embedding_dim=128, epochs=30, feat_drop=0.1, model_checkpoint='amex_checkpoint.pth', n_items=37484, num_layers=1, num_workers=0, step=1, topk=20)


#### yoochoose1_64

In [30]:
model_name="LESSR"

args.dataset_path='./YOOCHOOSE_data/yoochoose1_64/'
args.n_items=37484
args.batch_size=100 
args.epoch=30 
args.embedding_dim=256
args.num_layers=3
args.model_checkpoint='yoochoose1_64_checkpoint.pth'

train, test = load_data(args.dataset_path)
train_data = RecSysDataset(train)
test_data = RecSysDataset(test)

if args.num_layers > 1:
    collate_fn = collate_fn_factory(seq_to_eop_multigraph, seq_to_shortcut_graph)
else:
    collate_fn = collate_fn_factory(seq_to_eop_multigraph)
        
train_loader = DataLoader(
    train_data,
    batch_size=args.batch_size,
    # shuffle=True,  # Remove shuffle=True in this case as SubsetRandomSampler shuffles data already
    # drop_last=True,
    num_workers=args.num_workers,
    collate_fn=collate_fn,
    pin_memory=True,
    sampler=SequentialSampler(train_data)
)

test_loader = DataLoader(
    test_data,
    batch_size=args.batch_size,
    # shuffle=True,
    num_workers=args.num_workers,
    collate_fn=collate_fn
)
print()
print('{:<30}{:<10,} '.format("training mini-batch",len(train_loader)))
print('{:<30}{:<10,} '.format("test mini-batch",len(test_loader)))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LESSR(args.n_items, args.embedding_dim, args.num_layers, feat_drop=args.feat_drop).to(device)

model_path=os.path.join(os.getcwd(),model_name, args.model_checkpoint)
ckpt = torch.load(model_path)
model.load_state_dict(ckpt['state_dict'])
recall_v1, mrr_v1,loss_v1 = validate(test_loader, model,device)
print()
print("Test: Recall@{}: {:.4f}, MRR@{}: {:.4f}".format(args.topk, recall_v1, args.topk, mrr_v1))

--------------------------------------------------
Dataset info:
Number of sessions: 369859
--------------------------------------------------
--------------------------------------------------
Dataset info:
Number of sessions: 55898
--------------------------------------------------

training mini-batch           3,699      
test mini-batch               559        


100%|██████████| 559/559 [00:27<00:00, 20.16it/s]


Test: Recall@20: 0.6888, MRR@20: 0.2979





In [31]:
# root_dir=os.path.join(os.getcwd(), "LESSR","output_metrics")
# file_name="test_yoochoose1_64_metrics.txt"
# output=[]
# with open(os.path.join(root_dir,file_name),'r') as f:
#     for line in f:
#         output.append((line.split(",")[1].strip("\n"),line.split(",")[2].strip("\n")))

#### diginetica

In [32]:
# model_name="LESSR"

# args.dataset_path='./diginetica_data/'
# args.n_items=43098
# args.batch_size=100 
# args.epoch=30 
# args.embedding_dim=256
# args.num_layers=3
# args.model_checkpoint='diginetica_checkpoint.pth'

# train, test = load_data(args.dataset_path)
# train_data = RecSysDataset(train)
# test_data = RecSysDataset(test)

# collate_fn = collate_fn_factory(seq_to_session_graph)
# train_loader = DataLoader(
#     train_data,
#     batch_size=args.batch_size,
#     # shuffle=True,  # Remove shuffle=True in this case as SubsetRandomSampler shuffles data already
#     # drop_last=True,
#     num_workers=args.num_workers,
#     collate_fn=collate_fn,
#     pin_memory=True,
#     sampler=SequentialSampler(train_data)
# )

# test_loader = DataLoader(
#     test_data,
#     batch_size=args.batch_size,
#     # shuffle=True,
#     num_workers=args.num_workers,
#     collate_fn=collate_fn
# )
# print()
# print('{:<30}{:<10,} '.format("training mini-batch",len(train_loader)))
# print('{:<30}{:<10,} '.format("test mini-batch",len(test_loader)))

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model = LESSR(args.n_items, args.embedding_dim, args.num_layers, feat_drop=args.feat_drop).to(device)

# model_path=os.path.join(os.getcwd(),model_name, args.model_checkpoint)
# ckpt = torch.load(model_path)
# model.load_state_dict(ckpt['state_dict'])
# recall_v2, mrr_v2,loss_v2 = validate(test_loader, model,device)
# print()
# print("Test: Recall@{}: {:.4f}, MRR@{}: {:.4f}".format(args.topk, recall_v2, args.topk, mrr_v2))

In [45]:
root_dir=os.path.join(os.getcwd(), "LESSR","output_metrics")
file_name="test_diginetica_metrics.txt"
output=[]
with open(os.path.join(root_dir,file_name),'r') as f:
    for line in f:
        output.append((line.split(",")[1].strip("\n"),line.split(",")[2].strip("\n")))
        
import operator
output=sorted(output, key=operator.itemgetter(1),reverse=True)
recall_v2=output[0][0]
mrr_v2=output[0][1]

#### Amex

In [34]:
# model_name="LESSR"

# args.dataset_path="./dataset/amex_explorepoi-poi_category/"
# args.n_items=556
# args.batch_size=32 
# args.epoch=30 
# args.embedding_dim=256
# args.num_layers=3
# args.model_checkpoint='amex_checkpoint.pth'

# train, test = load_data(args.dataset_path)
# train_data = RecSysDataset(train)
# test_data = RecSysDataset(test)

# collate_fn = collate_fn_factory(seq_to_session_graph)
# train_loader = DataLoader(
#     train_data,
#     batch_size=args.batch_size,
#     # shuffle=True,  # Remove shuffle=True in this case as SubsetRandomSampler shuffles data already
#     # drop_last=True,
#     num_workers=args.num_workers,
#     collate_fn=collate_fn,
#     pin_memory=True,
#     sampler=SequentialSampler(train_data)
# )

# test_loader = DataLoader(
#     test_data,
#     batch_size=args.batch_size,
#     # shuffle=True,
#     num_workers=args.num_workers,
#     collate_fn=collate_fn
# )
# print()
# print('{:<30}{:<10,} '.format("training mini-batch",len(train_loader)))
# print('{:<30}{:<10,} '.format("test mini-batch",len(test_loader)))

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model = LESSR(args.n_items, args.embedding_dim, args.num_layers, feat_drop=args.feat_drop).to(device)

# model_path=os.path.join(os.getcwd(),model_name, args.model_checkpoint)
# ckpt = torch.load(model_path)
# model.load_state_dict(ckpt['state_dict'])
# recall_v3, mrr_v3,loss_v3 = validate(test_loader, model,device)
# print()
# print("Test: Recall@{}: {:.4f}, MRR@{}: {:.4f}".format(args.topk, recall_v3, args.topk, mrr_v3))

In [48]:
root_dir=os.path.join(os.getcwd(), "LESSR","output_metrics")
file_name="test_amex_metrics.txt"
output=[]
with open(os.path.join(root_dir,file_name),'r') as f:
    for line in f:
        output.append((line.split(",")[1].strip("\n"),line.split(",")[2].strip("\n")))

import operator
output=sorted(output, key=operator.itemgetter(1),reverse=True)
recall_v3=output[0][0]
mrr_v3=output[0][1]

In [69]:
tempt={"Model":["LESSR"]*3, "Dataset":["yoochoose1_64","diginetica","amex-poi-category"],
      "Recall@20":[recall_v1,recall_v2,recall_v3],"MRR@20":[mrr_v1,mrr_v2,mrr_v3]}
tempt=pd.DataFrame(tempt)
tempt['Recall@20']=tempt['Recall@20'].astype(float)
tempt['MRR@20']=tempt['MRR@20'].astype(float)
output_df=output_df.append(tempt,ignore_index=True)
output_df.style.format({'Recall@20':'{:.2%}','MRR@20':'{:.2%}'})

Unnamed: 0,Model,Dataset,Recall@20,MRR@20
0,NARM,yoochoose1_64,69.79%,29.58%
1,NARM,diginetica,47.43%,15.33%
2,NARM,amex-poi-category,68.27%,46.14%
3,SRGNN,yoochoose1_64,70.74%,31.30%
4,SRGNN,diginetica,51.57%,17.16%
5,SRGNN,amex-poi-category,73.86%,41.79%
6,NISER,yoochoose1_64,71.93%,31.71%
7,NISER,diginetica,55.00%,18.74%
8,NISER,amex-poi-category,85.11%,59.49%
9,TAGNN,yoochoose1_64,70.66%,30.74%


## MSGIFSR

In [70]:
sys.path=list(set(sys.path))
root_dir='/home/ec2-user/SageMaker/sequence-based-recommendation'
old_model_name=""
old_model_path=os.path.join(root_dir,old_model_name)
sys.path=[x for x in sys.path if x !=old_model_path]

model_name="MSGIFSR"
model_path=os.path.join(root_dir,model_name)
sys.path.append(model_path)

from MSGIFSR.msgifsr import MSGIFSR
from MSGIFSR.collate import (collate_fn_factory_ccs, seq_to_ccs_graph)
from MSGIFSR import metric
from MSGIFSR.dataset import load_data,RecSysDataset

In [71]:
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument(
    '--dataset-dir', default='../YOOCHOOSE_data/yoochoose1_64/', help='the dataset directory'
)
parser.add_argument('--n_items', type=int, default=37484, help='number of unique items. 37484 for yoochoose')
parser.add_argument('--embedding-dim', type=int, default=128, help='the embedding size')
parser.add_argument('--num-layers', type=int, default=1, help='the number of layers')
parser.add_argument('--feat-drop', type=float, default=0.1, help='the dropout ratio for features')
parser.add_argument('--step', type=int, default=1, help='gnn propogation steps')
parser.add_argument('--batch-size', type=int, default=100, help='the batch size for training')
parser.add_argument('--epochs', type=int, default=30, help='the number of training epochs')
parser.add_argument("--model_checkpoint", type=str, default="amex_checkpoint.pth") 
parser.add_argument('--topk', type=int, default=20, help='number of top score items selected for calculating recall and mrr metrics')
parser.add_argument('--num-workers',type=int,default=0,help='the number of processes to load the input graphs')

parser.add_argument('--order',type=int,default=3,help='order of msg')
parser.add_argument('--reducer',type=str,default='mean',help='method for reducer')
parser.add_argument('--norm',type=bool,default=True,help='whether use l2 norm')
parser.add_argument('--extra',action='store_true',help='whether use REnorm.')
parser.add_argument('--fusion',action='store_true',help='whether use IFR.')
    
args,_= parser.parse_known_args()
print(args)

Namespace(batch_size=100, dataset_dir='../YOOCHOOSE_data/yoochoose1_64/', embedding_dim=128, epochs=30, extra=False, feat_drop=0.1, fusion=False, model_checkpoint='amex_checkpoint.pth', n_items=37484, norm=True, num_layers=1, num_workers=0, order=3, reducer='mean', step=1, topk=20)


#### yoochoose1_64

In [83]:
model_name="MSGIFSR"

args.dataset_path='./YOOCHOOSE_data/yoochoose1_64/'
args.n_items=37484
args.batch_size=100 
args.epoch=30 
args.embedding_dim=128
args.num_layers=1
args.order=3
args.model_checkpoint='yoochoose1_64_checkpoint.pth'

train, test = load_data(args.dataset_path)
train_data = RecSysDataset(train)
test_data = RecSysDataset(test)

collate_fn = collate_fn_factory_ccs((seq_to_ccs_graph,), order=args.order)
        
train_loader = DataLoader(
    train_data,
    batch_size=args.batch_size,
    # shuffle=True,  # Remove shuffle=True in this case as SubsetRandomSampler shuffles data already
    # drop_last=True,
    num_workers=args.num_workers,
    collate_fn=collate_fn,
    pin_memory=True,
    sampler=SequentialSampler(train_data)
)

test_loader = DataLoader(
    test_data,
    batch_size=args.batch_size,
    # shuffle=True,
    num_workers=args.num_workers,
    collate_fn=collate_fn,
    pin_memory=True
)
print()
print('{:<30}{:<10,} '.format("training mini-batch",len(train_loader)))
print('{:<30}{:<10,} '.format("test mini-batch",len(test_loader)))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MSGIFSR(args.n_items, args.embedding_dim, args.num_layers, dropout=args.feat_drop, reducer=args.reducer, order=args.order, 
                norm=args.norm, extra=args.extra, fusion=args.fusion, device=device)
model=model.to(device)

model_path=os.path.join(os.getcwd(),model_name, args.model_checkpoint)
ckpt = torch.load(model_path)
model.load_state_dict(ckpt['state_dict'])
recall_v1, mrr_v1,loss_v1 = validate(test_loader, model,device)
print()
print("Test: Recall@{}: {:.4f}, MRR@{}: {:.4f}".format(args.topk, recall_v1, args.topk, mrr_v1))

--------------------------------------------------
Dataset info:
Number of sessions: 369859
--------------------------------------------------
--------------------------------------------------
Dataset info:
Number of sessions: 55898
--------------------------------------------------

training mini-batch           3,699      
test mini-batch               559        


100%|██████████| 559/559 [02:25<00:00,  3.85it/s]


Test: Recall@20: 0.7225, MRR@20: 0.3218





#### diginetica

In [84]:
model_name="MSGIFSR"

args.dataset_path='./diginetica_data/'
args.n_items=43098
args.batch_size=100 
args.epoch=30 
args.embedding_dim=128
args.num_layers=1
args.order=3
args.model_checkpoint='diginetica_checkpoint.pth'

train, test = load_data(args.dataset_path)
train_data = RecSysDataset(train)
test_data = RecSysDataset(test)

collate_fn = collate_fn_factory_ccs((seq_to_ccs_graph,), order=args.order)
        
train_loader = DataLoader(
    train_data,
    batch_size=args.batch_size,
    # shuffle=True,  # Remove shuffle=True in this case as SubsetRandomSampler shuffles data already
    # drop_last=True,
    num_workers=args.num_workers,
    collate_fn=collate_fn,
    pin_memory=True,
    sampler=SequentialSampler(train_data)
)

test_loader = DataLoader(
    test_data,
    batch_size=args.batch_size,
    # shuffle=True,
    num_workers=args.num_workers,
    collate_fn=collate_fn,
    pin_memory=True
)
print()
print('{:<30}{:<10,} '.format("training mini-batch",len(train_loader)))
print('{:<30}{:<10,} '.format("test mini-batch",len(test_loader)))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MSGIFSR(args.n_items, args.embedding_dim, args.num_layers, dropout=args.feat_drop, reducer=args.reducer, order=args.order, 
                norm=args.norm, extra=args.extra, fusion=args.fusion, device=device)
model=model.to(device)

model_path=os.path.join(os.getcwd(),model_name, args.model_checkpoint)
ckpt = torch.load(model_path)
model.load_state_dict(ckpt['state_dict'])
recall_v2, mrr_v2,loss_v2 = validate(test_loader, model,device)
print()
print("Test: Recall@{}: {:.4f}, MRR@{}: {:.4f}".format(args.topk, recall_v2, args.topk, mrr_v2))

--------------------------------------------------
Dataset info:
Number of sessions: 719470
--------------------------------------------------
--------------------------------------------------
Dataset info:
Number of sessions: 60858
--------------------------------------------------

training mini-batch           7,195      
test mini-batch               609        


100%|██████████| 609/609 [02:35<00:00,  3.91it/s]


Test: Recall@20: 0.5564, MRR@20: 0.1918





#### Amex

In [85]:
model_name="MSGIFSR"

args.dataset_path='./dataset/amex_explorepoi-poi_category/'
args.n_items=556
args.batch_size=32
args.epoch=30 
args.embedding_dim=256
args.num_layers=1
args.order=3
args.model_checkpoint='amex_checkpoint.pth'

train, test = load_data(args.dataset_path)
train_data = RecSysDataset(train)
test_data = RecSysDataset(test)

collate_fn = collate_fn_factory_ccs((seq_to_ccs_graph,), order=args.order)
        
train_loader = DataLoader(
    train_data,
    batch_size=args.batch_size,
    # shuffle=True,  # Remove shuffle=True in this case as SubsetRandomSampler shuffles data already
    # drop_last=True,
    num_workers=args.num_workers,
    collate_fn=collate_fn,
    pin_memory=True,
    sampler=SequentialSampler(train_data)
)

test_loader = DataLoader(
    test_data,
    batch_size=args.batch_size,
    # shuffle=True,
    num_workers=args.num_workers,
    collate_fn=collate_fn,
    pin_memory=True
)
print()
print('{:<30}{:<10,} '.format("training mini-batch",len(train_loader)))
print('{:<30}{:<10,} '.format("test mini-batch",len(test_loader)))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MSGIFSR(args.n_items, args.embedding_dim, args.num_layers, dropout=args.feat_drop, reducer=args.reducer, order=args.order, 
                norm=args.norm, extra=args.extra, fusion=args.fusion, device=device)
model=model.to(device)

model_path=os.path.join(os.getcwd(),model_name, args.model_checkpoint)
ckpt = torch.load(model_path)
model.load_state_dict(ckpt['state_dict'])
recall_v3, mrr_v3,loss_v3 = validate(test_loader, model,device)
print()
print("Test: Recall@{}: {:.4f}, MRR@{}: {:.4f}".format(args.topk, recall_v3, args.topk, mrr_v3))

--------------------------------------------------
Dataset info:
Number of sessions: 3536
--------------------------------------------------
--------------------------------------------------
Dataset info:
Number of sessions: 305
--------------------------------------------------

training mini-batch           111        
test mini-batch               10         


100%|██████████| 10/10 [00:00<00:00, 10.55it/s]


Test: Recall@20: 0.8664, MRR@20: 0.5979





In [92]:
tempt={"Model":["MSGIFSR"]*3, "Dataset":["yoochoose1_64","diginetica","amex-poi-category"],
      "Recall@20":[recall_v1,recall_v2,recall_v3],"MRR@20":[mrr_v1,mrr_v2,mrr_v3]}
tempt=pd.DataFrame(tempt)
tempt['Recall@20']=tempt['Recall@20'].astype(float)
tempt['MRR@20']=tempt['MRR@20'].astype(float)
output_df=output_df.append(tempt,ignore_index=True)
output_df.style.format({'Recall@20':'{:.2%}','MRR@20':'{:.2%}'})

Unnamed: 0,Model,Dataset,Recall@20,MRR@20
0,NARM,yoochoose1_64,69.79%,29.58%
1,NARM,diginetica,47.43%,15.33%
2,NARM,amex-poi-category,68.27%,46.14%
3,SRGNN,yoochoose1_64,70.74%,31.30%
4,SRGNN,diginetica,51.57%,17.16%
5,SRGNN,amex-poi-category,73.86%,41.79%
6,NISER,yoochoose1_64,71.93%,31.71%
7,NISER,diginetica,55.00%,18.74%
8,NISER,amex-poi-category,85.11%,59.49%
9,TAGNN,yoochoose1_64,70.66%,30.74%


In [93]:
# output_df=output_df.loc[output_df.Model!="MSGIFSR"]
# output_df

In [94]:
output_df.to_csv("result_df.csv")

In [None]:
# def bar_plot(data, colors=None, total_width=0.8, single_width=1, legend=True,title=None,subtitle=None,axis_truncation=0.5):
#     """Draws a bar plot with multiple bars per data point.

#     Parameters
#     ----------
#     ax : matplotlib.pyplot.axis
#         The axis we want to draw our plot on.

#     data: dictionary
#         A dictionary containing the data we want to plot. Keys are the names of the
#         data, the items is a list of the values.

#         Example:
#         data = {
#             "x":[1,2,3],
#             "y":[1,2,3],
#             "z":[1,2,3],
#         }

#     colors : array-like, optional
#         A list of colors which are used for the bars. If None, the colors
#         will be the standard matplotlib color cyle. (default: None)

#     total_width : float, optional, default: 0.8
#         The width of a bar group. 0.8 means that 80% of the x-axis is covered
#         by bars and 20% will be spaces between the bars.

#     single_width: float, optional, default: 1
#         The relative width of a single bar within a group. 1 means the bars
#         will touch eachother within a group, values less than 1 will make
#         these bars thinner.

#     legend: bool, optional, default: True
#         If this is set to true, a legend will be added to the axis.
#     """

#     # Check if colors where provided, otherwhise use the default color cycle
    
#     fig, ax = plt.subplots(figsize =(15, 8))
    
#     if colors is None:
#         colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
    
#     # Number of bars per group
#     n_bars = len(data)

#     # The width of a single bar
#     bar_width = total_width / n_bars

#     # List containing handles for the drawn bars, used for the legend
#     bars = []

#     # Iterate over all data
#     for i, (name, values) in enumerate(data.items()):
#         # The offset in x direction of that bar
#         x_offset = (i - n_bars / 2) * bar_width + bar_width / 2

#         # Draw a bar for every value of that type
#         for x, y in enumerate(values):
#             bar = ax.bar(x + x_offset, y, width=bar_width * single_width, color=colors[i % len(colors)])

#         # Add a handle to the last drawn bar, which we'll need for the legend
#         bars.append(bar[0])

#     # Draw legend if we need
#     if legend:
#         ax.legend(bars, data.keys())
    
#     ax.set_ylabel('Accuracy Rate')
#     ind=np.arange(len(data[list(data.keys())[0]]))
#     ax.set_xticks(ind)
#     ax.set_xticklabels( ('top 5% score', 'top 10% score', 'top 15% score','top 20% score') )
#     ax.set_title(f"Top Predicted Score \n {subtitle} {title} ",fontsize=15)
    
#     #     plt.xlim([0, 1])
#     plt.ylim([axis_truncation, 1])
#     plt.show()

In [None]:
# if __name__ == "__main__":
#     data = {
# #         "pretrained_longformer": response_cust_gs,
#         "longformer : Pretrain + Fine-Tune": response_cust_gs_v2,
#         "longformer : Fine-Tune": response_gs,
#         "bag-of-word": response_gs_bm
#     }

    
#     CL=['r', 'g', 'b', 'c', 'y', 'darkorange', 'lime', 'grey','gold','bisque', 'lightseagreen', 'purple']
#     bar_plot(data, colors=CL,total_width=.7, single_width=1,title="(MSR+Member Transcript)",subtitle="Training Set ",axis_truncation=0.50)
