In [52]:
import os
import argparse
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, SequentialSampler

import sys
sys.path=list(set(sys.path))
root_dir='/home/ec2-user/SageMaker/sequence-based-recommendation'
model_name="NARM"
model_path=os.path.join(root_dir,model_name)
sys.path.append(model_path)

import metric
from utils import collate_fn
from narm import NARM
from dataset import load_data, RecSysDataset

## NARM Model

In [28]:
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--dataset_path', default='./YOOCHOOSE_data/yoochoose1_64/', 
                        help='dataset directory path: datasets/amex/yoochoose1_4/yoochoose1_64')
    
    parser.add_argument('--n_items', type=int, default=37484, help='number of unique items. 37484 for yoochoose')
    parser.add_argument('--batch_size', type=int, default=512, help='input batch size')
    parser.add_argument('--hidden_size', type=int, default=100, help='hidden state size of gru module')
    parser.add_argument('--embed_dim', type=int, default=50, help='the dimension of item embedding')
    parser.add_argument('--topk', type=int, default=20, help='number of top score items selected for calculating recall and mrr metrics')
    parser.add_argument('--valid_portion', type=float, default=0.1, help='split the portion of training set as validation set')
    args,_ = parser.parse_known_args()
    print(args)

Namespace(batch_size=512, dataset_path='./YOOCHOOSE_data/yoochoose1_64/', embed_dim=50, hidden_size=100, n_items=37484, topk=20, valid_portion=0.1)


In [29]:
args.dataset_path='./YOOCHOOSE_data/yoochoose1_64/'
train, valid, test = load_data(args.dataset_path, valid_portion=args.valid_portion)
train_data = RecSysDataset(train)
valid_data = RecSysDataset(valid)
test_data = RecSysDataset(test)
train_loader_yoochoose1_64 = DataLoader(train_data, batch_size = args.batch_size, shuffle = True, collate_fn = collate_fn)
valid_loader_yoochoose1_64 = DataLoader(valid_data, batch_size = args.batch_size, shuffle = False, collate_fn = collate_fn)
test_loader_yoochoose1_64 = DataLoader(test_data, batch_size = args.batch_size, shuffle = False, collate_fn = collate_fn)

print('{:<30}{:<10,} '.format("training batch",len(train_loader_yoochoose1_64)))
print('{:<30}{:<10,} '.format("validation batch",len(valid_loader_yoochoose1_64)))
print('{:<30}{:<10,} '.format("test batch",len(test_loader_yoochoose1_64)))

--------------------------------------------------
Dataset info:
Number of sessions: 332873
--------------------------------------------------
--------------------------------------------------
Dataset info:
Number of sessions: 36986
--------------------------------------------------
--------------------------------------------------
Dataset info:
Number of sessions: 55898
--------------------------------------------------
training batch                651        
validation batch              73         
test batch                    110        


In [30]:
args.dataset_path='./YOOCHOOSE_data/yoochoose1_4/'
train, valid, test = load_data(args.dataset_path, valid_portion=args.valid_portion)
train_data = RecSysDataset(train)
valid_data = RecSysDataset(valid)
test_data = RecSysDataset(test)
train_loader_yoochoose1_4 = DataLoader(train_data, batch_size = args.batch_size, shuffle = True, collate_fn = collate_fn)
valid_loader_yoochoose1_4 = DataLoader(valid_data, batch_size = args.batch_size, shuffle = False, collate_fn = collate_fn)
test_loader_yoochoose1_4 = DataLoader(test_data, batch_size = args.batch_size, shuffle = False, collate_fn = collate_fn)

print('{:<30}{:<10,} '.format("training batch",len(train_loader_yoochoose1_4)))
print('{:<30}{:<10,} '.format("validation batch",len(valid_loader_yoochoose1_4)))
print('{:<30}{:<10,} '.format("test batch",len(test_loader_yoochoose1_4)))

--------------------------------------------------
Dataset info:
Number of sessions: 5325970
--------------------------------------------------
--------------------------------------------------
Dataset info:
Number of sessions: 591775
--------------------------------------------------
--------------------------------------------------
Dataset info:
Number of sessions: 55898
--------------------------------------------------
training batch                10,403     
validation batch              1,156      
test batch                    110        


In [31]:
def validate(valid_loader, model,criterion):
    model.eval()
    recalls = []
    mrrs = []
    losses=[]
    with torch.no_grad():
        for seq, target, lens in valid_loader:
            seq = seq.to(device)
            target = target.to(device)
            outputs = model(seq, lens)
            loss = criterion(outputs, target)
            logits = F.softmax(outputs, dim = 1)
            recall, mrr = metric.evaluate(logits, target, k = args.topk)
            recalls.append(recall)
            mrrs.append(mrr)
            losses.append(loss.item())
    
    mean_recall = np.mean(recalls)
    mean_mrr = np.mean(mrrs)
    mean_loss=np.mean(losses)
    
    return mean_recall, mean_mrr, mean_loss

In [32]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = NARM(args.n_items, args.hidden_size, args.embed_dim, args.batch_size).to(device)

In [33]:
model_name='yoochoose1_64_latest_checkpoint.pth'
model_path=os.path.join(os.getcwd(),"NARM", model_name)
ckpt = torch.load(model_path)
model.load_state_dict(ckpt['state_dict'])
criterion = nn.CrossEntropyLoss()
recall_v1, mrr_v1,loss_v1 = validate(test_loader_yoochoose1_64, model,criterion)
print("Test: Recall@{}: {:.4f}, MRR@{}: {:.4f}".format(args.topk, recall_v1, args.topk, mrr_v1))

Test: Recall@20: 0.6704, MRR@20: 0.2836


In [34]:
model_name='yoochoose1_4_latest_checkpoint.pth'
model_path=os.path.join(os.getcwd(),"NARM", model_name)
ckpt = torch.load(model_path)
model.load_state_dict(ckpt['state_dict'])
criterion = nn.CrossEntropyLoss()
recall_v2, mrr_v2,loss_v2 = validate(test_loader_yoochoose1_4, model,criterion)
print("Test: Recall@{}: {:.4f}, MRR@{}: {:.4f}".format(args.topk, recall_v2, args.topk, mrr_v2))

Test: Recall@20: 0.6950, MRR@20: 0.2909


In [36]:
args.n_items=556
args.batch_size=64
args.dataset_path="./dataset/amex_explorepoi-poi_category/"
train, valid, test = load_data(args.dataset_path, valid_portion=args.valid_portion)
train_data = RecSysDataset(train)
valid_data = RecSysDataset(valid)
test_data = RecSysDataset(test)
train_loader_amex = DataLoader(train_data, batch_size = args.batch_size, shuffle = True, collate_fn = collate_fn)
valid_loader_amex = DataLoader(valid_data, batch_size = args.batch_size, shuffle = False, collate_fn = collate_fn)
test_loader_amex = DataLoader(test_data, batch_size = args.batch_size, shuffle = False, collate_fn = collate_fn)

print('{:<30}{:<10,} '.format("training batch",len(train_loader_amex)))
print('{:<30}{:<10,} '.format("validation batch",len(valid_loader_amex)))
print('{:<30}{:<10,} '.format("test batch",len(test_loader_amex)))

model = NARM(args.n_items, args.hidden_size, args.embed_dim, args.batch_size).to(device)

model_name='amex_explorepoi-poi_category_latest_checkpoint.pth'
model_path=os.path.join(os.getcwd(),"NARM", model_name)
ckpt = torch.load(model_path)
model.load_state_dict(ckpt['state_dict'])
recall_v3, mrr_v3,loss_v3 = validate(test_loader_amex, model,criterion = nn.CrossEntropyLoss())
print()
print("Test: Recall@{}: {:.4f}, MRR@{}: {:.4f}".format(args.topk, recall_v3, args.topk, mrr_v3))

--------------------------------------------------
Dataset info:
Number of sessions: 3182
--------------------------------------------------
--------------------------------------------------
Dataset info:
Number of sessions: 354
--------------------------------------------------
--------------------------------------------------
Dataset info:
Number of sessions: 305
--------------------------------------------------
training batch                50         
validation batch              6          
test batch                    5          

Test: Recall@20: 0.6362, MRR@20: 0.5320


In [37]:
output_df=pd.DataFrame()
output_df["Model"]=["NARM"]*3
output_df["Dataset"]=["yoochoose1_64","yoochoose1_4","amex-poi-category"]
output_df["Recall@20"]=[recall_v1,recall_v2,recall_v3]
output_df["MRR@20"]=[mrr_v1,mrr_v2,mrr_v3]
output_df.style.format({'Recall@20':'{:.2%}','MRR@20':'{:.2%}'})

Unnamed: 0,Model,Dataset,Recall@20,MRR@20
0,NARM,yoochoose1_64,67.04%,28.36%
1,NARM,yoochoose1_4,69.50%,29.09%
2,NARM,amex-poi-category,63.62%,53.20%


## SRGNN

In [38]:
sys.path=list(set(sys.path))
root_dir='/home/ec2-user/SageMaker/sequence-based-recommendation'
old_model_name="NARM"
old_model_path=os.path.join(root_dir,old_model_name)
sys.path=[x for x in sys.path if x !=old_model_path]

model_name="SRGNN"
model_path=os.path.join(root_dir,model_name)
sys.path.append(model_path)

from srgnn import SRGNN
from collate import (collate_fn_factory, seq_to_session_graph)
import metric
from dataset import load_data,RecSysDataset

In [45]:
def prepare_batch(batch, device):
    inputs, labels = batch
    # inputs, labels = batch
    inputs_gpu  = [x.to(device) for x in inputs]
    labels_gpu  = labels.to(device)
   
    return inputs_gpu, labels_gpu 

def validate(valid_loader, model,device):
    model.eval()
    recalls = []
    mrrs = []
    losses=[]
    with torch.no_grad():
        for step, batch in tqdm(enumerate(valid_loader), total=len(valid_loader),position=0,leave=True):
            inputs, labels = prepare_batch(batch, device)
            outputs = model(*inputs)
            # loss = criterion(outputs, labels)
            loss = nn.functional.nll_loss(outputs, labels)
            logits = F.softmax(outputs, dim = 1)
            recall, mrr = metric.evaluate(logits, labels, k = args.topk)
            recalls.append(recall)
            mrrs.append(mrr)
            losses.append(loss.item())
    
    mean_recall = np.mean(recalls)
    mean_mrr = np.mean(mrrs)
    mean_loss=np.mean(losses)
    
    return mean_recall, mean_mrr, mean_loss

In [56]:
if __name__ == '__main__':
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        '--dataset-dir', default='../YOOCHOOSE_data/yoochoose1_64/', help='the dataset directory'
    )
    parser.add_argument("--seed",  type=int,default=101,
            help="random seed for np.random.seed, torch.manual_seed and torch.cuda.manual_seed.")
    parser.add_argument(
    '--batch-size', type=int, default=512, help='the batch size for training'
    )

    parser.add_argument('--n_items', type=int, default=37484, help='number of unique items. 37484 for yoochoose')
    parser.add_argument('--embedding-dim', type=int, default=256, help='the embedding size')
    parser.add_argument('--num-layers', type=int, default=1, help='the number of layers')
    parser.add_argument('--feat-drop', type=float, default=0.1, help='the dropout ratio for features')
    parser.add_argument(
    '--valid-split',
    type=float,
    default=0.1,
    help='the fraction for the validation set',
    )
    parser.add_argument(
        '--num-workers',
        type=int,
        default=0,
        help='the number of processes to load the input graphs',
    )
    
    parser.add_argument(
        '--topk', 
        type=int, 
        default=20, 
        help='number of top score items selected for calculating recall and mrr metrics',
    )

    args,_= parser.parse_known_args()
    print(args)

Namespace(batch_size=512, dataset_dir='../YOOCHOOSE_data/yoochoose1_64/', embedding_dim=256, feat_drop=0.1, n_items=37484, num_layers=1, num_workers=0, seed=101, topk=20, valid_split=0.1)


In [57]:
args.dataset_path='./YOOCHOOSE_data/yoochoose1_64/'
train, valid, test = load_data(args.dataset_path, valid_portion=args.valid_split)
train_data = RecSysDataset(train)
valid_data = RecSysDataset(valid)
test_data = RecSysDataset(test)

collate_fn = collate_fn_factory(seq_to_session_graph)
train_loader_yoochoose1_64 = DataLoader(
    train_data,
    batch_size=args.batch_size,
    # shuffle=True,
    # drop_last=True,
    num_workers=args.num_workers,
    collate_fn=collate_fn,
    pin_memory=True,
    sampler=SequentialSampler(train_data)
)

valid_loader_yoochoose1_64 = DataLoader(
    valid_data,
    batch_size=args.batch_size,
    # shuffle=True,
    # drop_last=True,
    num_workers=args.num_workers,
    collate_fn=collate_fn,
    pin_memory=True,
    sampler=SequentialSampler(valid_data)
)

test_loader_yoochoose1_64 = DataLoader(
    test_data,
    batch_size=args.batch_size,
    shuffle=True,
    num_workers=args.num_workers,
    collate_fn=collate_fn
)

print('{:<30}{:<10,} '.format("training batch",len(train_loader_yoochoose1_64)))
print('{:<30}{:<10,} '.format("validation batch",len(valid_loader_yoochoose1_64)))
print('{:<30}{:<10,} '.format("test batch",len(test_loader_yoochoose1_64)))

--------------------------------------------------
Dataset info:
Number of sessions: 332873
--------------------------------------------------
--------------------------------------------------
Dataset info:
Number of sessions: 36986
--------------------------------------------------
--------------------------------------------------
Dataset info:
Number of sessions: 55898
--------------------------------------------------
training batch                651        
validation batch              73         
test batch                    110        


In [60]:
args.dataset_path='./YOOCHOOSE_data/yoochoose1_4/'
train, valid, test = load_data(args.dataset_path, valid_portion=args.valid_split)
train_data = RecSysDataset(train)
valid_data = RecSysDataset(valid)
test_data = RecSysDataset(test)

collate_fn = collate_fn_factory(seq_to_session_graph)
train_loader_yoochoose1_4 = DataLoader(
    train_data,
    batch_size=args.batch_size,
    # shuffle=True,
    # drop_last=True,
    num_workers=args.num_workers,
    collate_fn=collate_fn,
    pin_memory=True,
    sampler=SequentialSampler(train_data)
)

valid_loader_yoochoose1_4 = DataLoader(
    valid_data,
    batch_size=args.batch_size,
    # shuffle=True,
    # drop_last=True,
    num_workers=args.num_workers,
    collate_fn=collate_fn,
    pin_memory=True,
    sampler=SequentialSampler(valid_data)
)

test_loader_yoochoose1_4 = DataLoader(
    test_data,
    batch_size=args.batch_size,
    shuffle=True,
    num_workers=args.num_workers,
    collate_fn=collate_fn
)

print('{:<30}{:<10,} '.format("training batch",len(train_loader_yoochoose1_4)))
print('{:<30}{:<10,} '.format("validation batch",len(valid_loader_yoochoose1_4)))
print('{:<30}{:<10,} '.format("test batch",len(test_loader_yoochoose1_4)))

--------------------------------------------------
Dataset info:
Number of sessions: 5325970
--------------------------------------------------
--------------------------------------------------
Dataset info:
Number of sessions: 591775
--------------------------------------------------
--------------------------------------------------
Dataset info:
Number of sessions: 55898
--------------------------------------------------
training batch                10,403     
validation batch              1,156      
test batch                    110        


In [63]:
args.n_items=556
args.batch_size=64
args.dataset_path="./dataset/amex_explorepoi-poi_category/"
train, valid, test = load_data(args.dataset_path, valid_portion=args.valid_split)
train_data = RecSysDataset(train)
valid_data = RecSysDataset(valid)
test_data = RecSysDataset(test)

collate_fn = collate_fn_factory(seq_to_session_graph)
train_loader_amex = DataLoader(
    train_data,
    batch_size=args.batch_size,
    # shuffle=True,
    # drop_last=True,
    num_workers=args.num_workers,
    collate_fn=collate_fn,
    pin_memory=True,
    sampler=SequentialSampler(train_data)
)

valid_loader_amex = DataLoader(
    valid_data,
    batch_size=args.batch_size,
    # shuffle=True,
    # drop_last=True,
    num_workers=args.num_workers,
    collate_fn=collate_fn,
    pin_memory=True,
    sampler=SequentialSampler(valid_data)
)

test_loader_amex = DataLoader(
    test_data,
    batch_size=args.batch_size,
    shuffle=True,
    num_workers=args.num_workers,
    collate_fn=collate_fn
)

print('{:<30}{:<10,} '.format("training batch",len(train_loader_amex)))
print('{:<30}{:<10,} '.format("validation batch",len(valid_loader_amex)))
print('{:<30}{:<10,} '.format("test batch",len(test_loader_amex)))

--------------------------------------------------
Dataset info:
Number of sessions: 3182
--------------------------------------------------
--------------------------------------------------
Dataset info:
Number of sessions: 354
--------------------------------------------------
--------------------------------------------------
Dataset info:
Number of sessions: 305
--------------------------------------------------
training batch                50         
validation batch              6          
test batch                    5          


In [58]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SRGNN(args.n_items, args.embedding_dim, args.num_layers, feat_drop=args.feat_drop)
model = model.to(device)

In [59]:
model_name='yoochoose1_64_latest_checkpoint.pth'
model_path=os.path.join(os.getcwd(),"SRGNN", model_name)
ckpt = torch.load(model_path)
model.load_state_dict(ckpt['state_dict'])

recall_v1, mrr_v1,loss_v1 = validate(test_loader_yoochoose1_64, model,device)
print("Test: Recall@{}: {:.4f}, MRR@{}: {:.4f}".format(args.topk, recall_v1, args.topk, mrr_v1))

100%|██████████| 110/110 [00:22<00:00,  4.91it/s]

Test: Recall@20: 0.6786, MRR@20: 0.2925





In [64]:
model_name='yoochoose1_4_latest_checkpoint.pth'
model_path=os.path.join(os.getcwd(),"SRGNN", model_name)
ckpt = torch.load(model_path)
model.load_state_dict(ckpt['state_dict'])

recall_v2, mrr_v2,loss_v2 = validate(test_loader_yoochoose1_4, model,device)
print("Test: Recall@{}: {:.4f}, MRR@{}: {:.4f}".format(args.topk, recall_v2, args.topk, mrr_v2))

100%|██████████| 110/110 [00:20<00:00,  5.28it/s]

Test: Recall@20: 0.5924, MRR@20: 0.2368





In [67]:
model_name='amex_explorepoi-poi_category_latest_checkpoint.pth'
model_path=os.path.join(os.getcwd(),"SRGNN", model_name)
ckpt = torch.load(model_path)

args.n_items=556
model = SRGNN(args.n_items, args.embedding_dim, args.num_layers, feat_drop=args.feat_drop).to(device)
model.load_state_dict(ckpt['state_dict'])

recall_v3, mrr_v3,loss_v3 = validate(test_loader_amex, model,device)
print("Test: Recall@{}: {:.4f}, MRR@{}: {:.4f}".format(args.topk, recall_v3, args.topk, mrr_v3))

100%|██████████| 5/5 [00:00<00:00, 35.82it/s]

Test: Recall@20: 0.6265, MRR@20: 0.2709





In [71]:
tempt={"Model":["SRGNN"]*3, "Dataset":["yoochoose1_64","yoochoose1_4","amex-poi-category"],
      "Recall@20":[recall_v1,recall_v2,recall_v3],"MRR@20":[mrr_v1,mrr_v2,mrr_v3]}
tempt=pd.DataFrame(tempt)
output_df=output_df.append(tempt,ignore_index=True)
output_df.style.format({'Recall@20':'{:.2%}','MRR@20':'{:.2%}'})

Unnamed: 0,Model,Dataset,Recall@20,MRR@20
0,NARM,yoochoose1_64,67.04%,28.36%
1,NARM,yoochoose1_4,69.50%,29.09%
2,NARM,amex-poi-category,63.62%,53.20%
3,SRGNN,yoochoose1_64,67.86%,29.25%
4,SRGNN,yoochoose1_4,59.24%,23.68%
5,SRGNN,amex-poi-category,62.65%,27.09%
