In [1]:
import argparse
from logging import getLogger
import os
from recbole.config import Config
from recbole.data import create_dataset
from recbole.data.utils import get_dataloader, create_samplers
from recbole.model.sequential_recommender.mbht import MBHT
from recbole.utils import init_logger, init_seed, get_model, get_trainer, set_color

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#默认参数及格式
def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model', '-m', type=str, default='MBHT', help='Model for session-based rec.')
    parser.add_argument('--dataset', '-d', type=str, default='tmall_beh', help='Benchmarks for session-based rec.')
    parser.add_argument('--validation', action='store_true', help='Whether evaluating on validation set (split from train set), otherwise on test set.')
    parser.add_argument('--valid_portion', type=float, default=0.1, help='ratio of validation set.')
    parser.add_argument('--gpu_id', type=int, default=0)
    parser.add_argument('--batch_size', type=int, default=2048)
    return parser.parse_known_args()[0]

In [3]:
# 装载参数
args = get_args()

In [4]:
# configurations initialization 参数初始化
config_dict = {
    'USER_ID_FIELD': 'session_id',
    'load_col': None,
    # 'neg_sampling': {'uniform':1},
    'neg_sampling': None,
    'benchmark_filename': ['train', 'test'],
    'alias_of_item_id': ['item_id_list'],
    'topk': [5, 10, 101],
    'metrics': ['Recall', 'NDCG', 'MRR'],
    'valid_metric': 'NDCG@10',
    'eval_args':{
        'mode':'full',
        'order':'TO'
        },
    'gpu_id':args.gpu_id,
    "MAX_ITEM_LIST_LENGTH":200,
    "train_batch_size": 32 if args.dataset == "ijcai_beh" else 64,
    "eval_batch_size":24 if args.dataset == "ijcai_beh" else 128,
    "hyper_len":10 if args.dataset == "ijcai_beh" else 6,
    "scales":[10, 4, 20],
    "enable_hg":1,
    "enable_ms":1,
    "customized_eval":1,
    "abaltion":""
}

In [5]:
if args.dataset == "retail_beh":
    config_dict['scales'] = [5, 4, 20]
    config_dict['hyper_len'] = 6

In [6]:
config = Config(model="MBHT", dataset=f'{args.dataset}', config_dict=config_dict)
# config['device']="cpu"
init_seed(config['seed'], config['reproducibility'])

In [7]:
# logger initialization
init_logger(config, log_root="log")
logger = getLogger()

logger.info(f"PID: {os.getpid()}")
logger.info(args)
logger.info(config)

01 Aug 19:49    INFO  PID: 13572
01 Aug 19:49    INFO  Namespace(batch_size=2048, dataset='tmall_beh', gpu_id=0, model='MBHT', valid_portion=0.1, validation=False)
01 Aug 19:49    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = dataset/tmall_beh
show_progress = True
save_dataset = False
save_dataloaders = False
benchmark_filename = ['train', 'test']

Training Hyper Parameters:
checkpoint_dir = saved
epochs = 300
train_batch_size = 64
learner = adam
learning_rate = 0.001
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'mode': 'full', 'order': 'TO', 'split': {'RS': [0.8, 0.1, 0.1]}, 'group_by': 'user'}
metrics = ['Recall', 'NDCG', 'MRR']
topk = [5, 10, 101]
valid_metric = NDCG@10
valid_metric_bigger = True
eval_batch_size = 128
metric_decimal_place = 4

Dataset Hyper Parameters:
field_separator = 	
seq_separator =  
USER_ID_FIE

In [8]:
# dataset filtering
dataset = create_dataset(config)
logger.info(dataset)

01 Aug 19:49    INFO  tmall_beh
The number of users: 437367
Average actions of users: 1.0813460579926195
The number of items: 99038
Average actions of items: 5.257094583328702
The number of inters: 472944
The sparsity of the dataset: 99.99890815284478%
Remain Fields: ['session_id', 'item_id_list', 'item_type_list', 'item_id', 'item_length']


In [9]:
print(config['device'])

cpu


In [10]:
# dataset splitting
train_dataset, test_dataset = dataset.build()
train_sampler, test_sampler = create_samplers(config, dataset, [train_dataset, test_dataset])
if args.validation:
    train_dataset.shuffle()
    new_train_dataset, new_test_dataset = train_dataset.split_by_ratio([1 - args.valid_portion, args.valid_portion])
    train_data = get_dataloader(config, 'train')(config, new_train_dataset, None, shuffle=True)
    test_data = get_dataloader(config, 'test')(config, new_test_dataset, None, shuffle=False)
else:
    train_data = get_dataloader(config, 'train')(config, train_dataset, train_sampler, shuffle=True)
    test_data = get_dataloader(config, 'test')(config, test_dataset, test_sampler, shuffle=False)

In [11]:
# model loading and initialization
model = get_model(config['model'])(config, train_data.dataset).to(config['device'])
print(get_model(config['model']))

<class 'recbole.model.sequential_recommender.mbht.MBHT'>


In [12]:
# trainer loading and initialization
trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)

In [13]:
# model training and evaluation
test_score, test_result = trainer.fit(
    train_data, test_data, saved=True, show_progress=config['show_progress']
)

Evaluate   :   0%|                                                           | 0/79 [00:00<?, ?it/s]:   1%|▋                                                  | 1/79 [00:01<01:26,  1.11s/it]:   3%|█▎                                                 | 2/79 [00:02<01:23,  1.09s/it]:   4%|█▉                                                 | 3/79 [00:03<01:18,  1.03s/it]:   5%|██▌                                                | 4/79 [00:04<01:14,  1.01it/s]:   6%|███▏                                               | 5/79 [00:04<01:10,  1.04it/s]:   8%|███▊                                               | 6/79 [00:05<01:08,  1.07it/s]:   9%|████▌                                              | 7/79 [00:06<01:06,  1.09it/s]:  10%|█████▏                                             | 8/79 [00:07<01:05,  1.09it/s]:  11%|█████▊                                             | 9/79 [00:08<01:04,  1.09it/s]:  13%|██████▎                                           | 10/79 [00:09<01:05,  1.06it/s]:  14%|███

KeyboardInterrupt: 

n_layers: 8
hidden_size: 64
hidden_dropout_prob: 0.5
hidden_act: 'gelu'
layer_norm_eps: 1e-12
initializer_range: 0.02
selected_features: ['class', 'movie_title', 'release_year']
pooling_mode: 'mean'
loss_type: 'CE'