In [17]:
import os
import time

import pandas as pd
import numpy as np
from tqdm import tqdm
from recbole.quick_start import run_recbole

In [18]:
class BaseDataset(object):
    def __init__(self, input_path, output_path):
        super(BaseDataset, self).__init__()

        self.dataset_name = ''
        self.input_path = input_path
        self.output_path = output_path
        self.check_output_path()

        # input file
        self.inter_file = os.path.join(self.input_path, 'inters.dat')
        self.item_file = os.path.join(self.input_path, 'items.dat')
        self.user_file = os.path.join(self.input_path, 'users.dat')
        self.sep = '\t'

        # output file
        self.output_inter_file, self.output_item_file, self.output_user_file = self.get_output_files()

        # selected feature fields
        self.inter_fields = {}
        self.item_fields = {}
        self.user_fields = {}

    def check_output_path(self):
        if not os.path.isdir(self.output_path):
            os.makedirs(self.output_path)

    def get_output_files(self):
        output_inter_file = os.path.join(self.output_path, self.dataset_name + '.inter')
        output_item_file = os.path.join(self.output_path, self.dataset_name + '.item')
        output_user_file = os.path.join(self.output_path, self.dataset_name + '.user')
        return output_inter_file, output_item_file, output_user_file

    def load_inter_data(self) -> pd.DataFrame():
        raise NotImplementedError

    def load_item_data(self) -> pd.DataFrame():
        raise NotImplementedError

    def load_user_data(self) -> pd.DataFrame():
        raise NotImplementedError

    def convert_inter(self):
        try:
            input_inter_data = self.load_inter_data()
            self.convert(input_inter_data, self.inter_fields, self.output_inter_file)
        except NotImplementedError:
            print('This dataset can\'t be converted to inter file\n')

    def convert_item(self):
        try:
            input_item_data = self.load_item_data()
            self.convert(input_item_data, self.item_fields, self.output_item_file)
        except NotImplementedError:
            print('This dataset can\'t be converted to item file\n')

    def convert_user(self):
        try:
            input_user_data = self.load_user_data()
            self.convert(input_user_data, self.user_fields, self.output_user_file)
        except NotImplementedError:
            print('This dataset can\'t be converted to user file\n')

    @staticmethod
    def convert(input_data, selected_fields, output_file):
        output_data = pd.DataFrame()
        for column in selected_fields:
            output_data[column] = input_data.iloc[:, column]
        with open(output_file, 'w') as fp:
            fp.write('\t'.join([selected_fields[column] for column in output_data.columns]) + '\n')
            for i in tqdm(range(output_data.shape[0])):
                fp.write('\t'.join([str(output_data.iloc[i, j])
                                    for j in range(output_data.shape[1])]) + '\n')

    def parse_json(self, data_path):
        with open(data_path, 'rb') as g:
            for l in g:
                yield eval(l)

    def getDF(self, data_path):
        i = 0
        df = {}
        for d in self.parse_json(data_path):
            df[i] = d
            i += 1
        data = pd.DataFrame.from_dict(df, orient='index')
        
        return data

In [3]:
class BRDataset(BaseDataset):
    def __init__(self, input_path, output_path):
        super(BRDataset, self).__init__(input_path, output_path)
        self.dataset_name = "br"

        self.inter_file = os.path.join(self.input_path, "train_ratings.csv")
        self.item_file = os.path.join(self.input_path, "books.csv")
        self.user_file = os.path.join(self.input_path, "users.csv")

        self.sep = ","

        # output_path
        output_files = self.get_output_files()
        self.output_inter_file = output_files[0]
        self.output_item_file = output_files[1]
        self.output_user_file = output_files[2]

        # selected feature fields
        self.inter_fields = {
            0: "user_id:token",
            1: "isbn:token",
            2: "rating:float"
        }

        self.user_fields = {
            0: "user_id:token",
            1: "location:token_seq",
            2: "age:token"
        }
        
        self.item_fields = {
            0: "isbn:token",
            1: "book_title:token_seq",
            2: "book_author:token_seq",
            3: "year_of_publication:token",
            4: "publisher:token",
            5: "language:token",
            6: "category:token_seq",
            7: "summary:token_seq"
        }

    def load_inter_data(self):
        df = pd.read_csv(self.inter_file,
            dtype={"user_id": "object", "isbn": "object", "rating": "float"}
           )
        # approx. 1 month + 2 weeks
        df = df[-len(df)*3//48:].reset_index(drop=True)
        # Further downsampling to avoid OOM
        uus = df["user_id"].unique()
        sampled_users = np.random.choice(uus, len(uus)//6)
        df = df.query('user_id in @sampled_users')
        return df

    def load_user_data(self):
        return pd.read_csv(self.user_file,
                           dtype={"user_id": "object", "location": "object", "age": pd.Int64Dtype()},
                           delimiter=self.sep,
                           engine="python")
    
    def load_item_data(self):
        return pd.read_csv(self.item_file,
                           dtype={"isbn": "object", "book_title": "object", "book_author": "object", "year_of_publication": pd.Int64Dtype(), "publisher": "object", "img_url": "object", "language": "object", "category": "object", "summary": "object", "img_path": "object"},
                           delimiter=self.sep,
                           engine="python").drop(["img_url", "img_path"], axis = 1)

In [158]:
brds = BRDataset("../data", "./br")
brds.convert_inter()
brds.convert_user()
brds.convert_item()
del brds

100%|██████████| 2784/2784 [00:00<00:00, 12259.68it/s]
100%|██████████| 68092/68092 [00:06<00:00, 10902.00it/s]
100%|██████████| 149570/149570 [00:33<00:00, 4410.96it/s]


In [22]:
class BRTestDataset(BaseDataset):
    def __init__(self, input_path, output_path):
        super(BRTestDataset, self).__init__(input_path, output_path)
        self.dataset_name = "br_test"

        self.inter_file = os.path.join(self.input_path, "test_ratings.csv")

        self.sep = ","

        # output_path
        output_files = self.get_output_files()
        self.output_inter_file = output_files[0]

        # selected feature fields
        self.inter_fields = {
            0: "user_id:token",
            1: "isbn:token",
            2: "rating:float"
        }

    def load_inter_data(self):
        df = pd.read_csv(self.inter_file,
            dtype={"user_id": "object", "isbn": "object", "rating": "float"}
           )
        # approx. 1 month + 2 weeks
        df = df[-len(df)*3//48:].reset_index(drop=True)
        # Further downsampling to avoid OOM
        uus = df["user_id"].unique()
        sampled_users = np.random.choice(uus, len(uus)//6)
        df = df.query('user_id in @sampled_users')
        return df

In [24]:
brdsTest = BRTestDataset("../data", "./br")
brdsTest.convert_inter()
del brdsTest

100%|██████████| 698/698 [00:00<00:00, 13406.65it/s]


In [4]:
cfg_str = """
data_path: ./
dataset: br
field_separator: "\\t"
USER_ID_FIELD: user_id
ITEM_ID_FIELD: isbn
RATING_FIELD: rating
TIME_FIELD: ~
LABEL_FIELD: rating
show_progress: false

load_col:
    inter: [user_id, isbn, rating]
    user: [user_id, location, age]
    item: [isbn, book_title, book_author, year_of_publication, publisher, language, category, summary]

epochs: 5
learning_rate: 0.01
user_inter_num_interval: "[0,inf)"
item_inter_num_interval: "[0,inf)"
filter_inter_by_user_or_item: false
neg_sampling:
    uniform: 1
eval_args:
    split: {'RS': [6, 2, 2]}
    group_by: None
    mode: labeled
metrics: ['RMSE']
valid_metric: RMSE
"""


with open("br/config.yaml", "w") as f:
    f.write(cfg_str)

In [5]:
def run(model_name):
    if model_name in [
        "MultiVAE",
        "MultiDAE",
        "MacridVAE",
        "RecVAE",
        "GRU4Rec",
        "NARM",
        "STAMP",
        "NextItNet",
        "TransRec",
        "SASRec",
        "BERT4Rec",
        "SRGNN",
        "GCSAN",
        "GRU4RecF",
        "FOSSIL",
        "SHAN",
        "RepeatNet",
        "HRM",
        "NPE",
    ]:
        parameter_dict = {
            "neg_sampling": None
        }
        return run_recbole(
            model=model_name,
            dataset='br',
            config_file_list=['br/config.yaml'],
            config_dict=parameter_dict,
        )
    else:
        return run_recbole(
            model=model_name,
            dataset='br',
            config_file_list=['br/config.yaml'],
        )

In [None]:
model_list = ["FFM"]
# model_list = ["Pop", "ItemKNN", "BPR", "NeuMF", "RecVAE", "LightGCN"] # General
# model_list += ["FFM", "DeepFM"] # Context-aware
for model_name in model_list:
    print(f"running {model_name}...")
    start = time.time()
    result = run(model_name)
    t = time.time() - start
    print(f"It took {t/60:.2f} mins")
    print(result)

In [None]:
model_list = ["Pop", "ItemKNN", "BPR", "NeuMF", "RecVAE", "LightGCN"] # General
model_list += ["FFM", "DeepFM"] # Context-aware

In [1]:
from recbole.config.configurator import Config
from recbole.data.utils import create_dataset, data_preparation
from recbole.utils import get_model, get_trainer

def objective_function(config_dict=None, config_file_list=None):

    config = Config(config_dict=config_dict, config_file_list=config_file_list)
    dataset = create_dataset(config)
    train_data, valid_data, test_data = data_preparation(config, dataset)
    model_name = config['model']
    model = get_model(model_name)(config, train_data._dataset).to(config['device'])
    trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)
    best_valid_score, best_valid_result = trainer.fit(train_data, valid_data, verbose=False)
    test_result = trainer.evaluate(test_data)

    return {
        'model': model_name,
        'best_valid_score': best_valid_score,
        'valid_score_bigger': config['valid_metric_bigger'],
        'best_valid_result': best_valid_result,
        'test_result': test_result
    }

In [2]:
from ray import tune
from ray.tune.schedulers import ASHAScheduler

asha_scheduler = ASHAScheduler(
    time_attr='training_iteration',
    metric='best_valid_result/rmse',
    mode='min',
    max_t=100,
    grace_period=10,
    reduction_factor=3,
    brackets=1,
)

config = {
    "model": tune.grid_search(["FM"]),
    "epochs": tune.randint(10, 100),
    "train_batch_size": tune.grid_search([256, 512, 1024, 2048]),
    "learning_rate": tune.loguniform(1e-5, 1e-1)
}

result = tune.run(
    tune.with_parameters(objective_function, config_file_list=['/opt/ml/recbole/br/config.yaml']),
    config=config,
    num_samples=5,
    log_to_file='log',
    scheduler=asha_scheduler,
    local_dir='result',
    verbose=1
)
best_trial = result.get_best_trial('best_valid_result/rmse', 'min')
print("best params: ",best_trial.config)
print("best result: ",best_trial.last_result)

0,1
Current time:,2023-04-17 02:38:05
Running for:,00:43:00.09
Memory:,6.6/88.5 GiB

Trial name,status,loc,epoch,learning_rate,model,train_batch_size,iter,total time (s),best_valid_score
objective_function_dfed1_00000,TERMINATED,172.17.0.2:57053,25,1.24804e-05,FM,256,1,1039.16,6.6896
objective_function_dfed1_00001,TERMINATED,172.17.0.2:57125,76,0.0755485,FM,512,1,1049.27,6.47
objective_function_dfed1_00002,TERMINATED,172.17.0.2:57127,74,0.0868598,FM,1024,1,1047.44,6.494
objective_function_dfed1_00003,TERMINATED,172.17.0.2:57129,55,3.55554e-05,FM,2048,1,1041.41,6.7383
objective_function_dfed1_00004,TERMINATED,172.17.0.2:57135,96,2.01359e-05,FM,256,1,1050.56,6.6912
objective_function_dfed1_00005,TERMINATED,172.17.0.2:57137,14,0.00741768,FM,512,1,1049.32,6.4524
objective_function_dfed1_00006,TERMINATED,172.17.0.2:57139,79,0.0406584,FM,1024,1,1049.5,6.5004
objective_function_dfed1_00007,TERMINATED,172.17.0.2:57143,89,0.00397239,FM,2048,1,1036.38,6.3967
objective_function_dfed1_00008,TERMINATED,172.17.0.2:57053,12,0.00199372,FM,256,1,1022.77,6.3275
objective_function_dfed1_00009,TERMINATED,172.17.0.2:57143,13,0.00440251,FM,512,1,1020.59,6.5127


2023-04-17 02:38:05,813	INFO tune.py:798 -- Total run time: 2580.13 seconds (2580.08 seconds for the tuning loop).


best params:  {'model': 'FM', 'epoch': 30, 'train_batch_size': 512, 'learning_rate': 0.006364591701150409}
best result:  {'model': 'FM', 'best_valid_score': 6.273, 'valid_score_bigger': False, 'best_valid_result': OrderedDict([('rmse', 6.273)]), 'test_result': OrderedDict([('rmse', 6.5666)]), 'time_this_iter_s': 498.17821502685547, 'done': True, 'timesteps_total': None, 'episodes_total': None, 'training_iteration': 1, 'trial_id': 'dfed1_00017', 'experiment_id': '6c184032fc8249c396bd9badb3060271', 'date': '2023-04-17_02-37-49', 'timestamp': 1681699069, 'time_total_s': 498.17821502685547, 'pid': 57053, 'hostname': 'cf68c92cecb9', 'node_ip': '172.17.0.2', 'config': {'model': 'FM', 'epoch': 30, 'train_batch_size': 512, 'learning_rate': 0.006364591701150409}, 'time_since_restore': 498.17821502685547, 'timesteps_since_restore': 0, 'iterations_since_restore': 1, 'warmup_time': 0.003553628921508789, 'experiment_tag': '17_epoch=30,learning_rate=0.0064,model=FM,train_batch_size=512'}


In [10]:
from recbole.quick_start import run_recbole

result = run_recbole(
    model='FM',
    dataset='br',
    config_file_list=['br/config.yaml'],
    config_dict={'epochs': 30, 'train_batch_size': 512, 'learning_rate': 0.006364591701150409},
    saved=True
)

17 Apr 02:57    INFO  ['/opt/conda/envs/bc_recbole/lib/python3.8/site-packages/ipykernel_launcher.py', '-f', '/opt/ml/.local/share/jupyter/runtime/kernel-a88d0c04-6357-4a6c-a845-f7de34815b66.json']
17 Apr 02:57    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = /opt/ml/recbole/br
checkpoint_dir = saved
show_progress = False
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 30
train_batch_size = 512
learner = adam
learning_rate = 0.006364591701150409
train_neg_sample_args = {'distribution': 'none', 'sample_num': 'none', 'alpha': 'none', 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 10

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [6, 2, 2]}, 'group_by': 'None', 'mode': 'labeled', 'order': 'RO'}
repeatable = False
me

In [12]:
config, model, dataset, train_data, valid_data, test_data = load_data_and_model(
    model_file='saved/FM-Apr-17-2023_02-57-45.pth',
)

17 Apr 02:58    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = /opt/ml/recbole/br
checkpoint_dir = saved
show_progress = False
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 30
train_batch_size = 512
learner = adam
learning_rate = 0.006364591701150409
train_neg_sample_args = {'distribution': 'none', 'sample_num': 'none', 'alpha': 'none', 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 10

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [6, 2, 2]}, 'group_by': 'None', 'mode': 'labeled', 'order': 'RO'}
repeatable = False
metrics = ['RMSE']
topk = [10]
valid_metric = RMSE
valid_metric_bigger = False
eval_batch_size = 4096
metric_decimal_place = 4

Dataset Hyper Parameters:
field_separator = 	
seq_separator =  
USER_ID_

In [41]:
import torch

test = pd.read_csv('~/data/test_ratings.csv', dtype={"user_id": int, "isbn": int, "rating": float})

{columnName: torch.tensor(columnData.values) for (columnName, columnData) in test.items()}

# input_inter = Interaction({columnName: columnData for (columnName, columnData) in stu_df.iteritems()})

# input_inter = Interaction({
#         'user_id': torch.tensor([1, 2]),
#         'item_id_list': torch.tensor([[1, 2, 3, 0, 0],
#                                       [4, 5, 0, 0, 0]]),
#         'item_length': torch.tensor([3, 2]),
#     })

ValueError: invalid literal for int() with base 10: '042518630X'

In [39]:
torch.from_numpy(test['user_id'].values)

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.

In [25]:
input_inter = Interaction({
        'user_id': torch.tensor([1, 2]),
        'item_id_list': torch.tensor([[1, 2, 3, 0, 0],
                                      [4, 5, 0, 0, 0]]),
        'item_length': torch.tensor([3, 2]),
    })

model.predict('/br/br_test.inter')

TypeError: string indices must be integers

In [15]:
from ray import tune
from ray.tune.schedulers import ASHAScheduler

asha_scheduler = ASHAScheduler(
    time_attr='training_iteration',
    metric='best_valid_result/rmse',
    mode='min',
    max_t=100,
    grace_period=10,
    reduction_factor=3,
    brackets=1,
)

config = {
    "model": tune.grid_search(["FM"]),
    "epoch": tune.randint(10, 100),
    "train_batch_size": tune.grid_search([256, 512, 1024, 2048]),
    "learning_rate": tune.loguniform(1e-5, 1e-1)
}

result = tune.run(
    tune.with_parameters(objective_function, config_file_list=['/opt/ml/recbole/br/config.yaml']),
    config=config,
    num_samples=5,
    log_to_file='log',
    scheduler=asha_scheduler,
    local_dir='result',
    verbose=1
)
best_trial = result.get_best_trial('best_valid_result/rmse', 'min')
print("best params: ",best_trial.config)
print("best result: ",best_trial.last_result)

best params:  {}
best result:  {'model': 'FM', 'best_valid_score': 6.6316, 'valid_score_bigger': False, 'best_valid_result': OrderedDict([('rmse', 6.6316)]), 'test_result': OrderedDict([('rmse', 6.6092)]), 'time_this_iter_s': 35.80374240875244, 'done': True, 'timesteps_total': None, 'episodes_total': None, 'training_iteration': 1, 'trial_id': '1c16f_00002', 'experiment_id': '373b290cca96456fa965af312cfb0946', 'date': '2023-04-16_18-19-22', 'timestamp': 1681669162, 'time_total_s': 35.80374240875244, 'pid': 46048, 'hostname': 'cf68c92cecb9', 'node_ip': '172.17.0.2', 'config': {}, 'time_since_restore': 35.80374240875244, 'timesteps_since_restore': 0, 'iterations_since_restore': 1, 'warmup_time': 0.003397703170776367, 'experiment_tag': '2'}


In [15]:
Config(config_dict=None, config_file_list=['/opt/ml/recbole/br/config.yaml'])


[1;35mGeneral Hyper Parameters:
[0m[1;36mgpu_id[0m =[1;33m 0[0m
[1;36muse_gpu[0m =[1;33m True[0m
[1;36mseed[0m =[1;33m 2020[0m
[1;36mstate[0m =[1;33m INFO[0m
[1;36mreproducibility[0m =[1;33m True[0m
[1;36mdata_path[0m =[1;33m /opt/ml/recbole/br[0m
[1;36mcheckpoint_dir[0m =[1;33m saved[0m
[1;36mshow_progress[0m =[1;33m False[0m
[1;36msave_dataset[0m =[1;33m False[0m
[1;36mdataset_save_path[0m =[1;33m None[0m
[1;36msave_dataloaders[0m =[1;33m False[0m
[1;36mdataloaders_save_path[0m =[1;33m None[0m
[1;36mlog_wandb[0m =[1;33m False[0m

[1;35mTraining Hyper Parameters:
[0m[1;36mepochs[0m =[1;33m 5[0m
[1;36mtrain_batch_size[0m =[1;33m 2048[0m
[1;36mlearner[0m =[1;33m adam[0m
[1;36mlearning_rate[0m =[1;33m 0.01[0m
[1;36mtrain_neg_sample_args[0m =[1;33m {'distribution': 'none', 'sample_num': 'none', 'alpha': 'none', 'dynamic': False, 'candidate_num': 0}[0m
[1;36meval_step[0m =[1;33m 1[0m
[1;36mstopping_step[0m =

In [None]:
from ray import tune
from ray.tune.schedulers import ASHAScheduler

asha_scheduler = ASHAScheduler(
    time_attr='training_iteration',
    metric='best_valid_result/rmse',
    mode='min',
    max_t=100,
    grace_period=10,
    reduction_factor=3,
    brackets=1,
)

config = {
    "learning_rate": tune.grid_search([0.001, 0.01]),
    "momentum": tune.grid_search([0.5, 0.9])
}

result = tune.run(
    tune.with_parameters(objective_function, config_file_list=['/opt/ml/recbole/br/config.yaml']),
    config=config,
    num_samples=5,
    log_to_file='log',
    scheduler=asha_scheduler,
    local_dir='result',
    verbose=1
)
best_trial = result.get_best_trial('best_valid_result/rmse', 'min')
print("best params: ",best_trial.config)
print("best result: ",best_trial.last_result)