In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import os
import logging
import csv
from collections import OrderedDict


def create_log_id(dir_path):
    log_count = 0
    file_path = os.path.join(dir_path, 'log{:d}.log'.format(log_count))
    while os.path.exists(file_path):
        log_count += 1
        file_path = os.path.join(dir_path, 'log{:d}.log'.format(log_count))
    return log_count


def logging_config(folder=None, name=None,
                   level=logging.DEBUG,
                   console_level=logging.DEBUG,
                   no_console=True):

    if not os.path.exists(folder):
        os.makedirs(folder)
    for handler in logging.root.handlers:
        logging.root.removeHandler(handler)
    logging.root.handlers = []
    logpath = os.path.join(folder, name + ".log")
    print("All logs will be saved to %s" %logpath)

    logging.root.setLevel(level)
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    logfile = logging.FileHandler(logpath)
    logfile.setLevel(level)
    logfile.setFormatter(formatter)
    logging.root.addHandler(logfile)

    if not no_console:
        logconsole = logging.StreamHandler()
        logconsole.setLevel(console_level)
        logconsole.setFormatter(formatter)
        logging.root.addHandler(logconsole)
    return folder

In [3]:
import os
from collections import OrderedDict

import torch

def early_stopping(recall_list, stopping_steps):
    best_recall = max(recall_list)
    best_step = recall_list.index(best_recall)
    if len(recall_list) - best_step - 1 >= stopping_steps:
        should_stop = True
    else:
        should_stop = False
    return best_recall, should_stop


def save_model(model, model_dir, current_epoch, last_best_epoch=None):
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    model_state_file = os.path.join(model_dir, 'model_epoch{}.pth'.format(current_epoch))
    torch.save({'model_state_dict': model.state_dict(), 'epoch': current_epoch}, model_state_file)

    if last_best_epoch is not None and current_epoch != last_best_epoch:
        old_model_state_file = os.path.join(model_dir, 'model_epoch{}.pth'.format(last_best_epoch))
        if os.path.exists(old_model_state_file):
            os.system('rm {}'.format(old_model_state_file))
            
def save_checkpoint(model_dir, model, optimizer, current_epoch, best_recall, best_epoch, metrics_list, epoch_list):
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    checkpoint_file = os.path.join(model_dir, 'checkpoint_epoch{}.pth'.format(current_epoch))
    torch.save({'model_state_dict': model.state_dict(), 
                'optimizer_state_dict': optimizer.state_dict(), 
                'epoch': current_epoch,
                'best_recall': best_recall,
                'best_epoch': best_epoch,
                'metrics_list': metrics_list,
                'epoch_list': epoch_list
               }, checkpoint_file)

def load_model(model, model_path):
    checkpoint = torch.load(model_path, map_location=torch.device('cpu'), weights_only=False)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    return model

In [4]:
import torch
import numpy as np
from sklearn.metrics import roc_auc_score, log_loss, mean_squared_error


def calc_recall(rank, ground_truth, k):
    """
    calculate recall of one example
    """
    return len(set(rank[:k]) & set(ground_truth)) / float(len(set(ground_truth)))


def precision_at_k(hit, k):
    """
    calculate Precision@k
    hit: list, element is binary (0 / 1)
    """
    hit = np.asarray(hit)[:k]
    return np.mean(hit)


def precision_at_k_batch(hits, k):
    """
    calculate Precision@k
    hits: array, element is binary (0 / 1), 2-dim
    """
    res = hits[:, :k].mean(axis=1)
    return res


def average_precision(hit, cut):
    """
    calculate average precision (area under PR curve)
    hit: list, element is binary (0 / 1)
    """
    hit = np.asarray(hit)
    precisions = [precision_at_k(hit, k + 1) for k in range(cut) if len(hit) >= k]
    if not precisions:
        return 0.
    return np.sum(precisions) / float(min(cut, np.sum(hit)))


def dcg_at_k(rel, k):
    """
    calculate discounted cumulative gain (dcg)
    rel: list, element is positive real values, can be binary
    """
    rel = np.asfarray(rel)[:k]
    dcg = np.sum((2 ** rel - 1) / np.log2(np.arange(2, rel.size + 2)))
    return dcg


def ndcg_at_k(rel, k):
    """
    calculate normalized discounted cumulative gain (ndcg)
    rel: list, element is positive real values, can be binary
    """
    idcg = dcg_at_k(sorted(rel, reverse=True), k)
    if not idcg:
        return 0.
    return dcg_at_k(rel, k) / idcg


def ndcg_at_k_batch(hits, k):
    """
    calculate NDCG@k
    hits: array, element is binary (0 / 1), 2-dim
    """
    hits_k = hits[:, :k]
    dcg = np.sum((2 ** hits_k - 1) / np.log2(np.arange(2, k + 2)), axis=1)

    sorted_hits_k = np.flip(np.sort(hits), axis=1)[:, :k]
    idcg = np.sum((2 ** sorted_hits_k - 1) / np.log2(np.arange(2, k + 2)), axis=1)

    idcg[idcg == 0] = np.inf
    ndcg = (dcg / idcg)
    return ndcg


def recall_at_k(hit, k, all_pos_num):
    """
    calculate Recall@k
    hit: list, element is binary (0 / 1)
    """
    hit = np.asfarray(hit)[:k]
    return np.sum(hit) / all_pos_num


def recall_at_k_batch(hits, k):
    """
    calculate Recall@k
    hits: array, element is binary (0 / 1), 2-dim
    """
    res = (hits[:, :k].sum(axis=1) / hits.sum(axis=1))
    return res


def F1(pre, rec):
    if pre + rec > 0:
        return (2.0 * pre * rec) / (pre + rec)
    else:
        return 0.


def calc_auc(ground_truth, prediction):
    try:
        res = roc_auc_score(y_true=ground_truth, y_score=prediction)
    except Exception:
        res = 0.
    return res


def logloss(ground_truth, prediction):
    logloss = log_loss(np.asarray(ground_truth), np.asarray(prediction))
    return logloss


def calc_metrics_at_k(cf_scores, train_user_dict, test_user_dict, user_ids, item_ids, Ks, num_negatives=100):
    '''
    Calculate precision, recall, and NDCG at K for each user with negative sampling.
    Negative sampling selects 100 items that are neither the ground truth item nor in the user's train set.

    cf_scores: (n_users, n_items)
    '''

    binary_hit = []
    temp_cf_scores = []
    test_indices = []

    test_pos_item_binary = np.concatenate((
        np.ones((len(user_ids), 1)),
        np.zeros((len(user_ids), num_negatives))
    ), axis=1)


    for idx, user in enumerate(user_ids):
        # Ground truth items for the user
        test_item = set(test_user_dict[user])

        # Items in the training set to be excluded
        train_items = set(train_user_dict[user])

        # Negative samples: items not in the test items and not in the train items
        possible_negatives = [item for item in item_ids if item not in train_items and item not in test_item]
        negative_samples = np.random.choice(possible_negatives, num_negatives, replace=False)

        # Selected items for testing: ground truth + negative samples
        test_set = list(test_item) + list(negative_samples)
        # test_indices.append(test_set)

        # Get the corresponding scores of these items from the cf_scores matrix
        temp_cf_scores.append(cf_scores[idx][test_set].tolist())

    try:
        _, rank_indices = torch.sort(torch.LongTensor(temp_cf_scores).cuda(), descending=True)    # try to speed up the sorting process
    except:
        _, rank_indices = torch.sort(torch.LongTensor(temp_cf_scores), descending=True)

    rank_indices = rank_indices.cpu()

    # binary_hit = [] # shape (n_users, num_negatives+1)
    # test_indices = np.asarray(test_indices)

    for i in range(len(user_ids)):
        binary_hit.append(test_pos_item_binary[i][rank_indices[i]])
    binary_hit = np.array(binary_hit, dtype=np.float32)

    metrics_dict = {}
    for k in Ks:
        metrics_dict[k] = {}
        metrics_dict[k]['precision'] = precision_at_k_batch(binary_hit, k)
        metrics_dict[k]['recall']    = recall_at_k_batch(binary_hit, k)
        metrics_dict[k]['ndcg']      = ndcg_at_k_batch(binary_hit, k)

    return metrics_dict

In [5]:
import torch
import numpy as np
import pandas as pd
import os
import random
import scipy.sparse as sp
import ast
from torch.utils.data import Dataset, DataLoader

class DataBuilderFM(object):
    def __init__(self, args, logging):
        self.args = args
        self.data_dir = args.data_dir
        self.train_file = os.path.join(self.data_dir, 'train_df.csv')
        self.test_file = os.path.join(
            self.data_dir,
            'val_df.csv' if self.args.mode == 'train' else 'test_df.csv'
        )
        self.kg_file = os.path.join(self.data_dir, "kg_final.txt")
        self.user_file = os.path.join(self.data_dir, "user_list.txt")

        self.train_batch_size = args.train_batch_size
        self.test_batch_size = args.test_batch_size

        kg_data = self.load_kg(self.kg_file)
        users_info = self.load_user_info(self.user_file)
        self.cf_train_data, self.train_user_dict = self.load_train_cf(self.train_file)
        self.cf_test_data, self.test_user_dict = self.load_test_cf(self.test_file)
        self.statistic_cf()

        self.construct_data(kg_data, users_info)
        self.print_info(logging)

    def load_train_cf(self, filename):
        user = []
        item = []
        user_dict = dict()
    
        df = pd.read_csv(filename)
    
        for _, row in df.iterrows():
            user_id = int(row['user'])
    
            item_ids = list(set(ast.literal_eval(row['feature'])))
    
            for item_id in item_ids:
                user.append(user_id)
                item.append(item_id)
            user_dict[user_id] = item_ids
    
        user = np.array(user, dtype=np.int32)
        item = np.array(item, dtype=np.int32)
        return (user, item), user_dict

    def load_test_cf(self, filename):
        user = []
        item = []
        user_dict = dict()
    
        df = pd.read_csv(filename, header=0, names=['user', 'label', 'time'])
    
        for _, row in df.iterrows():
            user_id = int(row['user'])
            item_id = int(row['label'])

            user.append(user_id)
            item.append(item_id)
            if user_id not in user_dict:
                user_dict[user_id] = []
            user_dict[user_id].append(item_id)
    
        user = np.array(user, dtype=np.int32)
        item = np.array(item, dtype=np.int32)
        return (user, item), user_dict


    def statistic_cf(self):
        self.n_users = max(max(self.cf_train_data[0]), max(self.cf_test_data[0])) + 1
        self.n_items = max(max(self.cf_train_data[1]), max(self.cf_test_data[1])) + 1
        self.n_cf_train = len(self.cf_train_data[0])
        self.n_cf_test = len(self.cf_test_data[0])


    def load_kg(self, filename):
        kg_data = pd.read_csv(filename, sep='\t', names=['h', 'r', 't'], engine='python')
        kg_data = kg_data.drop_duplicates()
        return kg_data

    def load_user_info(self, filename):
        user_data = pd.read_csv(filename, sep=' ')
        user_data = user_data.drop_duplicates()
        return user_data

    def construct_data(self, kg_data, users_info):
        # construct user matrix
        feat_rows = list(range(self.n_users))
        feat_cols = list(range(self.n_users))
        feat_data = [1] * self.n_users

        self.n_user_attr = self.n_users

        if users_info is not None:
            user_cols = [col for col in users_info.columns
                             if col not in ['id', 'remap_id']]
            
            for col in user_cols:
                feat_rows += list(range(self.n_users))
                feat_cols += (users_info[col] + self.n_user_attr).to_list()
                feat_data += [1] * users_info.shape[0]
                self.n_user_attr += max(users_info[col]) + 1

        self.user_matrix = sp.coo_matrix((feat_data, (feat_rows, feat_cols)), shape=(self.n_users, self.n_user_attr)).tocsr()

        # construct feature matrix
        self.n_entities = max(max(kg_data['h']), max(kg_data['t'])) + 1

        feat_rows = list(range(self.n_items))
        feat_cols = list(range(self.n_items))
        feat_data = [1] * self.n_items

        filtered_kg_data = kg_data[kg_data['h'] < self.n_items]
        feat_rows += filtered_kg_data['h'].tolist()
        feat_cols += filtered_kg_data['t'].tolist()
        feat_data += [1] * filtered_kg_data.shape[0]

        self.feat_matrix = sp.coo_matrix((feat_data, (feat_rows, feat_cols)), shape=(self.n_items, self.n_entities)).tocsr()

        self.n_users_entities = self.n_user_attr + self.n_entities

    def print_info(self, logging):
        logging.info('n_users:              %d' % self.n_users)
        logging.info('n_items:              %d' % self.n_items)
        logging.info('n_entities:           %d' % self.n_entities)
        logging.info('n_user_attr:           %d' % self.n_user_attr)
        logging.info('n_users_entities:     %d' % self.n_users_entities)

        logging.info('n_cf_train:           %d' % self.n_cf_train)
        logging.info('n_cf_test:            %d' % self.n_cf_test)

        logging.info('shape of user_matrix: {}'.format(self.user_matrix.shape))
        logging.info('shape of feat_matrix: {}'.format(self.feat_matrix.shape))


class TrainDatasetFM(Dataset):
    def __init__(self, user_dict):
        self.all_users = list(user_dict.keys())

    def __len__(self):
        return len(self.all_users)

    def __getitem__(self, idx):
        user = self.all_users[idx]
        return user


def process_user_batch(batch_user, user_dict, user_matrix, feat_matrix):
    def sample_pos_items_for_u(user_dict, u, num_samples=1):
        pos_items = user_dict[u]
        return random.sample(pos_items, num_samples)
    
    def sample_neg_items_for_u(user_dict, u, num_samples=1, all_item_ids=None):
        pos_items = set(user_dict[u])
        neg_items = []
        while len(neg_items) < num_samples:
            item = random.choice(all_item_ids)
            if item not in pos_items:
                neg_items.append(item)
        return neg_items

    batch_user = batch_user.tolist()  # Tensor to list if needed

    # To get all item IDs once
    all_item_ids = list(range(feat_matrix.shape[0]))

    pos_items, neg_items = [], []

    for u in batch_user:
        pos_items += sample_pos_items_for_u(user_dict, u, 1)
        neg_items += sample_neg_items_for_u(user_dict, u, 1, all_item_ids=all_item_ids)

    batch_user_np = np.array(batch_user)
    batch_pos_item_np = np.array(pos_items)
    batch_neg_item_np = np.array(neg_items)


    user_features = user_matrix[batch_user_np]             # shape: (B, user_feat_dim)
    pos_item_features = feat_matrix[batch_pos_item_np]     # shape: (B, item_feat_dim)
    neg_item_features = feat_matrix[batch_neg_item_np]     # shape: (B, item_feat_dim)

    pos_feature_values = sp.hstack([user_features, pos_item_features])
    neg_feature_values = sp.hstack([user_features, neg_item_features])

    return pos_feature_values, neg_feature_values



def generate_test_batch(batch_user, n_items, user_matrix, feat_matrix):
    rep_batch_user = np.repeat(batch_user, n_items)
    batch_user_sp = user_matrix[rep_batch_user]

    batch_item_sp = sp.vstack([feat_matrix] * len(batch_user))

    feature_values = sp.hstack([batch_user_sp, batch_item_sp])
    return  feature_values

In [6]:
import torch
from torch import nn
from torch.nn import functional as F
import numpy as np
import scipy.sparse as sp

class FM(nn.Module):

    def __init__(self, args,
                 n_users, n_items, n_entities, n_user_attr,
                 user_pre_embed=None, item_pre_embed=None):

        super(FM, self).__init__()
        self.preload = args.preload
        self.n_users = n_users
        self.n_items = n_items
        self.n_user_attr = n_user_attr
        self.n_entities = n_entities
        self.n_features = n_user_attr + n_entities

        self.embed_dim = args.embed_dim
        self.l2loss_lambda = args.l2loss_lambda

        self.linear = nn.Linear(self.n_features, 1)
        nn.init.xavier_uniform_(self.linear.weight)

        self.feature_embed = nn.Parameter(torch.Tensor(self.n_features, self.embed_dim))
        nn.init.xavier_uniform_(self.feature_embed)

        self.h = nn.Linear(self.embed_dim, 1, bias=False)
        with torch.no_grad():
            self.h.weight.copy_(torch.ones([1, self.embed_dim]))
        for param in self.h.parameters():
            param.requires_grad = False

    def convert_coo2tensor(self, coo):
        values = coo.data
        indices = np.vstack((coo.row, coo.col))

        i = torch.LongTensor(indices)
        v = torch.FloatTensor(values)
        shape = coo.shape
        return torch.sparse_coo_tensor(i, v, torch.Size(shape)).coalesce()

    def calc_score(self, feature_values):
        """
        feature_values:  (batch_size, n_features), n_features = n_users + n_entities, torch.sparse.FloatTensor
        """
        # Bi-Interaction layer
        # Equation (4) / (3)
        feature_values = self.convert_coo2tensor(feature_values.tocoo())
        sum_square_embed = torch.mm(feature_values, self.feature_embed).pow(2)           # (batch_size, embed_dim)
        square_sum_embed = torch.mm(feature_values.pow(2), self.feature_embed.pow(2))    # (batch_size, embed_dim)
        z = 0.5 * (sum_square_embed - square_sum_embed)                                  # (batch_size, embed_dim)

        # Prediction layer
        # Equation (6)
        y = self.h(z)                                       # (batch_size, 1)
        # Equation (2) / (7) / (8)
        y = self.linear(feature_values) + y                 # (batch_size, 1)
        return y.squeeze()

    def calc_loss(self, pos_feature_values, neg_feature_values):
        """
        pos_feature_values:  (batch_size, n_features), torch.sparse.FloatTensor
        neg_feature_values:  (batch_size, n_features), torch.sparse.FloatTensor
        """
        pos_scores = self.calc_score(pos_feature_values)            # (batch_size)
        neg_scores = self.calc_score(neg_feature_values)            # (batch_size)

        loss = (-1.0) * torch.log(1e-10 + F.sigmoid(pos_scores - neg_scores))
        loss = torch.mean(loss)

        l2_loss = torch.norm(self.h.weight, 2).pow(2) / 2
        loss += self.l2loss_lambda * l2_loss
        return loss


    def forward(self, *input, is_train):
        if is_train:
            return self.calc_loss(*input)
        else:
            return self.calc_score(*input)

In [7]:
import sys
import random
import itertools
from time import time
import datetime
import pandas as pd
from tqdm import tqdm
import scipy.sparse as sp
import torch.nn as nn
import torch.optim as optim
import torch.multiprocessing as mp
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data.distributed import DistributedSampler

def evaluate(args, model, databuilder, Ks):
    test_batch_size = databuilder.test_batch_size
    train_user_dict = databuilder.train_user_dict
    test_user_dict = databuilder.test_user_dict

    model.eval()

    user_ids = list(test_user_dict.keys())
    user_ids_batches = [user_ids[i: i + test_batch_size] for i in range(0, len(user_ids), test_batch_size)]

    n_users = len(user_ids)
    n_items = databuilder.n_items
    item_ids = list(range(n_items))
    user_idx_map = dict(zip(user_ids, range(n_users)))

    cf_users = []
    cf_items = []
    cf_scores = []

    with tqdm(total=len(user_ids_batches), desc='Evaluating Iteration') as pbar:
        for batch_user in user_ids_batches:
            feature_values = generate_test_batch(batch_user, databuilder.n_items, databuilder.user_matrix, databuilder.feat_matrix)

            with torch.no_grad():
                batch_scores = model(feature_values, is_train=False)            # (batch_size)

            cf_users.extend(np.repeat(batch_user, n_items).tolist())
            cf_items.extend(item_ids * len(batch_user))
            cf_scores.append(batch_scores.cpu())
            pbar.update(1)

    rows = [user_idx_map[u] for u in cf_users]
    cols = cf_items
    cf_scores = torch.cat(cf_scores)
    cf_score_matrix = torch.Tensor(sp.coo_matrix((cf_scores, (rows, cols)), shape=(n_users, n_items)).todense())

    user_ids = np.array(user_ids)
    item_ids = np.array(item_ids)
    metrics_dict = calc_metrics_at_k(cf_score_matrix, train_user_dict, test_user_dict, user_ids, item_ids, Ks)

    cf_score_matrix = cf_score_matrix.numpy()
    for k in Ks:
        for m in ['precision', 'recall', 'ndcg']:
            metrics_dict[k][m] = metrics_dict[k][m].mean()
    return cf_score_matrix, metrics_dict

def train(args):
    dist.init_process_group("gloo", timeout=datetime.timedelta(seconds=7200))
    rank = dist.get_rank()

    # Seed đồng bộ trên tất cả process
    random.seed(args.seed)
    np.random.seed(args.seed)
    
    torch.manual_seed(args.seed)

    # Chỉ rank 0 log thông tin
    if rank == 0:
        log_save_id = create_log_id(args.save_dir)
        logging_config(folder=args.save_dir, name=f'log{log_save_id}', no_console=False)
        logging.info(args)

    # Load data + DistributedSampler
    data_builder = DataBuilderFM(args, logging)
    train_dataset = TrainDatasetFM(data_builder.train_user_dict)
    
    sampler = DistributedSampler(
        train_dataset,
        shuffle=True
    )
    dataloader = DataLoader(
        train_dataset,
        batch_size=data_builder.train_batch_size // dist.get_world_size(),
        sampler=sampler,
        num_workers=0,
        drop_last=True
    )

    model = FM(args, data_builder.n_users, data_builder.n_items, data_builder.n_entities, data_builder.n_user_attr)
    optimizer = optim.Adam(model.parameters(), lr=args.lr)
    initial_epoch = 1
    
    if args.preload == 1:
        checkpoint = torch.load(args.checkpoint_path, weights_only=False)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        initial_epoch = checkpoint['epoch'] + 1

    model = DDP(model)

    if rank == 0:
        logging.info(model)

    if rank == 0:
        best_epoch = -1
        best_recall = 0
        Ks = eval(args.Ks)
        k_min = min(Ks)
        k_max = max(Ks)
        epoch_list = []
        metrics_list = {k: {'precision': [], 'recall': [], 'ndcg': []} for k in Ks}

        if args.preload == 1:
            best_epoch = checkpoint['best_epoch']
            best_recall = checkpoint['best_recall']
            epoch_list = checkpoint['epoch_list']
            metrics_list = checkpoint['metrics_list']

    # Huấn luyện
    steps_per_epoch = data_builder.n_cf_train // data_builder.train_batch_size
    
    for epoch in range(initial_epoch, args.n_epoch + 1):
        model.train()
        dataloader.sampler.set_epoch(epoch)  

        total_loss = 0.0
        dataloader_iter = iter(dataloader)

        for step in range(steps_per_epoch):
            try:
                batch_user = next(dataloader_iter)
            except StopIteration:
                dataloader_iter = iter(dataloader)
                batch_user = next(dataloader_iter)
    
            pos_feature_values, neg_feature_values = process_user_batch(
                batch_user=batch_user,
                user_dict=data_builder.train_user_dict,
                user_matrix=data_builder.user_matrix,
                feat_matrix=data_builder.feat_matrix
            )
            
            batch_loss = model(pos_feature_values, neg_feature_values, is_train=True)
            
            if torch.isnan(batch_loss).any():
                logging.error(f'ERROR: Epoch {epoch} Loss is nan.')
                sys.exit()
            batch_loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            total_loss += batch_loss.item()

        average_loss = total_loss / steps_per_epoch
        
        average_loss = torch.tensor(average_loss).to(args.device)
        dist.all_reduce(average_loss, op=dist.ReduceOp.SUM)
        average_loss = average_loss.item() / dist.get_world_size()

        if rank == 0:
            logging.info(f'Epoch {epoch:04d} | Average Loss: {average_loss:.4f}')

        dist.barrier()
        # Đánh giá (chỉ rank 0)
        if rank == 0 and (epoch % args.evaluate_every == 0 or epoch == args.n_epoch):
            _, metrics_dict = evaluate(args, model.module, data_builder, Ks)

            # Log và lưu metrics
            logging.info('CF Evaluation: Epoch {:04d} | Precision [{:.4f}, {:.4f}], Recall [{:.4f}, {:.4f}], NDCG [{:.4f}, {:.4f}]'.format(
                epoch, metrics_dict[k_min]['precision'], metrics_dict[k_max]['precision'], metrics_dict[k_min]['recall'], metrics_dict[k_max]['recall'], metrics_dict[k_min]['ndcg'], metrics_dict[k_max]['ndcg']))

            epoch_list.append(epoch)
            for k in Ks:
                for m in ['precision', 'recall', 'ndcg']:
                    metrics_list[k][m].append(metrics_dict[k][m])

            # Early stopping
            best_recall, should_stop = early_stopping(metrics_list[k_max]['recall'], args.stopping_steps)
            if should_stop:
                break

            if metrics_list[Ks[-1]]['recall'][-1] == best_recall:
                save_model(model.module, args.save_dir, epoch, best_epoch)
                logging.info(f'Save model at epoch {epoch:04d}!')
                best_epoch = epoch
                
        if rank == 0 and (epoch % args.checkpoint_every == 0 or epoch == args.n_epoch):
            save_checkpoint(args.save_dir, model.module, optimizer, epoch, best_recall, best_epoch, metrics_list, epoch_list)
        dist.barrier()
    # Lưu kết quả cuối cùng (rank 0)
    if rank == 0:
        metrics_df = [epoch_list]
        metrics_cols = ['epoch_idx']
        for k in Ks:
            for m in ['precision', 'recall', 'ndcg']:
                metrics_df.append(metrics_list[k][m])
                metrics_cols.append('{}@{}'.format(m, k))
        metrics_df = pd.DataFrame(metrics_df).transpose()
        metrics_df.columns = metrics_cols
        metrics_df.to_csv(args.save_dir + '/metrics.csv', index=False)

        best_metrics = metrics_df.loc[metrics_df['epoch_idx'] == best_epoch].iloc[0].to_dict()
        logging.info('Best CF Evaluation: Epoch {:04d} | Precision [{:.4f}, {:.4f}], Recall [{:.4f}, {:.4f}], NDCG [{:.4f}, {:.4f}]'.format(
            int(best_metrics['epoch_idx']), best_metrics['precision@{}'.format(k_min)], best_metrics['precision@{}'.format(k_max)], best_metrics['recall@{}'.format(k_min)], best_metrics['recall@{}'.format(k_max)], best_metrics['ndcg@{}'.format(k_min)], best_metrics['ndcg@{}'.format(k_max)]))

    dist.destroy_process_group()


In [9]:
from types import SimpleNamespace
args = SimpleNamespace(
    mode='train',
    seed=2024,
    data_dir='/kaggle/input/mooc-fm',
    preload=0,
    embed_dim=64,
    l2loss_lambda=1e-5,
    train_batch_size=1024,
    test_batch_size=1024,
    lr=0.0001,
    n_epoch=50,
    stopping_steps=2,
    checkpoint_every=5,
    evaluate_every=50,
    Ks='[1, 5, 10]',
    save_dir='/kaggle/working/',
    device='cpu',
    checkpoint_path='/kaggle/input/fm-distributed-model/pytorch/default/2/checkpoint_epoch30.pth'
)

In [8]:
from pyspark.ml.torch.distributor import TorchDistributor
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("DistributedTorchTrain") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/21 06:42:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [12]:
distributor = TorchDistributor(num_processes=2, local_mode=True, use_gpu=False)
distributor.run(train, args)

[W513 14:34:50.298867606 socket.cpp:204] [c10d] The hostname of the client socket cannot be retrieved. err=-3
[W513 14:35:00.309617832 socket.cpp:204] [c10d] The hostname of the client socket cannot be retrieved. err=-3
[W513 14:35:13.228267404 socket.cpp:204] [c10d] The hostname of the client socket cannot be retrieved. err=-3
[W513 14:35:13.294432797 socket.cpp:204] [c10d] The hostname of the client socket cannot be retrieved. err=-3
[W513 14:35:23.239109062 socket.cpp:204] [c10d] The hostname of the client socket cannot be retrieved. err=-3
[W513 14:35:33.249772110 socket.cpp:204] [c10d] The hostname of the client socket cannot be retrieved. err=-3
All logs will be saved to /kaggle/working/log0.log
2025-05-13 14:35:33,362 - root - INFO - namespace(mode='train', seed=2024, data_dir='/kaggle/input/mooc-fm', preload=0, embed_dim=64, l2loss_lambda=1e-05, train_batch_size=1024, test_batch_size=1024, lr=0.0001, n_epoch=50, stopping_steps=2, checkpoint_every=5, evaluate_every=50, Ks='[1, 5

In [16]:
from types import SimpleNamespace
args = SimpleNamespace(
    mode='train',
    seed=2024,
    data_dir='/kaggle/input/mooc-fm',
    preload=1,
    embed_dim=64,
    l2loss_lambda=1e-5,
    train_batch_size=1024,
    test_batch_size=1024,
    lr=0.0001,
    n_epoch=150,
    stopping_steps=2,
    checkpoint_every=5,
    evaluate_every=50,
    Ks='[1, 5, 10]',
    save_dir='/kaggle/working/',
    device='cpu',
    checkpoint_path='/kaggle/input/new_fm_model/pytorch/default/4/checkpoint_epoch100.pth'
)

In [12]:
from pyspark.ml.torch.distributor import TorchDistributor
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("DistributedTorchTrain") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/20 15:47:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [17]:
distributor = TorchDistributor(num_processes=2, local_mode=True, use_gpu=False)
distributor.run(train, args)

[W519 23:36:49.117494470 socket.cpp:204] [c10d] The hostname of the client socket cannot be retrieved. err=-3
[W519 23:36:59.127250923 socket.cpp:204] [c10d] The hostname of the client socket cannot be retrieved. err=-3
[W519 23:37:11.921680215 socket.cpp:204] [c10d] The hostname of the client socket cannot be retrieved. err=-3
[W519 23:37:11.938077352 socket.cpp:204] [c10d] The hostname of the client socket cannot be retrieved. err=-3
[W519 23:37:21.932414797 socket.cpp:204] [c10d] The hostname of the client socket cannot be retrieved. err=-3
[W519 23:37:31.942955141 socket.cpp:204] [c10d] The hostname of the client socket cannot be retrieved. err=-3
All logs will be saved to /kaggle/working/log1.log
2025-05-19 23:37:31,958 - root - INFO - namespace(mode='train', seed=2024, data_dir='/kaggle/input/mooc-fm', preload=1, embed_dim=64, l2loss_lambda=1e-05, train_batch_size=1024, test_batch_size=1024, lr=0.0001, n_epoch=150, stopping_steps=2, checkpoint_every=5, evaluate_every=50, Ks='[1, 

In [10]:
args.pretrain_model_path = "/kaggle/input/new_fm_model/pytorch/default/6/checkpoint_epoch190.pth"
args.mode = "test"

In [11]:
def predict(args):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    data_builder = DataBuilderFM(args, logging)
     
    model = FM(args, data_builder.n_users, data_builder.n_items, data_builder.n_entities, data_builder.n_user_attr)
    model = load_model(model, args.pretrain_model_path)
    model.to(device)

    Ks = eval(args.Ks)
    k_min = min(Ks)
    k_max = max(Ks)
    
    cf_scores, metrics_dict = evaluate(args, model, data_builder, Ks)
    for k in Ks:
        print(f'*** CF Evaluation @{k} ***')
        print(f'Precision@{k}   : ', metrics_dict[k]['precision'])
        print(f'Recall@{k}      : ', metrics_dict[k]['recall'])
        print(f'NDCG@{k}        : ', metrics_dict[k]['ndcg'])

predict(args)

Evaluating Iteration: 100%|██████████| 98/98 [55:49<00:00, 34.18s/it]


*** CF Evaluation @1 ***
Precision@1   :  0.19940983
Recall@1      :  0.19940983
NDCG@1        :  0.19940982294688406
*** CF Evaluation @5 ***
Precision@5   :  0.094080225
Recall@5      :  0.4704011
NDCG@5        :  0.3398102717534796
*** CF Evaluation @10 ***
Precision@10   :  0.06079024
Recall@10      :  0.60790235
NDCG@10        :  0.38425794189657836
