In [None]:
!pip install tensorboardX

In [3]:
import os
import time
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torch.backends.cudnn as cudnn
from tensorboardX import SummaryWriter

import pandas as pd
import scipy.sparse as sp

### config, args

In [6]:
config = {
    "model": "MLP",
    "model_path": "./models/",
    "train_rating": "/content/drive/MyDrive/Bitamin/컨퍼런스/Data/ml-1m.train.rating",
    "test_negative": "/content/drive/MyDrive/Bitamin/컨퍼런스/Data/ml-1m.test.negative",
}

args = {
    "batch_size": 256,
    "dropout": 0.0,
    "epochs": 20,
    "factor_num": 32, 
    "gpu": "0",
    "lr": 0.001,
    "num_layers": 3,
    "num_ng": 4,
    "out": True,
    "test_num_ng": 99,
    "top_k": 10,
}
os.environ["CUDA_VISIBLE_DEVICES"] = args["gpu"]
cudnn.benchmark = True

### Import data

In [7]:
train_data = pd.read_csv(config["train_rating"])
with open(config["test_negative"], "r") as fd:
    lines = fd.readlines()

print(train_data.shape, len(lines))
print(train_data.head(10))
print(lines[:2])

(994168, 1) 6040
   0\t32\t4\t978824330
0  0\t34\t4\t978824330
1   0\t4\t5\t978824291
2  0\t35\t4\t978824291
3  0\t30\t4\t978824291
4  0\t29\t3\t978824268
5  0\t33\t4\t978824268
6  0\t40\t5\t978824268
7  0\t10\t5\t978824268
8  0\t16\t3\t978824268
9  0\t23\t5\t978824195
['(0,25)\t1064\t174\t2791\t3373\t269\t2678\t1902\t3641\t1216\t915\t3672\t2803\t2344\t986\t3217\t2824\t2598\t464\t2340\t1952\t1855\t1353\t1547\t3487\t3293\t1541\t2414\t2728\t340\t1421\t1963\t2545\t972\t487\t3463\t2727\t1135\t3135\t128\t175\t2423\t1974\t2515\t3278\t3079\t1527\t2182\t1018\t2800\t1830\t1539\t617\t247\t3448\t1699\t1420\t2487\t198\t811\t1010\t1423\t2840\t1770\t881\t1913\t1803\t1734\t3326\t1617\t224\t3352\t1869\t1182\t1331\t336\t2517\t1721\t3512\t3656\t273\t1026\t1991\t2190\t998\t3386\t3369\t185\t2822\t864\t2854\t3067\t58\t2551\t2333\t2688\t3703\t1300\t1924\t3118\n', '(1,133)\t1072\t3154\t3368\t3644\t549\t1810\t937\t1514\t1713\t2186\t660\t2303\t2416\t670\t1176\t788\t889\t3120\t2344\t2525\t3301\t2055\t1436\t2630

train data

In [16]:
train_data = pd.read_csv(
    config["train_rating"],
    sep="\t",
    header=None,
    names=["user", "item"],
    usecols=[0, 1],
    dtype={0: np.int32, 1: np.int32},
)

In [17]:
train_data

Unnamed: 0,user,item
0,0,32
1,0,34
2,0,4
3,0,35
4,0,30
...,...,...
994164,6039,1092
994165,6039,41
994166,6039,128
994167,6039,323


test data

In [21]:
test_data = []
with open(config["test_negative"], "r") as fd:
    line = fd.readline()
    while line != None and line != "":
        arr = line.split("\t")
        u = eval(arr[0])[0]
        test_data.append([u, eval(arr[0])[1]])
        for i in arr[1:]:
            test_data.append([u, int(i)])
        line = fd.readline()

In [31]:
test_data

[[0, 25],
 [0, 1064],
 [0, 174],
 [0, 2791],
 [0, 3373],
 [0, 269],
 [0, 2678],
 [0, 1902],
 [0, 3641],
 [0, 1216],
 [0, 915],
 [0, 3672],
 [0, 2803],
 [0, 2344],
 [0, 986],
 [0, 3217],
 [0, 2824],
 [0, 2598],
 [0, 464],
 [0, 2340],
 [0, 1952],
 [0, 1855],
 [0, 1353],
 [0, 1547],
 [0, 3487],
 [0, 3293],
 [0, 1541],
 [0, 2414],
 [0, 2728],
 [0, 340],
 [0, 1421],
 [0, 1963],
 [0, 2545],
 [0, 972],
 [0, 487],
 [0, 3463],
 [0, 2727],
 [0, 1135],
 [0, 3135],
 [0, 128],
 [0, 175],
 [0, 2423],
 [0, 1974],
 [0, 2515],
 [0, 3278],
 [0, 3079],
 [0, 1527],
 [0, 2182],
 [0, 1018],
 [0, 2800],
 [0, 1830],
 [0, 1539],
 [0, 617],
 [0, 247],
 [0, 3448],
 [0, 1699],
 [0, 1420],
 [0, 2487],
 [0, 198],
 [0, 811],
 [0, 1010],
 [0, 1423],
 [0, 2840],
 [0, 1770],
 [0, 881],
 [0, 1913],
 [0, 1803],
 [0, 1734],
 [0, 3326],
 [0, 1617],
 [0, 224],
 [0, 3352],
 [0, 1869],
 [0, 1182],
 [0, 1331],
 [0, 336],
 [0, 2517],
 [0, 1721],
 [0, 3512],
 [0, 3656],
 [0, 273],
 [0, 1026],
 [0, 1991],
 [0, 2190],
 [0, 998],
 

In [32]:
def load_all():
    """ We load all the three file here to save time in each epoch. """
    train_data = pd.read_csv(
        config["train_rating"],
        sep="\t",
        header=None,
        names=["user", "item"],
        usecols=[0, 1],
        dtype={0: np.int32, 1: np.int32},
    )

    user_num = train_data["user"].max() + 1
    item_num = train_data["item"].max() + 1

    # dok matrix 형식으로 저장하기
    train_data = train_data.values.tolist()

    train_mat = sp.dok_matrix((user_num, item_num), dtype=np.float32)
    for x in train_data:
        train_mat[x[0], x[1]] = 1.0

    test_data = []
    with open(config["test_negative"], "r") as fd:
        line = fd.readline()
        while line != None and line != "":
            arr = line.split("\t")
            u = eval(arr[0])[0]
            test_data.append([u, eval(arr[0])[1]])
            for i in arr[1:]:
                test_data.append([u, int(i)])
            line = fd.readline()
    return train_data, test_data, user_num, item_num, train_mat

# prepare dataset
train_data, test_data, user_num, item_num, train_mat = load_all()

### Data Loader

'train_data' only contains positive sample, therefore add negative samples to train a model

In [33]:
class NCFData(data.Dataset):
    def __init__(self, features, num_item, train_mat=None, num_ng=0, is_training=None):
        super(NCFData, self).__init__()
        """ Note that the labels are only useful when training, we thus 
			add them in the ng_sample() function.
		"""
        # self.features_ps = [[0, 121], [0, 199], [1, 456],...]
        self.features_ps = features
        self.num_item = num_item
        self.train_mat = train_mat
        self.num_ng = num_ng
        self.is_training = is_training
        self.labels = [0] * len(features)

    def set_ng_sample(self):
        assert self.is_training, "no need to sampling when testing"

        # negative sample 더하기
        self.features_ng = []
        for x in self.features_ps:
            # user
            u = x[0]
            for _ in range(self.num_ng):
                j = np.random.randint(self.num_item)
                # train set에 있는 경우 다시 뽑기
                while (u, j) in self.train_mat:
                    j = np.random.randint(self.num_item)
                self.features_ng.append([u, j])

        labels_ps = [1] * len(self.features_ps)
        labels_ng = [0] * len(self.features_ng)

        self.features_fill = self.features_ps + self.features_ng
        self.labels_fill = labels_ps + labels_ng

    def __len__(self):
        return (self.num_ng + 1) * len(self.labels)

    def __getitem__(self, idx):
        features = self.features_fill if self.is_training else self.features_ps
        labels = self.labels_fill if self.is_training else self.labels

        user = features[idx][0]
        item = features[idx][1]
        label = labels[idx]
        return user, item, label

def prepare_data(train_data, test_data, item_num, train_mat):

    # construct the train and test datasets
    # args = (features, num_item, train_mat=None, num_ng=0, is_training=None)
    train_dataset = NCFData(train_data, item_num, train_mat, args["num_ng"], True)
    test_dataset = NCFData(test_data, item_num, train_mat, 0, False)
    train_loader = data.DataLoader(
        train_dataset, batch_size=args["batch_size"], shuffle=True, num_workers=4
    )
    test_loader = data.DataLoader(
        test_dataset, batch_size=args["test_num_ng"] + 1, shuffle=False, num_workers=0
    )

    return train_loader, test_loader

train_loader, test_loader = prepare_data(train_data, test_data, item_num, train_mat)

### Model

In [34]:
class NCF(nn.Module):
    def __init__(
        self, user_num, item_num, factor_num, num_layers, dropout, model,
    ):
        super(NCF, self).__init__()
        """
		user_num: number of users;
		item_num: number of items;
		factor_num: number of predictive factors;
		num_layers: the number of layers in MLP model;
		dropout: dropout rate between fully connected layers;
		model: 'MLP', 'GMF', 'NeuMF-end', and 'NeuMF-pre';
		"""
        self.dropout = dropout
        self.model = model

        # Embeddings; (num_embeddings, embedding_dim)
        self.embed_user_MLP = nn.Embedding(
            user_num, factor_num * (2 ** (num_layers - 1))
        )
        self.embed_item_MLP = nn.Embedding(
            item_num, factor_num * (2 ** (num_layers - 1))
        )

        MLP_modules = []
        for i in range(num_layers):
            input_size = factor_num * (2 ** (num_layers - i))
            MLP_modules.append(nn.Dropout(p=self.dropout))
            MLP_modules.append(nn.Linear(input_size, input_size // 2))
            MLP_modules.append(nn.ReLU())
        self.MLP_layers = nn.Sequential(*MLP_modules)
        predict_size = factor_num
        self.predict_layer = nn.Linear(predict_size, 1)
        self._init_weight_()

    def _init_weight_(self):
        # weight
        nn.init.normal_(self.embed_user_MLP.weight, std=0.01)
        nn.init.normal_(self.embed_item_MLP.weight, std=0.01)
        for m in self.MLP_layers:
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
        nn.init.kaiming_uniform_(self.predict_layer.weight, a=1, nonlinearity="sigmoid")

        # bias
        for m in self.modules():
            if isinstance(m, nn.Linear) and m.bias is not None:
                m.bias.data.zero_()

    def forward(self, user, item):
        embed_user_MLP = self.embed_user_MLP(user)
        embed_item_MLP = self.embed_item_MLP(item)
        # Embedding vectors
        interaction = torch.cat((embed_user_MLP, embed_item_MLP), -1)
        output_MLP = self.MLP_layers(interaction)
        concat = output_MLP

        # Prediction
        prediction = self.predict_layer(concat)
        return prediction.view(-1)

def create_model(user_num, item_num, args):
    model = NCF(
        user_num,
        item_num,
        args["factor_num"],
        args["num_layers"],
        args["dropout"],
        config["model"],
    )
    model.cuda()
    loss_function = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=args["lr"])
    return model, loss_function, optimizer

# Model
model, loss_function, optimizer = create_model(user_num, item_num, args)

### Metrics

In [35]:
def hit(gt_item, pred_items):
    if gt_item in pred_items:
        return 1
    return 0


def ndcg(gt_item, pred_items):
    if gt_item in pred_items:
        index = pred_items.index(gt_item)
        return np.reciprocal(np.log2(index + 2))
    return 0


def metrics(model, test_loader, top_k):
    HR, NDCG = [], []

    for user, item, _ in test_loader:
        user = user.cuda()
        item = item.cuda()

        predictions = model(user, item)
        # 가장 높은 top_k개 선택
        _, indices = torch.topk(predictions, top_k)
        # 해당 상품 index 선택
        recommends = torch.take(item, indices).cpu().numpy().tolist()
        # 정답값 선택
        gt_item = item[0].item()
        HR.append(hit(gt_item, recommends))
        NDCG.append(ndcg(gt_item, recommends))

    return np.mean(HR), np.mean(NDCG)

### Train

* `writer` = `SummaryWriter()`
* `writer.add_scalar("data/loss", loss.item(), count)`: record loss for each count
* `writer.add_scalar("test/HR", np.mean(HR), epoch)`: record HR average for each epoch
* `writer.add_scalar("test/NDCG", np.mean(NDCG), epoch)`: record NDCG average for each epoch

In [None]:
if __name__ == "__main__":
    count, best_hr = 0, 0
    writer = SummaryWriter()  # for visualization
    for epoch in range(args["epochs"]):
        model.train()  # Enable dropout (if have).
        start_time = time.time()
        train_loader.dataset.set_ng_sample()

        for user, item, label in train_loader:
            user = user.cuda()
            item = item.cuda()
            label = label.float().cuda()

            # gradient 초기화
            model.zero_grad()
            prediction = model(user, item)
            loss = loss_function(prediction, label)
            loss.backward()
            optimizer.step()
            writer.add_scalar("data/loss", loss.item(), count)
            count += 1

        model.eval()
        HR, NDCG = metrics(model, test_loader, args["top_k"])

        elapsed_time = time.time() - start_time
        print(
            "The time elapse of epoch {:03d}".format(epoch)
            + " is: "
            + time.strftime("%H: %M: %S", time.gmtime(elapsed_time))
        )
        print("HR: {:.3f}\tNDCG: {:.3f}".format(np.mean(HR), np.mean(NDCG)))

        if HR > best_hr:
            best_hr, best_ndcg, best_epoch = HR, NDCG, epoch
            if args["out"]:
                if not os.path.exists(config["model_path"]):
                    os.mkdir(config["model_path"])
                torch.save(
                    model, "{}{}.pth".format(config["model_path"], config["model"])
                )

    print(
        "End. Best epoch {:03d}: HR = {:.3f}, NDCG = {:.3f}".format(
            best_epoch, best_hr, best_ndcg
        )
    )