In [None]:
try:
    from google.colab import drive
    print('Google Drive is mounted successfully for Colab.')
except:
    print('Not Colab.')

Google Drive is mounted successfully for Colab.


In [None]:
%%capture
import os
import sys
import torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from scipy.sparse import csr_matrix
from tqdm import tqdm
import scipy.sparse as sp
from time import time
from pprint import pprint

In [None]:
# CAUTION - Set your options #
sys.dont_write_bytecode = True  # pycahce option
path: str = '/content/drive/MyDrive/기초추천시스템/model'  # Put yelp2018 raw data directory path.

In [None]:
# Your Environment Setting #
os.chdir(path)
train_file   = path + '/yelp2018/train.txt'
test_file    = path + '/yelp2018/test.txt'
adj_mat_file = path + '/yelp2018/s_pre_adj_mat.npz'

assert os.getcwd() == path

In [None]:
# Training Resource Check #
GPU = torch.cuda.is_available()
device = torch.device('cuda' if GPU else "cpu")

print(f"Your Device: {['CPU', 'GPU'][GPU]}")

Your Device: CPU


# **Part1.** Data Pipeline

**`class Yelp2018`**  
> **Initialize** `(train_file, test_file)`  

*.txt 확장자인 Yelp2018 raw 데이터를 학습이 용이하도록 전처리

\\

**`class AdjacencyMatrix`** \\
> **Initialize** `(train_user, train_item, num_user, num_item, device)`

전처리한 학습 데이터를 기반으로 Normalized Adjacency Matrix를 구축  
특히, `get_sparse_graph(adj_mat_file)`의 반환값은 Normalized Adjacency Matrix임에 유의하며  
이는 논문에서 $\tilde{\mathbf{A}} := \mathbf{D}^{-\frac{1}{2}} \mathbf{A} \mathbf{D}^{-\frac{1}{2}}$ 이다.

\\

**`class PairwiseTrainData(torch.utils.dataset)`** \\
> **Initialize** `(train_user, train_item, num_user, num_item)`  

전처리한 학습 데이터를 기반으로 BPR Loss 학습을 위한 Negative Sampling 과정을 구현

\\

**`class TestData(torch.utils.dataset)`** \\
> **Initialize** `(train_user, train_item, test_user, test_item)`  

전처리한 학습 데이터와 테스트 데이터를 기반으로 구현  
특히, 추천시스템의 Metric의 특성 때문에 학습 데이터에서 본 아이템은 랭킹에서 제외해야한다.  
따라서 학습 데이터도 사용해야한다.

In [None]:
from data_utils import Yelp2018

yelp2018 = Yelp2018(train_file, test_file)

num_user = yelp2018.num_user
num_item = yelp2018.num_item

train_user = yelp2018.train_user
train_item = yelp2018.train_item
train_interaction = yelp2018.train_interaction

test_user = yelp2018.test_user
test_item = yelp2018.test_item
test_interaction = yelp2018.test_interaction

In [None]:
# Yelp2018 Statistics Check #
print('Yelp2018')
print(f"""
#user = {num_user}
#item = {num_item}

#interactions
    (train) {train_interaction}
    (test)  {test_interaction}
    (total) {train_interaction + test_interaction}

Sparsity = {(train_interaction + test_interaction) / (num_user * num_item)}
""")

Yelp2018

#user = 31668
#item = 38048

#interactions
    (train) 1237259
    (test)  324147
    (total) 1561406

Sparsity = 0.0012958757851778645



In [None]:
from data_utils import AdjacencyMatrix

adjacency_matrix = AdjacencyMatrix(train_user, train_item, num_user, num_item, device)
graph = adjacency_matrix.get_sparse_graph(adj_mat_file)  # This is The Normalized Adjacency Matrix.

loading adjacency matrix
successfully loaded...


In [None]:
from data_utils import PairwiseTrainData

train_dataset = PairwiseTrainData(train_user, train_item, num_user, num_item)
train_dataloader = DataLoader(train_dataset, batch_size=5, shuffle=True, num_workers=0)

In [None]:
debug_counter = 0

for minibatch in train_dataloader:
    # optimizer.zero_grad()

    # torch.tensor
    user    : torch.tensor = minibatch[0].to(device)
    pos_item: torch.tensor = minibatch[1].to(device)
    neg_item: torch.tensor = minibatch[2].to(device)

    print(f'user:     {user}')
    print(f'pos_item: {pos_item}')
    print(f'neg_item: {neg_item}')
    print()

    debug_counter += 1
    if debug_counter > 4:
        break

    # pos_scores, neg_scores = model(user, pos_item, neg_item)
    # loss = bpr_loss(pos_scores, neg_scores)

    # loss_sum += loss.item()
    # loss.backward()
    # optimizer.step()

user:     tensor([16310, 13263,  7473,  6816,  9978])
pos_item: tensor([ 3064,  2481, 10700, 12776,  9611])
neg_item: tensor([  313, 26141,  2034,  4341, 28874])

user:     tensor([ 1164, 11166, 11406,  1411,  5051])
pos_item: tensor([ 4074, 19486,  4685, 17385,  2072])
neg_item: tensor([33372, 30135, 35141, 25222, 13206])

user:     tensor([14795, 12536,   458,  1291, 23448])
pos_item: tensor([27093,  1935,   605,  2807,  1275])
neg_item: tensor([ 7327, 36024,  8891, 17487, 13351])

user:     tensor([27245, 31018,  2159, 15277, 24395])
pos_item: tensor([33121, 14296,  4459,  7595,  5196])
neg_item: tensor([31629, 29869, 25356, 31230, 20561])

user:     tensor([29617, 19028, 12663,  3708,  1984])
pos_item: tensor([   87, 27883,  9168, 16103,  9588])
neg_item: tensor([24457, 21598, 15206, 23181,  7910])



In [None]:
from data_utils import TestData, collate_fn, remove_padding

test_dataset = TestData(train_user, train_item, test_user, test_item)
test_dataloader = DataLoader(test_dataset, batch_size=3, shuffle=False, collate_fn=collate_fn)

In [None]:
debug_counter = 0

for minibatch in test_dataloader:
    user   : list = minibatch[0]
    history: list = remove_padding(minibatch[1])  # 각 유저 별 train 에서 존재하는 아이템
    label  : list = remove_padding(minibatch[2])  # 각 유저 별 test  에서 존재하는 아이템

    pprint(user)
    print()
    pprint(history)
    print()
    pprint(label)

    debug_counter += 1
    if debug_counter >= 1:
        break


[tensor(0), tensor(1), tensor(2)]

[tensor([ 0, 17, 16, 14, 13, 12, 11, 10,  9, 15,  7,  6,  5,  4,  3,  2,  1,  8]),
 tensor([372, 373, 374, 375, 376, 377, 381, 379, 380, 382, 383, 371, 378, 370,
        361, 368, 367, 366, 365, 364, 363, 362, 384, 360, 359, 358, 357, 356,
        355, 369, 385, 394, 387, 417, 416, 415, 414, 413, 412, 411, 410, 409,
        408, 407, 406, 405, 404, 386, 403, 401, 400, 399, 398, 397, 396, 395,
        354, 393, 392, 391, 390, 389, 388, 402, 353, 344, 351, 315, 314, 313,
        312, 311, 310, 309, 308, 307, 306, 305, 304, 303, 302, 301, 300, 299,
        298, 297, 296, 295, 294, 293, 292, 291, 290, 289, 288, 287, 316, 317,
        318, 319, 350, 349, 348, 347, 346, 345, 418, 343, 342, 341, 340, 339,
        338, 337, 352, 336, 334, 333, 332, 331, 330, 329, 328, 327, 326, 325,
        324, 322, 321, 320, 335, 419, 428, 421, 518, 517, 516, 515, 514, 513,
        512, 511, 510, 509, 508, 507, 506, 505, 504, 503, 502, 501, 500, 499,
        498, 497, 496, 