In [1]:
import os
import numpy as np
import random

import torch
import torch.nn as nn

from model import *

import arguments

In [3]:
# !pip install cppimport

Defaulting to user installation because normal site-packages is not writeable
Collecting cppimport
  Downloading cppimport-22.8.2.tar.gz (26 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting pybind11 (from cppimport)
  Downloading pybind11-2.11.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.7/227.7 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
Building wheels for collected packages: cppimport
  Building wheel for cppimport (pyproject.toml) ... [?25ldone
[?25h  Created wheel for cppimport: filename=cppimport-22.8.2-py3-none-any.whl size=17499 sha256=05a7f084c0884377aecc26befa281c475d0cc136a10e65fcf1644c06c41fce06
  Stored in directory: /home/dable/.cache/pip/wheels/38/0d/cd/d3a3135529beaf8c8fde3957e56ff89c25cd14ab18358bd724
Successfully built cppimport
Installing collected packages: pybind11, cppimpor

In [4]:
import utils.load_dataset
import utils.data_loader
import utils.metrics
from utils.early_stop import EarlyStopping, Stop_args

In [5]:
from utils.metrics import auc

In [6]:
dataset = 'yahooR3'
base_model_args = {'emb_dim': 10, 'learning_rate': 0.01, 'imputaion_lambda': 0.01, 'weight_decay': 1}
weight1_model_args ={'learning_rate': 0.1, 'weight_decay': 0.001}
weight2_model_args =  {'learning_rate': 1e-3, 'weight_decay': 1e-2}
imputation_model_args = {'learning_rate': 1e-1, 'weight_decay': 1e-4}
training_args =  {'batch_size': 1024, 'epochs': 100, 'patience': 20, 'block_batch': [1000, 100]}
uniform_ratio = 0.05
seed = 0

In [7]:
from train_implicit import setup_seed
setup_seed(seed)

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)


cuda


## Load Dataset

### Indices: 
`When working with sparse tensors, the indices refer to the positions of non-zero elements in the tensor. It is a tensor of shape (N, D), where N represents the number of non-zero elements, and D corresponds to the number of dimensions or axes of the tensor. Each row in the indices tensor represents the coordinates of a non-zero element in the sparse tensor.`

### Value:
`The value refers to the actual non-zero values associated with the indices in a sparse tensor. It is a tensor of shape (N,), where N is the number of non-zero elements. Each element in the value tensor corresponds to the value of a non-zero element in the sparse tensor.`

### nnz:
`nnz stands for "number of non-zero elements." It represents the count of non-zero elements present in a sparse tensor. In other words, it denotes the length of the indices and value tensors.`

### Layout:
`The layout of a sparse tensor defines how the indices and values are stored in memory. Torch supports different sparse tensor layouts, such as "torch.sparse_coo", "torch.sparse_csr", and "torch.sparse_csc". Each layout has its own advantages and is suited for specific operations and computations. For example, the "torch.sparse_coo" layout stores the indices and values as separate tensors, while the "torch.sparse_csr" and "torch.sparse_csc" layouts store them in a compressed format.`

In [51]:
dataset='yahooR3'
training_args = {'batch_size': 1024,
                 'epochs': 500, 
                 'patience': 60, 
                 'block_batch': [6000, 500]}

base_model_args = {'emb_dim': 10, 
                   'learning_rate': 0.0001, 
                   'imputaion_lambda': 1, 
                   'weight_decay': 1}
weight1_model_args = {'learning_rate': 0.01, 'weight_decay': 0.01}
weight2_model_args = {'learning_rate': 1e-3, 'weight_decay': 1e-2}
imputation_model_args = {'emb_dim': 10, 'learning_rate': 1e-1, 'weight_decay': 1e-4}

In [16]:
# args = arguments.parse_args()
# para(args)
setup_seed(seed)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [17]:
train, unif_train, validation, test = utils.load_dataset.load_dataset(data_name=dataset, type = 'implicit', seed = seed, device=device)


In [18]:
train.shape, unif_train.shape, validation.shape, test.shape

(torch.Size([15400, 1000]),
 torch.Size([15400, 1000]),
 torch.Size([15400, 1000]),
 torch.Size([15400, 1000]))

In [33]:
train

tensor(indices=tensor([[    0,     0,     0,  ..., 15399, 15399, 15399],
                       [   13,   152,   169,  ...,   563,   636,   948]]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]),
       device='cuda:0', size=(15400, 1000), nnz=125077, layout=torch.sparse_coo)

In [66]:
train._indices().shape, train._values().shape, train._nnz()

(torch.Size([2, 125077]), torch.Size([125077]), 125077)

In [19]:
from train_implicit import train_and_eval

# train_and_eval(train, 
#                unif_train, 
#                validation, 
#                test,
#                device,
#                base_model_args = base_model_args, 
#                weight1_model_args = weight1_model_args,
#                weight2_model_args = weight2_model_args,
#                imputation_model_args = imputation_model_args, 
#                training_args = training_args)

## 1. Baseline Model - DR

In [20]:
train_data, unif_train_data, val_data, test_data = train, unif_train, validation, test

In [22]:
train_loader = utils.data_loader.Block(train_data, u_batch_size=training_args['block_batch'][0], i_batch_size=training_args['block_batch'][1], device=device)
val_loader = utils.data_loader.DataLoader(utils.data_loader.Interactions(val_data), batch_size=training_args['batch_size'], shuffle=False, num_workers=0)
test_loader = utils.data_loader.DataLoader(utils.data_loader.Interactions(test_data), batch_size=training_args['batch_size'], shuffle=False, num_workers=0)

In [23]:
type(train_loader)

utils.data_loader.Block

In [67]:
for u_batch_idx, users in enumerate(train_loader.User_loader):
  print(users.shape)
  break

torch.Size([6000])


### Naive Bayes propensity estimator & Functions used in building model

In [38]:
P_Oeq1 = train_data._nnz() / (train.size()[0] * train.size()[1])
P_Oeq1 # ratio of non-zero

0.008121883116883118

In [39]:
train_data._nnz() # number of non-zero elements

125077

In [40]:
y_unique = torch.unique(train_data._values())
y_unique

tensor([1.], device='cuda:0')

In [41]:
P_y_givenO = torch.zeros(y_unique.shape).to(device)
P_y = torch.zeros(y_unique.shape).to(device)
print(P_y_givenO, P_y)

tensor([0.], device='cuda:0') tensor([0.], device='cuda:0')


In [44]:
for i in range(len(y_unique)): 
    P_y_givenO[i] = torch.sum(train._values() == y_unique[i]) / torch.sum(torch.ones(train._values().shape).to(device))
    P_y[i] = torch.sum(unif_train_data._values() == y_unique[i]) / torch.sum(torch.ones(unif_train_data._values().shape).to(device))

Propensity = P_y_givenO / P_y * P_Oeq1

In [46]:
Propensity

tensor([0.0949], device='cuda:0')

In [47]:
InvP = torch.reciprocal(Propensity)
InvP

tensor([10.5340], device='cuda:0')

In [46]:
# number of total and non-zero elements in the tensor
print(train.coalesce().numel(), train.coalesce()._nnz())

15400000 125077


### Train start

In [48]:
n_user, n_item = train_data.shape
print(n_user, n_item)

15400 1000


In [49]:
base_model = MF(n_user, n_item, dim=base_model_args['emb_dim'], dropout=0).to(device)
base_optimizer = torch.optim.SGD(base_model.parameters(), lr=base_model_args['learning_rate'],  weight_decay=0)

In [52]:
imputation_model = MF(n_user, n_item, dim=imputation_model_args['emb_dim'], dropout=0).to(device)
imputation_optimizer = torch.optim.SGD(imputation_model.parameters(), lr=imputation_model_args['learning_rate'], weight_decay=0)

In [104]:
# Loss criterion
none_criterion = nn.MSELoss(reduction='none')
sum_criterion = nn.MSELoss(reduction='sum')

In [54]:
# Early stopping
stopping_args = Stop_args(patience=training_args['patience'], max_epochs=training_args['epochs'])
early_stopping = EarlyStopping(base_model, **stopping_args)

In [80]:
users, items = next(iter(train_loader.User_loader)), next(iter(train_loader.Item_loader))

In [89]:
print(users) # dim: 6000

tensor([  834,  5411, 14072,  ..., 13086,  2660,  1126], device='cuda:0')


In [82]:
print(items) # dim: 500

tensor([118, 531, 702, 489, 266, 221, 428, 738, 768, 136,  43, 217,  72,  24,
        123, 639, 558, 324, 560, 933, 829, 206, 274,  45, 389, 815, 632, 660,
        941, 200,  12, 239, 830, 855, 251,  51, 496, 485, 803, 762, 513, 947,
        969, 108, 466, 767, 688,  86, 986, 901, 848,  78, 763, 238,  38, 593,
        480, 538, 843, 349,  53, 938, 521, 409,  71,  19, 550, 888, 605, 935,
        852, 802, 407, 840, 920, 533, 116, 371, 326, 696, 783, 147, 429, 784,
        149,  40,  62,  17, 602, 177, 474, 786, 915, 872, 670, 163, 443, 215,
        205, 704, 346, 561, 191, 773,  13, 749, 892, 877, 173, 328, 765,  73,
        470, 823, 808, 778, 425, 599, 449, 383, 874, 922, 822, 949, 397, 824,
        582, 931, 107, 247, 897, 614, 904, 995, 755, 800, 942, 788,  95, 168,
        501,  52, 601, 336, 741, 894, 707, 456, 388, 504, 871, 146, 684,  58,
        135, 844, 928, 179, 955, 950, 629, 566, 754,  57, 211, 451, 340, 699,
        333, 774, 961, 325,  79, 372, 276, 732,  90,  82, 780, 8

In [83]:
# y_train: all 1's
# dim of users_train, items_train: 25142 (train._nnz() * 6000 * 500)
users_train, items_train, y_train = train_loader.get_batch(users, items, device)

In [93]:
weight = torch.ones(y_train.shape).to(device)

In [94]:
for i in range(len(y_unique)):
    weight[y_train==y_unique[i]] = InvP[i]

### Step 1: update imputation error model on observed data

In [96]:
imputation_model.train()

MF(
  (user_latent): Embedding(15400, 10)
  (item_latent): Embedding(1000, 10)
  (user_bias): Embedding(15400, 1)
  (item_bias): Embedding(1000, 1)
  (dropout): Dropout(p=0, inplace=False)
)

In [99]:
e_hat = imputation_model(users_train, items_train) # imputation error, dim: 25142

In [100]:
e_hat

tensor([-0.0430,  0.0629,  0.0201,  ..., -0.0500, -0.0386,  0.0917],
       device='cuda:0', grad_fn=<SqueezeBackward1>)

In [101]:
e = y_train - base_model(users_train, items_train) # prediction error, dim: 25142

In [110]:
e

tensor([0.9703, 1.0003, 0.9692,  ..., 1.0331, 0.9851, 0.9773], device='cuda:0',
       grad_fn=<SubBackward0>)

In [105]:
cost_e = none_criterion(e_hat, e) # the cost of error, MSE between imputation and prediction error

In [108]:
cost_e # (e[0].item() - e_hat[0].item()) ** 2 == cost_e[0].item()

tensor([1.0268, 0.8787, 0.9008,  ..., 1.1731, 1.0479, 0.7844], device='cuda:0',
       grad_fn=<MseLossBackward0>)

In [None]:
loss_imp = torch.sum(weight * cost_e) + imputation_model_args['weight_decay'] * imputation_model.l2_norm(users_train, items_train)

### Full training code

In [None]:
for epoch in range(early_stopping.max_epochs):
    for u_batch_idx, users in enumerate(train_loader.User_loader):
        for i_batch_idx, items in enumerate(train_loader.Item_loader):
            # observation data in this batch
            users_train, items_train, y_train = train_loader.get_batch(users, items, device)

## Train & Eval

In [48]:
# train_and_eval(train, 
#               unif_train,
#               validation,
#               test,
#               device, 
#               base_model_args=base_model_args, 
#               weight1_model_args=weight1_model_args, 
#               weight2_model_args=weight2_model_args, 
#               imputation_model_args=imputation_model_args, 
#               training_args=training_args)

In [49]:
# transform sparse to dense matrix
train_data = train
train_dense = train_data.to_dense()
train_dense

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [50]:
unif_train_data = unif_train
users_unif = unif_train_data._indices()[0]
items_unif = unif_train_data._indices()[1]
y_unif = unif_train_data._values()

In [51]:
# build data_loader. (block matrix data loader)

train_loader = utils.data_loader.Block(train_data,
                                       u_batch_size=training_args['block_batch'][0],
                                       i_batch_size=training_args['block_batch'][1],
                                       device=device)

In [52]:
val_data, test_data = validation, test
val_loader = utils.data_loader.DataLoader(utils.data_loader.Interactions(val_data), batch_size=training_args['batch_size'], shuffle=False, num_workers=0)
test_loader = utils.data_loader.DataLoader(utils.data_loader.Interactions(test_data), batch_size=training_args['batch_size'], shuffle=False, num_workers=0)

In [53]:
# data shape
n_user, n_item = train_data.shape
print(n_user, n_item)

15400 1000


## Models

In [54]:
class MF(nn.Module):
  """
  Base module for matrix factoriazation
  """
  def __init__(self, n_user, n_item, dim=40, dropout=0, init=None):
    super().__init__()
    
    self.user_latent = nn.Embedding(n_user, dim)
    self.item_latent = nn.Embedding(n_item, dim)
    
    self.user_bias = nn.Embedding(n_user, 1)
    self.item_bias = nn.Embedding(n_item, 1)
    
    self.dropout_p = dropout
    self.dropout = nn.Dropout(p=self.dropout_p)
    if not init:
      self.init_embedding(init)
    else:
      self.init_embedding(0)
    
  def init_embedding(self, init):
    nn.init.kaiming_normal_(self.user_latent.weight.data, mode='fan_out', a=init)
    nn.init.kaiming_normal_(self.item_latent.weight.data, mode='fan_out', a=init)
    nn.init.kaiming_normal_(self.user_bias.weight.data, mode='fan_out', a=init)
    nn.init.kaiming_normal_(self.item_bias.weight.data, mode='fan_out', a=init)
    
  def forward(self, users, items):
    u_latent = self.dropout(self.user_latent(users))
    i_latent = self.dropout(self.item_latent(items))
    
    u_bias = self.user_bias(users)
    i_bias = self.user_bias(items)
    
    preds = torch.sum(u_latent * i_latent, dim=1, keepdim=True) + u_bias + i_bias
    
    return preds.squeeze(dim=-1)
  
  def l2_norm(self, users, items):
    users = torch.unique(users)
    items = torch.unique(items)
    
    l2_loss = (torch.sum(self.user_latent(users)**2) + torch.sum(self.item_latent(items)**2)) / 2
    return l2_loss


In [55]:
from model import MetaModule, MetaEmbed
class MetaEmbed(MetaModule):
  """
  Base module for matrix factorization
  """
  def __init__(self, dim_1, dim_2):
    super().__init__()
    ignore = nn.Embedding(dim_1, dim_2)
    
    self.register_buffer('weight', to_var(ignore.weight.data, requires_grad=True))
    self.register_buffer('bias', None)
    
  def forward(self):
    return self.weight
  
  def named_leaves(self):
    return [('weight', self.weight), ('bias', self.bias)]

In [56]:
class MetaMF(MetaModule):
    """
    Base module for matrix factorization.
    """
    def __init__(self, n_user, n_item, dim=40, dropout=0, init = None):
        super().__init__()
        
        self.user_latent = MetaEmbed(n_user, dim)
        self.item_latent = MetaEmbed(n_item, dim)
        self.user_bias = MetaEmbed(n_user, 1)
        self.item_bias = MetaEmbed(n_item, 1)
        self.dropout_p = dropout
        self.dropout = nn.Dropout(p=self.dropout_p)
        if init is not None:
            self.init_embedding(init)
        else: 
            self.init_embedding(0)
        
    def init_embedding(self, init): 

        nn.init.kaiming_normal_(self.user_latent.weight, mode='fan_out', a = init)
        nn.init.kaiming_normal_(self.item_latent.weight, mode='fan_out', a = init)
        nn.init.kaiming_normal_(self.user_bias.weight, mode='fan_out', a = init)
        nn.init.kaiming_normal_(self.item_bias.weight, mode='fan_out', a = init)
          
    def forward(self, users, items):
        u_latent = self.dropout(self.user_latent.weight[users])
        i_latent = self.dropout(self.item_latent.weight[items])
        u_bias = self.user_bias.weight[users]
        i_bias = self.item_bias.weight[items]

        preds = torch.sum(u_latent * i_latent, dim=1, keepdim=True) + u_bias + i_bias
        return preds.squeeze(dim=-1)

    def l2_norm(self, users, items, unique = True): 

        users = torch.unique(users)
        items = torch.unique(items)
        
        l2_loss = (torch.sum(self.user_latent.weight[users]**2) + torch.sum(self.item_latent.weight[items]**2)) / 2
        return l2_loss

In [57]:
# import torch
# import torch.nn as nn

# # Define the embedding layer
# embedding_dim = 10
# vocab_size = 100
# embedding_layer = nn.Embedding(vocab_size, embedding_dim)

# # Example input indices
# indices = torch.tensor(list(range(4)))

# # Apply the embedding layer
# embedded_output = embedding_layer(indices)

# print(embedded_output.shape)

In [58]:
base_model = MetaMF(n_user, n_item, dim=base_model_args['emb_dim'], dropout=0).to(device)
base_optimizer = torch.optim.SGD(base_model.params(), lr=base_model_args['learning_rate'], weight_decay=0) # todo: other optimizer SGD

register_buffer 로 layer를 등록하면 어떤 특징이 있는가?

1. optimizer가 업데이트하지 않는다.

2. 그러나 값은 존재한다(하나의 layer로써 작용한다고 보면 된다.)

3. state_dict()로 확인이 가능하다.

4. GPU연산이 가능하다.

 

따라서 네트워크를 구성함에 있어서 네트워크를 end2end로 학습시키고 싶은데 중간에 업데이트를 하지않는 일반 layer를 넣고 싶을 때 사용할 수 있다.

In [59]:
def to_var(x, requires_grad=True):
  x = x.cuda() if torch.cuda.is_available() else x
  return Variable(x, requires_grad=requires_grad) # 현재는 모든 tensor에서 required_grad 옵션을 통해 gradient를 추적할 수 있기 때문에 따로 위와 같이 Variable로 감싸줄 필요가 없다. (현재는 쓸 필요 없음)


In [60]:
class OneLinear(nn.Module):
  """
  linear model: r
  """
  def __init__(self, n):
    super().__init__()

    self.bias = nn.Embedding(n,1)
    self.init_embedding()

  def init_embedding(self):
    self.bias.weight.data *= 0.01

  def forward(self, values):
    d_bias = self.bias(values)
    return d_bias.squeeze()


In [61]:
class TwoLinear(nn.Module):
  def __init__(self, n_user, n_item):
    super().__init__()

    self.user_bias = nn.Embedding(n_user, 1)
    self.item_bias = nn.Embedding(n_item, 1)

    self.init_embedding(0)
  
  def init_embedding(self, init):
    nn.init.kaiming_normal_(self.user_bias.weight, mode='fan_out', a=init)
    nn.init.kaiming_normal_(self.item_bias.weight, mode='fan_out', a=init)

  def forward(self, users, items):
    u_bias = self.user_bias(users)
    i_bias = self.item_bias(items)
    preds = u_bias + i_bias
    
    return preds.squeeze()

In [62]:
class ThreeLinear(nn.Module):
  """
  linear model: u + i + r / o
  """
  def __init__(self, n_user, n_item, n):
    super().__init__()
    
    self.user_bias = nn.Embedding(n_user, 1)
    self.item_bias = nn.Embedding(n_item, 1)
    self.data_bias= nn.Embedding(n, 1)
    self.init_embedding(0)
      
  def init_embedding(self, init): 
    nn.init.kaiming_normal_(self.user_bias.weight, mode='fan_out', a = init)
    nn.init.kaiming_normal_(self.item_bias.weight, mode='fan_out', a = init)
    nn.init.kaiming_normal_(self.data_bias.weight, mode='fan_out', a = init)
    self.data_bias.weight.data *= 0.001

  def forward(self, users, items, values):

    u_bias = self.user_bias(users)
    i_bias = self.item_bias(items)
    d_bias = self.data_bias(values)

    preds = u_bias + i_bias + d_bias
    return preds.squeeze()

In [63]:
weight1_model = TwoLinear(n_user, n_item).to(device)
weight1_optimizer = torch.optim.Adam(weight1_model.parameters(), 
                                     lr=base_model_args['learning_rate'],
                                     weight_decay=0)

In [64]:
weight2_model = ThreeLinear(n_user, n_item, 2).to(device)
weight2_optimizer = torch.optim.Adam(weight2_model.parameters(), 
                                     lr=weight2_model_args['learning_rate'],
                                     weight_decay=weight2_model_args['weight_decay'])

In [65]:
imputation_model = OneLinear(2).to(device)
imputation_optimizer = torch.optim.Adam(imputation_model.parameters(),
                                        lr=imputation_model_args['learning_rate'],
                                        weight_decay=imputation_model_args['weight_decay'])

In [66]:
# loss criterion
sum_criterion = nn.MSELoss(reduction='sum')
none_criterion = nn.MSELoss(reduction='none')

## Training

In [67]:
train_dense.shape

torch.Size([15400, 1000])

In [68]:
stopping_args = Stop_args(patience=training_args['patience'], max_epochs=training_args['epochs'])
early_stopping = EarlyStopping(base_model, **stopping_args)

In [76]:
from train_implicit import train_and_eval

# setting that works well
train_and_eval(train, 
               unif_train, 
               validation, 
               test,
               device,
               base_model_args = base_model_args, 
               weight1_model_args = weight1_model_args,
               weight2_model_args = weight2_model_args,
               imputation_model_args = imputation_model_args, 
               training_args = training_args)

In [33]:
# # users and items have shape of 1000 and 100, respectively
# for epoch in range(training_args['epochs']):
#   training_loss = 0
#   for u_batch_idx, users in enumerate(train_loader.User_loader):
#     for i_batch_idx, items in enumerate(train_loader.Item_loader):
#       # training set: 1. update parameters one_step (assumed update); 2. update parameters (real update) 
#       # uniform set: update hyper_parameters using gradient descent.
      
#       # print(users.shape, items.shape) # torch.Size([1000]) torch.Size([100])
      
#       # index_row = np.isin(train_data._indices()[0].cpu().numpy(), users.cpu().numpy())
#       # index_col = np.isin(train_data._indices()[1].cpu().numpy(), items.cpu().numpy())
#       # index = torch.tensor(np.where(index_row * index_col)[0]).to(device)
#       # print(index_row.shape, index_col.shape, index.shape) # (125077,) (125077,) <- number of observed elements / torch.Size([1023])
      
#       # y_train consists of 1s only
#       users_train, items_train, y_train = train_loader.get_batch(users, items, device)
#       # print(users_train.shape, items_train.shape) # ex. torch.Size([1023]) torch.Size([1023])
    
#       # calculate weight 1
#       weight1_model.train()
#       weight1 = weight1_model(users_train, items_train)
#       weight1 = torch.exp(weight1/5)
      
#       # all pair
#       all_pair = torch.cartesian_prod(users, items)
#       users_all, items_all = all_pair[:, 0], all_pair[:, 1]
#       # print(users_all.shape) # torch.Size([1000 * 100])
      
#       # calculate weight 2
#       weight2_model.train()
#       weight2 = weight2_model(users_all, items_all, (train_dense[users_all,items_all]!=0)*1) # *1: bool -> int
#       weight2 = torch.exp(weight2/5)
      
#       # caclculate imputation values
#       imputation_model.train()
#       impu_f_all = torch.tanh(imputation_model((train_dense[users_all,items_all]).long()))
      
#       # print(weight1.shape, weight2.shape, impu_f_all.shape) # torch.Size([788]) torch.Size([100000]) torch.Size([100000])
      
#       ######################################
#       ## 1. Assumed Update of theta (Black Arrows) #
#       ######################################
#       # one_step_model: assumed model, just update one step on base model. it is for updating weight parameters
#       one_step_model = MetaMF(n_user, n_item, dim=base_model_args['emb_dim'], dropout=0)
#       one_step_model.load_state_dict(base_model.state_dict()) # state_dict 는 간단히 말해 각 계층을 매개변수 텐서로 매핑되는 Python 사전(dict) 객체
      
#       # formal parameter: using training set to update parameters
#       one_step_model.train()
#       # all pair data in this block
#       y_hat_f_all = one_step_model(users_all, items_all)
#       cost_f_all = none_criterion(y_hat_f_all, impu_f_all)
#       loss_f_all = torch.sum(cost_f_all * weight2)

#       # observation data
#       y_hat_f_obs = one_step_model(users_train, items_train)
#       cost_f_obs = none_criterion(y_hat_f_obs, y_train)
#       loss_f_obs = torch.sum(cost_f_obs * weight1)
#       loss_f = loss_f_obs + base_model_args['imputaion_lambda'] * loss_f_all + base_model_args['weight_decay'] * one_step_model.l2_norm(users_all, items_all)
      
#       # assumed update (not a real update)
#       one_step_model.zero_grad()
#       grads = torch.autograd.grad(loss_f, (one_step_model.params()), create_graph=True)
#       one_step_model.update_params(base_model_args['learning_rate'], source_params=grads)
      
#       ######################################
#       ### 2. Update of pi (Blue Arrows) ####
#       ######################################
#       # latter hyper_parameter: Using uniform set to update hyper_parameters
#       y_hat_l = one_step_model(users_unif, items_unif)
#       loss_l = sum_criterion(y_hat_l, y_unif)
      
#       # update hyper-parameters
#       weight1_optimizer.zero_grad()
#       weight2_optimizer.zero_grad()
#       imputation_optimizer.zero_grad()
#       loss_l.backward()
#       if epoch >= 20:
#         weight1_optimizer.step()
#         weight2_optimizer.step()
#       imputation_optimizer.step()
      
      
#       ######################################
#       ### 3. Update of theta (Black Arrows) ####
#       ######################################
#       # use new weights to update parameters (real update)
#       weight1_model.train()
#       weight1 = weight1_model(users_train, items_train)
#       weight1 = torch.exp(weight1/5)
      
#       # calculate weight2
#       weight2_model.train()
#       weight2 = weight2_model(users_all, items_all,(train_dense[users_all,items_all]!=0)*1)
#       weight2 = torch.exp(weight2/5) # for stable training
      
#       # use new imputation to update parameters
#       imputation_model.train()
#       impu_all = torch.tanh(imputation_model((train_dense[users_all,items_all]).long()))

#       # loss of training set
#       base_model.train()
#       # all pair
#       y_hat_all = base_model(users_all, items_all)
#       cost_all = none_criterion(y_hat_all, impu_all)
#       loss_all = torch.sum(cost_all * weight2)
#       # observation
#       y_hat_obs = base_model(users_train, items_train)
#       cost_obs = none_criterion(y_hat_obs, y_train)
#       loss_obs = torch.sum(cost_obs * weight1)
#       loss = loss_obs + base_model_args['imputaion_lambda'] * loss_all + base_model_args['weight_decay'] * base_model.l2_norm(users_all, items_all)
      
#       base_optimizer.zero_grad()
#       loss.backward()
#       base_optimizer.step()

#       training_loss += loss.item()
      
#   base_model.eval()
#   with torch.no_grad():
#       # training metrics
#       train_pre_ratings = torch.empty(0).to(device)
#       train_ratings = torch.empty(0).to(device)
#       for u_batch_idx, users in enumerate(train_loader.User_loader): 
#           for i_batch_idx, items in enumerate(train_loader.Item_loader): 
#               users_train, items_train, y_train = train_loader.get_batch(users, items, device)
#               pre_ratings = base_model(users_train, items_train)
#               train_pre_ratings = torch.cat((train_pre_ratings, pre_ratings))
#               train_ratings = torch.cat((train_ratings, y_train))

#       # validation metrics
#       val_pre_ratings = torch.empty(0).to(device)
#       val_ratings = torch.empty(0).to(device)
#       for batch_idx, (users, items, ratings) in enumerate(val_loader):
#           pre_ratings = base_model(users, items)
#           val_pre_ratings = torch.cat((val_pre_ratings, pre_ratings))
#           val_ratings = torch.cat((val_ratings, ratings))

#   train_results = utils.metrics.evaluate(train_pre_ratings, train_ratings, ['MSE'], device)
#   val_results = utils.metrics.evaluate(val_pre_ratings, val_ratings, ['MSE', 'NLL', 'AUC'], device)

#   print('Epoch: {0:2d} / {1}, Validation: {2}'.
#           format(epoch, training_args['epochs'], 
#               ' '.join([key+':'+'%.3f'%val_results[key] for key in val_results])))

#   if epoch >= 50 and early_stopping.check([val_results['AUC']], epoch):
#       break

## Baseline Models

### 1. IPS

In [73]:
from baselines.IPS import train_and_eval as train_and_eval_ips

In [75]:
ips_training_args = {'batch_size': 1024, 'epochs': 500, 'patience': 60, 'block_batch': [6000, 500]}
ips_base_model_args = {'emb_dim': 10, 'learning_rate': 0.00001, 'weight_decay': 0}

train_and_eval_ips(train_data,
                   unif_train_data,
                   val_data,
                   test_data,
                   device=device, 
                   model_args=ips_base_model_args,
                   training_args=ips_training_args)

### 2. DR

In [111]:
dr_training_args = {'batch_size': 1024, 'epochs': 500, 'patience': 60, 'block_batch': [6000, 500]}
dr_base_model_args = {'emb_dim': 10, 'learning_rate': 0.00001, 'weight_decay': 0}
dr_imputation_model_args = {'emb_dim': 10, 'learning_rate': 0.00001, 'weight_decay': 0}


In [107]:
def Naive_Bayes_Propensity(train, unif): 
    # the implementation of naive bayes propensity
    P_Oeq1 = train._nnz() / (train.size()[0] * train.size()[1])

    y_unique = torch.unique(train._values())
    P_y_givenO = torch.zeros(y_unique.shape).to(device)
    P_y = torch.zeros(y_unique.shape).to(device)

    for i in range(len(y_unique)): 
        P_y_givenO[i] = torch.sum(train._values() == y_unique[i]) / torch.sum(torch.ones(train._values().shape).to(device))
        P_y[i] = torch.sum(unif._values() == y_unique[i]) / torch.sum(torch.ones(unif._values().shape).to(device))

    Propensity = P_y_givenO / P_y * P_Oeq1

    return y_unique, Propensity

In [108]:
y_unique, Propensity = Naive_Bayes_Propensity(train_data, unif_train_data)
InvP = torch.reciprocal(Propensity)

In [109]:
y_unique, Propensity, InvP

(tensor([1.]), tensor([0.0949]), tensor([10.5340]))

In [80]:
train, unif = train_data, unif_train_data

In [104]:
P_Oeq1 = train._nnz() / (train.size()[0] * train.size()[1])
P_Oeq1

0.008121883116883118

In [89]:
y_unique = torch.unique(train._values())
y_unique

tensor([1.])

In [93]:
P_y_givenO = torch.zeros(y_unique.shape).to(device)
P_y = torch.zeros(y_unique.shape).to(device)

In [94]:
P_y_givenO, P_y

(tensor([0.]), tensor([0.]))

In [101]:
for i in range(len(y_unique)):
  print(i)
  
  P_y_givenO[i] = torch.sum(train._values() == y_unique[i]) / torch.sum(torch.ones(train._values().shape).to(device))
  P_y[i] = torch.sum(unif._values() == y_unique[i]) / torch.sum(torch.ones(unif._values().shape).to(device))

0


In [102]:
P_y_givenO, P_y

(tensor([1.]), tensor([0.0856]))

In [106]:
Propensity = P_y_givenO / P_y * P_Oeq1
Propensity

tensor([0.0949])

In [121]:
# data shape
n_user, n_item = train_data.shape

# model and its optimizer. 
base_model = MF(n_user, n_item, dim=dr_base_model_args['emb_dim'], dropout=0).to(device)
base_optimizer = torch.optim.SGD(base_model.parameters(), lr=dr_base_model_args['learning_rate'], weight_decay=0)

imputation_model = MF(n_user, n_item, dim=dr_imputation_model_args['emb_dim'], dropout=0).to(device)
imputation_optimizer = torch.optim.SGD(imputation_model.parameters(), lr=dr_imputation_model_args['learning_rate'], weight_decay=0)

# loss_criterion
none_criterion = nn.MSELoss(reduction='none')
sum_criterion = nn.MSELoss(reduction='sum')

In [122]:
stopping_args = Stop_args(patience=training_args['patience'], max_epochs=training_args['epochs'])
early_stopping = EarlyStopping(base_model, **stopping_args)

In [125]:
for epo in range(early_stopping.max_epochs):
        for u_batch_idx, users in enumerate(train_loader.User_loader): 
            for i_batch_idx, items in enumerate(train_loader.Item_loader): 
                # observation data in this batch
                users_train, items_train, y_train = train_loader.get_batch(users, items)
                weight = torch.ones(y_train.shape).to(device)
                for i in range(len(y_unique)): 
                    weight[y_train == y_unique[i]] = InvP[i]

                # step 1: update imptation error model
                imputation_model.train()

                e_hat = imputation_model(users_train, items_train) # imputation error
                e = y_train - base_model(users_train, items_train) # prediction error
                cost_e = none_criterion(e_hat, e) # the cost of error, i.e., the difference between imputaiton error and prediction error

                loss_imp = torch.sum(weight * cost_e) + imputation_model_args['weight_decay'] * imputation_model.l2_norm(users_train, items_train)
                
                imputation_optimizer.zero_grad()
                loss_imp.backward()
                imputation_optimizer.step()

                # step 2: update predition model
                base_model.train()

                # all pair data in this block
                all_pair = torch.cartesian_prod(users, items)
                users_all, items_all = all_pair[:,0], all_pair[:,1]

                y_hat_all = base_model(users_all, items_all)
                y_hat_all_detach = torch.detach(y_hat_all)
                g_all = imputation_model(users_all, items_all)

                loss_all = sum_criterion(y_hat_all, g_all + y_hat_all_detach) # sum(e_hat)

                # observation data
                y_hat_obs = base_model(users_train, items_train)
                y_hat_obs_detach = torch.detach(y_hat_obs)
                g_obs = imputation_model(users_train, items_train)

                e_obs = none_criterion(y_hat_obs, y_train)
                e_hat_obs = none_criterion(y_hat_obs, g_obs + y_hat_obs_detach)

                cost_obs = e_obs - e_hat_obs
                loss_obs = torch.sum(weight * cost_obs)

                loss_base = loss_all + loss_obs + base_model_args['weight_decay'] * base_model.l2_norm(users_all, items_all)

                base_optimizer.zero_grad()
                loss_base.backward()
                base_optimizer.step()
        
        base_model.eval()
        with torch.no_grad():
            # training metrics
            train_pre_ratings = torch.empty(0).to(device)
            train_ratings = torch.empty(0).to(device)
            for u_batch_idx, users in enumerate(train_loader.User_loader): 
                for i_batch_idx, items in enumerate(train_loader.Item_loader): 
                    users_train, items_train, y_train = train_loader.get_batch(users, items)
                    pre_ratings = base_model(users_train, items_train)
                    train_pre_ratings = torch.cat((train_pre_ratings, pre_ratings))
                    train_ratings = torch.cat((train_ratings, y_train))

            # validation metrics
            val_pre_ratings = torch.empty(0).to(device)
            val_ratings = torch.empty(0).to(device)
            for batch_idx, (users, items, ratings) in enumerate(val_loader):
                pre_ratings = base_model(users, items)
                val_pre_ratings = torch.cat((val_pre_ratings, pre_ratings))
                val_ratings = torch.cat((val_ratings, ratings))
            
        train_results = utils.metrics.evaluate(train_pre_ratings, train_ratings, ['MSE', 'NLL'])
        val_results = utils.metrics.evaluate(val_pre_ratings, val_ratings, ['MSE', 'NLL', 'AUC'])

        print('Epoch: {0:2d} / {1}, Traning: {2}, Validation: {3}'.
                format(epo, training_args['epochs'], ' '.join([key+':'+'%.3f'%train_results[key] for key in train_results]), 
                ' '.join([key+':'+'%.3f'%val_results[key] for key in val_results])))

        if early_stopping.check([val_results['AUC']], epo):
            break

# testing loss
print('Loading {}th epoch'.format(early_stopping.best_epoch))
base_model.load_state_dict(early_stopping.best_state)

# validation metrics
val_pre_ratings = torch.empty(0).to(device)
val_ratings = torch.empty(0).to(device)
for batch_idx, (users, items, ratings) in enumerate(val_loader):
    pre_ratings = base_model(users, items)
    val_pre_ratings = torch.cat((val_pre_ratings, pre_ratings))
    val_ratings = torch.cat((val_ratings, ratings))

# test metrics
test_users = torch.empty(0, dtype=torch.int64).to(device)
test_items = torch.empty(0, dtype=torch.int64).to(device)
test_pre_ratings = torch.empty(0).to(device)
test_ratings = torch.empty(0).to(device)
for batch_idx, (users, items, ratings) in enumerate(test_loader):
    pre_ratings = base_model(users, items)
    test_users = torch.cat((test_users, users))
    test_items = torch.cat((test_items, items))
    test_pre_ratings = torch.cat((test_pre_ratings, pre_ratings))
    test_ratings = torch.cat((test_ratings, ratings))

val_results = utils.metrics.evaluate(val_pre_ratings, val_ratings, ['MSE', 'NLL', 'AUC'])
test_results = utils.metrics.evaluate(test_pre_ratings, test_ratings, ['MSE', 'NLL', 'AUC', 'Recall_Precision_NDCG@'], users=test_users, items=test_items)
print('-'*30)
print('The performance of validation set: {}'.format(' '.join([key+':'+'%.3f'%val_results[key] for key in val_results])))
print('The performance of testing set: {}'.format(' '.join([key+':'+'%.3f'%test_results[key] for key in test_results])))
print('-'*30)

Epoch:  0 / 500, Traning: MSE:nan NLL:nan, Validation: MSE:nan NLL:nan AUC:0.494
Epoch:  1 / 500, Traning: MSE:nan NLL:nan, Validation: MSE:nan NLL:nan AUC:0.494


KeyboardInterrupt: 