In [1]:
import os
import numpy as np
import random

import torch
import torch.nn as nn

from model import *

import arguments

In [2]:
import utils.load_dataset
import utils.data_loader
import utils.metrics
from utils.early_stop import EarlyStopping, Stop_args

In [3]:
dataset = 'yahooR3'
base_model_args = {'emb_dim': 10, 'learning_rate': 0.01, 'imputaion_lambda': 0.01, 'weight_decay': 1}
weight1_model_args ={'learning_rate': 0.1, 'weight_decay': 0.001}
weight2_model_args =  {'learning_rate': 1e-3, 'weight_decay': 1e-2}
imputation_model_args = {'learning_rate': 1e-1, 'weight_decay': 1e-4}
training_args =  {'batch_size': 1024, 'epochs': 100, 'patience': 20, 'block_batch': [1000, 100]}
uniform_ratio = 0.05
seed = 0

In [4]:
from train_implicit import setup_seed
setup_seed(seed)

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)


cpu


In [6]:
train, unif_train, validation, test = utils.load_dataset.load_dataset(data_name=dataset, type='implicit', seed=seed, device=device)

## Load Dataset

### Indices: 
`When working with sparse tensors, the indices refer to the positions of non-zero elements in the tensor. It is a tensor of shape (N, D), where N represents the number of non-zero elements, and D corresponds to the number of dimensions or axes of the tensor. Each row in the indices tensor represents the coordinates of a non-zero element in the sparse tensor.`

### Value:
`The value refers to the actual non-zero values associated with the indices in a sparse tensor. It is a tensor of shape (N,), where N is the number of non-zero elements. Each element in the value tensor corresponds to the value of a non-zero element in the sparse tensor.`

### nnz:
`nnz stands for "number of non-zero elements." It represents the count of non-zero elements present in a sparse tensor. In other words, it denotes the length of the indices and value tensors.`

### Layout:
`The layout of a sparse tensor defines how the indices and values are stored in memory. Torch supports different sparse tensor layouts, such as "torch.sparse_coo", "torch.sparse_csr", and "torch.sparse_csc". Each layout has its own advantages and is suited for specific operations and computations. For example, the "torch.sparse_coo" layout stores the indices and values as separate tensors, while the "torch.sparse_csr" and "torch.sparse_csc" layouts store them in a compressed format.`

In [7]:
train # print(unif_train._indices(), unif_train._values())

tensor(indices=tensor([[    0,     0,     0,  ..., 15399, 15399, 15399],
                       [   13,   152,   169,  ...,   563,   636,   948]]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]),
       size=(15400, 1000), nnz=125077, layout=torch.sparse_coo)

In [8]:
# number of total and non-zero elements in the tensor
print(train.coalesce().numel(), train.coalesce()._nnz())

15400000 125077


In [9]:
print(train.shape, unif_train.shape, validation.shape, test.shape)

torch.Size([15400, 1000]) torch.Size([15400, 1000]) torch.Size([15400, 1000]) torch.Size([15400, 1000])


## Train & Eval

In [10]:
# train_and_eval(train, 
#               unif_train,
#               validation,
#               test,
#               device, 
#               base_model_args=base_model_args, 
#               weight1_model_args=weight1_model_args, 
#               weight2_model_args=weight2_model_args, 
#               imputation_model_args=imputation_model_args, 
#               training_args=training_args)

In [11]:
# transform sparse to dense matrix
train_data = train
train_dense = train_data.to_dense()
train_dense

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [12]:
unif_train_data = unif_train
users_unif = unif_train_data._indices()[0]
items_unif = unif_train_data._indices()[1]
y_unif = unif_train_data._values()

In [13]:
# build data_loader. (block matrix data loader)

train_loader = utils.data_loader.Block(train_data,
                                       u_batch_size=training_args['block_batch'][0],
                                       i_batch_size=training_args['block_batch'][1],
                                       device=device)

In [14]:
val_data, test_data = validation, test
val_loader = utils.data_loader.DataLoader(utils.data_loader.Interactions(val_data), batch_size=training_args['batch_size'], shuffle=False, num_workers=0)
test_loader = utils.data_loader.DataLoader(utils.data_loader.Interactions(test_data), batch_size=training_args['batch_size'], shuffle=False, num_workers=0)

In [15]:
# data shape
n_user, n_item = train_data.shape
print(n_user, n_item)

15400 1000


## Models

In [16]:
class MF(nn.Module):
  """
  Base module for matrix factoriazation
  """
  def __init__(self, n_user, n_item, dim=40, dropout=0, init=None):
    super().__init__()
    
    self.user_latent = nn.Embedding(n_user, dim)
    self.item_latent = nn.Embedding(n_item, dim)
    
    self.user_bias = nn.Embedding(n_user, 1)
    self.item_bias = nn.Embedding(n_item, 1)
    
    self.dropout_p = dropout
    self.dropout = nn.Dropout(p=self.dropout_p)
    if not init:
      self.init_embedding(init)
    else:
      self.init_embedding(0)
    
  def init_embedding(self, init):
    nn.init.kaiming_normal_(self.user_latent.weight.data, mode='fan_out', a=init)
    nn.init.kaiming_normal_(self.item_latent.weight.data, mode='fan_out', a=init)
    nn.init.kaiming_normal_(self.user_bias.weight.data, mode='fan_out', a=init)
    nn.init.kaiming_normal_(self.item_bias.weight.data, mode='fan_out', a=init)
    
  def forward(self, users, items):
    u_latent = self.dropout(self.user_latent(users))
    i_latent = self.dropout(self.item_latent(items))
    
    u_bias = self.user_bias(users)
    i_bias = self.user_bias(items)
    
    preds = torch.sum(u_latent * i_latent, dim=1, keepdim=True) + u_bias + i_bias
    
    return preds.squeeze(dim=-1)
  
  def l2_norm(self, users, items):
    users = torch.unique(users)
    items = torch.unique(items)
    
    l2_loss = (torch.sum(self.user_latent(users)**2) + torch.sum(self.item_latent(items)**2)) / 2
    return l2_loss


In [17]:
from model import MetaModule, MetaEmbed
class MetaEmbed(MetaModule):
  """
  Base module for matrix factorization
  """
  def __init__(self, dim_1, dim_2):
    super().__init__()
    ignore = nn.Embedding(dim_1, dim_2)
    
    self.register_buffer('weight', to_var(ignore.weight.data, requires_grad=True))
    self.register_buffer('bias', None)
    
  def forward(self):
    return self.weight
  
  def named_leaves(self):
    return [('weight', self.weight), ('bias', self.bias)]

In [18]:
class MetaMF(MetaModule):
    """
    Base module for matrix factorization.
    """
    def __init__(self, n_user, n_item, dim=40, dropout=0, init = None):
        super().__init__()
        
        self.user_latent = MetaEmbed(n_user, dim)
        self.item_latent = MetaEmbed(n_item, dim)
        self.user_bias = MetaEmbed(n_user, 1)
        self.item_bias = MetaEmbed(n_item, 1)
        self.dropout_p = dropout
        self.dropout = nn.Dropout(p=self.dropout_p)
        if init is not None:
            self.init_embedding(init)
        else: 
            self.init_embedding(0)
        
    def init_embedding(self, init): 

        nn.init.kaiming_normal_(self.user_latent.weight, mode='fan_out', a = init)
        nn.init.kaiming_normal_(self.item_latent.weight, mode='fan_out', a = init)
        nn.init.kaiming_normal_(self.user_bias.weight, mode='fan_out', a = init)
        nn.init.kaiming_normal_(self.item_bias.weight, mode='fan_out', a = init)
          
    def forward(self, users, items):
        u_latent = self.dropout(self.user_latent.weight[users])
        i_latent = self.dropout(self.item_latent.weight[items])
        u_bias = self.user_bias.weight[users]
        i_bias = self.item_bias.weight[items]

        preds = torch.sum(u_latent * i_latent, dim=1, keepdim=True) + u_bias + i_bias
        return preds.squeeze(dim=-1)

    def l2_norm(self, users, items, unique = True): 

        users = torch.unique(users)
        items = torch.unique(items)
        
        l2_loss = (torch.sum(self.user_latent.weight[users]**2) + torch.sum(self.item_latent.weight[items]**2)) / 2
        return l2_loss

In [19]:
# import torch
# import torch.nn as nn

# # Define the embedding layer
# embedding_dim = 10
# vocab_size = 100
# embedding_layer = nn.Embedding(vocab_size, embedding_dim)

# # Example input indices
# indices = torch.tensor(list(range(4)))

# # Apply the embedding layer
# embedded_output = embedding_layer(indices)

# print(embedded_output.shape)

In [20]:
base_model = MetaMF(n_user, n_item, dim=base_model_args['emb_dim'], dropout=0).to(device)
base_optimizer = torch.optim.SGD(base_model.params(), lr=base_model_args['learning_rate'], weight_decay=0) # todo: other optimizer SGD

register_buffer 로 layer를 등록하면 어떤 특징이 있는가?

1. optimizer가 업데이트하지 않는다.

2. 그러나 값은 존재한다(하나의 layer로써 작용한다고 보면 된다.)

3. state_dict()로 확인이 가능하다.

4. GPU연산이 가능하다.

 

따라서 네트워크를 구성함에 있어서 네트워크를 end2end로 학습시키고 싶은데 중간에 업데이트를 하지않는 일반 layer를 넣고 싶을 때 사용할 수 있다.

In [21]:
def to_var(x, requires_grad=True):
  x = x.cuda() if torch.cuda.is_available() else x
  return Variable(x, requires_grad=requires_grad) # 현재는 모든 tensor에서 required_grad 옵션을 통해 gradient를 추적할 수 있기 때문에 따로 위와 같이 Variable로 감싸줄 필요가 없다. (현재는 쓸 필요 없음)


In [22]:
class OneLinear(nn.Module):
  """
  linear model: r
  """
  def __init__(self, n):
    super().__init__()

    self.bias = nn.Embedding(n,1)
    self.init_embedding()

  def init_embedding(self):
    self.bias.weight.data *= 0.01

  def forward(self, values):
    d_bias = self.bias(values)
    return d_bias.squeeze()


In [23]:
class TwoLinear(nn.Module):
  def __init__(self, n_user, n_item):
    super().__init__()

    self.user_bias = nn.Embedding(n_user, 1)
    self.item_bias = nn.Embedding(n_item, 1)

    self.init_embedding(0)
  
  def init_embedding(self, init):
    nn.init.kaiming_normal_(self.user_bias.weight, mode='fan_out', a=init)
    nn.init.kaiming_normal_(self.item_bias.weight, mode='fan_out', a=init)

  def forward(self, users, items):
    u_bias = self.user_bias(users)
    i_bias = self.item_bias(items)
    preds = u_bias + i_bias
    
    return preds.squeeze()

In [24]:
class ThreeLinear(nn.Module):
  """
  linear model: u + i + r / o
  """
  def __init__(self, n_user, n_item, n):
    super().__init__()
    
    self.user_bias = nn.Embedding(n_user, 1)
    self.item_bias = nn.Embedding(n_item, 1)
    self.data_bias= nn.Embedding(n, 1)
    self.init_embedding(0)
      
  def init_embedding(self, init): 
    nn.init.kaiming_normal_(self.user_bias.weight, mode='fan_out', a = init)
    nn.init.kaiming_normal_(self.item_bias.weight, mode='fan_out', a = init)
    nn.init.kaiming_normal_(self.data_bias.weight, mode='fan_out', a = init)
    self.data_bias.weight.data *= 0.001

  def forward(self, users, items, values):

    u_bias = self.user_bias(users)
    i_bias = self.item_bias(items)
    d_bias = self.data_bias(values)

    preds = u_bias + i_bias + d_bias
    return preds.squeeze()

In [25]:
weight1_model = TwoLinear(n_user, n_item).to(device)
weight1_optimizer = torch.optim.Adam(weight1_model.parameters(), 
                                     lr=base_model_args['learning_rate'],
                                     weight_decay=0)

In [26]:
weight2_model = ThreeLinear(n_user, n_item, 2).to(device)
weight2_optimizer = torch.optim.Adam(weight2_model.parameters(), 
                                     lr=weight2_model_args['learning_rate'],
                                     weight_decay=weight2_model_args['weight_decay'])

In [27]:
imputation_model = OneLinear(2).to(device)
imputation_optimizer = torch.optim.Adam(imputation_model.parameters(),
                                        lr=imputation_model_args['learning_rate'],
                                        weight_decay=imputation_model_args['weight_decay'])

In [28]:
# loss criterion
sum_criterion = nn.MSELoss(reduction='sum')
none_criterion = nn.MSELoss(reduction='none')

## Training

In [29]:
train_dense.shape

torch.Size([15400, 1000])

In [30]:
stopping_args = Stop_args(patience=training_args['patience'], max_epochs=training_args['epochs'])
early_stopping = EarlyStopping(base_model, **stopping_args)

In [34]:
# users and items have shape of 1000 and 100, respectively
for epoch in range(training_args['epochs']):
  training_loss = 0
  for u_batch_idx, users in enumerate(train_loader.User_loader):
    for i_batch_idx, items in enumerate(train_loader.Item_loader):
      # training set: 1. update parameters one_step (assumed update); 2. update parameters (real update) 
      # uniform set: update hyper_parameters using gradient descent.
      
      # print(users.shape, items.shape) # torch.Size([1000]) torch.Size([100])
      
      # index_row = np.isin(train_data._indices()[0].cpu().numpy(), users.cpu().numpy())
      # index_col = np.isin(train_data._indices()[1].cpu().numpy(), items.cpu().numpy())
      # index = torch.tensor(np.where(index_row * index_col)[0]).to(device)
      # print(index_row.shape, index_col.shape, index.shape) # (125077,) (125077,) <- number of observed elements / torch.Size([1023])
      
      # y_train consists of 1s only
      users_train, items_train, y_train = train_loader.get_batch(users, items, device)
      # print(users_train.shape, items_train.shape) # ex. torch.Size([1023]) torch.Size([1023])
    
      # calculate weight 1
      weight1_model.train()
      weight1 = weight1_model(users_train, items_train)
      weight1 = torch.exp(weight1/5)
      
      # all pair
      all_pair = torch.cartesian_prod(users, items)
      users_all, items_all = all_pair[:, 0], all_pair[:, 1]
      # print(users_all.shape) # torch.Size([1000 * 100])
      
      # calculate weight 2
      weight2_model.train()
      weight2 = weight2_model(users_all, items_all, (train_dense[users_all,items_all]!=0)*1) # *1: bool -> int
      weight2 = torch.exp(weight2/5)
      
      # caclculate imputation values
      imputation_model.train()
      impu_f_all = torch.tanh(imputation_model((train_dense[users_all,items_all]).long()))
      
      # print(weight1.shape, weight2.shape, impu_f_all.shape) # torch.Size([788]) torch.Size([100000]) torch.Size([100000])
      
      ######################################
      ## 1. Assumed Update of theta (Black Arrows) #
      ######################################
      # one_step_model: assumed model, just update one step on base model. it is for updating weight parameters
      one_step_model = MetaMF(n_user, n_item, dim=base_model_args['emb_dim'], dropout=0)
      one_step_model.load_state_dict(base_model.state_dict()) # state_dict 는 간단히 말해 각 계층을 매개변수 텐서로 매핑되는 Python 사전(dict) 객체
      
      # formal parameter: using training set to update parameters
      one_step_model.train()
      # all pair data in this block
      y_hat_f_all = one_step_model(users_all, items_all)
      cost_f_all = none_criterion(y_hat_f_all, impu_f_all)
      loss_f_all = torch.sum(cost_f_all * weight2)

      # observation data
      y_hat_f_obs = one_step_model(users_train, items_train)
      cost_f_obs = none_criterion(y_hat_f_obs, y_train)
      loss_f_obs = torch.sum(cost_f_obs * weight1)
      loss_f = loss_f_obs + base_model_args['imputaion_lambda'] * loss_f_all + base_model_args['weight_decay'] * one_step_model.l2_norm(users_all, items_all)
      
      # assumed update (not a real update)
      one_step_model.zero_grad()
      grads = torch.autograd.grad(loss_f, (one_step_model.params()), create_graph=True)
      one_step_model.update_params(base_model_args['learning_rate'], source_params=grads)
      
      ######################################
      ### 2. Update of pi (Blue Arrows) ####
      ######################################
      # latter hyper_parameter: Using uniform set to update hyper_parameters
      y_hat_l = one_step_model(users_unif, items_unif)
      loss_l = sum_criterion(y_hat_l, y_unif)
      
      # update hyper-parameters
      weight1_optimizer.zero_grad()
      weight2_optimizer.zero_grad()
      imputation_optimizer.zero_grad()
      loss_l.backward()
      if epoch >= 20:
        weight1_optimizer.step()
        weight2_optimizer.step()
      imputation_optimizer.step()
      
      
      ######################################
      ### 3. Update of theta (Black Arrows) ####
      ######################################
      # use new weights to update parameters (real update)
      weight1_model.train()
      weight1 = weight1_model(users_train, items_train)
      weight1 = torch.exp(weight1/5)
      
      # calculate weight2
      weight2_model.train()
      weight2 = weight2_model(users_all, items_all,(train_dense[users_all,items_all]!=0)*1)
      weight2 = torch.exp(weight2/5) # for stable training
      
      # use new imputation to update parameters
      imputation_model.train()
      impu_all = torch.tanh(imputation_model((train_dense[users_all,items_all]).long()))

      # loss of training set
      base_model.train()
      # all pair
      y_hat_all = base_model(users_all, items_all)
      cost_all = none_criterion(y_hat_all, impu_all)
      loss_all = torch.sum(cost_all * weight2)
      # observation
      y_hat_obs = base_model(users_train, items_train)
      cost_obs = none_criterion(y_hat_obs, y_train)
      loss_obs = torch.sum(cost_obs * weight1)
      loss = loss_obs + base_model_args['imputaion_lambda'] * loss_all + base_model_args['weight_decay'] * base_model.l2_norm(users_all, items_all)
      
      base_optimizer.zero_grad()
      loss.backward()
      base_optimizer.step()

      training_loss += loss.item()
      
  base_model.eval()
  with torch.no_grad():
      # training metrics
      train_pre_ratings = torch.empty(0).to(device)
      train_ratings = torch.empty(0).to(device)
      for u_batch_idx, users in enumerate(train_loader.User_loader): 
          for i_batch_idx, items in enumerate(train_loader.Item_loader): 
              users_train, items_train, y_train = train_loader.get_batch(users, items, device)
              pre_ratings = base_model(users_train, items_train)
              train_pre_ratings = torch.cat((train_pre_ratings, pre_ratings))
              train_ratings = torch.cat((train_ratings, y_train))

      # validation metrics
      val_pre_ratings = torch.empty(0).to(device)
      val_ratings = torch.empty(0).to(device)
      for batch_idx, (users, items, ratings) in enumerate(val_loader):
          pre_ratings = base_model(users, items)
          val_pre_ratings = torch.cat((val_pre_ratings, pre_ratings))
          val_ratings = torch.cat((val_ratings, ratings))

  train_results = utils.metrics.evaluate(train_pre_ratings, train_ratings, ['MSE'], device)
  val_results = utils.metrics.evaluate(val_pre_ratings, val_ratings, ['MSE', 'NLL', 'AUC'], device)

  print('Epoch: {0:2d} / {1}, Validation: {2}'.
          format(epoch, training_args['epochs'], 
              ' '.join([key+':'+'%.3f'%val_results[key] for key in val_results])))

  if epoch >= 50 and early_stopping.check([val_results['AUC']], epoch):
      break

RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx