In [1]:
import os
import numpy as np
import random

import torch
import torch.nn as nn

from model import *

import arguments

In [2]:
import utils.load_dataset
import utils.data_loader
import utils.metrics
from utils.early_stop import EarlyStopping, Stop_args

In [3]:
from utils.metrics import auc

In [4]:
dataset = 'yahooR3'
base_model_args = {'emb_dim': 10, 'learning_rate': 0.01, 'imputaion_lambda': 0.01, 'weight_decay': 1}
weight1_model_args ={'learning_rate': 0.1, 'weight_decay': 0.001}
weight2_model_args =  {'learning_rate': 1e-3, 'weight_decay': 1e-2}
imputation_model_args = {'learning_rate': 1e-1, 'weight_decay': 1e-4}
training_args =  {'batch_size': 1024, 'epochs': 100, 'patience': 20, 'block_batch': [1000, 100]}
uniform_ratio = 0.05
seed = 0

In [5]:
from train_implicit import setup_seed
setup_seed(seed)

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)


cuda


## Load Dataset

In [7]:
dataset='yahooR3'
training_args = {'batch_size': 1024,
                 'epochs': 500, 
                 'patience': 60, 
                 'block_batch': [6000, 500]}

base_model_args = {'emb_dim': 10, 
                   'learning_rate': 0.0001, 
                   'imputaion_lambda': 1, 
                   'weight_decay': 1}
weight1_model_args = {'learning_rate': 0.01, 'weight_decay': 0.01}
weight2_model_args = {'learning_rate': 1e-3, 'weight_decay': 1e-2}
imputation_model_args = {'emb_dim': 10, 'learning_rate': 1e-1, 'weight_decay': 1e-4}

In [8]:
# args = arguments.parse_args()
# para(args)
setup_seed(seed)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [9]:
# To generate implicit feedback data, we also use the aforementioned
# datasets, but abandon the negative feedback of the training data.
train, unif_train, validation, test = utils.load_dataset.load_dataset(data_name=dataset, type = 'implicit', seed = seed, device=device)


In [10]:
train.shape, unif_train.shape, validation.shape, test.shape

(torch.Size([15400, 1000]),
 torch.Size([15400, 1000]),
 torch.Size([15400, 1000]),
 torch.Size([15400, 1000]))

In [11]:
train

tensor(indices=tensor([[    0,     0,     0,  ..., 15399, 15399, 15399],
                       [   13,   152,   169,  ...,   563,   636,   948]]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]),
       device='cuda:0', size=(15400, 1000), nnz=125077, layout=torch.sparse_coo)

In [12]:
train._indices().shape, train._values().shape, train._nnz()

(torch.Size([2, 125077]), torch.Size([125077]), 125077)

In [13]:
from train_implicit import train_and_eval

# train_and_eval(train, 
#                unif_train, 
#                validation, 
#                test,
#                device,
#                base_model_args = base_model_args, 
#                weight1_model_args = weight1_model_args,
#                weight2_model_args = weight2_model_args,
#                imputation_model_args = imputation_model_args, 
#                training_args = training_args)

## Proposed Model - AutoDebias

In [14]:
train_data, unif_train_data, val_data, test_data = train, unif_train, validation, test

In [15]:
train_dense = train_data.to_dense()

In [18]:
train_dense.shape

torch.Size([15400, 1000])

In [19]:
users_unif, items_unif = unif_train_data._indices()[0], unif_train_data._indices()[1]
y_unif = unif_train_data._values()

In [25]:
train_loader = utils.data_loader.Block(train_data, u_batch_size=training_args['block_batch'][0], i_batch_size=training_args['block_batch'][1], device=device)
val_loader = utils.data_loader.DataLoader(utils.data_loader.Interactions(val_data), batch_size=training_args['batch_size'], shuffle=False, num_workers=0)
test_loader = utils.data_loader.DataLoader(utils.data_loader.Interactions(test_data), batch_size=training_args['batch_size'], shuffle=False, num_workers=0)

In [17]:
type(train_loader)

utils.data_loader.Block

In [26]:
# data shape
n_user, n_item = train_data.shape

# Base model and its optimizer. This optimizer is for optimize parameters in base model using the updated weights (true optimization).
base_model = MetaMF(n_user, n_item, dim=base_model_args['emb_dim'], dropout=0).to(device)
base_optimizer = torch.optim.SGD(base_model.params(), lr=base_model_args['learning_rate'], weight_decay=0) # todo: other optimizer SGD

# Weight model and its optimizer. This optimizer is for optimize parameters of weight model. 
weight1_model = TwoLinear(n_user, n_item).to(device)
weight1_optimizer = torch.optim.Adam(weight1_model.parameters(), lr=weight1_model_args['learning_rate'], weight_decay=weight1_model_args['weight_decay'])

weight2_model = ThreeLinear(n_user, n_item, 2).to(device)
weight2_optimizer = torch.optim.Adam(weight2_model.parameters(), lr=weight2_model_args['learning_rate'], weight_decay=weight2_model_args['weight_decay'])

imputation_model = OneLinear(2).to(device)
imputation_optimizer = torch.optim.Adam(imputation_model.parameters(), lr=imputation_model_args['learning_rate'], weight_decay=imputation_model_args['weight_decay'])


In [27]:
# loss_criterion
sum_criterion = nn.MSELoss(reduction='sum')
none_criterion = nn.MSELoss(reduction='none')

### Train start

In [93]:
training_loss = 0

In [28]:
n_user, n_item = train_data.shape
print(n_user, n_item)

15400 1000


In [29]:
# Early stopping
stopping_args = Stop_args(patience=training_args['patience'], max_epochs=training_args['epochs'])
early_stopping = EarlyStopping(base_model, **stopping_args)

In [30]:
users, items = next(iter(train_loader.User_loader)), next(iter(train_loader.Item_loader))

In [31]:
print(users) # dim: 6000

tensor([14527, 14920,  4534,  ..., 13982, 14032, 10839], device='cuda:0')


In [32]:
print(items) # dim: 500

tensor([ 33, 435, 864, 289, 797, 373,  48,  98, 809,  37, 521, 148, 123, 718,
        460, 239, 847, 401, 359, 352, 314, 326, 881, 657, 727, 120, 346, 960,
        926, 292, 848, 988, 423, 919, 223, 992, 201, 243, 885,  59, 158, 922,
         65, 135, 211, 272, 277,  42, 920,  77, 739, 230, 756,  86,  20, 819,
          9, 244, 296,   1, 448, 366, 554, 312, 653, 498, 508, 642, 154,  66,
        196, 227, 379, 476,  92, 478, 311, 853, 496, 802,   8, 345, 984, 640,
        697, 468, 481,  74, 271, 130, 495, 562, 193, 764, 806, 221, 100, 324,
        118, 578, 691,   2, 592, 736,  41, 690, 663, 350, 596, 128, 119, 371,
        873, 461, 660, 671, 713, 816, 446, 798, 963, 731, 870, 807, 765, 146,
        419, 755,  69, 958, 523, 259, 437, 956, 237, 504, 182, 980, 538, 441,
        964, 805, 574, 712, 273, 524, 205,  68, 833, 903, 145, 948, 175, 197,
         58, 852, 288, 876, 821, 336, 792, 631,  60, 934, 780, 693, 716, 830,
         15, 998, 724, 329, 333, 630, 577, 191, 200, 676, 814, 3

In [33]:
# y_train: all 1's
# dim of users_train, items_train ~ 25000 (train._nnz() * 6000 * 500)
users_train, items_train, y_train = train_loader.get_batch(users, items, device)

In [64]:
users_train.size(), items_train.size(), y_train.size()

(torch.Size([24496]), torch.Size([24496]), torch.Size([24496]))

### Step 1: Assumed update of $\theta$ (update parameters one_step )

In [40]:
# calculate weight 1: w_k^{(1)}, only for training data (Eq 17.)
weight1_model.train()
weight1 = weight1_model(users_train, items_train)
weight1 = torch.exp(weight1/5) # for stable training

In [54]:
# all pair
all_pair = torch.cartesian_prod(users, items)
users_all, items_all = all_pair[:,0], all_pair[:,1]

In [52]:
# calculate weight2: w_{ui}^{(2)} (Eq 17.)
# ((train_dense[users_all, items_all]!=0)*1): unobserved -> 1, observed -> 0
weight2_model.train()
weight2 = weight2_model(users_all, items_all, ((train_dense[users_all, items_all]!=0)*1))
weight2 = torch.exp(weight2/5) # for stable training

In [67]:
# calculate imputation values
imputation_model.train()
impu_all = torch.tanh(imputation_model((train_dense[users_all, items_all]).long()))

### 1. Assumed Update of theta (Black arrow)

In [59]:
# one_step_model : assumed model, just update one step on base model. It is for updating parameters
one_step_model = MetaMF(n_user, n_item, dim=base_model_args['emb_dim'], dropout=0)

In [63]:
one_step_model.load_state_dict(base_model.state_dict())

one_step_model.train()


MetaMF(
  (user_latent): MetaEmbed()
  (item_latent): MetaEmbed()
  (user_bias): MetaEmbed()
  (item_bias): MetaEmbed()
  (dropout): Dropout(p=0, inplace=False)
)

In [76]:
# formal parameter: using training set to update parameters
# 1) observation data
y_hat_obs = one_step_model(users_train, items_train)
cost_obs = none_criterion(y_hat_obs, y_train)
loss_obs = torch.sum(weight1 * cost_obs)

In [77]:
# 2) all pair data 
y_hat_all = one_step_model(users_all, items_all)
cost_all = none_criterion(y_hat_all, impu_all)
loss_all = torch.sum(weight2 * cost_all)

In [78]:
# loss: \hat{L}_{T}(f | \phi)
loss = loss_obs + base_model_args['imputaion_lambda'] * loss_all + base_model_args['weight_decay'] * one_step_model.l2_norm(users_all, items_all)

In [86]:
# update parameters of one_step_model
one_step_model.zero_grad()
grads = torch.autograd.grad(loss, (one_step_model.params()), create_graph=True)
one_step_model.update_params(base_model_args['learning_rate'], source_params=grads)

### Step 2. Update of $\phi(\psi)$

In [87]:
# latter hyper_parameter: using uniform set to update hyper_parameters
y_hat_l = one_step_model(users_unif, items_unif)
loss_l = sum_criterion(y_hat_l, y_unif)

In [88]:
# update hyper-parameters
weight1_optimizer.zero_grad()
weight2_optimizer.zero_grad()
imputation_optimizer.zero_grad()

loss_l.backward()

weight1_optimizer.step()
weight2_optimizer.step()
imputation_optimizer.step()

In [91]:
# use new weights to update parameters (real update)       
weight1_model.train()
weight1 = weight1_model(users_train, items_train)
weight1 = torch.exp(weight1/5)

# calculate weight2
weight2_model.train()
weight2 = weight2_model(users_all, items_all,(train_dense[users_all,items_all]!=0)*1)
weight2 = torch.exp(weight2/5) # for stable training

# use new imputation to update parameters
imputation_model.train()
impu_all = torch.tanh(imputation_model((train_dense[users_all,items_all]).long()))

### Step 3. Update of $\theta$

In [None]:
base_model.train()

# all_pair

In [94]:
y_hat_all = base_model(users_all, items_all)
cost_all = none_criterion(y_hat_all, impu_all)
loss_all = torch.sum(cost_all * weight2)
# observation
y_hat_obs = base_model(users_train, items_train)
cost_obs = none_criterion(y_hat_obs, y_train)
loss_obs = torch.sum(cost_obs * weight1)
loss = loss_obs + base_model_args['imputaion_lambda'] * loss_all + base_model_args['weight_decay'] * base_model.l2_norm(users_all, items_all)

base_optimizer.zero_grad()
loss.backward()
base_optimizer.step()

training_loss += loss.item()

### Validation

In [69]:
base_model.eval()
with torch.no_grad():
    # training metrics
    train_pre_ratings = torch.empty(0).to(device)
    train_ratings = torch.empty(0).to(device)
    for u_batch_idx, users in enumerate(train_loader.User_loader): 
        for i_batch_idx, items in enumerate(train_loader.Item_loader): 
            users_train, items_train, y_train = train_loader.get_batch(users, items, device)
            pre_ratings = base_model(users_train, items_train)
            train_pre_ratings = torch.cat((train_pre_ratings, pre_ratings))
            train_ratings = torch.cat((train_ratings, y_train))

    # validation metrics
    val_pre_ratings = torch.empty(0).to(device)
    val_ratings = torch.empty(0).to(device)
    for batch_idx, (users, items, ratings) in enumerate(val_loader):
        pre_ratings = base_model(users, items)
        val_pre_ratings = torch.cat((val_pre_ratings, pre_ratings))
        val_ratings = torch.cat((val_ratings, ratings))

train_results = utils.metrics.evaluate(train_pre_ratings, train_ratings, ['MSE'], device)
val_results = utils.metrics.evaluate(val_pre_ratings, val_ratings, ['MSE', 'NLL', 'AUC'], device)


### Full training code

In [None]:
for epoch in range(early_stopping.max_epochs):
    for u_batch_idx, users in enumerate(train_loader.User_loader):
        for i_batch_idx, items in enumerate(train_loader.Item_loader):
            # observation data in this batch
            users_train, items_train, y_train = train_loader.get_batch(users, items, device)

## Train & Eval

In [48]:
# train_and_eval(train, 
#               unif_train,
#               validation,
#               test,
#               device, 
#               base_model_args=base_model_args, 
#               weight1_model_args=weight1_model_args, 
#               weight2_model_args=weight2_model_args, 
#               imputation_model_args=imputation_model_args, 
#               training_args=training_args)

In [49]:
# transform sparse to dense matrix
train_data = train
train_dense = train_data.to_dense()
train_dense

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [50]:
unif_train_data = unif_train
users_unif = unif_train_data._indices()[0]
items_unif = unif_train_data._indices()[1]
y_unif = unif_train_data._values()

In [51]:
# build data_loader. (block matrix data loader)

train_loader = utils.data_loader.Block(train_data,
                                       u_batch_size=training_args['block_batch'][0],
                                       i_batch_size=training_args['block_batch'][1],
                                       device=device)

In [52]:
val_data, test_data = validation, test
val_loader = utils.data_loader.DataLoader(utils.data_loader.Interactions(val_data), batch_size=training_args['batch_size'], shuffle=False, num_workers=0)
test_loader = utils.data_loader.DataLoader(utils.data_loader.Interactions(test_data), batch_size=training_args['batch_size'], shuffle=False, num_workers=0)

In [53]:
# data shape
n_user, n_item = train_data.shape
print(n_user, n_item)

15400 1000


## Models

In [54]:
class MF(nn.Module):
  """
  Base module for matrix factoriazation
  """
  def __init__(self, n_user, n_item, dim=40, dropout=0, init=None):
    super().__init__()
    
    self.user_latent = nn.Embedding(n_user, dim)
    self.item_latent = nn.Embedding(n_item, dim)
    
    self.user_bias = nn.Embedding(n_user, 1)
    self.item_bias = nn.Embedding(n_item, 1)
    
    self.dropout_p = dropout
    self.dropout = nn.Dropout(p=self.dropout_p)
    if not init:
      self.init_embedding(init)
    else:
      self.init_embedding(0)
    
  def init_embedding(self, init):
    nn.init.kaiming_normal_(self.user_latent.weight.data, mode='fan_out', a=init)
    nn.init.kaiming_normal_(self.item_latent.weight.data, mode='fan_out', a=init)
    nn.init.kaiming_normal_(self.user_bias.weight.data, mode='fan_out', a=init)
    nn.init.kaiming_normal_(self.item_bias.weight.data, mode='fan_out', a=init)
    
  def forward(self, users, items):
    u_latent = self.dropout(self.user_latent(users))
    i_latent = self.dropout(self.item_latent(items))
    
    u_bias = self.user_bias(users)
    i_bias = self.user_bias(items)
    
    preds = torch.sum(u_latent * i_latent, dim=1, keepdim=True) + u_bias + i_bias
    
    return preds.squeeze(dim=-1)
  
  def l2_norm(self, users, items):
    users = torch.unique(users)
    items = torch.unique(items)
    
    l2_loss = (torch.sum(self.user_latent(users)**2) + torch.sum(self.item_latent(items)**2)) / 2
    return l2_loss


In [55]:
from model import MetaModule, MetaEmbed
class MetaEmbed(MetaModule):
  """
  Base module for matrix factorization
  """
  def __init__(self, dim_1, dim_2):
    super().__init__()
    ignore = nn.Embedding(dim_1, dim_2)
    
    self.register_buffer('weight', to_var(ignore.weight.data, requires_grad=True))
    self.register_buffer('bias', None)
    
  def forward(self):
    return self.weight
  
  def named_leaves(self):
    return [('weight', self.weight), ('bias', self.bias)]

In [56]:
class MetaMF(MetaModule):
    """
    Base module for matrix factorization.
    """
    def __init__(self, n_user, n_item, dim=40, dropout=0, init = None):
        super().__init__()
        
        self.user_latent = MetaEmbed(n_user, dim)
        self.item_latent = MetaEmbed(n_item, dim)
        self.user_bias = MetaEmbed(n_user, 1)
        self.item_bias = MetaEmbed(n_item, 1)
        self.dropout_p = dropout
        self.dropout = nn.Dropout(p=self.dropout_p)
        if init is not None:
            self.init_embedding(init)
        else: 
            self.init_embedding(0)
        
    def init_embedding(self, init): 

        nn.init.kaiming_normal_(self.user_latent.weight, mode='fan_out', a = init)
        nn.init.kaiming_normal_(self.item_latent.weight, mode='fan_out', a = init)
        nn.init.kaiming_normal_(self.user_bias.weight, mode='fan_out', a = init)
        nn.init.kaiming_normal_(self.item_bias.weight, mode='fan_out', a = init)
          
    def forward(self, users, items):
        u_latent = self.dropout(self.user_latent.weight[users])
        i_latent = self.dropout(self.item_latent.weight[items])
        u_bias = self.user_bias.weight[users]
        i_bias = self.item_bias.weight[items]

        preds = torch.sum(u_latent * i_latent, dim=1, keepdim=True) + u_bias + i_bias
        return preds.squeeze(dim=-1)

    def l2_norm(self, users, items, unique = True): 

        users = torch.unique(users)
        items = torch.unique(items)
        
        l2_loss = (torch.sum(self.user_latent.weight[users]**2) + torch.sum(self.item_latent.weight[items]**2)) / 2
        return l2_loss

In [57]:
# import torch
# import torch.nn as nn

# # Define the embedding layer
# embedding_dim = 10
# vocab_size = 100
# embedding_layer = nn.Embedding(vocab_size, embedding_dim)

# # Example input indices
# indices = torch.tensor(list(range(4)))

# # Apply the embedding layer
# embedded_output = embedding_layer(indices)

# print(embedded_output.shape)

In [58]:
base_model = MetaMF(n_user, n_item, dim=base_model_args['emb_dim'], dropout=0).to(device)
base_optimizer = torch.optim.SGD(base_model.params(), lr=base_model_args['learning_rate'], weight_decay=0) # todo: other optimizer SGD

register_buffer 로 layer를 등록하면 어떤 특징이 있는가?

1. optimizer가 업데이트하지 않는다.

2. 그러나 값은 존재한다(하나의 layer로써 작용한다고 보면 된다.)

3. state_dict()로 확인이 가능하다.

4. GPU연산이 가능하다.

 

따라서 네트워크를 구성함에 있어서 네트워크를 end2end로 학습시키고 싶은데 중간에 업데이트를 하지않는 일반 layer를 넣고 싶을 때 사용할 수 있다.

In [59]:
def to_var(x, requires_grad=True):
  x = x.cuda() if torch.cuda.is_available() else x
  return Variable(x, requires_grad=requires_grad) # 현재는 모든 tensor에서 required_grad 옵션을 통해 gradient를 추적할 수 있기 때문에 따로 위와 같이 Variable로 감싸줄 필요가 없다. (현재는 쓸 필요 없음)


In [60]:
class OneLinear(nn.Module):
  """
  linear model: r
  """
  def __init__(self, n):
    super().__init__()

    self.bias = nn.Embedding(n,1)
    self.init_embedding()

  def init_embedding(self):
    self.bias.weight.data *= 0.01

  def forward(self, values):
    d_bias = self.bias(values)
    return d_bias.squeeze()


In [61]:
class TwoLinear(nn.Module):
  def __init__(self, n_user, n_item):
    super().__init__()

    self.user_bias = nn.Embedding(n_user, 1)
    self.item_bias = nn.Embedding(n_item, 1)

    self.init_embedding(0)
  
  def init_embedding(self, init):
    nn.init.kaiming_normal_(self.user_bias.weight, mode='fan_out', a=init)
    nn.init.kaiming_normal_(self.item_bias.weight, mode='fan_out', a=init)

  def forward(self, users, items):
    u_bias = self.user_bias(users)
    i_bias = self.item_bias(items)
    preds = u_bias + i_bias
    
    return preds.squeeze()

In [62]:
class ThreeLinear(nn.Module):
  """
  linear model: u + i + r / o
  """
  def __init__(self, n_user, n_item, n):
    super().__init__()
    
    self.user_bias = nn.Embedding(n_user, 1)
    self.item_bias = nn.Embedding(n_item, 1)
    self.data_bias= nn.Embedding(n, 1)
    self.init_embedding(0)
      
  def init_embedding(self, init): 
    nn.init.kaiming_normal_(self.user_bias.weight, mode='fan_out', a = init)
    nn.init.kaiming_normal_(self.item_bias.weight, mode='fan_out', a = init)
    nn.init.kaiming_normal_(self.data_bias.weight, mode='fan_out', a = init)
    self.data_bias.weight.data *= 0.001

  def forward(self, users, items, values):

    u_bias = self.user_bias(users)
    i_bias = self.item_bias(items)
    d_bias = self.data_bias(values)

    preds = u_bias + i_bias + d_bias
    return preds.squeeze()

In [63]:
weight1_model = TwoLinear(n_user, n_item).to(device)
weight1_optimizer = torch.optim.Adam(weight1_model.parameters(), 
                                     lr=base_model_args['learning_rate'],
                                     weight_decay=0)

In [64]:
weight2_model = ThreeLinear(n_user, n_item, 2).to(device)
weight2_optimizer = torch.optim.Adam(weight2_model.parameters(), 
                                     lr=weight2_model_args['learning_rate'],
                                     weight_decay=weight2_model_args['weight_decay'])

In [65]:
imputation_model = OneLinear(2).to(device)
imputation_optimizer = torch.optim.Adam(imputation_model.parameters(),
                                        lr=imputation_model_args['learning_rate'],
                                        weight_decay=imputation_model_args['weight_decay'])

In [66]:
# loss criterion
sum_criterion = nn.MSELoss(reduction='sum')
none_criterion = nn.MSELoss(reduction='none')

## Training

In [67]:
train_dense.shape

torch.Size([15400, 1000])

In [68]:
stopping_args = Stop_args(patience=training_args['patience'], max_epochs=training_args['epochs'])
early_stopping = EarlyStopping(base_model, **stopping_args)

In [76]:
from train_implicit import train_and_eval

# setting that works well
train_and_eval(train, 
               unif_train, 
               validation, 
               test,
               device,
               base_model_args = base_model_args, 
               weight1_model_args = weight1_model_args,
               weight2_model_args = weight2_model_args,
               imputation_model_args = imputation_model_args, 
               training_args = training_args)