In [17]:
import numpy as np
import pandas as pd
import torch
import torch.nn
import torch.nn.functional as F
import torch.nn.init as init
import torch.optim as optim
from torchvision import transforms
from torch.utils.data import Dataset

Implementation for **``Training Deep AutoEncoders for Collaborative Filtering Implementation``**

## Structure of Model
* **Loss**: Define loss function for model
* **TrainTestDataset**: Define ``Dataset`` class by overwriting ``__len__()`` and ``__getitem__()``
* **Model**: Construct a AutoEncoder Model
* **Main**: Train & Test the model
---

## Loss.py

보편적인 loss함수와는 다르게, 본 model에서는 actual rating이 없는 경우에 대해서는 loss를 평가 할 수가 없다(약간 unsupervised learning 느낌? 사용자가 현실에서 평가를 하기 전까지는 정답을 알 수 없으니까).

따라서 ``torch.nn``에 있는 loss function이 아닌, 본 상황에 맞는 loss function을 새롭게 정의하여야 한다. 

특히, 본 Paper에서는 **Masked Mean Squared Error**를 사용한다.

$$ MMSE = {m_i * (r_i - y_i)^2 \over \sum_{i=0}^{i=n} m_i} $$

### Code

In [None]:
class MSEloss_with_Mask(nn.Module):
        def __init__(self):
        super(MSEloss_with_Mask, self).__init__()

    def forward(self, inputs, targets):
        # Masking into a vector of 1's and 0's.
        mask= (targets!=0)
        mask= mask.float()

        # actual number of ratings
        # Take max to avoid division by zero while caculating loss
        other= torch.Tensor([1.0])
        number_ratings= torch.max(torch.sum(mask), other)
        error= torch.sum(torch.mul(mask, torch.mul((targets-inputs), (targets-inputs))))
        loss= error.div(number_ratings)
        return loss[0]

---
### Example
#### Mask

In [13]:
targets_= torch.Tensor([5, 3, 0, 4, 0, 5])
mask_= (targets!=0)
print(mask_)

tensor([ True,  True, False,  True, False,  True])


---
---
## TrainTestDataset.py

Transformation을 적용 할 수 있는 ``Dataset`` class를 정의한다.

### Code

In [None]:
class TrainTestDataset(Dataset):
        def __init__(self, file, transform=None):
        self.data= pd.read_csv(file)
        self.data= self.data.iloc[:, 1:]
        self.transform= transform

        if transform is not None:
            self.data= self.transform(np.array(self.data))

    def __len__(self):
        return len(self.data[0])

    def __getitem__(self, ind):
        user_vector= self.data.data[0][ind]

        return user_vector

---
---
## Model.py

#### AutoEncoder Class
**``AutoEncoder``**를 정의한다. AutoEncoder의 input으로는 다음과 같은 parameter가 들어간다.
* layer_size: size of each layer in the AE model
        ex) [10000, 1024, 512] result in
            - encoder 2 layers: 10000 x 1024, 1024 x 512
            - representation layer z: 512
            - decoder 2 layers: 512 x 1024, 1024 x 10000
* nl_type: non-linearity activation function type
* is_constrained: if True, the the weights of encoder and decoder are tied
* dp_drop_prob: dropout probability. if > 0, dropout process proceeds
* last_layer_activation: whether to apply activation on last decode layer


#### Weights and Biases
**``nn.ParameterList``** class와 ``layer_size`` parameter를 통해 **encoder/decoder의 weight/bias**를 직접 초기화해준다.


#### Stacked AutoEncoder
**``is_constrained``**의 값에 따라 decoder를 정의하고 학습하는 방법이 달라진다.


#### Activation Funciton
paper에서는 ``selu``를 사용하였지만, 다양한 activation funciton의 사용을 위해 함수를 하나 만들어준다.

### Code


In [None]:
def activation(input, type):
    if type.lower() == 'selu':
        return F.selu(input)
    elif type.lower() == 'elu':
        return F.elu(input)
    elif type.lower() == 'relu':
        return F.relu(input)
    elif type.lower() == 'relu6':
        return F.relu6(input)
    elif type.lower() == 'tanh':
        return F.tanh(input)
    elif type.lower() == 'sigmoid':
        return F.sigmoid(input)
    elif type.lower() == 'swish':
        return F.sigmoid(input) * input
    elif type.lower() == 'identity':
        return input
    else:
        raise ValueError("Unknown non-Linearity activation function")

In [None]:
class AutoEncoder(nn.Module):
    def __init__(self, layer_size, nl_type='selu', is_constrained=True, dp_drop_prob= 0, last_layer_activations=True):
        super(AutoEncoder, self).__init__()
        '''
        layer_sizes: size of each layer in the autoencoder model
            ex) [10000, 1024, 512] will result in
                - encoder 2 layers: 10000 x 1024 & 1024 x 512
                - representation layer z: 512
                - decoder 2 layers: 512 x 1024 & 1024 x 10000
        nl_type: non-linearity type
        is_constrained: if ture then the weights of encoder and decoder are tied
        dp_drop_prob: dropout probability
        last_layer_activations: whether to apply activation on last decoder layer
        '''

        self.layer_sizes= layer_size
        self.nl_type= nl_type
        self.is_constrained= is_constrained
        self.dp_drop_prob= dp_drop_prob
        self.last_layer_activations= last_layer_activations

        if dp_drop_prob > 0:
            self.drop= nn.Dropout(dp_drop_prob)

        self._last= len(layer_size) - 2

        # initialize weights
        self.encoder_weights= nn.ParameterList([nn.Parameter(torch.rand(layer_size[i+1], layer_size[i])) for i in range(len(layer_size)-1)])

        for weights in self.encoder_weights:
            init.xavier_uniform_(weights)

        self.encoder_bias= nn.ParameterList([nn.Parameter(torch.zeros(layer_size[i+1])) for i in range(len(layer_size) - 1)])

        reverse_layer_sizes= list(reversed(layer_size))

        # Decoder weights
        if is_constrained == False:
            self.decoder_weights= nn.ParameterList([nn.Parameter(torch.rand(reverse_layer_sizes[i+1], reverse_layer_sizes[i])) for i in range(len(reverse_layer_sizes) - 1)])

            for weights in self.decoder_weights:
                init.xavier_uniform_(weights)

        self.decoder_bias= nn.ParameterList([nn.Parameter(torch.zeros(reverse_layer_sizes[i+1])) for i in range(len(reverse_layer_sizes) - 1)])

    def encode(self, x):
        for i, w in enumerate(self.encoder_weights):
            x= F.linear(input=x, weight=w, bias=self.encoder_bias[i])
            x= activation(input=x, type=self.nl_type)

        if self.dp_drop_prob > 0:
            x= self.drop(x)

        return x

    def decode(self, x):
        if self.is_constrained == True:
            for i, w in zip(range(len(self.encoder_weights)), list(reversed(self.encoder_weights))):
                x= F.linear(input=x, weight=w.t(), bias=self.decoder_bias[i])
                x= activation(input=x, type=self.nl_type if i != self._last or self.last_layer_activations else 'identity')

        else:
            for i, w in enumerate(self.decoder_weights):
                x= F.linear(input=x, weight=w, bias=self.decoder_bias[i])
                x= activation(input=x, type=self.nl_type if i != self._last or self.last_layer_activations else 'identity')

        return x

    def forward(self, x):
        return self.decode(self.encode(x))


---
---
## 4. Main.py
Model을 위해 data를 준비하고 학습하기 위해 다음과 같은 과정이 필요하다.
1. Data 준비 - DataLoader
2. Model 설계 - layer 수, loss function, optimizer
3. Training
4. Validation

### DataLoader
앞서 정의한 TrainTestDataset을 이용하자! data에 가해줄 transformation을 정의하고, DataLoader를 만든다.

In [None]:
transformations= transforms.Compose([transforms.ToTensor()])
train_dat= TrainTestDataset('./data/train.csv', transformations)
test_dat= TrainTestDataset('./data/test.csv', transformations)

train_dl= DataLoader(dataset=train_dat, batch_size=128, shuffle=True, num_workers=0)
test_dl= DataLoader(dataset=test_dat, batch_size=512, shuffle=False, num_workers=0)

### Modeling
layer의 size를 정하고 model을 불러오며, loss function과 optimizer등 training에 필요한 요소들을 준비한다.

In [None]:
layer_sizes= [9559, 512, 512, 1024]
model=AutoEncoder(layer_size=layer_sizes, nl_type='selu', is_constrained=True, dp_drop_prob=0.0, last_layer_activations=False)

criterion= MSEloss_with_Mask()
optimizer= optim.Adam(model.parameters(), lr= 0.001)

### Training & Validaiton
DataLoader도 준비되었고, model도 정의되었으니 이제 학습과 validation을 진행 할 수 있다.

매 epoch마다 training과 validation을 반복한다. 이 과정은 딱히 특별할게 없다. batch input을 model에 통과시키고, loss를 계산하고, backward pass를 수행한 이후 optimizer의 step function을 이용해 weight을 update한다.

In [None]:
def train(model, criterion, optimizer, train_dl, test_dl, num_epochs= 40):
    liveloss= PlotLosses()
    lr2_tr_loss, lr2_val_loss= [], []
    for epoch in range(num_epochs):
        train_loss, valid_loss= [], []
        logs= {}
        prefix= ''

        model.train()
        for i, data in enumerate(train_dl, 0):
            inputs = labels= data
            inputs= inputs.float()
            labels= labels.float()

            optimizer.zero_grad()

            outputs= model(inputs)
            loss= criterion(outputs, labels)
            loss.backward()

            optimizer.step()

            # Iterative Dense Output Re-Feeding
            for iter_ in range(3):
                optimizer.zero_grad()

                outputs= model(outputs.detach())
                loss= criterion(outputs, labels)
                loss.backward()
                optimizer.step()

            train_loss.append(loss.item())
            logs[prefix+"MME loss"]= loss.item()

        for i, data in enumerate(test_dl):
            model.eval()
            inputs = labels= data
            inputs= inputs.float()
            labels= labels.float()

            with torch.no_grad():
                outputs= model(inputs)
                loss= criterion(outputs, labels)
                valid_loss.append(loss.item())
                prefix= 'val_'
                logs[prefix + "MMSE loss"]= loss.item()

        lr2_tr_loss.append(np.mean(train_loss))
        lr2_val_loss.append(np.mean(valid_loss))
        liveloss.update(logs)
        liveloss.draw()

        print("Epoch:", epoch+1, "Training Loss: ", np.mean(train_loss), "Valid Loss: ", np.mean(valid_loss))

        if epoch == num_epochs -1:
            return lr2_tr_loss, lr2_val_loss