# KL Divergence loss

The current mlpack implementation is incorrect. <br>
Also, the current implementation doesn't assume that ```input``` matrix has log-probabilites, unlike PyTorch. <br>

This notebook fixes these issues and also introduces the different reductions that PyTorch offers. <br>

### Imports and installation of mlpack

In [None]:
%%capture
!sudo apt-get install libmlpack-dev 
import torch
import torch.nn as nn

### PyTorch

The input matrix has to have log-probabilities.

#### Input generation with fixed random seeds

In [None]:
import random
import os
import numpy as np

def fix_seeds(seed=0):
  SEED = seed
  random.seed(seed)
  os.environ['PYTHONHASHSEED'] = str(SEED)
  np.random.seed(SEED)
  torch.manual_seed(SEED)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False
  if (torch.cuda.is_available()):
    torch.cuda.manual_seed(SEED)

fix_seeds()

In [None]:
x = torch.rand(4, 3)
y = torch.rand(4, 3)
xlog = torch.log(x)
print('Input : ')
print(xlog)
print('Target : ')
print(y)

Input : 
tensor([[-0.7007, -0.2637, -2.4250],
        [-2.0247, -1.1795, -0.4556],
        [-0.7132, -0.1093, -0.7861],
        [-0.4584, -1.0530, -0.9120]])
Target : 
tensor([[0.0223, 0.1689, 0.2939],
        [0.5185, 0.6977, 0.8000],
        [0.1610, 0.2823, 0.6816],
        [0.9152, 0.3971, 0.8742]])


#### None Reduction


In [None]:
loss = torch.nn.KLDivLoss(reduction='none')
input = torch.tensor([[-0.7007, -0.2637, -2.4250],
                      [-2.0247, -1.1795, -0.4556],
                      [-0.7132, -0.1093, -0.7861],
                      [-0.4584, -1.0530, -0.9120]], requires_grad=True)
target = torch.tensor([[0.0223, 0.1689, 0.2939],
                      [0.5185, 0.6977, 0.8000],
                      [0.1610, 0.2823, 0.6816],
                      [0.9152, 0.3971, 0.8742]])
output = loss(input, target)
output.backward(torch.ones(input.shape))
print("Input : ")
print(input)
print("Target : ")
print(target)
print("FORWARD : ")
print("Loss : ")
print(output)
print("BACKWARD : ")
print(input.grad)
print("Sum of all values in this matrix for Backward: ")
print(torch.sum(input.grad))

Input : 
tensor([[-0.7007, -0.2637, -2.4250],
        [-2.0247, -1.1795, -0.4556],
        [-0.7132, -0.1093, -0.7861],
        [-0.4584, -1.0530, -0.9120]], requires_grad=True)
Target : 
tensor([[0.0223, 0.1689, 0.2939],
        [0.5185, 0.6977, 0.8000],
        [0.1610, 0.2823, 0.6816],
        [0.9152, 0.3971, 0.8742]])
FORWARD : 
Loss : 
tensor([[-0.0692, -0.2558,  0.3528],
        [ 0.7092,  0.5718,  0.1860],
        [-0.1792, -0.3262,  0.2745],
        [ 0.3384,  0.0514,  0.6797]], grad_fn=<KlDivBackward>)
BACKWARD : 
tensor([[-0.0223, -0.1689, -0.2939],
        [-0.5185, -0.6977, -0.8000],
        [-0.1610, -0.2823, -0.6816],
        [-0.9152, -0.3971, -0.8742]])
Sum of all values in this matrix for Backward: 
tensor(-5.8127)


#### Sum Reduction

In [None]:
loss = torch.nn.KLDivLoss(reduction='sum')
input = torch.tensor([[-0.7007, -0.2637, -2.4250],
                      [-2.0247, -1.1795, -0.4556],
                      [-0.7132, -0.1093, -0.7861],
                      [-0.4584, -1.0530, -0.9120]], requires_grad=True)
target = torch.tensor([[0.0223, 0.1689, 0.2939],
                      [0.5185, 0.6977, 0.8000],
                      [0.1610, 0.2823, 0.6816],
                      [0.9152, 0.3971, 0.8742]])
output = loss(input, target)
output.backward()
print("Input : ")
print(input)
print("Target : ")
print(target)
print("FORWARD : ")
print("Loss : ")
print(output)
print("BACKWARD : ")
print(input.grad)
print("Sum of all values in this matrix for Backward: ")
print(torch.sum(input.grad))

Input : 
tensor([[-0.7007, -0.2637, -2.4250],
        [-2.0247, -1.1795, -0.4556],
        [-0.7132, -0.1093, -0.7861],
        [-0.4584, -1.0530, -0.9120]], requires_grad=True)
Target : 
tensor([[0.0223, 0.1689, 0.2939],
        [0.5185, 0.6977, 0.8000],
        [0.1610, 0.2823, 0.6816],
        [0.9152, 0.3971, 0.8742]])
FORWARD : 
Loss : 
tensor(2.3335, grad_fn=<KlDivBackward>)
BACKWARD : 
tensor([[-0.0223, -0.1689, -0.2939],
        [-0.5185, -0.6977, -0.8000],
        [-0.1610, -0.2823, -0.6816],
        [-0.9152, -0.3971, -0.8742]])
Sum of all values in this matrix for Backward: 
tensor(-5.8127)


#### Mean reduction - Divide result of sum reduction by number of elements (will be removed in next version)

In [None]:
# UserWarning: reduction: 'mean' divides the total loss by both the batch size and the support size.'batchmean' divides only by the batch size, 
# and aligns with the KL div math definition.'mean' will be changed to behave the same as 'batchmean' in the next major release.

loss = torch.nn.KLDivLoss(reduction='mean')
input = torch.tensor([[-0.7007, -0.2637, -2.4250],
                      [-2.0247, -1.1795, -0.4556],
                      [-0.7132, -0.1093, -0.7861],
                      [-0.4584, -1.0530, -0.9120]], requires_grad=True)
target = torch.tensor([[0.0223, 0.1689, 0.2939],
                      [0.5185, 0.6977, 0.8000],
                      [0.1610, 0.2823, 0.6816],
                      [0.9152, 0.3971, 0.8742]])
output = loss(input, target)
output.backward()
print("Input : ")
print(input)
print("Target : ")
print(target)
print("FORWARD : ")
print("Loss : ")
print(output)
print("BACKWARD : ")
print(input.grad)
print("Sum of all values in this matrix for Backward: ")
print(torch.sum(input.grad))

Input : 
tensor([[-0.7007, -0.2637, -2.4250],
        [-2.0247, -1.1795, -0.4556],
        [-0.7132, -0.1093, -0.7861],
        [-0.4584, -1.0530, -0.9120]], requires_grad=True)
Target : 
tensor([[0.0223, 0.1689, 0.2939],
        [0.5185, 0.6977, 0.8000],
        [0.1610, 0.2823, 0.6816],
        [0.9152, 0.3971, 0.8742]])
FORWARD : 
Loss : 
tensor(0.1945, grad_fn=<KlDivBackward>)
BACKWARD : 
tensor([[-0.0019, -0.0141, -0.0245],
        [-0.0432, -0.0581, -0.0667],
        [-0.0134, -0.0235, -0.0568],
        [-0.0763, -0.0331, -0.0728]])
Sum of all values in this matrix for Backward: 
tensor(-0.4844)




#### Batch-Mean reduction - Divide result of sum reduction by number of rows (use this instead of mean)

In [None]:
loss = torch.nn.KLDivLoss(reduction='batchmean')
input = torch.tensor([[-0.7007, -0.2637, -2.4250],
                      [-2.0247, -1.1795, -0.4556],
                      [-0.7132, -0.1093, -0.7861],
                      [-0.4584, -1.0530, -0.9120]], requires_grad=True)
target = torch.tensor([[0.0223, 0.1689, 0.2939],
                      [0.5185, 0.6977, 0.8000],
                      [0.1610, 0.2823, 0.6816],
                      [0.9152, 0.3971, 0.8742]])
output = loss(input, target)
output.backward()
print("Input : ")
print(input)
print("Target : ")
print(target)
print("FORWARD : ")
print("Loss : ")
print(output)
print("BACKWARD : ")
print(input.grad)
print("Sum of all values in this matrix for Backward: ")
print(torch.sum(input.grad))

Input : 
tensor([[-0.7007, -0.2637, -2.4250],
        [-2.0247, -1.1795, -0.4556],
        [-0.7132, -0.1093, -0.7861],
        [-0.4584, -1.0530, -0.9120]], requires_grad=True)
Target : 
tensor([[0.0223, 0.1689, 0.2939],
        [0.5185, 0.6977, 0.8000],
        [0.1610, 0.2823, 0.6816],
        [0.9152, 0.3971, 0.8742]])
FORWARD : 
Loss : 
tensor(0.5834, grad_fn=<DivBackward0>)
BACKWARD : 
tensor([[-0.0056, -0.0422, -0.0735],
        [-0.1296, -0.1744, -0.2000],
        [-0.0402, -0.0706, -0.1704],
        [-0.2288, -0.0993, -0.2185]])
Sum of all values in this matrix for Backward: 
tensor(-1.4532)


### mlpack


#### CURRENT IMPLEMENTATION - incorrect.

In [None]:
%%capture
%%writefile test.cpp  

#include <iostream>
#include <armadillo>

using namespace std;
using namespace arma;

int main()
{
  // Constructor
  arma::mat x,y;
  arma::mat weight;
 
  x << -0.7007 << -0.2637 << -2.4250 << endr
    << -2.0247 << -1.1795 << -0.4556 << endr
    << -0.7132 << -0.1093 << -0.7861 << endr
    << -0.4584 << -1.0530 << -0.9120 << endr;

  y << 0.0223 << 0.1689 << 0.2939 << endr
    << 0.5185 << 0.6977 << 0.8000 << endr
    << 0.1610 << 0.2823 << 0.6816 << endr
    << 0.9152 << 0.3971 << 0.8742 << endr;

  // Forward
  double loss_sum = arma::accu(x % (arma::log(x) - arma::log(y)));
  double loss_mean = arma::as_scalar(arma::mean(arma::mean(x % (arma::log(x) - arma::log(y)))));

  // Backward
  arma::mat output;
  output = arma::accu(arma::log(x) - arma::log(y) + 1);

  // Display
  cout << "------------------------------------------------------------------" << endl;
  cout << "USER-PROVIDED MATRICES : " << endl;
  cout << "------------------------------------------------------------------" << endl;
  cout << "Input shape : "<< x.n_rows << " " << x.n_cols << endl;
  cout << "Input : " << endl << x << endl;
  cout << "Target shape : "<< y.n_rows << " " << y.n_cols << endl;
  cout << "Target : " << endl << y << endl;
  cout << "------------------------------------------------------------------" << endl;
  cout << "SUM " << endl;
  cout << "------------------------------------------------------------------" << endl;
  cout << "FORWARD : " << endl;
  cout << "Loss (sum):\n" << loss_sum << '\n';
  cout << "BACKWARD : " << endl;
  cout << "Output shape : "<< output.n_rows << " " << output.n_cols << endl;
  cout << "Output (sum) : " << endl << output << endl;
  cout << "Sum of all values in this matrix : " << arma::as_scalar(arma::accu(output)) << endl;
  cout << "------------------------------------------------------------------" << endl;
  cout << "MEAN " << endl;
  cout << "------------------------------------------------------------------" << endl;
  cout << "FORWARD : " << endl;
  cout << "Loss (mean):\n" << loss_mean << '\n';
  cout << "BACKWARD : " << endl;
  cout << "Output shape : "<< output.n_rows << " " << output.n_cols << endl;
  cout << "Output (mean) : " << endl << arma::mean(arma::mean(arma::log(x) - arma::log(y) + 1)) << endl;
  cout << "Sum of all values in this matrix : " << arma::as_scalar(arma::accu(output / x.n_elem)) << endl;
  cout << "------------------------------------------------------------------" << endl;
  return 0;
}

In [None]:
%%script bash
g++ test.cpp -o test -larmadillo && ./test

------------------------------------------------------------------
USER-PROVIDED MATRICES : 
------------------------------------------------------------------
Input shape : 4 3
Input : 
  -0.7007  -0.2637  -2.4250
  -2.0247  -1.1795  -0.4556
  -0.7132  -0.1093  -0.7861
  -0.4584  -1.0530  -0.9120

Target shape : 4 3
Target : 
   0.0223   0.1689   0.2939
   0.5185   0.6977   0.8000
   0.1610   0.2823   0.6816
   0.9152   0.3971   0.8742

------------------------------------------------------------------
SUM 
------------------------------------------------------------------
FORWARD : 
Loss (sum):
nan
BACKWARD : 
Output shape : 1 1
Output (sum) : 
      nan

Sum of all values in this matrix : nan
------------------------------------------------------------------
MEAN 
------------------------------------------------------------------
FORWARD : 
Loss (mean):
nan
BACKWARD : 
Output shape : 1 1
Output (mean) : 
nan
Sum of all values in this matrix : nan
------------------------------------

#### NEW IMPLEMENTATION

The formula used in the Forward function matches PyTorch and TensorFlow implementations.

In [None]:
%%capture
%%writefile test.cpp  

#include <iostream>
#include <armadillo>

using namespace std;
using namespace arma;

int main()
{
  // Constructor
  arma::mat x,y;
  arma::mat weight;
  
  x << -0.7007 << -0.2637 << -2.4250 << endr
    << -2.0247 << -1.1795 << -0.4556 << endr
    << -0.7132 << -0.1093 << -0.7861 << endr
    << -0.4584 << -1.0530 << -0.9120 << endr;


  y << 0.0223 << 0.1689 << 0.2939 << endr
    << 0.5185 << 0.6977 << 0.8000 << endr
    << 0.1610 << 0.2823 << 0.6816 << endr
    << 0.9152 << 0.3971 << 0.8742 << endr;

  // Forward
  arma::mat loss_none = y % (arma::log(y) - x);
  double loss_sum = arma::accu(loss_none);
  double loss_mean = loss_sum / x.n_elem;
  double loss_batch_mean = loss_sum / x.n_rows;

  // Backward
  arma::mat output;
  output = -y ;

  // Display
  cout << "------------------------------------------------------------------" << endl;
  cout << "USER-PROVIDED MATRICES : " << endl;
  cout << "------------------------------------------------------------------" << endl;
  cout << "Input shape : "<< x.n_rows << " " << x.n_cols << endl;
  cout << "Input : " << endl << x << endl;
  cout << "Target shape : "<< y.n_rows << " " << y.n_cols << endl;
  cout << "Target : " << endl << y << endl;
  cout << "------------------------------------------------------------------" << endl;
  cout << "SUM " << endl;
  cout << "------------------------------------------------------------------" << endl;
  cout << "FORWARD : " << endl;
  cout << "Loss : \n" << loss_none << '\n';
  cout << "Loss (sum):\n" << loss_sum << '\n';
  cout << "BACKWARD : " << endl;
  cout << "Output shape : "<< output.n_rows << " " << output.n_cols << endl;
  cout << "Output (sum) : " << endl << output << endl;
  cout << "Sum of all values in this matrix : " << arma::as_scalar(arma::accu(output)) << endl;
  cout << "------------------------------------------------------------------" << endl;
  cout << "MEAN " << endl;
  cout << "------------------------------------------------------------------" << endl;
  cout << "FORWARD : " << endl;
  cout << "Loss (mean):\n" << loss_mean << '\n';
  cout << "BACKWARD : " << endl;
  cout << "Output shape : "<< output.n_rows << " " << output.n_cols << endl;
  cout << "Output (mean) : " << endl << output / x.n_elem << endl;
  cout << "Sum of all values in this matrix : " << arma::as_scalar(arma::accu(output / x.n_elem)) << endl;
  cout << "------------------------------------------------------------------" << endl;
  cout << "BATCH - MEAN " << endl;
  cout << "------------------------------------------------------------------" << endl;
  cout << "FORWARD : " << endl;
  cout << "Loss (mean):\n" << loss_batch_mean << '\n';
  cout << "BACKWARD : " << endl;
  cout << "Output shape : "<< output.n_rows << " " << output.n_cols << endl;
  cout << "Output (batchmean) : " << endl << output / x.n_rows << endl;
  cout << "Sum of all values in this matrix : " << arma::as_scalar(arma::accu(output / x.n_rows)) << endl;
  cout << "------------------------------------------------------------------" << endl;
  return 0;
}

In [None]:
%%script bash
g++ test.cpp -o test -larmadillo && ./test

------------------------------------------------------------------
USER-PROVIDED MATRICES : 
------------------------------------------------------------------
Input shape : 4 3
Input : 
  -0.7007  -0.2637  -2.4250
  -2.0247  -1.1795  -0.4556
  -0.7132  -0.1093  -0.7861
  -0.4584  -1.0530  -0.9120

Target shape : 4 3
Target : 
   0.0223   0.1689   0.2939
   0.5185   0.6977   0.8000
   0.1610   0.2823   0.6816
   0.9152   0.3971   0.8742

------------------------------------------------------------------
SUM 
------------------------------------------------------------------
FORWARD : 
Loss : 
  -0.0692  -0.2558   0.3528
   0.7092   0.5718   0.1860
  -0.1792  -0.3262   0.2745
   0.3384   0.0514   0.6797

Loss (sum):
2.33349
BACKWARD : 
Output shape : 4 3
Output (sum) : 
  -0.0223  -0.1689  -0.2939
  -0.5185  -0.6977  -0.8000
  -0.1610  -0.2823  -0.6816
  -0.9152  -0.3971  -0.8742

Sum of all values in this matrix : -5.8127
----------------------------------------------------------------