In [1]:
import torch

# Multi-class classification
Suppose we have a 5-way multi-class classification task. 
Here are different ways to achieve the same cross entropy function given raw logit output by the network.

In [2]:
# batch size 3, output size 5
logit = torch.FloatTensor(([[ 0.0400, -0.0112,  0.0376,  0.0343,  0.6097],
        [-1.0835, -0.7568,  0.3562, -0.5957,  0.9419],
        [ 1.6908,  1.4120,  0.5687, -0.0944, -1.3327]]))
targets = torch.LongTensor([0,1,4])
one_hot_vector = torch.LongTensor([[1,0,0,0,0],[0,1,0,0,0],[0,0,0,0,1]])

In [3]:
print(targets)
print(one_hot_vector)

tensor([0, 1, 4])
tensor([[1, 0, 0, 0, 0],
        [0, 1, 0, 0, 0],
        [0, 0, 0, 0, 1]])


## Version 1: CrossEntropyLoss

In [4]:
ce_loss = torch.nn.CrossEntropyLoss()(logit, targets)
print(ce_loss)

tensor(2.6774)


## Version 2: LogSoftmax + NLL

In [5]:
log_prob = torch.nn.LogSoftmax(dim=1)(logit)
logsoftmax_nll_loss = torch.nn.NLLLoss()(log_prob, targets)
print(logsoftmax_nll_loss)

tensor(2.6774)


## Version 3: Softmax + Log + NLL
Note that this is discouraged by pytorch since Softmax can produce NaN when some logit are very close to 0. Try to use LogSoftmax instead of Softmax whenever possible.

In [6]:
prob = torch.nn.Softmax(dim=1)(logit)
log_prob = torch.log(prob)
softmax_log_nll_loss = torch.nn.NLLLoss()(log_prob, targets)
print(softmax_log_nll_loss)

tensor(2.6774)


## Version 4: LogSoftmax + my NLL with hot vector

In [7]:
log_prob = torch.nn.LogSoftmax(dim=1)(logit)
logsoftmax_nll_one_hot_loss = -(log_prob * one_hot_vector).sum(dim=1).mean()
print(logsoftmax_nll_one_hot_loss)

tensor(2.6774)


# Partial Feedback classification
Now suppose we are in the LECO setup.
In Time 0, we have a binary classification task.
In Time 1, we have 5-way classification, and classes [0,1,2] belongs to class 0 in Time 0, and classes [3,4] belongs to class 1 in Time 1.

In [8]:
print("Logit is the same:")
print(logit)
targets = torch.LongTensor([[0, 0], [0, 1], [1, 4]])
print("Each target now has ground truth for both time 0 and time 1:")
print(targets)

Logit is the same:
tensor([[ 0.0400, -0.0112,  0.0376,  0.0343,  0.6097],
        [-1.0835, -0.7568,  0.3562, -0.5957,  0.9419],
        [ 1.6908,  1.4120,  0.5687, -0.0944, -1.3327]])
Each target now has ground truth for both time 0 and time 1:
tensor([[0, 0],
        [0, 1],
        [1, 4]])


## (Please skip the below cell) Helper functions I implemented to create one hot vector from targets

In [9]:
# dict[lead_idx] = targets
leaf_idx_to_all_class_idx = {
    0 : [0, 0],
    1 : [0, 1],
    2 : [0, 2],
    3 : [1, 3],
    4 : [1, 4]
}

def get_superclass_to_subclass(leaf_idx_to_all_class_idx):
    # superclass_to_subclass[sub_class_time][super_class_time][super_class_idx]
    # is the set of indices in sub_class_time that correspond to the superclass
    num_of_levels = len(leaf_idx_to_all_class_idx[list(leaf_idx_to_all_class_idx.keys())[0]])
    superclass_to_subclass = {}
    for tp_idx in range(num_of_levels-1, -1, -1):
        superclass_to_subclass[tp_idx] = {}
        for super_class_time in range(tp_idx+1):
            superclass_to_subclass[tp_idx][super_class_time] = {}
            for leaf_idx in leaf_idx_to_all_class_idx:
                sub_class_idx = leaf_idx_to_all_class_idx[leaf_idx][tp_idx]
                super_class_idx = leaf_idx_to_all_class_idx[leaf_idx][super_class_time]
                if not super_class_idx in superclass_to_subclass[tp_idx][super_class_time]:
                    superclass_to_subclass[tp_idx][super_class_time][super_class_idx] = [sub_class_idx]
                elif not sub_class_idx in superclass_to_subclass[tp_idx][super_class_time][super_class_idx]:
                    superclass_to_subclass[tp_idx][super_class_time][super_class_idx].append(sub_class_idx)
    return superclass_to_subclass

# superclass_to_subclass[sub_class_time][super_class_time][super_class_idx]
# is the set of indices in sub_class_time that correspond to the superclass
superclass_to_subclass = get_superclass_to_subclass(leaf_idx_to_all_class_idx)

# print(superclass_to_subclass)

num_of_classes = [2, 5]

def get_make_hot_vector_func(superclass_to_subclass,
                             num_of_classes,
                             tp_idx):
    # superclass_to_subclass[tp_idx][super_class_time][super_class_idx] is the set
    # of indices (in current_time:tp_idx) included in the superclass

    # Return a function that makes a one hot vector from timestamp and labels
    num_of_leaf_classes = num_of_classes[tp_idx]
    def make_hot_vector(time_indices, labels, device='cuda'):
        hot_vector = torch.zeros((time_indices.shape[0], num_of_leaf_classes)).to(device)
        # import pdb; pdb.set_trace()
        for idx, super_class_time in enumerate(time_indices):
            super_class_idx = int(labels[int(super_class_time)][idx])
            # if super_class_time < tp_idx:
            label_indices = superclass_to_subclass[tp_idx][int(super_class_time)][super_class_idx]
            hot_vector[idx, label_indices] = 1
            # import pdb; pdb.set_trace()
            # elif super_class_time == tp_idx:
            #     hot_vector[idx, label_idx] = 1
            # else:
            #     raise ValueError('Invalid time index')
        return hot_vector
    return make_hot_vector

hot_vector_func = get_make_hot_vector_func(superclass_to_subclass,
                             num_of_classes, 1)

def get_labels(targets):
    labels = []
    for i in range(len(targets[0])):
        labels.append([t[i] for t in targets])
    return labels
labels = get_labels(targets)

hot_vector = hot_vector_func(torch.LongTensor([1,1,1]), labels, device='cpu')
print("Hot vector for targets is:")
print(hot_vector)

Hot vector for targets is:
tensor([[1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 1.]])


## Version 5: LogSoftmax + my NLL with hot vector generated by helper

In [10]:
log_prob = torch.nn.LogSoftmax(dim=1)(logit)
logsoftmax_nll_one_hot_loss = -(log_prob * hot_vector).sum(dim=1).mean()
print(logsoftmax_nll_one_hot_loss)

tensor(2.6774)


# Partial Feedback Loss
Now suppose we are working with partial feedback with history samples. In the above example, now let's assume the 0th item is history sample. Here would be it's hot vector:

In [11]:
hot_vector = hot_vector_func(torch.LongTensor([0,1,1]), labels, device='cpu')
print("Hot vector for targets is:")
print(hot_vector)

Hot vector for targets is:
tensor([[1., 1., 1., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 1.]])


# Partial Feedback (My loss, not Peiyun's):
My partial feedback loss first calculate the log prob (via log softmax), then sum the logged probability for negative likelihood.
My loss is an upper bound of Peiyun's loss and therefore should serve as a better loss surrogate.

In [12]:
log_prob = torch.nn.LogSoftmax(dim=1)(logit)
my_loss = -(log_prob * hot_vector).sum(dim=1).mean()
print(my_loss)

tensor(3.8567)


Equivalently, we could use softmax instead of logsoftmax, though softmax is relatively unstable to use, and will certainly produce NaN in practice.

In [13]:
log_prob = torch.log(torch.nn.Softmax(dim=1)(logit))
loss = -(log_prob * hot_vector).sum(dim=1)
# print(loss)
my_loss = loss.mean()
print(my_loss)

tensor(3.8567)


# Partial Feedback (Peiyun's loss):
Peiyun's loss first calculate the prob (via softmax), then sum the probability, and send the total sum to log.

In [14]:
prob = torch.nn.Softmax(dim=1)(logit)
prob_mask = (prob * hot_vector).sum(dim=1)
# print(prob_mask)
loss = -torch.log(prob_mask)
# print(loss)
peiyun_loss = loss.mean()
print(peiyun_loss)

tensor(2.3171)


Below is my alternative peiyun's loss instantiation to compute the same above loss function by shifting the logit (by max value) before sending into Softmax, and therefore more numerically stable:

In [15]:
max_logit = logit.max(1)[0].unsqueeze(1)
shifted_logit = logit - max_logit
prob = torch.nn.Softmax(dim=1)(shifted_logit)
prob_mask = (prob * hot_vector).sum(dim=1)
loss = -torch.log(prob_mask)
peiyun_loss = loss.mean()
print(peiyun_loss)

tensor(2.3171)
