In [34]:
import torch
from torch import nn, optim
import torch.nn.functional as F

In [35]:
X = torch.Tensor([[1, 2, 1, 1],
                  [2, 1, 3, 2],
                  [3, 1, 3, 4],
                  [4, 1, 5, 5],
                  [1, 7, 5, 5],
                  [1, 2, 5, 6],
                  [1, 6, 6, 6],
                  [1, 7, 7, 7]])
_Y = torch.LongTensor([2, 2, 2, 1, 1, 1, 0, 0])
Y = torch.zeros(8, 3, dtype=_Y.dtype).scatter_(dim=1, index=_Y.unsqueeze(dim=1), value=1)
print('one hot vector of Y:', *Y.tolist(), sep='\n')

one hot vector of Y:
[0, 0, 1]
[0, 0, 1]
[0, 0, 1]
[0, 1, 0]
[0, 1, 0]
[0, 1, 0]
[1, 0, 0]
[1, 0, 0]


$\text{Softmax}:$
The softmax function, also known as softargmax or normalized exponential function, is a generalization of the logistic function to multiple dimensions. It is used in multinomial logistic regression and is often used as the last activation function of a neural network to normalize the output of a network to a probability distribution over predicted output classes, based on Luce's choice axiom.
$$p_i = \sigma(\bold{z})_{i} = \frac{e^{z_i}}{\sum^K_{j=1}e^{z_j}} $$
$\text{Cross-Entropy}:$
$$ \text{Cost}(W) = -\frac{1}{n}\sum^n_{i=1}\sum^k_{j = 1} y^{(i)}_j \log(p^{(i)}_j) $$

In [36]:
W = torch.zeros((4, 3), requires_grad=True)
b = torch.zeros(1, requires_grad=True)
optimizer = optim.SGD([W, b], lr=1e-1)

n_epochs = 10000
for epoch in range(1, n_epochs + 1):
    hypothesis = F.softmax(X.matmul(W) + b, dim=1)
    loss = (Y * -torch.log(hypothesis)).sum(dim=1).mean()

    optimizer.zero_grad()

    loss.backward()
    optimizer.step()

    if epoch % 1000 == 0:
        print(f'epoch: {epoch:05d}/{n_epochs}\tloss: {loss.item():.6f}')

epoch: 01000/10000	loss: 0.385798
epoch: 02000/10000	loss: 0.246166
epoch: 03000/10000	loss: 0.192469
epoch: 04000/10000	loss: 0.157261
epoch: 05000/10000	loss: 0.132464
epoch: 06000/10000	loss: 0.114132
epoch: 07000/10000	loss: 0.100080
epoch: 08000/10000	loss: 0.088997
epoch: 09000/10000	loss: 0.080050
epoch: 10000/10000	loss: 0.072688


Using $\text{PyTorch}$ modules

In [37]:
model = nn.Linear(4, 3)
optimizer = optim.SGD(model.parameters(), lr=1e-1)

n_epochs = 10000
for epoch in range(1, n_epochs + 1):
    loss = F.cross_entropy(model(X), _Y)

    optimizer.zero_grad()

    loss.backward()
    optimizer.step()

    if epoch % 1000 == 0:
        print(f'epoch: {epoch:05d}/{n_epochs}\tloss: {loss.item():.6f}')

epoch: 01000/10000	loss: 0.241418
epoch: 02000/10000	loss: 0.156788
epoch: 03000/10000	loss: 0.115383
epoch: 04000/10000	loss: 0.090977
epoch: 05000/10000	loss: 0.074963
epoch: 06000/10000	loss: 0.063676
epoch: 07000/10000	loss: 0.055307
epoch: 08000/10000	loss: 0.048861
epoch: 09000/10000	loss: 0.043746
epoch: 10000/10000	loss: 0.039591


References:$\newline$
[소프트맥스 회귀(Softmax Regression) 이해하기](https://wikidocs.net/59427)$\newline$
[소프트맥스 회귀 구현하기](https://wikidocs.net/60575)$\newline$
[Softmax function](https://en.wikipedia.org/wiki/Softmax_function)