In [1]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as dataset
from torch.autograd import Variable
from torch.nn import Parameter
from torch import Tensor
import torch.nn.functional as F
from torch.utils.data import DataLoader
import math

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
cuda = True if torch.cuda.is_available() else False
    
Tensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor    

torch.manual_seed(125)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(125)

In [2]:
mnist_transform = transforms.Compose([
    transforms.ToTensor(), 
    transforms.Normalize((0.5,), (1.0,))
])

In [3]:
from torchvision.datasets import MNIST

download_root = '../data/MNIST_DATASET/'

train_dataset = MNIST(download_root, transform=mnist_transform, train=True, download=False)
valid_dataset = MNIST(download_root, transform=mnist_transform, train=False, download=False)
test_dataset = MNIST(download_root, transform=mnist_transform, train=False, download=False)

In [4]:
batch_size = 64
train_loader = DataLoader(dataset=train_dataset, 
                         batch_size=batch_size,
                         shuffle=True)
valid_loader = DataLoader(dataset=test_dataset, 
                         batch_size=batch_size,
                         shuffle=True)
test_loader = DataLoader(dataset=test_dataset, 
                         batch_size=batch_size,
                         shuffle=True)

In [5]:
batch_size = 100
n_iters = 6000
num_epochs = n_iters / (len(train_dataset) / batch_size)
num_epochs = int(num_epochs)

In [6]:
inputs, labels = next(iter(train_loader))

In [7]:
labels.size()

torch.Size([64])

In [8]:
inputs.size()

torch.Size([64, 1, 28, 28])

In [9]:
inputs =inputs.view(-1,28,28)

In [10]:
seq = inputs[:,1,:]

In [11]:
seq.size()

torch.Size([64, 28])

In [15]:
class LSTMCell(nn.Module):
    def __init__(self,input_size,hidden_size,bias=True):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bias = bias
        self.x2h = nn.Linear(input_size, 4 *hidden_size, bias=bias)
        self.h2h = nn.Linear(hidden_size, 4 * hidden_size, bias=bias)
        self.reset_parameters()

    def reset_parameters(self):
        std = 1.0 / math.sqrt(self.hidden_size)
        for w in self.parameters():
            w.data.uniform_(-std,std)
    
    def forward(self,x,hidden): # (64,28)의 sequence 하나를 입력받는다
        hx, cx = hidden
        x = x.view(-1,x.size(1)) # (64,28)로 이쁘게 핀다

        gates = self.x2h(x) + self.h2h(hx) #(64,4 * 128)
        gates = gates.squeeze()
        ingate, forgetgate,cellgate,outgate = gates.chunk(4,1) #(64,28)

        ingate = F.sigmoid(ingate)
        forgetgate = F.sigmoid(forgetgate)
        cellgate = F.sigmoid(cellgate)
        outgate = F.sigmoid(outgate)

        cy = torch.mul(cx, forgetgate) + torch.mul(ingate,cellgate)
        hy = torch.mul(outgate, F.tanh(cy))
        return (hy,cy)

MNIST는 손으로 쓴 숫자 이미지 데이터입니다. 하나의 이미지는 가로 28개, 세로 28개, 총 784개의 값으로 이루어져 있습니다.

Many-to-One model는 여러 시퀀스를 넣었을 때 나오는 최종 결과물만을 이용하는 모델입니다. 이를 이용하여 784개의 input으로 1개의 output값(A) 을 도출합니다. 이 A를 하나의 층에 통과시켜 10개의 숫자 label중 하나를 할당합니다.

784개의 입력값을 사이즈가 28인 벡터가 28번 이어지는 시퀀스(time step)로 보고, input의 크기를 28, 시퀀스 길이를 28로 각각 설정합니다. 28개의 input은 C라고 표현되어 있는 LSTM 셀로 순차적으로 들어가게 됩니다.

output의 크기는 셀의 크기와 같으며, 64로 설정하였습니다. 셀크기가 너무 작으면 많은 정보를 담지 못하기 때문에 적당히 큰 값으로 설정합니다. 전체 output은 64개의 값을 가지고 있는 벡터 28개의 집합이 되고, 마지막 벡터만 사용합니다.

1층의 fully connected layer를 이용하여 64차원 벡터를 10차원으로 줄이고 softmax를 이용하여 0부터 9까지 중 하나의 값을 예측합니다.

https://pozalabs.github.io/lstm/

In [16]:

class LSTMModel(nn.Module):
    def __init__(self,input_dim,hidden_dim,layer_dim,output_dim, bias=True):
        super().__init__()
        self.hidden_dim = hidden_dim

        self.layer_dim = layer_dim
        self.lstm = LSTMCell(input_dim,hidden_dim,layer_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self,x): # x: torch.Size([64, 28, 28]) (batch,seq_dim,input_dim)
        # 
        h0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).cuda()) # (1,64,128)
        c0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).cuda()) # (1,64,128)

        outs = []
        cn = c0[0,:,:]
        hn = h0[0,:,:]

        for seq in range(x.size(1)): # range(28)
            hn,cn = self.lstm(x[:,seq,:],(hn,cn)) # (64,1,28)로 sequence를 하나씩 전달
            outs.append(hn)
        
        out = outs[-1].squeeze()
        out = self.fc(out)
        return out

In [17]:
input_dim = 28
hidden_dim = 128
layer_dim = 1  
output_dim = 10
 
model = LSTMModel(input_dim, hidden_dim, layer_dim, output_dim)
if torch.cuda.is_available():
    model.cuda()
criterion = nn.CrossEntropyLoss()
learning_rate = 0.1 
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [18]:
seq_dim = 28
loss_list = []
iter = 0
for epoch in range(num_epochs):
    for i, (images,labels) in enumerate(train_loader):
        images = Variable(images.view(-1,seq_dim,input_dim).cuda()) #torch.Size([64, 1, 28, 28]) -> torch.Size([64, 28, 28])으로 변경
        labels = Variable(labels.cuda())

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)

        loss.cuda()

        loss.backward()
        optimizer.step()
        loss_list.append(loss.item())
        iter +=1

        if iter % 500 == 0:
            correct = 0
            total = 0

            for images, labels in valid_loader:
                images = Variable(images.view(-1,seq_dim,input_dim).cuda())
                outputs = model(images)
                _,predicted = torch.max(outputs.data,1)

                total += labels.size(0)
                correct += (predicted.cpu() == labels.cpu()).sum()
            
            accuracy = 100 * correct / total
            print('Iteration: {}. Loss: {}. Accuracy: {}'.format(iter, loss.item(), accuracy))



Iteration: 500. Loss: 2.301367998123169. Accuracy: 10.09000015258789
Iteration: 1000. Loss: 2.300041675567627. Accuracy: 11.350000381469727
Iteration: 1500. Loss: 2.3201262950897217. Accuracy: 10.229999542236328
Iteration: 2000. Loss: 2.2970528602600098. Accuracy: 10.279999732971191
Iteration: 2500. Loss: 2.2516183853149414. Accuracy: 11.760000228881836
Iteration: 3000. Loss: 2.2531661987304688. Accuracy: 16.610000610351562
Iteration: 3500. Loss: 2.166072368621826. Accuracy: 20.280000686645508
Iteration: 4000. Loss: 1.7078289985656738. Accuracy: 36.529998779296875
Iteration: 4500. Loss: 1.221008539199829. Accuracy: 59.56999969482422
Iteration: 5000. Loss: 0.8756988644599915. Accuracy: 72.26000213623047
Iteration: 5500. Loss: 0.687919557094574. Accuracy: 74.58999633789062
Iteration: 6000. Loss: 0.6276847720146179. Accuracy: 75.47000122070312
Iteration: 6500. Loss: 0.49487772583961487. Accuracy: 84.80000305175781
Iteration: 7000. Loss: 0.24680334329605103. Accuracy: 86.86000061035156
Ite

In [19]:
def evaluate(model, val_iter):
    corrects, total, total_loss = 0,0,0
    model.eval()

    for images, labels in val_iter:
        images = Variable(images.view(-1,seq_dim,input_dim).cuda())
        
        labels = Variable(labels.cuda())
        
        logit = model(images).to(device)
        loss = F.cross_entropy(logit,labels,reduction='sum')
        
        _, pred = torch.max(logit.data, 1)
        total += labels.size(0)
        total_loss += loss.item()
        
        corrects += (pred == labels).sum()
    
    avg_loss = total_loss / len(val_iter)
    avg_accuracy = corrects / total
    return avg_loss, avg_accuracy


In [20]:
test_loss, test_acc = evaluate(model, test_loader)
print("Test Loss: %5.2f | Test Accuracy: %5.2f" % (test_loss, test_acc))

Test Loss: 21.03 | Test Accuracy:  0.90
