# Pytorch로 딥러닝 제대로 배우기-중급
## Part7-4: RNN 실습
18개 언어로 구성된 단어 사전을 학습하여 주어진 단어가 어떤 언어인지 예측하는 모델 개발

### 목차
1. 데이터
2. 모델
3. 학습
4. 모델 변경

### (1) 데이터



#### 데이터 다운로드

본 데이터는 PyTorch 내부에서 다운로드가 불가능하기 때문에 외부에서 **다운**받아서 활용

1. 다운로드 경로: [HERE](https://download.pytorch.org/tutorial/data.zip)
2. 다운 받은 데이터를 압축 해제하여 Colab 디렉토리에 추가

In [7]:
# from google.colab import drive
# drive.mount('/content/drive')

#### 데이터 준비
1. 데이터를 파일로부터 읽어와서 Unicode string --> SSCII 변경
2. Category(label)와 line(data)를 학습 데이터 구조화

In [8]:
from __future__ import unicode_literals, print_function, division
from io import open
import glob
import os

def findFiles(path): return glob.glob(path)

print(findFiles('./dataset/names/*.txt'))

import unicodedata
import string

all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

print(unicodeToAscii('Ślusàrski'))

# Build the category_lines dictionary, a list of names per language
category_lines = {}
all_categories = []

# Read a file and split into lines
def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]

for filename in findFiles('./dataset/names/*.txt'):
    category = os.path.splitext(os.path.basename(filename))[0]
    all_categories.append(category)
    lines = readLines(filename)
    category_lines[category] = lines      # To make dataset

n_categories = len(all_categories)

[]
Slusarski


#### Word to Vector Helper functions

1. 알파벳을 one-hot 형태로 벡터화
2. 단어를 텐서화

In [9]:
import torch

# Find letter index from all_letters, e.g. "a" = 0
def letterToIndex(letter):
    return all_letters.find(letter)

# Just for demonstration, turn a letter into a <1 x n_letters> Tensor
def letterToTensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letterToIndex(letter)] = 1
    return tensor

# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def lineToTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

print(letterToTensor('J'))

print(lineToTensor('Jones').size())

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0.]])
torch.Size([5, 1, 57])


In [10]:
print(all_categories)
print(category_lines)

[]
{}


In [11]:
from torch.utils.data import Dataset, DataLoader

class WordDataset(Dataset):
  def __init__(self, category_lines):
      self.x = []
      self.y = []
      for key, value in category_lines.items():
        for line in value:
          self.x.append(self.lineToTensor(line))
          self.y.append(torch.tensor(all_categories.index(key), dtype=torch.long))

  def __len__(self):
    return len(self.x)
  
  def __getitem__(self, idx):
    return self.x[idx], self.y[idx]
  
  # Turn a line into a <line_length x 1 x n_letters>,
  # or an array of one-hot letter vectors
  def lineToTensor(self, line):
      word2vec = []
      for li, letter in enumerate(line):
          tensor = torch.zeros(n_letters)
          tensor[self.letterToIndex(letter)] = 1
          word2vec.append(tensor)
      return torch.stack(word2vec)
  
  # Find letter index from all_letters, e.g. "a" = 0
  def letterToIndex(self, letter):
      return all_letters.find(letter)

In [12]:
dataset = WordDataset(category_lines)
data_loader = DataLoader(dataset, batch_size=1, shuffle=True)

ValueError: num_samples should be a positive integer value, but got num_samples=0

### (2) 모델

In [None]:
import torch.nn as nn

# CPU 또는 GPU 설정
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

class Net(nn.Module):
  def __init__(self, input_size, hidden_size, output_size):
    super(Net, self).__init__()
    self.hidden_size = hidden_size
    self.rnn = nn.RNN(input_size=57, hidden_size=hidden_size)
    self.fc = nn.Linear(hidden_size, output_size)
  
  def forward(self, input, hidden):
    x = input.permute(1, 0, 2).contiguous()
    out, h = self.rnn(x, hidden)
    x = self.fc(out[-1])
    return x, h
  
  def initHidden(self):
    return torch.ones(1, 1, self.hidden_size)

n_hidden = 128
rnn = Net(n_letters, n_hidden, n_categories).to(device)

Using cuda device


### (3) 학습

#### Helper functions

In [None]:
criterion = nn.CrossEntropyLoss()
optim = torch.optim.Adam(rnn.parameters(), lr=0.01)

def train(line_tensor, category_tensor):
  optim.zero_grad()
  hidden = rnn.initHidden().to(device)
  output, hidden = rnn(line_tensor, hidden)
  loss = criterion(output, category_tensor)

  loss.backward()
  optim.step()

  return output, loss.item()

In [None]:
import time
import math

epochs = 5
plot_every = 5000

# Keep track of losses for plotting
current_loss = 0
all_losses = []

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

start = time.time()

for i in range(epochs):
    step_counter = 0
    num_correct = 0

    for x, y in data_loader:
      x = x.to(device)
      y = y.to(device)
      output, loss = train(x, y)
      current_loss += loss
      _, hat_y = output.max(1)
      num_correct += (hat_y == y).sum()
      step_counter += 1
      if step_counter % plot_every == 0:
          all_losses.append(current_loss / plot_every)
          current_loss = 0
          print("Epochs:{} / Steps: {} / Accuracy: {:.3f}".format(i, 
                                                                  step_counter, 
                                                                  num_correct/step_counter))

print("Time required: {}".format(timeSince(start)))

Epochs:0 / Steps: 5000 / Accuracy: 0.359
Epochs:0 / Steps: 10000 / Accuracy: 0.378
Epochs:0 / Steps: 15000 / Accuracy: 0.381
Epochs:0 / Steps: 20000 / Accuracy: 0.375
Epochs:1 / Steps: 5000 / Accuracy: 0.356
Epochs:1 / Steps: 10000 / Accuracy: 0.353
Epochs:1 / Steps: 15000 / Accuracy: 0.349
Epochs:1 / Steps: 20000 / Accuracy: 0.347
Epochs:2 / Steps: 5000 / Accuracy: 0.365
Epochs:2 / Steps: 10000 / Accuracy: 0.348
Epochs:2 / Steps: 15000 / Accuracy: 0.348
Epochs:2 / Steps: 20000 / Accuracy: 0.343


#### Plotting run

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

plt.figure()
plt.plot(all_losses)

### (4) LSTM

In [None]:
class LSTM_Net(nn.Module):
  def __init__(self, input_size, hidden_size, output_size):
    super(LSTM_Net, self).__init__()
    self.hidden_size = hidden_size
    self.lstm = nn.LSTM(input_size=57, hidden_size=self.hidden_size)
    self.fc = nn.Linear(hidden_size, output_size)
  
  def forward(self, input, hidden, cell):
    x = input.permute(1, 0, 2).contiguous()
    out, (h, c) = self.lstm(x, (hidden, cell))
    x = self.fc(out[-1])
    return x, h, c
  
  def initHidden(self):
    return torch.ones(1, 1, self.hidden_size)
  
  def initCell(self):
    return torch.ones(1, 1, self.hidden_size)

n_hidden = 128
lstm = LSTM_Net(n_letters, n_hidden, n_categories).to(device)

In [None]:
criterion = nn.CrossEntropyLoss().to(device)
optim = torch.optim.Adam(lstm.parameters(), lr=0.01)

def train(line_tensor, category_tensor):
    optim.zero_grad()
    hidden = lstm.initHidden().to(device)
    cell = lstm.initCell().to(device)

    output, hidden, cell = lstm(line_tensor, hidden, cell)
    loss = criterion(output, category_tensor)

    loss.backward()
    optim.step()

    return output, loss.item()

In [None]:
import time
import math

epochs = 5
plot_every = 5000

# Keep track of losses for plotting
current_loss = 0
all_losses = []

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

start = time.time()

for i in range(epochs):
    step_counter = 0
    num_correct = 0

    for x, y in data_loader:
      x = x.to(device)
      y = y.to(device)
      output, loss = train(x, y)
      current_loss += loss
      _, hat_y = output.max(1)
      num_correct += (hat_y == y).sum()
      step_counter += 1
      if step_counter % plot_every == 0:
          all_losses.append(current_loss / plot_every)
          current_loss = 0
          print("Epochs:{} / Steps: {} / Accuracy: {:.3f}".format(i, 
                                                                  step_counter, 
                                                                  num_correct/step_counter))

print("Time required: {}".format(timeSince(start)))

In [None]:
plt.figure()
plt.plot(all_losses)

### (5) GRU

In [None]:
class GRU_Net(nn.Module):
  def __init__(self, input_size, hidden_size, output_size):
    super(GRU_Net, self).__init__()
    self.hidden_size = hidden_size
    self.gru = nn.GRU(input_size=input_size, hidden_size=hidden_size)
    self.fc = nn.Linear(hidden_size, output_size)
  
  def forward(self, input, hidden):
    x = input.permute(1, 0, 2).contiguous()
    out, h = self.gru(x, hidden)
    x = self.fc(out[-1])
    return x, h
  
  def initHidden(self):
    return torch.ones(1, 1, self.hidden_size)

n_hidden = 128
gru = GRU_Net(n_letters, n_hidden, n_categories).to(device)

In [None]:
criterion = nn.CrossEntropyLoss()
optim = torch.optim.Adam(gru.parameters(), lr=0.01)

def train(line_tensor, category_tensor):
    optim.zero_grad()
    hidden = gru.initHidden().to(device)

    output, hidden = gru(line_tensor, hidden)
    loss = criterion(output, category_tensor)

    loss.backward()
    optim.step()

    return output, loss.item()

In [None]:
import time
import math

epochs = 5
plot_every = 5000

# Keep track of losses for plotting
current_loss = 0
all_losses = []

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

start = time.time()

for i in range(epochs):
    step_counter = 0
    num_correct = 0

    for x, y in data_loader:
      x = x.to(device) 
      y = y.to(device)
      output, loss = train(x, y)
      current_loss += loss
      _, hat_y = output.max(1)
      num_correct += (hat_y == y).sum()
      step_counter += 1
      if step_counter % plot_every == 0:
          all_losses.append(current_loss / plot_every)
          current_loss = 0
          print("Epochs:{} / Steps: {} / Accuracy: {:.3f}".format(i, 
                                                                  step_counter, 
                                                                  num_correct/step_counter))

print("Time required: {}".format(timeSince(start)))

In [None]:
plt.figure()
plt.plot(all_losses)