# Seq2Seq with Attention for Korean-English Neural Machine Translation
- Network architecture based on this [paper](https://arxiv.org/abs/1409.0473)
- Fit to run on Google Colaboratory

In [0]:
import os
import io
import tarfile

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchtext

from torchtext.data import Dataset
from torchtext.data import Example
from torchtext.data import Field
from torchtext.data import BucketIterator

# 1. Upload Data to Colab Workspace

로컬에 존재하는 다음 3개의 데이터를 가상 머신에 업로드. 파일의 원본은 [여기](https://github.com/jungyeul/korean-parallel-corpora/tree/master/korean-english-news-v1/)에서도 확인

- korean-english-park.train.tar.gz
- korean-english-park.dev.tar.gz
- korean.english-park.test.tar.gz



In [2]:
# 현재 작업경로를 확인 & 'data' 폴더 생성
!echo 'Current working directory:' ${PWD}
!mkdir -p data/
!ls -al

Current working directory: /content
total 20
drwxr-xr-x 1 root root 4096 Aug  1 00:22 .
drwxr-xr-x 1 root root 4096 Aug  1 00:21 ..
drwxr-xr-x 1 root root 4096 Jul 19 16:14 .config
drwxr-xr-x 2 root root 4096 Aug  1 00:22 data
drwxr-xr-x 1 root root 4096 Jul 19 16:14 sample_data


In [4]:
# 로컬의 데이터 업로드
from google.colab import files
uploaded = files.upload()

Saving korean-english-park.dev.tar.gz to korean-english-park.dev.tar.gz
Saving korean-english-park.test.tar.gz to korean-english-park.test.tar.gz
Saving korean-english-park.train.tar.gz to korean-english-park.train.tar.gz


In [5]:
# 'data' 폴더 하위로 이동, 잘 옮겨졌는지 확인
!mv *.tar.gz data/
!ls -al data/

total 8864
drwxr-xr-x 2 root root    4096 Aug  1 00:25 .
drwxr-xr-x 1 root root    4096 Aug  1 00:25 ..
-rw-r--r-- 1 root root  113461 Aug  1 00:23 korean-english-park.dev.tar.gz
-rw-r--r-- 1 root root  229831 Aug  1 00:23 korean-english-park.test.tar.gz
-rw-r--r-- 1 root root 8718893 Aug  1 00:24 korean-english-park.train.tar.gz


# 2. Check Packages

## KoNLPy (설치 필요)

In [6]:
# Java 1.8 & KoNLPy 설치
!apt-get update
!apt-get install g++ openjdk-8-jdk python-dev python3-dev
!pip3 install JPype1-py3
!pip3 install konlpy

0% [Working]            Ign:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
0% [Connecting to archive.ubuntu.com (91.189.88.24)] [Waiting for headers] [Wai                                                                               Get:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/ InRelease [3,626 B]
0% [Connecting to archive.ubuntu.com (91.189.88.24)] [Waiting for headers] [2 I0% [Connecting to archive.ubuntu.com (91.189.88.24)] [Waiting for headers] [Wai                                                                               Get:3 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
0% [Connecting to archive.ubuntu.com (91.189.88.24)] [3 InRelease 2,586 B/88.7 0% [2 InRelease gpgv 3,626 B] [Waiting for headers] [3 InRelease 2,586 B/88.7 k                                                                               Get:4 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease 

In [7]:
from konlpy.tag import Okt
ko_tokens = Okt().pos('트위터 데이터로 학습한 형태소 분석기가 잘 실행이 되는지 확인해볼까요?')  # list of (word, POS TAG) tuples
ko_tokens = [t[0] for t in ko_tokens]  # Only get words
print(ko_tokens)

del ko_tokens  # 필요 없으니까 삭제

-------------------------------------------------------------------------------
Deprecated: convertStrings was not specified when starting the JVM. The default
behavior in JPype will be False starting in JPype 0.8. The recommended setting
for new code is convertStrings=False.  The legacy value of True was assumed for
please file a ticket with the developer.
-------------------------------------------------------------------------------

  """)


['트위터', '데이터', '로', '학습', '한', '형태소', '분석', '기', '가', '잘', '실행', '이', '되는지', '확인', '해볼까', '요', '?']


## Spacy (이미 설치되어 있음)

In [8]:
# 설치가 되어있는지 확인
!pip show spacy

Name: spacy
Version: 2.1.6
Summary: Industrial-strength Natural Language Processing (NLP) with Python and Cython
Home-page: https://spacy.io
Author: Explosion AI
Author-email: contact@explosion.ai
License: MIT
Location: /usr/local/lib/python3.6/dist-packages
Requires: preshed, thinc, requests, plac, wasabi, blis, murmurhash, numpy, srsly, cymem
Required-by: fastai, en-core-web-sm


In [9]:
# 설치가 되어있는지 확인 (없다면 자동설치됨)
!python -m spacy download en_core_web_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [10]:
import spacy
spacy_en = spacy.load('en_core_web_sm')
en_tokens = [t.text for t in spacy_en.tokenizer('Check that spacy tokenizer works.')]
print(en_tokens)

del en_tokens  # 필요 없으니까 삭제

['Check', 'that', 'spacy', 'tokenizer', 'works', '.']


# 3. Define Tokenizing Functions
문장을 받아 그보다 작은 어절 혹은 형태소 단위의 리스트로 반환해주는 함수를 각 언어에 대해 작성
- Korean: konlpy.tag.Okt() <- Twitter()에서 명칭변경
- English: spacy.tokenizer

## Korean Tokenizer

In [0]:
#from konlpy.tag import Okt

class KoTokenizer(object):
    """For Korean."""
    def __init__(self):
        self.tokenizer = Okt()
        
    def tokenize(self, text):
        tokens = self.tokenizer.pos(text)
        tokens = [t[0] for t in tokens]
        return tokens

In [12]:
# Usage example
print(KoTokenizer().tokenize('전처리는 언제나 지겨워요.'))

['전', '처리', '는', '언제나', '지겨워요', '.']


## English Tokenizer

In [0]:
#import spacy

class EnTokenizer(object):
    """For English."""
    def __init__(self):
        self.spacy_en = spacy.load('en_core_web_sm')
        
    def tokenize(self, text):
        tokens = [t.text for t in self.spacy_en.tokenizer(text)]
        return tokens

In [14]:
# Usage example
print(EnTokenizer().tokenize("What I cannot create, I don't understand."))

['What', 'I', 'can', 'not', 'create', ',', 'I', 'do', "n't", 'understand', '.']


# 4. Data Preprocessing

## Load data

In [15]:
# Current working directory & list of files
!echo 'Current working directory:' ${PWD}
!ls -al

Current working directory: /content
total 20
drwxr-xr-x 1 root root 4096 Aug  1 00:25 .
drwxr-xr-x 1 root root 4096 Aug  1 00:21 ..
drwxr-xr-x 1 root root 4096 Jul 19 16:14 .config
drwxr-xr-x 2 root root 4096 Aug  1 00:25 data
drwxr-xr-x 1 root root 4096 Jul 19 16:14 sample_data


In [16]:
DATA_DIR = './data/'
print('Data directory exists:', os.path.isdir(DATA_DIR))
print('List of files:')
print(*os.listdir(DATA_DIR), sep='\n')

Data directory exists: True
List of files:
korean-english-park.test.tar.gz
korean-english-park.train.tar.gz
korean-english-park.dev.tar.gz


In [0]:
def get_data_from_tar_gz(filename):
    """
    Retrieve contents from a `tar.gz` file without extraction.
    Arguments:
        filename: path to `tar.gz` file.
    Returns:
        dict, (name, content) pairs
    """
    
    assert os.path.exists(filename)
    
    out = {}
    with tarfile.open(filename, 'r:gz') as tar:
        for member in tar.getmembers():
            lang = member.name.split('.')[-1]  # ex) korean-english-park.train.ko -> ko
            f = tar.extractfile(member)
            if f is not None:
                content = f.read().decode('utf-8')
                content = content.splitlines()
                out[lang] = content
    
    assert isinstance(out, dict)
    
    return out

In [0]:
# Each 'xxx_data' is a dictionary with keys; 'ko', 'en'
train_dict= get_data_from_tar_gz(os.path.join(DATA_DIR, 'korean-english-park.train.tar.gz'))  # train
dev_dict = get_data_from_tar_gz(os.path.join(DATA_DIR, 'korean-english-park.dev.tar.gz'))     # dev
test_dict = get_data_from_tar_gz(os.path.join(DATA_DIR, 'korean-english-park.test.tar.gz'))   # test

In [19]:
# Some samples (ko)
train_dict['ko'][100:105]

['제 23차 연례 컴덱스 박람회의 개회사를 한 케이츠는 2년여전 기술 산업의 거품이 붕괴된 이후에 첨단 기술에 대해 부정적인 인식이 있다고 말했다.',
 '1,960만갤론 이상의 중유를 운반하던 손상된 유조선이 스페인 북서부 연안 130마일 해역에서 침몰하여, 스페인 당국이 어업이 주요 산업인 이 지역에서 해안선 보호를 위해 황급히 서두르고 있다.',
 '싣고 있는 기름이 모두 유출된다면, 알라스카州 프린스 윌리암 사운드에서 1989년에 발생했던 엑손 발데즈 기름 유출 사건의 규모보다 두 배 이상이 되는 사상 최대 규모의 기름 유출 사건 중의 하나가 될 것이라고 세계 야생 생물 기금이 경고하고 있다.',
 '대도시를 완전히 파괴할 만큼 커다란 우주의 암석 덩어리들이 지구에 충돌하는 것은 이전에 생각했던 것보다 빈도가 훨씬 적은, 대략 1천년에 한 번 정도이다.',
 '핵폭발의 근거지를 찾아내는 美 국방성의 인공위성으로부터 얻은 수년간의 자료에 대한 전례 없는 조사에서 이러한 수정된 계산 결과가 나오게 되었다.']

In [20]:
# Some samples (en)
train_dict['en'][100:105]

['Gates, who opened the 23rd annual Comdex trade show, said there was a negative perception of high tech following the collapse of the tech bubble about two years ago.',
 "carrying more than 19.6 million gallons of heavy fuel oil sank 130 miles off Spain's northwest coast, leaving Spanish authorities scrambling to protect the coastline in the region where fishing is the primary industry.",
 "The World Wildlife Fund warns that if all of the oil leaked, it would be one of the largest oil spills ever more than twice the size of the 1989 Exxon Valdez spill in Alaska's Prince William Sound.",
 'Space boulders big enough to wipe out a major city slam into the Earth about once every 1,000 years, much less frequently than previously thought.',
 'The revised calculations come from an unprecedented examination of years of data that scour the ground for nuclear explosions.']

## Define Datasets

In [0]:
#from torchtext.data import Dataset
#from torchtext.data import Example

class KoEnTranslationDataset(Dataset):
    """A dataset for Korean-English Neural Machine Translation."""
    
    @staticmethod
    def sort_key(ex):
        return torchtext.data.interleave_keys(len(ex.src), len(ex.trg))
    
    def __init__(self, data_dict, field_dict, source_lang='ko', max_samples=None, **kwargs):
        """
        Only 'ko' and 'en' supported for `language`
        Arguments: 
            data_dict: dict of (`language`, text) pairs.
            field_dict: dict of (`language`, Field instance) pairs.
            source_lang: str, default 'ko'.
            Other kwargs are passed to the constructor of `torchtext.data.Dataset`.
        """
        
        if not all(k in ['ko', 'en'] for k in data_dict.keys()):
            raise KeyError("Check data keys.")
            
        if not all(k in ['ko', 'en'] for k in field_dict.keys()):
            raise KeyError("Check field keys.")
            
        if source_lang == 'ko':
            fields = [('src', field_dict['ko']), ('trg', field_dict['en'])]
            src_data = data_dict['ko']
            trg_data = data_dict['en']
        elif source_lang == 'en':
            fields = [('src', field_dict['en']), ('trg', field_dict['ko'])]
            src_data = data_dict['en']
            trg_data = data_dict['ko']
        else:
            raise NotImplementedError
        

        if not len(src_data) == len(trg_data):
            raise ValueError('Inconsistent number of instances between two languages.')
        
        examples = []
        for i, (src_line, trg_line) in enumerate(zip(src_data, trg_data)):
            src_line = src_line.strip()
            trg_line = trg_line.strip()
            if src_line != '' and trg_line != '':
                
                examples.append(
                    torchtext.data.Example.fromlist(
                        [src_line, trg_line], fields
                    )
                )
                
                i += 1
                if max_samples is not None:
                    if i >= max_samples:
                        break
                    
        super(KoEnTranslationDataset, self).__init__(examples, fields, **kwargs)
        

## Define Fields
- Instantiate tokenizers; one for each language.
- The 'tokenize' argument of `Field` requires a tokenizing function.

In [0]:
#from torchtext.data import Field

ko_tokenizer = KoTokenizer()  # korean tokenizer
en_tokenizer = EnTokenizer()  # english tokenizer

# Field instance for korean
KOREAN = Field(
    init_token='<sos>',
    eos_token='<eos>',
    tokenize=ko_tokenizer.tokenize,
    batch_first=True,
    lower=False
)

# Field instance for english
ENGLISH = Field(
    init_token='<sos>',
    eos_token='<eos>',
    tokenize=en_tokenizer.tokenize,
    batch_first=True,
    lower=True
)

# Store Field instances in a dictionary
field_dict = {
    'ko': KOREAN,
    'en': ENGLISH,
}

## Instantiate datasets
- one for each set (train, dev, test)

In [0]:
# 학습시간 단축을 위해 학습 데이터 줄이기
MAX_TRAIN_SAMPLES = 10000

In [24]:
# Instantiate with data
train_set = KoEnTranslationDataset(train_dict, field_dict, max_samples=MAX_TRAIN_SAMPLES)
print('Train set ready.')
print('#. examples:', len(train_set.examples)) 

dev_set = KoEnTranslationDataset(dev_dict, field_dict)
print('Dev set ready...')
print('#. examples:', len(dev_set.examples))

test_set = KoEnTranslationDataset(test_dict, field_dict)
print('Test set ready...')
print('#. examples:', len(test_set.examples))

Train set ready.
#. examples: 10000
Dev set ready...
#. examples: 1000
Test set ready...
#. examples: 2000


In [25]:
# Training example (KO, source language)
train_set.examples[50].src

['신랑',
 '들러리',
 '가',
 '포도주',
 '잔',
 '을',
 '들어',
 '올린다',
 '그리고는',
 '술',
 '주정',
 '을',
 '늘어놓는다',
 '.']

In [26]:
# Training example (EN, target language)
train_set.examples[50].trg

['the',
 'best',
 'man',
 'raises',
 'his',
 'wine',
 'glass',
 'and',
 'out',
 'comes',
 'a',
 'drunken',
 'diatribe',
 '.']

## Build Vocabulary
- 각 언어별 생성: `Field`의 인스턴스를 활용
- 최소 빈도수(`MIN_FREQ`) 값을 작게 하면 vocabulary의 크기가 커짐.
- 최소 빈도수(`MIN_FREQ`) 값을 크게 하면 vocabulary의 크기가 작아짐.


In [0]:
MIN_FREQ = 2  # TODO: try different values

In [177]:
# Build vocab for Korean
KOREAN.build_vocab(train_set, dev_set, test_set, min_freq=MIN_FREQ)  # ko
print('Size of source vocab (ko):', len(KOREAN.vocab))

Size of source vocab (ko): 14308


In [178]:
# Check indices of some important tokens
tokens = ['<unk>', '<pad>', '<sos>', '<eos>']
for token in tokens:
    print(f"{token} -> {KOREAN.vocab.stoi[token]}")

<unk> -> 0
<pad> -> 1
<sos> -> 2
<eos> -> 3


In [179]:
# Build vocab for English
ENGLISH.build_vocab(train_set, dev_set, test_set, min_freq=MIN_FREQ)  # en
print('Size of target vocab (en):', len(ENGLISH.vocab))

Size of target vocab (en): 12430


In [180]:
# Check indices of some important tokens
tokens = ['<unk>', '<pad>', '<sos>', '<eos>']
for token in tokens:
    print(f"{token} -> {KOREAN.vocab.stoi[token]}")

<unk> -> 0
<pad> -> 1
<sos> -> 2
<eos> -> 3


## Configure Device
- *'런타임'  -> '런타임 유형변경'* 에서 하드웨어 가속기로 **GPU** 선택

In [181]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device to use:', device)

Device to use: cuda


## Create Data Iterators
- 데이터를 미니배치(mini-batch) 단위로 반환해주는 역할
- `train_set`, `dev_set`, `test_set`에 대해 개별적으로 정의해야 함
- `BATCH_SIZE`를 정의해주어야 함
- `torchtext.data.BucketIterator`는 하나의 미니배치를 서로 비슷한 길이의 관측치들로 구성함
- [Bucketing](https://medium.com/@rashmi.margani/how-to-speed-up-the-training-of-the-sequence-model-using-bucketing-techniques-9e302b0fd976)의 효과: 하나의 미니배치 내 padding을 최소화하여 연산의 낭비를 줄여줌


In [0]:
BATCH_SIZE = 128

In [183]:
#from torchtext.data import BucketIterator

# Train iterator
train_iterator = BucketIterator(
    train_set,
    batch_size=BATCH_SIZE,
    train=True,
    shuffle=True,
    device=device
)

print(f'Number of minibatches per epoch: {len(train_iterator)}')

Number of minibatches per epoch: 79


In [184]:
#from torchtext.data import BucketIterator

# Dev iterator
dev_iterator = BucketIterator(
    dev_set,
    batch_size=100,
    train=False,
    shuffle=False,
    device=device
)

print(f'Number of minibatches per epoch: {len(dev_iterator)}')

Number of minibatches per epoch: 10


In [185]:
#from torchtext.data import BucketIterator

# Test iterator
test_iterator = BucketIterator(
    test_set,
    batch_size=200,
    train=False,
    shuffle=False,
    device=device
)

print(f'Number of minibatches per epoch: {len(test_iterator)}')

Number of minibatches per epoch: 10


In [186]:
train_batch = next(iter(train_iterator))
print('a batch of source examples has shape:', train_batch.src.size())  # (b, s)
print('a batch of target examples has shape:', train_batch.trg.size())  # (b, s)

a batch of source examples has shape: torch.Size([128, 71])
a batch of target examples has shape: torch.Size([128, 51])


In [187]:
# Checking first sample in mini-batch (KO, source lang)
ko_indices = train_batch.src[0]
ko_tokens = [KOREAN.vocab.itos[i] for i in ko_indices]
for t, i in zip(ko_tokens, ko_indices):
    print(f"{t} ({i})")
    
del ko_indices, ko_tokens

<sos> (2)
창기 (1486)
라이 (1211)
는 (12)
“ (33)
지금 (299)
필요한 (579)
것 (16)
은 (9)
새로운 (155)
선거 (278)
” (34)
라며 (465)
“ (33)
선거 (278)
를 (11)
다시 (272)
치러야 (13744)
폭력 (643)
사태 (432)
와 (35)
혼란 (1865)
을 (6)
막 (2294)
을 (6)
수 (36)
있다 (26)
” (34)
고 (32)
지적 (641)
했다 (15)
. (4)
<eos> (3)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)


In [188]:
# Checking first sample in mini-batch (EN, target lang)
en_indices = train_batch.trg[0]
en_tokens = [ENGLISH.vocab.itos[i] for i in en_indices]
for t, i in zip(en_tokens, en_indices):
    print(f"{t} ({i})")
    
del en_indices, en_tokens

<sos> (2)
" (12)
the (4)
reality (3089)
is (18)
that (14)
a (9)
new (53)
election (372)
, (5)
<unk> (0)
of (7)
violence (563)
and (10)
intimidation (3272)
, (5)
is (18)
the (4)
only (113)
way (180)
to (8)
put (392)
zimbabwe (737)
right (297)
. (6)
<eos> (3)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)
<pad> (1)


In [0]:
del train_batch  # 더 이상 필요 없으니까 삭제

# 5. Building Seq2Seq Model

## Hyperparameters

In [0]:
# Hyperparameters
INPUT_DIM = len(KOREAN.vocab)
OUTPUT_DIM = len(ENGLISH.vocab)
ENC_EMB_DIM = DEC_EMB_DIM = 100
ENC_HID_DIM = DEC_HID_DIM = 60
USE_BIDIRECTIONAL = False

## Encoder

In [0]:
class Encoder(nn.Module):
    """
    Learns an embedding for the source text.
        Arguments:
        input_dim: int, size of input language vocabulary.
        emb_dim: int, size of embedding layer output.
        enc_hid_dim: int, size of encoder hidden state.
        dec_hid_dim: int, size of decoder hidden state.
        bidirectional: uses bidirectional RNNs if True. default is False.
    """
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, bidirectional=False):
        
        super(Encoder, self).__init__()
        
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.bidirectional = bidirectional
        
        self.embedding = nn.Embedding(
            num_embeddings=self.input_dim,
            embedding_dim=self.emb_dim
        )
        
        self.rnn = nn.GRU(
            input_size=self.emb_dim,
            hidden_size=self.enc_hid_dim,
            bidirectional=self.bidirectional,
            batch_first=True
        )
        
        self.rnn_output_dim = self.enc_hid_dim
        if self.bidirectional:
            self.rnn_output_dim *= 2
        
        self.fc = nn.Linear(self.rnn_output_dim, self.dec_hid_dim)
        self.dropout = nn.Dropout(.2)
        
    def forward(self, src):
        """
        Arguments:
            src: 2d tensor of shape (batch_size, input_seq_len)
        Returns:
            outputs: 3d tensor of shape (batch_size, input_seq_len, num_directions * enc_h)
            hidden: 2d tensor of shape (b, dec_h). This tensor will be used as the initial
                hidden state value of the decoder (h0 of decoder).
        """
        
        assert len(src.size()) == 2, 'Input requires dimension (batch_size, seq_len).'
        
        # Shape: (b, s, h)
        embedded = self.embedding(src)
        embedded = self.dropout(embedded)
        
        outputs, hidden = self.rnn(embedded)
        
        if self.bidirectional:
            # (2, b, enc_h) -> (b, 2 * enc_h)
            hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        else:
            # (1, b, enc_h) -> (b, enc_h)
            hidden = hidden.squeeze(0)
        
        # (b, num_directions * enc_h) -> (b, dec_h)
        hidden = self.fc(hidden)
        hidden = torch.tanh(hidden)
        
        return outputs, hidden
            

## Attention

In [0]:
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim, encoder_is_bidirectional=False):
        super(Attention, self).__init__()
        
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.encoder_is_bidirectional = encoder_is_bidirectional
        
        self.attention_input_dim = enc_hid_dim + dec_hid_dim
        if self.encoder_is_bidirectional:
            self.attention_input_dim += enc_hid_dim  # 2 * h_enc + h_dec
        
        self.linear = nn.Linear(self.attention_input_dim, dec_hid_dim)
        self.v = nn.Parameter(torch.rand(dec_hid_dim))
        
    def forward(self, hidden, encoder_outputs):
        """
        Arguments:
            hidden: 2d tensor with shape (batch_size, dec_hid_dim).
            encoder_outputs: 3d tensor with shape (batch_size, input_seq_len, enc_hid_dim).
                if encoder is bidirectional, expects (batch_size, input_seq_len, 2 * enc_hid_dim).
        """
        
        # Shape check
        assert hidden.dim() == 2
        assert encoder_outputs.dim() == 3
        
        batch_size, seq_len, _ = encoder_outputs.size()
        
        # (b, dec_h) -> (b, s, dec_h)
        hidden = hidden.unsqueeze(1).expand(-1, seq_len, -1)
        
        # concat; shape results in (b, s, enc_h + dec_h).
        # if encoder is bidirectional, (b, s, 2 * h_enc + h_dec).
        concat = torch.cat((hidden, encoder_outputs), dim=2)
        
        # concat; shape is (b, s, dec_h)
        concat = self.linear(concat)
        concat = torch.tanh(concat)
        
        # tile v; (dec_h, ) -> (b, dec_h, 1)
        v = self.v.repeat(batch_size, 1).unsqueeze(2)
        
        # attn; (b, s, dec_h) @ (b, dec_h, 1) -> (b, s, 1) -> (b, s)
        attn_scores = torch.bmm(concat, v).squeeze(-1)
        
        assert attn_scores.dim() == 2  # Final shape check: (b, s)
        
        return F.softmax(attn_scores, dim=1)
        

## Decoder

In [0]:
class Decoder(nn.Module):
    """
    Unlike the encoder, a single forward pass of
    a `Decoder` instance is defined for only a single timestep.
    Arguments:
        output_dim: int,
        emb_dim: int,
        enc_hid_dim: int,
        dec_hid_dim: int,
        attention_module: torch.nn.Module,
        encoder_is_bidirectional: False
    """
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, attention_module, encoder_is_bidirectional=False):
        super(Decoder, self).__init__()
        
        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.output_dim = output_dim
        self.encoder_is_bidirectional = encoder_is_bidirectional
        
        if isinstance(attention_module, nn.Module):
            self.attention_module = attention_module
        else:
            raise ValueError
        
        self.rnn_input_dim = enc_hid_dim + emb_dim  # enc_h + dec_emb_dim
        if self.encoder_is_bidirectional:
            self.rnn_input_dim += enc_hid_dim       # 2 * enc_h + dec_emb_dim
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.GRU(
            input_size=self.rnn_input_dim,
            hidden_size=dec_hid_dim,
            bidirectional=False,
            batch_first=True,
        )
        
        out_input_dim = 2 * dec_hid_dim + emb_dim  # hidden + dec_hidden_dim + dec_emb_dim
        self.out = nn.Linear(out_input_dim, output_dim)
        
        self.dropout = nn.Dropout(.2)
        
    def forward(self, inp, hidden, encoder_outputs):
        """
        Arguments:
            inp: 1d tensor with shape (batch_size, )
            hidden: 2d tensor with shape (batch_size, dec_hid_dim).
                This `hidden` tensor is the hidden state vector from the previous timestep.
            encoder_outputs: 3d tensor with shape (batch_size, seq_len, enc_hid_dim).
                If encoder_is_bidirectional is True, expects shape (batch_size, seq_len, 2 * enc_hid_dim).
        """
        
        assert inp.dim() == 1
        assert hidden.dim() == 2
        assert encoder_outputs.dim() == 3
        
        # (batch_size, ) -> (batch_size, 1)
        inp = inp.unsqueeze(1)
        
        # (batch_size, 1) -> (batch_size, 1, emb_dim)
        embedded = self.embedding(inp)
        embedded = self.dropout(embedded)
        
        # attention probabilities; (batch_size, seq_len)
        attn_probs = self.attention_module(hidden, encoder_outputs)
        
        # (batch_size, 1, seq_len)
        attn_probs = attn_probs.unsqueeze(1)
        
        # (b, 1, s) @ (b, s, enc_hid_dim) -> (b, 1, enc_hid_dim)
        weighted = torch.bmm(attn_probs, encoder_outputs)
        
        # (batch_size, 1, emb_dim + enc_hid_dim)
        rnn_input = torch.cat((embedded, weighted), dim=2)
        
        # output; (batch_size, 1, dec_hid_dim)
        # new_hidden; (1, batch_size, dec_hid_dim)
        output, new_hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        
        embedded = embedded.squeeze(1)  # (b, 1, emb) -> (b, emb)
        output = output.squeeze(1)      # (b, 1, dec_h) -> (b, dec_h)
        weighted = weighted.squeeze(1)  # (b, 1, dec_h) -> (b, dec_h)
        
        # output; (batch_size, emb + 2 * dec_h) -> (batch_size, output_dim)
        output = self.out(torch.cat((output, weighted, embedded), dim=1))
        
        return output, new_hidden.squeeze(0)

## Seq2Seq

In [0]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_ratio=.5):
        
        batch_size, max_seq_len = trg.size()
        trg_vocab_size = self.decoder.output_dim
        
        # An empty tesnor to store decoder outputs (time index first for indexing)
        outputs_shape = (max_seq_len, batch_size, trg_vocab_size)
        outputs = torch.zeros(outputs_shape).to(self.device)
        
        encoder_outputs, hidden = self.encoder(src)
        
        # first input to the decoder is '<sos>'
        # trg; shape (batch_size, seq_len)
        initial_dec_input = output = trg[:, 0]  # get first timestep token
        
        for t in range(1, max_seq_len):
            
            output, hidden = self.decoder(output, hidden, encoder_outputs)
            outputs[t] = output  # Save output for timestep t, for 1 <= t <= max_len
            
            top1_val, top1_idx = output.max(dim=1)
            teacher_force = torch.rand(1).item() >= teacher_forcing_ratio
            
            output = trg[:, t] if teacher_force else top1_idx
        
        # Switch batch and time dimensions for consistency (batch_first=True)
        outputs = outputs.permute(1, 0, 2)  # (s, b, trg_vocab) -> (b, s, trg_vocab)
            
        return outputs

## Build Model

In [195]:
# Define encoder
enc = Encoder(
    input_dim=INPUT_DIM,
    emb_dim=ENC_EMB_DIM,
    enc_hid_dim=ENC_HID_DIM,
    dec_hid_dim=DEC_HID_DIM,
    bidirectional=USE_BIDIRECTIONAL
)

print(enc)

Encoder(
  (embedding): Embedding(14308, 100)
  (rnn): GRU(100, 60, batch_first=True)
  (fc): Linear(in_features=60, out_features=60, bias=True)
  (dropout): Dropout(p=0.2)
)


In [196]:
# Define attention layer
attn = Attention(
    enc_hid_dim=ENC_HID_DIM,
    dec_hid_dim=DEC_HID_DIM,
    encoder_is_bidirectional=USE_BIDIRECTIONAL
)

print(attn)

Attention(
  (linear): Linear(in_features=120, out_features=60, bias=True)
)


In [197]:
# Define decoder
dec = Decoder(
    output_dim=OUTPUT_DIM,
    emb_dim=DEC_EMB_DIM,
    enc_hid_dim=ENC_HID_DIM,
    dec_hid_dim=DEC_HID_DIM,
    attention_module=attn,
    encoder_is_bidirectional=USE_BIDIRECTIONAL
)

print(dec)

Decoder(
  (attention_module): Attention(
    (linear): Linear(in_features=120, out_features=60, bias=True)
  )
  (embedding): Embedding(12430, 100)
  (rnn): GRU(160, 60, batch_first=True)
  (out): Linear(in_features=220, out_features=12430, bias=True)
  (dropout): Dropout(p=0.2)
)


In [198]:
model = Seq2Seq(enc, dec, device).to(device)
print(model)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(14308, 100)
    (rnn): GRU(100, 60, batch_first=True)
    (fc): Linear(in_features=60, out_features=60, bias=True)
    (dropout): Dropout(p=0.2)
  )
  (decoder): Decoder(
    (attention_module): Attention(
      (linear): Linear(in_features=120, out_features=60, bias=True)
    )
    (embedding): Embedding(12430, 100)
    (rnn): GRU(160, 60, batch_first=True)
    (out): Linear(in_features=220, out_features=12430, bias=True)
    (dropout): Dropout(p=0.2)
  )
)


In [199]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters.')

The model has 5,500,930 trainable parameters.


# 6. Train

## Optimizer
- Use `optim.Adam` or `optim.RMSprop`.

In [0]:
optimizer = optim.Adam(model.parameters(), lr=0.001)
#optimizer = optim.RMSprop(model.parameters(), lr=0.01)

## Loss function

In [0]:
# Padding indices should not be considered when loss is calculated.
PAD_IDX = ENGLISH.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

## Train function

In [0]:
def train(seq2seq_model, iterator, optimizer, criterion, grad_clip=1.0):
    
    seq2seq_model.train()
    
    epoch_loss = .0
    
    for i, batch in enumerate(iterator):
        
        print('.', end='')
        
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        
        decoder_outputs = seq2seq_model(src, trg, teacher_forcing_ratio=.5)
        seq_len, batch_size, trg_vocab_size = decoder_outputs.size()  # (b, s, trg_vocab)
        
        # (b-1, s, trg_vocab)
        decoder_outputs = decoder_outputs[:, 1:, :]                 
        
        # ((b-1) * s, trg_vocab)
        decoder_outputs = decoder_outputs.contiguous().view(-1, trg_vocab_size)
        
        # ((b-1) * s, )
        trg = trg[:, 1:].contiguous().view(-1)                                       
        
        loss = criterion(decoder_outputs, trg)
        loss.backward()
        
        # Gradient clipping; remedy for exploding gradients
        torch.nn.utils.clip_grad_norm_(seq2seq_model.parameters(), grad_clip)
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

## Evaluate function

In [0]:
def evaluate(seq2seq_model, iterator, criterion):
    
    seq2seq_model.eval()
    
    epoch_loss = 0.
    
    with torch.no_grad():
        
        for i, batch in enumerate(iterator):
            
            print('.', end='')
            
            src = batch.src
            trg = batch.trg
            
            decoder_outputs = seq2seq_model(src, trg, teacher_forcing_ratio=0.)
            seq_len, batch_size, trg_vocab_size = decoder_outputs.size()  # (b, s, trg_vocab)
        
            # (b-1, s, trg_vocab)
            decoder_outputs = decoder_outputs[:, 1:, :]                 
        
            # ((b-1) * s, trg_vocab)
            decoder_outputs = decoder_outputs.contiguous().view(-1, trg_vocab_size)
        
            # ((b-1) * s, )
            trg = trg[:, 1:].contiguous().view(-1)
            
            loss = criterion(decoder_outputs, trg)
            
            epoch_loss += loss.item()
            
    return epoch_loss / len(iterator)

## Epoch time measure function

In [0]:
def epoch_time(start_time, end_time):
    """Returns elapsed time in mins & secs."""
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

## Train for multiple epochs

In [0]:
NUM_EPOCHS = 50

In [207]:
import time
import math

best_dev_loss = float('inf')

for epoch in range(NUM_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion)
    dev_loss = evaluate(model, dev_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if dev_loss < best_dev_loss:
        best_dev_loss = dev_loss
        torch.save(model.state_dict(), './best_model.pt')
    
    print("\n")
    print(f"Epoch: {epoch + 1:>02d} | Time: {epoch_mins}m {epoch_secs}s")
    print(f"Train Loss: {train_loss:>.4f} | Train Perplexity: {math.exp(train_loss):7.3f}")
    print(f"Dev Loss: {dev_loss:>.4f} | Dev Perplexity: {math.exp(dev_loss):7.3f}")
    

.........................................................................................

Epoch: 01 | Time: 1m 19s
Train Loss: 7.2537 | Train Perplexity: 1413.394
Dev Loss: 6.5596 | Dev Perplexity: 705.983
.........................................................................................

Epoch: 02 | Time: 1m 18s
Train Loss: 6.5319 | Train Perplexity: 686.695
Dev Loss: 6.4354 | Dev Perplexity: 623.532
.........................................................................................

Epoch: 03 | Time: 1m 18s
Train Loss: 6.4319 | Train Perplexity: 621.383
Dev Loss: 6.3587 | Dev Perplexity: 577.470
.........................................................................................

Epoch: 04 | Time: 1m 18s
Train Loss: 6.3550 | Train Perplexity: 575.378
Dev Loss: 6.2845 | Dev Perplexity: 536.183
.........................................................................................

Epoch: 05 | Time: 1m 19s
Train Loss: 6.2784 | Train Perplexity: 532.939
Dev Loss: 6.

## Save last model (overfitted)

In [0]:
torch.save(model.state_dict(), './last_model.pt')

# 7. Test

## Function to convert indices to original text strings

In [0]:
def indices_to_text(src_or_trg, lang_field):
    
    assert src_or_trg.dim() == 1, f'{src_or_trg.dim()}'  #(seq_len, )
    assert isinstance(lang_field, torchtext.data.Field)
    assert hasattr(lang_field, 'vocab')
    
    return [lang_field.vocab.itos[t] for t in src_or_trg]

## Function to make predictions
- Returns a list of examples, where each example is a (src, trg, prediction) tuple.

In [0]:
def predict(seq2seq_model, iterator):
    
    seq2seq_model.eval()
    
    out = []
    
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            
            src = batch.src
            trg = batch.trg
            
            decoder_outputs = seq2seq_model(src, trg, teacher_forcing_ratio=0.)
            seq_len, batch_size, trg_vocab_size = decoder_outputs.size()  # (b, s, trg_vocab)
            
            # Discard initial decoder input (index = 0)
            #decoder_outputs = decoder_outputs[:, 1:, :]
            
            decoder_predictions = decoder_outputs.argmax(dim=-1)  # (b, s)
            
            for i, pred in enumerate(decoder_predictions):
                out.append((src[i], trg[i], pred))
            
    return out

## Load best model

In [224]:
!ls -al

total 43004
drwxr-xr-x 1 root root     4096 Aug  1 04:40 .
drwxr-xr-x 1 root root     4096 Aug  1 00:21 ..
-rw-r--r-- 1 root root 22006424 Aug  1 04:10 best_model.pt
drwxr-xr-x 1 root root     4096 Jul 19 16:14 .config
drwxr-xr-x 2 root root     4096 Aug  1 00:25 data
-rw-r--r-- 1 root root 22006424 Aug  1 04:40 last_model.pt
drwxr-xr-x 1 root root     4096 Jul 19 16:14 sample_data


In [225]:
# Load model
model.load_state_dict(torch.load('./best_model.pt'))

IncompatibleKeys(missing_keys=[], unexpected_keys=[])

## Make predictions

In [0]:
# Make prediction
test_predictions = predict(model, dev_iterator)

In [227]:
for i, prediction in enumerate(test_predictions):
    
    src, trg, pred = prediction
    
    src_text = indices_to_text(src, lang_field=KOREAN)
    trg_text = indices_to_text(trg, lang_field=ENGLISH)
    pred_text = indices_to_text(pred, lang_field=ENGLISH)
    
    print('source:\n', src_text)
    print('target:\n', trg_text)
    print('prediction:\n', pred_text)
    print('-' * 160)
    
    if i > 5:
        break

source:
 ['<sos>', '오랫동안', '이탈리아', '전역', '이', '곤혹', '스러웠다', '.', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
target:
 ['<sos>', 'naples', ',', 'italy', '(', 'cnn', ')', 'for', 'years', ',', 'it', "'s", 'been', 'a', 'national', 'embarrassment', '.', '<eos>', '<pad>']
prediction:
 ['<unk>', 'the', 'was', 'the', '.', 'cnn', ')', '.', 'the', '.', '<eos>', '.', '.', '.', '.', '.', '.', '<eos>', '<eos>']
----------------------------------------------------------------------------------------------------------------------------------------------------------------
source:
 ['<sos>', 'bank', '-', '<unk>', 'company', '은행', '지주회사', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
target:
 ['<sos>', '“', 'gm', "'s", 'financing', 'arm', ',', 'gmac', ',', 'has', 'been', 'declared', 'a', 'bank', '-', 'holding', 'company', '.', '<eos>']
prediction:
 ['<unk>', 'the', 'the', 'to', '<unk>', 'to', '<unk>', '<unk>', '<unk>', '<un

# 8. Download Model

In [228]:
!ls -al

total 43004
drwxr-xr-x 1 root root     4096 Aug  1 04:40 .
drwxr-xr-x 1 root root     4096 Aug  1 00:21 ..
-rw-r--r-- 1 root root 22006424 Aug  1 04:10 best_model.pt
drwxr-xr-x 1 root root     4096 Jul 19 16:14 .config
drwxr-xr-x 2 root root     4096 Aug  1 00:25 data
-rw-r--r-- 1 root root 22006424 Aug  1 04:40 last_model.pt
drwxr-xr-x 1 root root     4096 Jul 19 16:14 sample_data


In [230]:
from google.colab import files
print('Downloading models...')  # Known bug; if using Firefox, a print statement in the same cell is necessary.
files.download('./best_model.pt')
files.download('./last_model.pt')

Downloading models...


# 9. Discussions