In [None]:
!pip install torchtext==0.6.0

Collecting torchtext==0.6.0
  Downloading torchtext-0.6.0-py3-none-any.whl (64 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/64.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from torchtext==0.6.0)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentencepiece, torchtext
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.15.2
    Uninstalling torchtext-0.15.2:
      Successfully uninstalled torchtext-0.15.2
Successfully installed sentencepiece-0.1.99 torchtext-0.6.0


In [None]:
import torch
import torchvision
import torchtext

print(f'torch version: {torch.__version__}')
print(f'torchvision version: {torchvision.__version__}')
print(f'torchtext version: {torchtext.__version__}')
print("cuda version: {}".format(torch.version.cuda))

torch version: 2.0.1+cu118
torchvision version: 0.15.2+cu118
torchtext version: 0.6.0
cuda version: 11.8


In [None]:
import random
SEED = 1234

random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
%%capture
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm

In [None]:
import spacy

spacy_en = spacy.load("en_core_web_sm") # 영어 토큰화(tokenization)
spacy_de = spacy.load("de_core_news_sm") # 독일어 토큰화(tokenization)

In [None]:
# 간단히 토큰화(tokenization) 기능 써보기

tokenized = spacy_en.tokenizer("I am a graduate student.")

for i, token in enumerate(tokenized):
    print(f"인덱스 {i}: {token.text}")

인덱스 0: I
인덱스 1: am
인덱스 2: a
인덱스 3: graduate
인덱스 4: student
인덱스 5: .


In [None]:
#@title 영어 및 독일어 토큰화 함수 정의


In [None]:
def tokenize_de(text):

    return [tok.text for tok in spacy_de.tokenizer(text)]        # input 문장의 토큰화 이후에 순서를 뒤집음 -> (X)

def tokenize_en(text):

    return [tok.text for tok in spacy_en.tokenizer(text)]


In [None]:
PAD_WORD = '<blank>' # padding token
UNK_WORD = '<unk>' # unknown token
BOS_WORD = '<s>' # start token
EOS_WORD = '</s>' # end token

SRC = torchtext.data.Field(
    tokenize = tokenize_de, lower=True,
    pad_token=PAD_WORD, init_token=BOS_WORD, eos_token=EOS_WORD)

TRG = torchtext.data.Field(
    tokenize = tokenize_en, lower=True,
    pad_token=PAD_WORD, init_token=BOS_WORD, eos_token=EOS_WORD)

In [None]:
from torchtext.datasets import Multi30k
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'),
                                                    fields = (SRC, TRG))

print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")


downloading training.tar.gz


KeyboardInterrupt: ignored

In [None]:
from torchtext.datasets import Multi30k
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de','.en'),
                                                    fields = (SRC, TRG),  root='C:/Users/82109/.data/')

print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

downloading training.tar.gz


KeyboardInterrupt: ignored

In [None]:
print(vars(train_data.examples[30])['src'])
print(vars(train_data.examples[30])['trg'])

NameError: ignored

In [None]:
SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)

print(f"len(SRC): {len(SRC.vocab)}")
print(f"len(TRG): {len(TRG.vocab)}")
# TRG.vocab.stoi

len(SRC): 7853
len(TRG): 5893


In [None]:
print(TRG.vocab.stoi["abcabc"])    # 없는 단어: 0
print(TRG.vocab.stoi[TRG.pad_token])  # 패딩(padding): 1
print(TRG.vocab.stoi["<s>"])        #  <sos>: 2
print(TRG.vocab.stoi["</s>"])       # <eos>: 3
print(TRG.vocab.stoi["water"])
print(TRG.vocab.stoi["world"])

0
1
2
3
47
1752


In [None]:
# BucketIterator 생성

from torchtext.data import Field, BucketIterator

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    device=device)

print(next(iter(train_iterator)).src)
print(next(iter(train_iterator)).src.shape)

tensor([[   2,    2,    2,  ...,    2,    2,    2],
        [   8,   54,    5,  ...,    8,    5,    5],
        [  16, 1551,  717,  ...,   36,   13,   13],
        ...,
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1]], device='cuda:0')
torch.Size([33, 128])


In [None]:
for w, _ in SRC.vocab.stoi.items():
    if w not in TRG.vocab.stoi:
        TRG.vocab.stoi[w] = len(TRG.vocab.stoi)
TRG.vocab.itos = [None] * len(TRG.vocab.stoi)
for w, i in TRG.vocab.stoi.items():
    TRG.vocab.itos[i] = w
SRC.vocab.stoi = TRG.vocab.stoi
SRC.vocab.itos = TRG.vocab.itos
print('[Info] merged vocabulary size:', len(TRG.vocab))

[Info] merged vocabulary size: 13403


In [None]:
data = {
    'vocab': {'src': SRC, 'trg': TRG},
    'train': train_data.examples,
    'valid': valid_data.examples,
    'test': test_data.examples}

In [None]:
#@title 1. Positional Encoding

In [None]:
import torch.nn as nn
vacab_len = 13203

embedding_layer = nn.Embedding(num_embeddings = vacab_len,
                               embedding_dim=512,
                               padding_idx=1)

print(f'TABLE SIZE:  {embedding_layer.weight.shape}')
print(embedding_layer.weight)

TABLE SIZE:  torch.Size([13203, 512])
Parameter containing:
tensor([[-3.3850e-01, -1.5431e-01, -2.3130e-01,  ..., -2.8964e-01,
          1.7315e+00, -5.5616e-01],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-1.4756e+00,  8.4137e-01,  6.9516e-01,  ..., -3.8743e-01,
         -4.7118e-01,  1.2454e+00],
        ...,
        [ 1.0026e-01, -2.9119e+00,  1.5184e-01,  ..., -1.3560e+00,
         -4.3024e-01,  6.4329e-01],
        [-8.8551e-01,  8.8354e-01,  3.6063e-02,  ...,  2.0745e+00,
         -2.3074e-01,  9.0141e-05],
        [ 8.8262e-01,  3.8370e-01, -4.7277e-01,  ..., -1.9764e+00,
          5.2690e-01,  1.3923e+00]], requires_grad=True)


In [None]:
import torch.nn as nn
import numpy as np

class Positional_Encoding(nn.Module):
    def __init__(self, d_hid, max_length=200):
        super(Positional_Encoding, self).__init__()

        self.register_buffer('pos_table', self._get_sinusoid_encoding_table(max_length, d_hid))

    def _get_sinusoid_encoding_table(self, max_length, d_hid):

        def get_position_angle_vec(position):
            return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]

        sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(max_length)])
        sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])     # 짝수 번째 인덱스는 sin 함수 적용
        sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])     # 홀수 번째 인덱스는 cos 함수 적용

        return torch.FloatTensor(sinusoid_table).unsqueeze(0)

    def forward(self, x):
        return x + self.pos_table[:, :x.size(1)].clone().detach()


# 위치 인코딩을 입력 텐서 x에 추가

In [None]:
#@title 2. Scaled dot-product


In [None]:
import torch.nn.functional as F

class ScaledDotProduct_Attention(nn.Module):

    def __init__(self, temperature, attn_dropout=0.1):
        super().__init__()
        self.temperature = temperature
        self.dropout = nn.Dropout(attn_dropout)

    def forward(self, q, k, v, mask=None):

        attn = torch.matmul(q / self.temperature, k.transpose(2, 3))

        if mask is not None:
            attn = attn.masked_fill(mask == 0, -1e9)

        attn = self.dropout(F.softmax(attn, dim=-1))

        output = torch.matmul(attn, v)

        return output, attn


In [None]:
#@title 3. Multihead attention

In [None]:
class MultiHeadAttention(nn.Module):

    def __init__(self, n_heads, d_model, d_k, d_v, device, dropout=0.1):
        super().__init__()

        assert d_model % n_heads == 0

        self.n_heads = n_heads
        self.d_model = d_model
        self.d_k = d_k
        self.d_v = d_v

        self.w_q = nn.Linear(d_model, n_heads * d_k, bias=False)
        self.w_k = nn.Linear(d_model, n_heads * d_k, bias=False)
        self.w_v = nn.Linear(d_model, n_heads * d_v, bias=False)

        self.fc = nn.Linear(n_heads* d_v, d_model, bias=False)

        self.attention = ScaledDotProduct_Attention(temperature=d_k ** 0.5).to(device)

        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)


    def forward(self, q, k, v, mask=None):

        d_k, d_v, n_heads = self.d_k, self.d_v, self.n_heads
        sz_b, len_q, len_k, len_v = q.size(0), q.size(1), k.size(1), v.size(1)

        # print("Shape of q:", q.shape)
        # print("Shape of k:", k.shape)
        # print("Shape of v:", v.shape)

        residual = q

        q = self.w_q(q).view(sz_b, len_q, n_heads, d_k)
        k = self.w_k(k).view(sz_b, len_k, n_heads, d_k)
        v = self.w_v(v).view(sz_b, len_v, n_heads, d_v)


        q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)

        if mask is not None:
            mask = mask.unsqueeze(1)

        q, attn = self.attention(q, k, v, mask=mask)

        q = q.transpose(1, 2).contiguous().view(sz_b, len_q, -1)
        q = self.dropout(self.fc(q))
        q += residual

        q = self.layer_norm(q)

        return q, attn


In [None]:
#@title 4.  Position wise Feed Forward

In [None]:
class Positionwise_FeedForward(nn.Module):
    ''' A two-feed-forward-layer module '''

    def __init__(self, d_in, d_hid, dropout=0.1):
        super().__init__()
        self.w_1 = nn.Linear(d_in, d_hid)
        self.w_2 = nn.Linear(d_hid, d_in)
        self.layer_norm = nn.LayerNorm(d_in, eps=1e-6)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):

        residual = x

        x = self.dropout(F.relu(self.w_1(x)))
        x = self.w_2(x)
        x += residual

        x = self.layer_norm(x)

        return x


In [None]:
#@title 5. Encoder Layer
# 인코더의 셀프 어텐션: Query = Key = Value


In [None]:
# 인코더 레이어는 입력과 출력 차원이 같음 -> 트랜스포머의 인코더는 인코더 레이어를 여러 번 중첩해서 사용 가능

class EncoderLayer(nn.Module):

    def __init__(self, d_model, d_inner, n_heads, d_k, d_v, device, dropout=0.1):
        super(EncoderLayer, self).__init__()

        self.self_attn = MultiHeadAttention(n_heads, d_model, d_k, d_v, device, dropout=dropout)
        self.pos_ffn = Positionwise_FeedForward(d_model, d_inner, dropout=dropout)
        # self.self_attn_layer_norm = nn.LayerNorm(d_model)
        self.ff_layer_norm = nn.LayerNorm(d_model)
        # self.dropout = nn.Dropout(dropout)

    def forward(self, enc_input, slf_attn_mask=None):
        enc_output, enc_slf_attn = self.self_attn(enc_input, enc_input, enc_input, mask=slf_attn_mask)
        # enc_input을 각각 q, k ,v 값으로 self-attention 수행 후 출력 텐서와 attention 가중치 텐서를 반환
        # enc_input = self.self_attn_layer_norm(enc_input + self.dropout(enc_input))         # dropout, residual connection and layer norm
        enc_output = self.pos_ffn(enc_output)                                              # position-wise feedforward
        # enc_input = self.ff_layer_norm(enc_input + self.dropout(enc_output))

        return enc_output, enc_slf_attn


In [None]:
# src_mask는 pad_idx에 대하여 mask값을 0으로 처리하는 함수로, 필요 없는 패딩 인덱스에 대해 attention 연산을 하지 않음

def get_pad_mask(seq, pad_idx):
    return (seq != pad_idx).unsqueeze(-2)


In [None]:
#@title 6.  Encoder Architechture
# Encoder의 구조 = token embedding + positional encoding -> a stack of N EncoderBlock -> layer norm

In [None]:
class Encoder(nn.Module):

    def __init__(
            self, src_vocab, d_word_vec, n_layers, n_heads, d_k, d_v,
            d_model, d_inner, pad_idx, device, dropout=0.1, max_length=200, scale_emb=False):

        super().__init__()
        self.device = device

        # 모든 단어들을 embedding (고유 백터를 가진 차원으로 변경)
        self.src_word_emb = nn.Embedding(src_vocab, d_word_vec, padding_idx=pad_idx)

        self.position_enc = Positional_Encoding(d_word_vec, max_length)
        self.dropout = nn.Dropout(p=dropout)

        # multiple encoder
        self.layer_stack = nn.ModuleList([
            EncoderLayer(d_model, d_inner, n_heads, d_k, d_v, device, dropout=dropout)
            for _ in range(n_layers)])

        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
        self.scale_emb = scale_emb
        self.d_model = d_model

    def forward(self, src_seq, src_mask, return_attns=True):

        enc_slf_attn_list = []

        # token embedding
        enc_output = self.src_word_emb(src_seq)
        if self.scale_emb:
            enc_output *= self.d_model ** 0.5

        # positional encoding
        enc_output = self.dropout(self.position_enc(enc_output))

        # layer norm
        enc_output = self.layer_norm(enc_output).to(device)

        # stack of encoder layers
        for enc_layer in self.layer_stack:
            enc_output, enc_slf_attn = enc_layer(enc_output, slf_attn_mask=src_mask)
            enc_slf_attn_list += [enc_slf_attn] if return_attns else []

        if return_attns:
            return enc_output, enc_slf_attn_list
        return enc_output


In [None]:
#@title Decoder Layer


In [None]:
# 인코더 layer와 마찬가지로 입력과 출력의 차원이 같음 -> 트랜스포머의 디코더 또한 디코더 레이어를 여러 번 중첩해서 사용 가능
# 두 개의 multihead attetion이 사용

def get_subsequent_mask(seq):
    sz_b, len_s = seq.size()
    subsequent_mask = (1 - torch.triu(
        torch.ones((1, len_s, len_s), device=seq.device), diagonal=1)).bool()
    return subsequent_mask

In [None]:
class DecoderLayer(nn.Module):

    def __init__(self, d_model, d_inner, n_heads, d_k, d_v, device, dropout=0.1):
        super(DecoderLayer, self).__init__()

        self.self_attn = MultiHeadAttention(n_heads, d_model, d_k, d_v, device, dropout=dropout)  # decoder layer의 self-attention
        self.enc_attn = MultiHeadAttention(n_heads, d_model, d_k, d_v, device, dropout=dropout)   # decoder-encoder layer의 self-attention
        self.pos_ffn = Positionwise_FeedForward(d_model, d_inner, dropout=dropout)
        # self.self_attn_layer_norm = nn.LayerNorm(d_model)
        # self.ff_layer_norm = nn.LayerNorm(d_model)
        # self.dropout = nn.Dropout(dropout)

    def forward(
            self, dec_input, enc_output,
            slf_attn_mask=None, dec_enc_attn_mask=None):

        # decoder input 값, 자기 자신에 대한 self attention
        dec_output, dec_slf_attn = self.self_attn(dec_input, dec_input, dec_input, mask=slf_attn_mask)
        # dec_input = self.self_attn_layer_norm(dec_input + self.dropout(dec_output))

        # 위 결과값을 query로, key, value는 encoder의 output 값으로 attention - 디코더의 query를 이용해 인코더를 attention
        dec_output, dec_enc_attn = self.enc_attn(dec_output, enc_output, enc_output, mask=dec_enc_attn_mask)
        # dec_input = self.ff_layer_norm(dec_input + self.dropout(dec_output))

        dec_output = self.pos_ffn(dec_output)

        return dec_output, dec_slf_attn, dec_enc_attn

In [None]:
#@title Decoder Architechture

In [None]:
from thinc.layers.dropout import Dropout
class Decoder(nn.Module):

    def __init__(
            self, trg_vocab, d_word_vec, n_layers, n_heads, d_k, d_v,
            d_model, d_inner, pad_idx,device, dropout=0.1, max_length=200, scale_emb=False):

        super().__init__()
        self.device = device

        # target word embedding
        self.trg_word_emb = nn.Embedding(trg_vocab, d_word_vec, padding_idx=pad_idx)

        # positional encoding
        self.position_enc = Positional_Encoding(d_word_vec, max_length=max_length)
        self.dropout = nn.Dropout(p=dropout)

         # multiple decoder
        self.layer_stack = nn.ModuleList([
            DecoderLayer(d_model, d_inner, n_heads, d_k, d_v, device, dropout=dropout)
            for _ in range(n_layers)])

        # layer_norm
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
        self.scale_emb = scale_emb
        self.d_model = d_model

    def forward(self, trg_seq, trg_mask, enc_output, src_mask, return_attns=True):

        dec_slf_attn_list, dec_enc_attn_list = [], []

        # target words embedding
        dec_output = self.trg_word_emb(trg_seq)
        if self.scale_emb:
            dec_output *= self.d_model ** 0.5

        # positional encoding
        dec_output = self.dropout(self.position_enc(dec_output))
        dec_output = self.layer_norm(dec_output).to(device)

        # decoder_layer stacked
        for dec_layer in self.layer_stack:
            dec_output, dec_slf_attn, dec_enc_attn = dec_layer(
                dec_output, enc_output, slf_attn_mask=trg_mask, dec_enc_attn_mask=src_mask)

            dec_slf_attn_list += [dec_slf_attn] if return_attns else []
            dec_enc_attn_list += [dec_enc_attn] if return_attns else []

        if return_attns:
            return dec_output, dec_slf_attn_list, dec_enc_attn_list
        return dec_output

In [None]:
#@title Transformer Model

In [None]:
class Transformer(nn.Module):

    def __init__(
            self, src_vocab, trg_vocab, src_pad_idx, trg_pad_idx, device,
            d_word_vec=512, d_model=512, d_inner=2048,
            n_layers=6, n_heads=8, d_k=64, d_v=64, max_length=200,
            trg_emb_prj_weight_sharing=True, emb_src_trg_weight_sharing=True,
            scale_emb_or_prj='prj'):

        super().__init__()
        self.device = device
        self.dropout = 0.1

        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx

        assert scale_emb_or_prj in ['emb', 'prj', 'none']
        scale_emb = (scale_emb_or_prj == 'emb') if trg_emb_prj_weight_sharing else False
        self.scale_prj = (scale_emb_or_prj == 'prj') if trg_emb_prj_weight_sharing else False
        self.d_model = d_model

        # encoder
        self.encoder = Encoder(
            src_vocab=src_vocab, max_length=max_length,
            d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner,
            n_layers=n_layers, n_heads=n_heads, d_k=d_k, d_v=d_v,
            pad_idx=src_pad_idx, dropout=dropout, scale_emb=scale_emb, device=device)

        # decoder
        self.decoder = Decoder(
            trg_vocab=trg_vocab, max_length=max_length,
            d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner,
            n_layers=n_layers, n_heads=n_heads, d_k=d_k, d_v=d_v,
            pad_idx=trg_pad_idx, dropout= dropout, scale_emb=scale_emb, device=device)

        # 최종 output layers
        self.trg_word_prj = nn.Linear(d_model, trg_vocab, bias=False)

        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

        assert d_model == d_word_vec

        if trg_emb_prj_weight_sharing:
            self.trg_word_prj.weight = self.decoder.trg_word_emb.weight

        if emb_src_trg_weight_sharing:
            self.encoder.src_word_emb.weight = self.decoder.trg_word_emb.weight


    def forward(self, src_seq, trg_seq):

        # ① 소스, 타겟 토큰 대한 mask 생성
        src_mask = get_pad_mask(src_seq, self.src_pad_idx)
        trg_mask = get_pad_mask(trg_seq, self.trg_pad_idx) & get_subsequent_mask(trg_seq)

        # ② encoder
        enc_output, *_ = self.encoder(src_seq, src_mask)

        # ③ decoder
        dec_output, *_ = self.decoder(trg_seq, trg_mask, enc_output, src_mask)

        # ④ 최종 weight
        seq_logit = self.trg_word_prj(dec_output)
        if self.scale_prj:
            seq_logit *= self.d_model ** -0.5

        return seq_logit.view(-1, seq_logit.size(2))

In [None]:
#@title 모델 학습

In [None]:
class Constants:
    PAD_WORD = '<blank>'
    UNK_WORD = '<unk>'
    BOS_WORD = '<s>'
    EOS_WORD = '</s>'


In [None]:
src_vocab_size = len(SRC.vocab)
trg_vocab_size= len(TRG.vocab)
src_pad_idx = SRC.vocab.stoi[Constants.PAD_WORD]
trg_pad_idx = TRG.vocab.stoi[Constants.PAD_WORD]
trg_bos_idx = TRG.vocab.stoi[Constants.BOS_WORD]
trg_eos_idx = TRG.vocab.stoi[Constants.EOS_WORD]
d_k = 64
d_v = 64
d_model = 512
d_word_vec = 512
d_inner_hid = 2048
n_layers = 6
n_heads = 8
dropout = 0.1
proj_share_weight = True
embs_share_weight = True
batch_size=128
max_lenght = 200
epoch = 10

In [None]:
CUDA_VISIBLE_DEVICES=-1
transformer = Transformer(
    src_vocab_size,
    trg_vocab_size,
    src_pad_idx,
    trg_pad_idx,
    device,
    d_word_vec,
    d_model,
    d_inner_hid,
    n_layers,
    n_heads,
    d_k,
    d_v,
    epoch).to(device)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(transformer):,} trainable parameters')

The model has 50,869,760 trainable parameters


In [None]:
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)

transformer.apply(initialize_weights)

Transformer(
  (encoder): Encoder(
    (src_word_emb): Embedding(13203, 512, padding_idx=1)
    (position_enc): Positional_Encoding()
    (dropout): Dropout(p=0.1, inplace=False)
    (layer_stack): ModuleList(
      (0-5): 6 x EncoderLayer(
        (self_attn): MultiHeadAttention(
          (w_q): Linear(in_features=512, out_features=512, bias=False)
          (w_k): Linear(in_features=512, out_features=512, bias=False)
          (w_v): Linear(in_features=512, out_features=512, bias=False)
          (fc): Linear(in_features=512, out_features=512, bias=False)
          (attention): ScaledDotProduct_Attention(
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (dropout): Dropout(p=0.1, inplace=False)
          (layer_norm): LayerNorm((512,), eps=1e-06, elementwise_affine=True)
        )
        (pos_ffn): Positionwise_FeedForward(
          (w_1): Linear(in_features=512, out_features=2048, bias=True)
          (w_2): Linear(in_features=2048, out_features=512, bias

In [None]:
class Scheduled_Optim():

    def __init__(self, optimizer, lr_mul, d_model, n_warmup_steps):
        self._optimizer = optimizer
        self.lr_mul = lr_mul
        self.d_model = d_model
        self.n_warmup_steps = n_warmup_steps
        self.n_steps = 0

    def step_and_update_lr(self):
        self._update_learning_rate()
        self._optimizer.step()


    def zero_grad(self):
        self._optimizer.zero_grad()


    def _get_lr_scale(self):
        d_model = self.d_model
        n_steps, n_warmup_steps = self.n_steps, self.n_warmup_steps
        return (d_model ** -0.5) * min(n_steps ** (-0.5), n_steps * n_warmup_steps ** (-1.5))


    def _update_learning_rate(self):

        self.n_steps += 1
        lr = self.lr_mul * self._get_lr_scale()

        for param_group in self._optimizer.param_groups:
            param_group['lr'] = lr

In [None]:
import torch.optim as optim
n_warmup_steps = 1000
lr_mul = 0.1
optimizer = Scheduled_Optim(
    optim.Adam(transformer.parameters(), betas=(0.9, 0.98), eps=1e-09),
    lr_mul, d_model, n_warmup_steps)

In [None]:
criterion = nn.CrossEntropyLoss(ignore_index=trg_pad_idx)

# 성능 평가

In [None]:
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(iterator):
        src = batch.src
        trg = batch.trg
        print('src shape:', src.shape)
        print('trg shape:', trg.shape)

        optimizer.zero_grad()
        output = model(src, trg[:, :-1])
        output_reshape = output.contiguous().view(-1, output.shape[-1])
        trg = trg[:, 1:].contiguous().view(-1)

        loss = criterion(output_reshape, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        epoch_loss += loss.item()
        print('step :', round((i / len(iterator)) * 100, 2), '% , loss :', loss.item())

    return epoch_loss / len(iterator)

In [None]:
##

def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(iterator):
        src = batch.src
        trg = batch.trg

        # Pad the source and target sequences
        src = nn.utils.rnn.pad_sequence(src, padding_value=SRC_PAD_IDX)
        trg = nn.utils.rnn.pad_sequence(trg, padding_value=TRG_PAD_IDX)

        # Get the source and target masks
        src_mask = get_pad_mask(src, SRC_PAD_IDX)
        trg_mask = get_pad_mask(trg, TRG_PAD_IDX) & get_subsequent_mask(trg)

        optimizer.zero_grad()
        output = model(src, trg[:-1])  # Note: We don't need the last token in the target for training
        output_reshape = output.contiguous().view(-1, output.shape[-1])
        trg = trg[1:].contiguous().view(-1)  # Note: We skip the first token in the target for loss calculation

        loss = criterion(output_reshape, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        epoch_loss += loss.item()
        print('step :', round((i / len(iterator)) * 100, 2), '% , loss :', loss.item())

    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    batch_bleu = []
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src = batch.src
            trg = batch.trg

            # Pad the source and target sequences
            src = nn.utils.rnn.pad_sequence(src, padding_value=SRC_PAD_IDX)
            trg = nn.utils.rnn.pad_sequence(trg, padding_value=TRG_PAD_IDX)

            # Get the source and target masks
            src_mask = get_pad_mask(src, SRC_PAD_IDX)
            trg_mask = get_pad_mask(trg, TRG_PAD_IDX) & get_subsequent_mask(trg)

            output = model(src, trg[:-1])  # Note: We don't need the last token in the target for evaluation
            output_reshape = output.contiguous().view(-1, output.shape[-1])
            trg = trg[1:].contiguous().view(-1)  # Note: We skip the first token in the target for loss calculation

            loss = criterion(output_reshape, trg)
            epoch_loss += loss.item()

            trg_words = [[TRG.target.vocab.itos[idx] for idx in trg[i].tolist()] for i in range(trg.shape[0])]
            output_words = [[TRG.target.vocab.itos[idx] for idx in output[i].max(dim=1)[1].tolist()] for i in range(output.shape[0])]

            bleu = bleu_score(output_words, trg_words)
            batch_bleu.append(bleu)

    batch_bleu = sum(batch_bleu) / len(batch_bleu)
    return epoch_loss / len(iterator), batch_bleu

In [None]:
from torchtext.data.metrics import bleu_score
candidate_corpus = [['My', 'full', 'pytorch', 'test'], ['Another', 'Sentence']]
references_corpus = [[['My', 'full', 'pytorch', 'test'], ['Completely', 'Different']], [['No', 'Match']]]
bleu_score(candidate_corpus, references_corpus)

0.8408964276313782

In [None]:
from torchtext.data.metrics import bleu_score

def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    batch_bleu = []
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src = batch.src
            trg = batch.trg
            output = model(src, trg[:, :-1])
            output_reshape = output.contiguous().view(-1, output.shape[-1])
            trg = trg[:, 1:].contiguous().view(-1)

            loss = criterion(output_reshape, trg)
            epoch_loss += loss.item()

            trg_words = [[TRG.target.vocab.itos[idx] for idx in trg[i].tolist()] for i in range(trg.shape[0])]
            output_words = [[TRG.target.vocab.itos[idx] for idx in output[i].max(dim=1)[1].tolist()] for i in range(output.shape[0])]

            bleu = bleu_score(output_words, trg_words)
            batch_bleu.append(bleu)

    batch_bleu = sum(batch_bleu) / len(batch_bleu)
    return epoch_loss / len(iterator), batch_bleu

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
clip = 1.0
import time, math

def run(total_epoch, best_loss):
    train_losses, test_losses, bleus = [], [], []
    for step in range(total_epoch):
        start_time = time.time()
        train_loss = train(transformer, train_iterator, optimizer, criterion, clip)
        valid_loss, bleu = evaluate(transformer, valid_iterator, criterion)
        end_time = time.time()

        if step > n_warmup_steps:
            Scheduled_Optim.step(valid_loss)

        train_losses.append(train_loss)
        test_losses.append(valid_loss)
        bleus.append(bleu)
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if valid_loss < best_loss:
            best_loss = valid_loss
            torch.save(transformer.state_dict(), 'saved/model-{0}.pt'.format(valid_loss))

        f = open('result/train_loss.txt', 'w')
        f.write(str(train_losses))
        f.close()

        f = open('result/bleu.txt', 'w')
        f.write(str(bleus))
        f.close()

        f = open('result/test_loss.txt', 'w')
        f.write(str(test_losses))
        f.close()

        print(f'Epoch: {step + 1} | Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
        print(f'\tVal Loss: {valid_loss:.3f} |  Val PPL: {math.exp(valid_loss):7.3f}')
        print(f'\tBLEU Score: {bleu:.3f}')

In [None]:
def run1(total_epoch, best_loss):
    train_losses, test_losses, bleus = [], [], []
    for step in range(total_epoch):
        start_time = time.time()
        train_loss = train(transformer, train_iterator, optimizer, criterion, clip)
        valid_loss, bleu = evaluate(transformer, valid_iterator, criterion)
        end_time = time.time()

        if step > n_warmup_steps:
            optimizer.step_and_update_lr()  # Correctly update the learning rate

        train_losses.append(train_loss)
        test_losses.append(valid_loss)
        bleus.append(bleu)
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if valid_loss < best_loss:
            best_loss = valid_loss
            torch.save(transformer.state_dict(), 'saved/model-{0}.pt'.format(valid_loss))

        f = open('result/train_loss.txt', 'w')
        f.write(str(train_losses))
        f.close()

        f = open('result/bleu.txt', 'w')
        f.write(str(bleus))
        f.close()

        f = open('result/test_loss.txt', 'w')
        f.write(str(test_losses))
        f.close()

        print(f'Epoch: {step + 1} | Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
        print(f'\tVal Loss: {valid_loss:.3f} |  Val PPL: {math.exp(valid_loss):7.3f}')
        print(f'\tBLEU Score: {bleu:.3f}')

In [None]:
factor = 0.9
adam_eps = 5e-9
patience = 10
epoch = 1000
weight_decay = 5e-4
inf = float('inf')

In [None]:
run(total_epoch=epoch, best_loss=inf)

# 새 섹션

In [None]:
def cal_performance(pred, gold, trg_pad_idx, smoothing=False):
    ''' Apply label smoothing if needed '''

    loss = cal_loss(pred, gold, trg_pad_idx, smoothing=smoothing)

    pred = pred.max(1)[1]
    gold = gold.contiguous().view(-1)
    non_pad_mask = gold.ne(trg_pad_idx)
    n_correct = pred.eq(gold).masked_select(non_pad_mask).sum().item()
    n_word = non_pad_mask.sum().item()

    return loss, n_correct, n_word

In [None]:
def cal_loss(pred, gold, trg_pad_idx, smoothing=False):
    ''' Calculate cross entropy loss, apply label smoothing if needed. '''

    gold = gold.contiguous().view(-1)

    if smoothing:
        eps = 0.1
        n_class = pred.size(1)

        one_hot = torch.zeros_like(pred).scatter(1, gold.view(-1, 1), 1)
        one_hot = one_hot * (1 - eps) + (1 - one_hot) * eps / (n_class - 1)
        log_prb = F.log_softmax(pred, dim=1)

        non_pad_mask = gold.ne(trg_pad_idx)
        loss = -(one_hot * log_prb).sum(dim=1)
        loss = loss.masked_select(non_pad_mask).sum()  # average later
    else:
        loss = F.cross_entropy(pred, gold, ignore_index=trg_pad_idx, reduction='sum')
    return loss


In [None]:
def patch_src(src, pad_idx):
    src = src.transpose(0, 1)
    return src


def patch_trg(trg, pad_idx):
    trg = trg.transpose(0, 1)
    trg, gold = trg[:, :-1], trg[:, 1:].contiguous().view(-1)
    return trg, gold


In [None]:
!pip install tqdm



In [None]:
from tqdm import tqdm

def train_epoch(model, training_iterator, optimizer, device, smoothing):
    ''' Epoch operation in training phase'''

    model.train()
    total_loss, n_word_total, n_word_correct = 0, 0, 0

    desc = '  - (Training)   '
    for batch in tqdm(training_iterator, mininterval=2, desc=desc, leave=False):

        # prepare data
        src_seq = patch_src(batch.src, src_pad_idx).to(device)
        trg_seq, gold = map(lambda x: x.to(device), patch_trg(batch.trg, trg_pad_idx))

        # forward
        optimizer.zero_grad()
        pred = model(src_seq, trg_seq)

        # backward and update parameters
        loss, n_correct, n_word = cal_performance(
            pred, gold, trg_pad_idx, smoothing=smoothing)
        loss.backward()
        optimizer.step_and_update_lr()

        # note keeping
        n_word_total += n_word
        n_word_correct += n_correct
        total_loss += loss.item()

    loss_per_word = total_loss/n_word_total
    accuracy = n_word_correct/n_word_total
    return loss_per_word, accuracy

In [None]:
def eval_epoch(model, validation_iterator, device):
    ''' Epoch operation in evaluation phase '''

    model.eval()
    total_loss, n_word_total, n_word_correct = 0, 0, 0

    desc = '  - (Validation) '
    with torch.no_grad():
        for batch in tqdm(validation_iterator, mininterval=2, desc=desc, leave=False):

            # prepare data
            src_seq = patch_src(batch.src, src_pad_idx).to(device)
            trg_seq, gold = map(lambda x: x.to(device), patch_trg(batch.trg, trg_pad_idx))

            # forward
            pred = model(src_seq, trg_seq)
            loss, n_correct, n_word = cal_performance(
                pred, gold, trg_pad_idx, smoothing=False)

            # note keeping
            n_word_total += n_word
            n_word_correct += n_correct
            total_loss += loss.item()

    loss_per_word = total_loss/n_word_total
    accuracy = n_word_correct/n_word_total
    return loss_per_word, accuracy

In [None]:
import os
import time
save_mode = 'best'
label_smoothing=True
output_dir='output'
use_tb=False

def train(model, train_iterator, valid_iterator, optimizer, device):
    ''' Start training '''

    # Use tensorboard to plot curves, e.g. perplexity, accuracy, learning rate
    if use_tb:
        print("[Info] Use Tensorboard")
        from torch.utils.tensorboard import SummaryWriter
        tb_writer = SummaryWriter(log_dir=os.path.join(output_dir, 'tensorboard'))

    os.makedirs(output_dir, exist_ok=True)
    log_train_file = os.path.join(output_dir, 'train.log')
    log_valid_file = os.path.join(output_dir, 'valid.log')

    print('[Info] Training performance will be written to file: {} and {}'.format(
        log_train_file, log_valid_file))

    with open(log_train_file, 'w') as log_tf, open(log_valid_file, 'w') as log_vf:
        log_tf.write('epoch,loss,ppl,accuracy\n')
        log_vf.write('epoch,loss,ppl,accuracy\n')

    def print_performances(header, ppl, accu, start_time, lr):
        print('  - {header:12} ppl: {ppl: 8.5f}, accuracy: {accu:3.3f} %, lr: {lr:8.5f}, '\
              'elapse: {elapse:3.3f} min'.format(
                  header=f"({header})", ppl=ppl,
                  accu=100*accu, elapse=(time.time()-start_time)/60, lr=lr))

    #valid_accus = []
    valid_losses = []
    for epoch_i in range(epoch):
        print('[ Epoch', epoch_i, ']')

        start = time.time()
        train_loss, train_accu = train_epoch(
            model, train_iterator, optimizer, device, smoothing=label_smoothing)
        train_ppl = math.exp(min(train_loss, 100))
        # Current learning rate
        lr = optimizer._optimizer.param_groups[0]['lr']
        print_performances('Training', train_ppl, train_accu, start, lr)

        start = time.time()
        valid_loss, valid_accu = eval_epoch(model, valid_iterator, device)
        valid_ppl = math.exp(min(valid_loss, 100))
        print_performances('Validation', valid_ppl, valid_accu, start, lr)

        valid_losses += [valid_loss]

        checkpoint = {'epoch': epoch_i, 'settings': data, 'model': model.state_dict()}

        if save_mode == 'all':
            model_name = 'model_accu_{accu:3.3f}.chkpt'.format(accu=100*valid_accu)
            torch.save(checkpoint, model_name)
        elif save_mode == 'best':
            model_name = 'model.chkpt'
            if valid_loss <= min(valid_losses):
                torch.save(checkpoint, os.path.join(output_dir, model_name))
                print('    - [Info] The checkpoint file has been updated.')

        with open(log_train_file, 'a') as log_tf, open(log_valid_file, 'a') as log_vf:
            log_tf.write('{epoch},{loss: 8.5f},{ppl: 8.5f},{accu:3.3f}\n'.format(
                epoch=epoch_i, loss=train_loss,
                ppl=train_ppl, accu=100*train_accu))
            log_vf.write('{epoch},{loss: 8.5f},{ppl: 8.5f},{accu:3.3f}\n'.format(
                epoch=epoch_i, loss=valid_loss,
                ppl=valid_ppl, accu=100*valid_accu))

In [None]:
train(transformer, train_iterator, valid_iterator, optimizer, device)

[Info] Training performance will be written to file: output/train.log and output/valid.log
[ Epoch 0 ]




RuntimeError: ignored

In [None]:
# @title **

In [None]:
%debug

> [0;32m<ipython-input-121-621954a10c70>[0m(22)[0;36mforward[0;34m()[0m
[0;32m     20 [0;31m[0;34m[0m[0m
[0m[0;32m     21 [0;31m    [0;32mdef[0m [0mforward[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mx[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 22 [0;31m        [0;32mreturn[0m [0mx[0m [0;34m+[0m [0mself[0m[0;34m.[0m[0mpos_table[0m[0;34m[[0m[0;34m:[0m[0;34m,[0m [0;34m:[0m[0mx[0m[0;34m.[0m[0msize[0m[0;34m([0m[0;36m1[0m[0;34m)[0m[0;34m][0m[0;34m.[0m[0mclone[0m[0;34m([0m[0;34m)[0m[0;34m.[0m[0mdetach[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     23 [0;31m[0;34m[0m[0m
[0m[0;32m     24 [0;31m[0;34m[0m[0m
[0m
ipdb> u
> [0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py[0m(1501)[0;36m_call_impl[0;34m()[0m
[0;32m   1499 [0;31m                [0;32mor[0m [0m_global_backward_pre_hooks[0m [0;32mor[0m [0m_global_backward_hooks[0m[0;34m[0m

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.chdir('/content/drive/MyDrive/Colab Notebooks/transformer')

In [None]:
def read_gzip_file(file_path):
    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        return f.read()

In [None]:
import os
import gzip

In [None]:
from torchtext.data import Field

SRC = torchtext.data.Field(
    tokenize = tokenize_de, lower=True,
    pad_token=PAD_WORD, init_token=BOS_WORD, eos_token=EOS_WORD)

TRG = torchtext.data.Field(
    tokenize = tokenize_en, lower=True,
    pad_token=PAD_WORD, init_token=BOS_WORD, eos_token=EOS_WORD)

In [None]:
train, valid =