In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm
import random
import os

def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

#DATA_PATH = "/content/drive/MyDrive/04_nlp/data/"
DATA_PATH = os.getcwd()
SEED = 42

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

# 번역: 한국어 -> 영어

In [2]:
train = pd.read_csv(f"{DATA_PATH}/data/en2ko/translate_en_ko.csv")
train.shape

(5794, 2)

# 텍스트 정제

In [3]:
train["ko"] = train["ko"].str.replace("[^가-힣 0-9,.!?\"\']","", regex=True)
train["en"] = train["en"].str.replace("[^a-zA-Z 0-9,.!?\"\']","", regex=True).str.lower()

In [6]:
!pip install kiwipiepy





# 한국어 토큰화

In [4]:
from kiwipiepy import Kiwi

kiwi = Kiwi()

In [5]:
result = kiwi.tokenize(train["ko"])
src_data = []
for tokens in tqdm(result):
    tokens = [ t.form for t in tokens]
    src_data.append(tokens)

0it [00:00, ?it/s]

In [6]:
src_data

[['이', '제품', '들', '은', '같', '은', '품질', '이', '다', '.'],
 ['미팅', '이', '취소', '되', '었', '다', '.'],
 ['그', '들', '이', '이것', '을', '찾', '었', '다', '.'],
 ['톰',
  '은',
  '메리',
  '에게',
  '왜',
  '그',
  '가',
  '늦',
  '게',
  '까지',
  '일',
  '하',
  '어야',
  '하',
  '었',
  '는지',
  '알리',
  '어',
  '주',
  '지',
  '않',
  '었',
  '다',
  '.'],
 ['먼저', '가', '세요', '.'],
 ['계속', '웃', '어', '.'],
 ['비', '가', '내리', '면', '집', '에', '있', '을', '것', '이', '다', '.'],
 ['나', '피', '나', '어'],
 ['톰', '이', '승낙', '하', '었', '어', '.'],
 ['방금', '나', 'ᆯ', '모욕', '하', '었', '어', '?'],
 ['나', 'ᆫ', '집중', '하', '려고', '애', '를', '쓰', '고', '있', '어', '.'],
 ['너', 'ᆫ', '왜', '그렇', '게', '바보', '같이', '굴', '어', '?'],
 ['끼', '어', '들', '어서', '죄송하', 'ᆸ니다', '.'],
 ['톰', '은', '기말', '시험', '공부', '를', '안', '하', '었', '어', '.'],
 ['그런', '일', '이', '생기', 'ᆫ다면', '어쩌', 'ᆯ', '거', '이', '야', '?'],
 ['톰', '은', '맥주', '를', '원', '하', 'ᆫ다', '.'],
 ['그거', 'ᆫ', '톰', '이', '이야기', '하', 'ᆫ', '것', '이', '아니', '다', '.'],
 ['메리',
  '는',
  '톰',
  '의',
  '자켓',
  '에서',
  '길',
  'ᆫ',
  

# 한국어 어휘집

In [7]:
from torchtext.vocab import build_vocab_from_iterator
vocab_src = build_vocab_from_iterator(src_data, specials=["<pad>", "<unk>"]) #이터러블한 객체 파라미터전달
vocab_src.set_default_index(vocab_src["<unk>"])
len(vocab_src)



3245

# 한국어 단어번호 부여

In [8]:
src_ko_data = [ vocab_src(tokens) for tokens in src_data]

In [9]:
src_ko_data

[[4, 2926, 32, 8, 50, 8, 3151, 4, 9, 2],
 [1112, 4, 1254, 38, 6, 9, 2],
 [22, 32, 4, 87, 12, 209, 6, 9, 2],
 [10,
  8,
  49,
  46,
  84,
  22,
  13,
  212,
  41,
  229,
  58,
  5,
  48,
  5,
  6,
  102,
  371,
  3,
  34,
  21,
  28,
  6,
  9,
  2],
 [893, 13, 62, 2],
 [166, 163, 3, 2],
 [246, 13, 414, 134, 85, 18, 16, 12, 25, 4, 9, 2],
 [11, 380, 11, 3],
 [10, 4, 2561, 5, 6, 3, 2],
 [1498, 11, 17, 2279, 5, 6, 3, 15],
 [11, 14, 434, 5, 176, 237, 19, 97, 20, 16, 3, 2],
 [26, 14, 84, 115, 41, 679, 165, 440, 3, 15],
 [587, 3, 32, 82, 1231, 92, 2],
 [10, 8, 2023, 459, 125, 19, 56, 5, 6, 3, 2],
 [305, 58, 4, 609, 410, 786, 17, 27, 4, 23, 15],
 [10, 8, 451, 19, 464, 5, 31, 2],
 [75, 14, 10, 4, 334, 5, 14, 25, 4, 83, 9, 2],
 [49,
  7,
  10,
  24,
  1214,
  43,
  306,
  14,
  1362,
  1098,
  91,
  2708,
  12,
  1492,
  5,
  6,
  9,
  2],
 [10, 46, 29, 5, 3, 2],
 [509, 137, 3, 34, 3, 2],
 [4, 1758, 123, 141, 38, 54, 15],
 [11, 14, 210, 118, 13, 48, 5, 3, 2],
 [10, 8, 1784, 47, 4, 23, 2],
 [11, 1

# 영어 토큰화

In [10]:
from torchtext.data.utils import get_tokenizer
tokenizer = get_tokenizer("basic_english")



- sos 토큰과 eos 토큰 정의

In [11]:
sos_token = "<sos>" # start of sentence
eos_token = "<eos>" # end of sentence

In [12]:
trg_data = []
for text in train["en"]:
    tokens = [sos_token] + tokenizer(text) + [eos_token]
    print(tokens)
    trg_data.append(tokens)

['<sos>', 'these', 'products', 'are', 'of', 'the', 'same', 'quality', '.', '<eos>']
['<sos>', 'the', 'meeting', 'was', 'cancelled', '.', '<eos>']
['<sos>', 'they', 'found', 'this', '.', '<eos>']
['<sos>', 'tom', 'didn', "'", 't', 'tell', 'mary', 'why', 'he', 'had', 'to', 'work', 'late', '.', '<eos>']
['<sos>', 'after', 'you', '.', '<eos>']
['<sos>', 'keep', 'smiling', '.', '<eos>']
['<sos>', 'if', 'it', 'rains', ',', 'i', "'", 'll', 'stay', 'at', 'home', '.', '<eos>']
['<sos>', 'i', "'", 'm', 'bleeding', '.', '<eos>']
['<sos>', 'tom', 'approved', '.', '<eos>']
['<sos>', 'did', 'you', 'just', 'insult', 'me', '?', '<eos>']
['<sos>', 'i', "'", 'm', 'having', 'a', 'hard', 'time', 'concentrating', '.', '<eos>']
['<sos>', 'why', 'are', 'you', 'acting', 'so', 'stupid', '?', '<eos>']
['<sos>', 'i', "'", 'm', 'sorry', 'for', 'interrupting', '.', '<eos>']
['<sos>', 'tom', 'didn', "'", 't', 'study', 'for', 'his', 'final', 'exam', '.', '<eos>']
['<sos>', 'what', 'would', 'you', 'do', 'if', 'that',

# 영어 어휘집

In [13]:
vocab_trg = build_vocab_from_iterator(trg_data, specials=["<pad>", "<unk>", sos_token, eos_token])
vocab_trg.set_default_index(vocab_trg["<unk>"])

In [14]:
len(vocab_trg)

3129

# 영어 단어번호 부여

In [15]:
trg_eng_data = [ vocab_trg(tokens) for tokens in trg_data ]

In [16]:
trg_eng_data

[[2, 210, 2696, 29, 27, 11, 291, 1584, 4, 3],
 [2, 11, 574, 23, 1314, 4, 3],
 [2, 68, 477, 21, 4, 3],
 [2, 7, 56, 6, 14, 93, 43, 62, 22, 84, 9, 111, 165, 4, 3],
 [2, 242, 8, 4, 3],
 [2, 117, 586, 4, 3],
 [2, 79, 17, 1589, 20, 5, 6, 61, 188, 51, 97, 4, 3],
 [2, 5, 6, 25, 1909, 4, 3],
 [2, 7, 1818, 4, 3],
 [2, 57, 8, 120, 2384, 30, 10, 3],
 [2, 5, 6, 25, 258, 13, 124, 74, 713, 4, 3],
 [2, 62, 29, 8, 461, 101, 455, 10, 3],
 [2, 5, 6, 25, 112, 37, 1480, 4, 3],
 [2, 7, 56, 6, 14, 209, 37, 44, 876, 728, 4, 3],
 [2, 34, 85, 8, 18, 79, 15, 301, 9, 8, 10, 3],
 [2, 7, 184, 13, 464, 4, 3],
 [2, 15, 6, 16, 42, 34, 7, 12, 425, 80, 4, 3],
 [2, 43, 477, 13, 172, 1298, 480, 58, 7, 6, 16, 1119, 4, 3],
 [2, 93, 7, 4, 3],
 [2, 71, 217, 4, 3],
 [2, 32, 5, 24, 21, 849, 10, 3],
 [2, 5, 24, 9, 54, 185, 4, 3],
 [2, 7, 12, 1766, 4, 3],
 [2, 5, 6, 25, 64, 9, 54, 41, 55, 7, 4, 3],
 [2, 7, 88, 22, 167, 9, 54, 424, 4, 3],
 [2, 50, 1022, 118, 11, 448, 4, 3],
 [2, 99, 527, 4, 3],
 [2, 5, 150, 746, 122, 15, 4, 3],
 [

# 데이터셋 클래스

In [17]:
class TranslateDataset(torch.utils.data.Dataset):
    def __init__(self, src, trg):
        self.src = src
        self.trg = trg
    def __len__(self):
        return len(self.src)
    def __getitem__(self, idx):
        item = {}
        item["src"] = torch.tensor( self.src[idx] )
        item["trg"] = torch.tensor( self.trg[idx] )
        return item

In [18]:
def collate_fn(lst):
    from torch.nn.utils.rnn import pad_sequence

    src = [ item["src"] for item in lst]
    src = pad_sequence(src, batch_first=True)

    trg = [ item["trg"] for item in lst]
    trg = pad_sequence(trg, batch_first=True)

    return {"src": src, "trg": trg}

In [19]:
dt = TranslateDataset(src_ko_data, trg_eng_data)
dl = torch.utils.data.DataLoader(dt, 2 , shuffle=False, collate_fn= collate_fn)
batch = next(iter(dl))
batch

{'src': tensor([[   4, 2926,   32,    8,   50,    8, 3151,    4,    9,    2],
         [1112,    4, 1254,   38,    6,    9,    2,    0,    0,    0]]),
 'trg': tensor([[   2,  210, 2696,   29,   27,   11,  291, 1584,    4,    3],
         [   2,   11,  574,   23, 1314,    4,    3,    0,    0,    0]])}

# Encoder layer

In [239]:
class Encoder(torch.nn.Module):
    def __init__(self, num_embeddings, embedding_dim):
        super().__init__()
        self.emb_layer = torch.nn.Embedding(num_embeddings, embedding_dim)
        self.rnn_layer = torch.nn.LSTM(embedding_dim, embedding_dim*2, batch_first=True, bidirectional=True)

    def forward(self, src):
        src = self.emb_layer(src) # batch,seq -> batch, seq, feature

        # output: batch, seq, feature
        # hn: nlayer, batch, features
        # cn: nlayer, batch, features
        outputs, (hn, cn) = self.rnn_layer(src)

        return hn, cn

In [240]:
encoder = Encoder(len(vocab_src), 64)
hn, cn = encoder(batch["src"])
hn.shape, cn.shape

(torch.Size([2, 2, 128]), torch.Size([2, 2, 128]))

In [241]:
# nlayer, batch, features -> batch, nlayer,  features -> batch, nlayer x features
tmp = hn.permute(1,0,2).flatten(1)
tmp.shape

torch.Size([2, 256])

In [242]:
tmp.unsqueeze(0).shape # batch, nlayer x features -> 1, batch, nlayer x features

torch.Size([1, 2, 256])

# Decoder layer

In [243]:
class Decoder(torch.nn.Module):
    def __init__(self, num_embeddings, embedding_dim):
        super().__init__()
        self.emb_layer = torch.nn.Embedding(num_embeddings, embedding_dim)
        self.rnn_layer = torch.nn.LSTM(embedding_dim, embedding_dim*4, batch_first=True)
        self.fc_layer = torch.nn.Linear(embedding_dim*4, num_embeddings) # 예측하는

    def forward(self,trg, hn, cn): # trg는 하나의 시점의 텐서
        trg = self.emb_layer(trg) # batch, 1 -> batch, 1, features

# nlayer, batch, features -> 
# batch, nlayer, features -> 
# batch, nlayer x features ->
# 1, batch, nlayer x features
        hn = hn.permute(1,0,2).flatten(1).unsqueeze(0)
        cn = cn.permute(1,0,2).flatten(1).unsqueeze(0)
        _, (hn, cn)= self.rnn_layer(trg, (hn, cn) )

        # hn: nlayer, batch, features
        pred = self.fc_layer(hn[-1]) # 인덱싱해서 다음과 같은 텐서가 입력으로 전달: batch, features
        return pred, hn, cn

In [244]:
batch["trg"][:,0].view(-1,1)

tensor([[2],
        [2]])

In [245]:
batch["trg"][:,1].view(-1,1)

tensor([[210],
        [ 11]])

In [246]:
hn.shape

torch.Size([2, 2, 128])

In [247]:
decoder = Decoder(len(vocab_trg), 64)
pred, hn, cn = decoder(batch["trg"][:,0].view(-1,1), hn, cn)

In [248]:
pred.shape

torch.Size([2, 3129])

# Seq2Seq 모델

In [249]:
class Net(torch.nn.Module):
    def __init__(self, vocab_size_src, vocab_size_trg, embedding_dim=64, device="cpu"):
        super().__init__()
        self.encoder = Encoder(vocab_size_src, embedding_dim)
        self.decoder = Decoder(vocab_size_trg, embedding_dim)
        self.device = device
        self.vocab_size_trg = vocab_size_trg # 타겟의 단어사전갯수 == 정답 클래스 개수
    def forward(self,src, trg, hn=None, cn=None):

        # trg: batch, seq
        batch_size = trg.shape[0]
        trg_len = trg.shape[1] 
        # prediction: batch, seq, n_class
        prediction = torch.zeros(batch_size, trg_len, self.vocab_size_trg).to(self.device)

        if hn is None:
            hn, cn = self.encoder(src)

        # 디코더에 전달 되는 첫번째 토큰데이터 == sos 토큰
        dec_input = trg[:, 0].view(-1,1)  # batch -> batch,seq

        # if len(trg_len[0][0])==1:
        #    print("디코더의 첫 인풋값이 잘못되었습니다.")

        for t in range(1, trg_len): # 맞춰야하는 문장 길이만큼 예측 시작
            
            # pred : batch, n_class
            pred, hn, cn = self.decoder(dec_input, hn, cn)

            prediction[:,t] = pred # t 시퀀스의 예측 단어를 t시점에 넣어줌

            dec_input = pred.argmax(1).view(-1,1) # batch -> batch, seq

            if random.random() < 0.5: # 교사강요
                dec_input = trg[:,t].view(-1,1) # batch -> batch,seq
        
        #-----> 추론 부분에선 하나하나 예측을 하는것이다.
    
        return prediction, hn, cn

In [250]:
model = Net(len(vocab_src), len(vocab_trg))
pred, _ , _  = model(batch["src"], batch["trg"])
pred.shape

torch.Size([2, 10, 3129])

In [251]:
len(vocab_trg)

3129

# 학습 loop 함수

In [36]:
def train_loop(dataloader, model, loss_fn, optimizer, device):
    epoch_loss = 0
    model.train()
    
    for batch in tqdm(dataloader):
        src = batch["src"].to(device)
        trg = batch["trg"].to(device)
        pred, _, _ = model(src,trg)

        # pred: batch, seq, n_class -> batch x seq, n_class
        n_class = pred.shape[-1] # 정답 클래스의 개수
        pred = pred.view(-1, n_class)

        # trg: batch, seq -> batch x seq
        trg = trg.flatten()

        # pad, unk, sos 토큰 제외하고 손실 계산하기 위해 마스킹
        mask = trg > 2
        trg = trg[mask]
        pred = pred[mask]
        loss = loss_fn(pred, trg)

        optimizer.zero_grad() # 경사값 0으로 초기화
        loss.backward() # 역전파
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1) # 기울기 폭주 현상을 개선
        optimizer.step() # 가중치 업데이트

        epoch_loss += loss.item()

    epoch_loss /= len(dataloader)

    return epoch_loss

In [37]:
loss_fn = torch.nn.CrossEntropyLoss()

# 학습

In [38]:
reset_seeds(SEED)
model = Net(len(vocab_src), len(vocab_trg), device=device).to(device)
optimizer = torch.optim.Adam( model.parameters() )

dt = TranslateDataset(src_ko_data, trg_eng_data)
dl = torch.utils.data.DataLoader(dt, batch_size=64, shuffle=True, collate_fn=collate_fn)

for epoch in range(50):
    epoch_loss = train_loop(dl, model, loss_fn, optimizer, device)
    print(epoch, epoch_loss)

  0%|          | 0/91 [00:00<?, ?it/s]

0 5.354704207116431


  0%|          | 0/91 [00:00<?, ?it/s]

1 4.476916842408233


  0%|          | 0/91 [00:00<?, ?it/s]

2 4.1709932154351534


  0%|          | 0/91 [00:00<?, ?it/s]

3 3.939714355783148


  0%|          | 0/91 [00:00<?, ?it/s]

4 3.7526487151345056


  0%|          | 0/91 [00:00<?, ?it/s]

5 3.580222711458311


  0%|          | 0/91 [00:00<?, ?it/s]

6 3.4178805691855296


  0%|          | 0/91 [00:00<?, ?it/s]

7 3.298372902712979


  0%|          | 0/91 [00:00<?, ?it/s]

8 3.1309490020458517


  0%|          | 0/91 [00:00<?, ?it/s]

9 3.0428453120556505


  0%|          | 0/91 [00:00<?, ?it/s]

10 2.8908222167046516


  0%|          | 0/91 [00:00<?, ?it/s]

11 2.743723209087665


  0%|          | 0/91 [00:00<?, ?it/s]

12 2.6241496185680013


  0%|          | 0/91 [00:00<?, ?it/s]

13 2.464437246322632


  0%|          | 0/91 [00:00<?, ?it/s]

14 2.3551629965121927


  0%|          | 0/91 [00:00<?, ?it/s]

15 2.251247546175024


  0%|          | 0/91 [00:00<?, ?it/s]

16 2.122707166514554


  0%|          | 0/91 [00:00<?, ?it/s]

17 2.010160560136313


  0%|          | 0/91 [00:00<?, ?it/s]

18 1.9308898619243078


  0%|          | 0/91 [00:00<?, ?it/s]

19 1.8105901636920132


  0%|          | 0/91 [00:00<?, ?it/s]

20 1.6964080386109404


  0%|          | 0/91 [00:00<?, ?it/s]

21 1.605543816482628


  0%|          | 0/91 [00:00<?, ?it/s]

22 1.5042456755271325


  0%|          | 0/91 [00:00<?, ?it/s]

23 1.4181377586427626


  0%|          | 0/91 [00:00<?, ?it/s]

24 1.3216810881436527


  0%|          | 0/91 [00:00<?, ?it/s]

25 1.25226407457184


  0%|          | 0/91 [00:00<?, ?it/s]

26 1.1631875057796857


  0%|          | 0/91 [00:00<?, ?it/s]

27 1.0925386168144562


  0%|          | 0/91 [00:00<?, ?it/s]

28 1.0201210432000212


  0%|          | 0/91 [00:00<?, ?it/s]

29 0.9742286997837025


  0%|          | 0/91 [00:00<?, ?it/s]

30 0.9051505569573287


  0%|          | 0/91 [00:00<?, ?it/s]

31 0.8519175065742745


  0%|          | 0/91 [00:00<?, ?it/s]

32 0.787810742855072


  0%|          | 0/91 [00:00<?, ?it/s]

33 0.7337854645409427


  0%|          | 0/91 [00:00<?, ?it/s]

34 0.681058250940763


  0%|          | 0/91 [00:00<?, ?it/s]

35 0.6428400864312936


  0%|          | 0/91 [00:00<?, ?it/s]

36 0.5710018535891732


  0%|          | 0/91 [00:00<?, ?it/s]

37 0.5372262096012032


  0%|          | 0/91 [00:00<?, ?it/s]

38 0.4842506234462445


  0%|          | 0/91 [00:00<?, ?it/s]

39 0.44606787817818777


  0%|          | 0/91 [00:00<?, ?it/s]

40 0.4108594616690835


  0%|          | 0/91 [00:00<?, ?it/s]

41 0.38612881286458656


  0%|          | 0/91 [00:00<?, ?it/s]

42 0.3528519089405353


  0%|          | 0/91 [00:00<?, ?it/s]

43 0.31998199437345776


  0%|          | 0/91 [00:00<?, ?it/s]

44 0.28293841766132105


  0%|          | 0/91 [00:00<?, ?it/s]

45 0.26721300381225543


  0%|          | 0/91 [00:00<?, ?it/s]

46 0.24586070238888919


  0%|          | 0/91 [00:00<?, ?it/s]

47 0.22588277567218948


  0%|          | 0/91 [00:00<?, ?it/s]

48 0.21495757080041444


  0%|          | 0/91 [00:00<?, ?it/s]

49 0.1879483413892788


# 한국어 -> 영어 출력하는 함수

In [56]:
@torch.no_grad()
def translate(text,tokenizer , model, vocab_src, vocab_trg, max_len, device):
    model.eval()

    src = vocab_src([ t.form for t in tokenizer.tokenize(text)])
    src = torch.tensor(src).view(1,-1).to(device) # seq -> 1, seq

    # trg = [2]
    # 디코더의 인풋값이 시퀀스 길이가 1일때 forward함수의 반복예측이 의미가 없어져서 나온 결과가온다.
    # 하지만 다음 디코더의 인풋이 시퀀스가 1이상이면 예측을 함
    # trg = [2,0,0,0,0] 이것도 예측을 한다.
    # trg = [2] 이런 의미 없는 시퀀스가 아니면 됨.

    trg = [2,0]

    trg = torch.tensor(trg).view(1,-1).to(device) # seq -> 1, seq

    hn = None
    cn = None

    for _ in range(max_len):
        pred, hn, cn = model(src,trg, hn, cn)
        # pred: batch, seq, n_class
        word_no = pred[0,-1].argmax().item() # n_class -> 예측 클래스 번호
        if word_no == 3:  # eos 토큰일 경우 반복 중지
            break

        print(vocab_trg.lookup_token(word_no), end=" ")

        trg = [word_no,0]
        # trg = [word_no]
        # 디코더의 인풋값이 시퀀스 길이가 1일때 forward함수의 반복예측이 의미가 없어져서 의미없는 결과가 나온다.
        trg = torch.tensor(trg).view(1,-1).to(device) # seq -> 1, seq

In [53]:
max_len = max( len(lst) for lst in trg_data)
max_len

112

In [54]:
n = 20
text = train["ko"][n]
text, train["en"][n]

('이 컵 가져도 돼요?', 'can i have this cup?')

In [55]:
# 다음 예시는 디코더의 인풋값이 시퀀스 길이가 1일때 forward함수의 반복예측이 의미가 없어져서 나온 결과
translate(text,kiwi , model, vocab_src, vocab_trg, max_len, device)

can i have this cup ? 

In [43]:
vocab_trg["<sos>"]

2

In [64]:
vocab_trg.lookup_token(2),vocab_trg.lookup_token(0)

('<sos>', '<pad>')