In [1]:
!pip install gdown
!pip install sacremoses

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


##1. Datasets

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import pandas as pd
from transformers import MarianTokenizer
import math

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
BATCH_SIZE = 32
MAX_LEN = 32

tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-ko-en')
PAD_IDX = tokenizer.pad_token_id
EOS_IDX = tokenizer.eos_token_id
VOCAB_SIZE = tokenizer.vocab_size

# !gdown --fuzzy https://drive.google.com/uc?id=1HwypWkIhYZ8sGHaqjOycsMostvGSI640 -O 'dataset.xlsx'
data = pd.read_excel('dataset.xlsx')

class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return self.data.shape[0]
    def __getitem__(self, idx):
        return self.data.loc[idx, '원문'], self.data.loc[idx, '번역문']

custom_DS = CustomDataset(data)
train_DS, val_DS, test_DS = random_split(custom_DS, [97000, 2000, 1000])

def collate_fn(batch):
    src_texts, trg_texts = zip(*batch)
    src = tokenizer(list(src_texts), return_tensors='pt', padding='max_length', truncation=True, max_length=MAX_LEN, add_special_tokens=True)['input_ids']
    trg = tokenizer(list(trg_texts), return_tensors='pt', padding='max_length', truncation=True, max_length=MAX_LEN, add_special_tokens=True)['input_ids']
    return src, trg

train_DL = DataLoader(train_DS, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_DL   = DataLoader(val_DS, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
test_DL  = DataLoader(test_DS, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_DL.dataset[0]

('오늘 말고 내일로 예약하고 싶은데, 내일은 언제로 예약할 수 있을까요?',
 "I'd like to make a reservation for tomorrow, not today. When are you available tomorrow?")

##2. Positional Encoding

In [3]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=512):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # (1, max_len, d_model)
        self.register_buffer('pe', pe)
    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]

def make_pad_mask(seq, pad_idx):
    return (seq != pad_idx)

def make_subsequent_mask(seq_len):
    # future mask
    return torch.tril(torch.ones((seq_len, seq_len), dtype=torch.bool))

##3. EncoderLayer

In [4]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, ff_dim, dropout=0.1):
        super().__init__()
        self.self_attn = nn.MultiheadAttention(d_model, n_heads, batch_first=True, dropout=dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, d_model)
        )
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x, src_key_padding_mask=None):
        attn_out, attn_weights = self.self_attn(x, x, x, key_padding_mask=~src_key_padding_mask)
        # implement here

        # residual connection + dropout
        x = x + self.dropout(attn_out)
        # 첫번째 정규화
        x = self.norm1(x)

        # feed forward network block
        ffn_out = self.ffn(x)

        x = x + self.dropout(ffn_out)
        x = self.norm2(x)
        return x, attn_weights


##4. DecoderLayer

In [5]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, ff_dim, dropout=0.1):
        super().__init__()
        self.self_attn = nn.MultiheadAttention(d_model, n_heads, batch_first=True, dropout=dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.cross_attn = nn.MultiheadAttention(d_model, n_heads, batch_first=True, dropout=dropout)
        self.norm2 = nn.LayerNorm(d_model)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, d_model)
        )
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x, enc_out, trg_mask=None, src_key_padding_mask=None):
        attn_out, self_attn_weights = self.self_attn(x, x, x, attn_mask=trg_mask)
        ####
        # Implement here
        x = x + self.dropout(attn_out)
        x = self.norm1(x)
        ####
        attn_out, cross_attn_weights = self.cross_attn(x, enc_out, enc_out, key_padding_mask=~src_key_padding_mask)
        ####
        # Implement here
        x = x + self.dropout(attn_out)
        x = self.norm2(x)

        ffn_out = self.ffn(x)
        x - x + self.dropout(ffn_out)
        x = self.norm3(x)
        ####
        return x, self_attn_weights, cross_attn_weights


##5. Encoder

In [6]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, n_heads, ff_dim, num_layers, max_len=512, dropout=0.1):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model, padding_idx=PAD_IDX)
        self.pos_enc = PositionalEncoding(d_model, max_len)
        self.layers = nn.ModuleList([EncoderLayer(d_model, n_heads, ff_dim, dropout) for _ in range(num_layers)])
    def forward(self, src, src_key_padding_mask):
        x = self.embed(src)
        x = self.pos_enc(x)
        attn_weights_list = []
        for layer in self.layers:
            x, attn_weights = layer(x, src_key_padding_mask)
            attn_weights_list.append(attn_weights)
        return x, attn_weights_list

##6. Decoder

In [7]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, n_heads, ff_dim, num_layers, max_len=512, dropout=0.1):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model, padding_idx=PAD_IDX)
        self.pos_enc = PositionalEncoding(d_model, max_len)
        self.layers = nn.ModuleList([DecoderLayer(d_model, n_heads, ff_dim, dropout) for _ in range(num_layers)])
        self.fc_out = nn.Linear(d_model, vocab_size)
    def forward(self, trg, enc_out, trg_mask, src_key_padding_mask):
        ####
        # Implement here
        x = self.embed(trg)
        x = self.pos_enc(x)

        self_attn_list = []
        cross_attn_list = []

        for layer in self.layers:
            x, self_attn_weights, cross_attn_weights = layer(x, enc_out, trg_mask, src_key_padding_mask)
            self_attn_list.append(self_attn_weights)
            cross_attn_list.append(cross_attn_weights)

        x = self.fc_out(x)
        ####
        return x, self_attn_list, cross_attn_list

##7. Transformer

In [8]:
class Transformer(nn.Module):
    def __init__(self, vocab_size, max_len, d_model=64, n_heads=4, ff_dim=256, num_layers=2, dropout=0.1):
        super().__init__()
        self.encoder = Encoder(vocab_size, d_model, n_heads, ff_dim, num_layers, max_len, dropout)
        self.decoder = Decoder(vocab_size, d_model, n_heads, ff_dim, num_layers, max_len, dropout)
    def forward(self, src, trg):
        src_key_padding_mask = make_pad_mask(src, PAD_IDX)
        trg_key_padding_mask = make_pad_mask(trg, PAD_IDX)
        seq_len = trg.size(1)
        subsequent_mask = make_subsequent_mask(seq_len).to(src.device)

        enc_out, enc_attn = self.encoder(src, src_key_padding_mask)

        out, dec_self_attn, dec_cross_attn = self.decoder(
            trg, enc_out, trg_mask=~subsequent_mask, src_key_padding_mask=src_key_padding_mask
        )
        return out, enc_attn, dec_self_attn, dec_cross_attn

##8. Training

In [13]:
d_model = 64
n_heads = 4
ff_dim = 256
num_layers = 2

model = Transformer(VOCAB_SIZE, MAX_LEN, d_model, n_heads, ff_dim, num_layers).to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
EPOCHS = 10

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for src_batch, trg_batch in train_DL:
        src_batch = src_batch.to(DEVICE)
        trg_batch = trg_batch.to(DEVICE)
        # Teacher Forcing: 입력은 trg[:, :-1], 정답은 trg[:, 1:]
        optimizer.zero_grad()
        out, *_ = model(src_batch, trg_batch[:, :-1])
        loss = criterion(out.reshape(-1, VOCAB_SIZE), trg_batch[:, 1:].reshape(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_DL):.4f}")


torch.save(model.state_dict(), 'dialog_transformer_layernorm.pt')
# 너무 오래 걸림


Epoch 1, Loss: 3.5569
Epoch 2, Loss: 2.8163
Epoch 3, Loss: 2.6230
Epoch 4, Loss: 2.5044
Epoch 5, Loss: 2.4215
Epoch 6, Loss: 2.3593
Epoch 7, Loss: 2.3093
Epoch 8, Loss: 2.2687
Epoch 9, Loss: 2.2341
Epoch 10, Loss: 2.2028


In [17]:
model

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(65001, 512, padding_idx=65000)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(65001, 512, padding_idx=65000)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

In [19]:
model.generate(input_tokens, max_new_tokens=100)

tensor([[65000,  2325,    69, 42607,   211,    24,    12,    10, 49046,    28,
             0]])

##9. Load Pre-trained Model

In [21]:
from transformers import MarianMTModel
model = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-ko-en')
# torch.save(model.state_dict(), 'marian_pretrained.pt')
input_text = "비가 오니까 우산을 가져가!"

input_tokens = tokenizer.encode(input_text, return_tensors="pt")
translated_tokens = model.generate(input_tokens, max_new_tokens=100)
translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)

print("입력:", input_text)
print("출력:", translated_text)

입력: 비가 오니까 우산을 가져가!
출력: Take your umbrella because it's raining!


##10. Quantization INT8

In [10]:
quantized_model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8  # 8bit로 양자화
)
torch.save(quantized_model.state_dict(), 'marian_quantized.pt')


##11. Compare

In [14]:
import os

org_path = 'dialog_transformer_layernorm.pt'
float_path = 'marian_pretrained.pt'
quant_path = 'marian_quantized.pt'

org_size = os.path.getsize(org_path) / 1024 / 1024  # MB
float_size = os.path.getsize(float_path) / 1024 / 1024  # MB
quant_size = os.path.getsize(quant_path) / 1024 / 1024  # MB

print(f"원본(float32) 모델 크기: {org_size:.2f} MB")
print(f"원본(float32) 모델 크기: {float_size:.2f} MB")
print(f"양자화(8bit) 모델 크기: {quant_size:.2f} MB")
print(f"크기 감소 비율: {quant_size / float_size:.2%}")


원본(float32) 모델 크기: 48.79 MB
원본(float32) 모델 크기: 297.67 MB
양자화(8bit) 모델 크기: 203.48 MB
크기 감소 비율: 68.36%


In [12]:
input_tokens = tokenizer.encode(input_text, return_tensors="pt")
translated_tokens = quantized_model.generate(input_tokens, max_new_tokens=100)
translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)

print("입력:", input_text)
print("출력:", translated_text)

입력: 비가 오니까 우산을 가져가!
출력: It's raining. Take your umbrella!
