In [1]:
!pip install gdown
!pip install sacremoses

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1


##1. Datasets

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import pandas as pd
from transformers import MarianTokenizer
import math

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
BATCH_SIZE = 32
MAX_LEN = 32

tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-ko-en')
PAD_IDX = tokenizer.pad_token_id
EOS_IDX = tokenizer.eos_token_id
VOCAB_SIZE = tokenizer.vocab_size

!gdown --fuzzy https://drive.google.com/uc?id=1HwypWkIhYZ8sGHaqjOycsMostvGSI640 -O 'dataset.xlsx'
data = pd.read_excel('dataset.xlsx')

class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return self.data.shape[0]
    def __getitem__(self, idx):
        return self.data.loc[idx, '원문'], self.data.loc[idx, '번역문']

custom_DS = CustomDataset(data)
train_DS, val_DS, test_DS = random_split(custom_DS, [97000, 2000, 1000])

def collate_fn(batch):
    src_texts, trg_texts = zip(*batch)
    src = tokenizer(list(src_texts), return_tensors='pt', padding='max_length', truncation=True, max_length=MAX_LEN, add_special_tokens=True)['input_ids']
    trg = tokenizer(list(trg_texts), return_tensors='pt', padding='max_length', truncation=True, max_length=MAX_LEN, add_special_tokens=True)['input_ids']
    return src, trg

train_DL = DataLoader(train_DS, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_DL   = DataLoader(val_DS, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
test_DL  = DataLoader(test_DS, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)


Downloading...
From: https://drive.google.com/uc?id=1HwypWkIhYZ8sGHaqjOycsMostvGSI640
To: /content/dataset.xlsx
100% 9.57M/9.57M [00:00<00:00, 58.8MB/s]


##2. Positional Encoding

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=512):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # (1, max_len, d_model)
        self.register_buffer('pe', pe)
    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]

def make_pad_mask(seq, pad_idx):
    return (seq != pad_idx)

def make_subsequent_mask(seq_len):
    # future mask
    return torch.tril(torch.ones((seq_len, seq_len), dtype=torch.bool))

##3. EncoderLayer

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, ff_dim, dropout=0.1):
        super().__init__()
        self.self_attn = nn.MultiheadAttention(d_model, n_heads, batch_first=True, dropout=dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, d_model)
        )
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x, src_key_padding_mask=None):
        attn_out, attn_weights = self.self_attn(x, x, x, key_padding_mask=~src_key_padding_mask)
        ####
        # Implement here
        ####
        return x, attn_weights


##4. DecoderLayer

In [None]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, ff_dim, dropout=0.1):
        super().__init__()
        self.self_attn = nn.MultiheadAttention(d_model, n_heads, batch_first=True, dropout=dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.cross_attn = nn.MultiheadAttention(d_model, n_heads, batch_first=True, dropout=dropout)
        self.norm2 = nn.LayerNorm(d_model)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, d_model)
        )
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x, enc_out, trg_mask=None, src_key_padding_mask=None):
        attn_out, self_attn_weights = self.self_attn(x, x, x, attn_mask=trg_mask)
        ####
        # Implement here
        ####
        attn_out, cross_attn_weights = self.cross_attn(x, enc_out, enc_out, key_padding_mask=~src_key_padding_mask)
        ####
        # Implement here
        ####
        return x, self_attn_weights, cross_attn_weights


##5. Encoder

In [None]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, n_heads, ff_dim, num_layers, max_len=512, dropout=0.1):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model, padding_idx=PAD_IDX)
        self.pos_enc = PositionalEncoding(d_model, max_len)
        self.layers = nn.ModuleList([EncoderLayer(d_model, n_heads, ff_dim, dropout) for _ in range(num_layers)])
    def forward(self, src, src_key_padding_mask):
        x = self.embed(src)
        x = self.pos_enc(x)
        attn_weights_list = []
        for layer in self.layers:
            x, attn_weights = layer(x, src_key_padding_mask)
            attn_weights_list.append(attn_weights)
        return x, attn_weights_list

##6. Decoder

In [None]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, n_heads, ff_dim, num_layers, max_len=512, dropout=0.1):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model, padding_idx=PAD_IDX)
        self.pos_enc = PositionalEncoding(d_model, max_len)
        self.layers = nn.ModuleList([DecoderLayer(d_model, n_heads, ff_dim, dropout) for _ in range(num_layers)])
        self.fc_out = nn.Linear(d_model, vocab_size)
    def forward(self, trg, enc_out, trg_mask, src_key_padding_mask):
        ####
        # Implement here
        ####
        return x, self_attn_list, cross_attn_list

##7. Transformer

In [None]:
class Transformer(nn.Module):
    def __init__(self, vocab_size, max_len, d_model=64, n_heads=4, ff_dim=256, num_layers=2, dropout=0.1):
        super().__init__()
        self.encoder = Encoder(vocab_size, d_model, n_heads, ff_dim, num_layers, max_len, dropout)
        self.decoder = Decoder(vocab_size, d_model, n_heads, ff_dim, num_layers, max_len, dropout)
    def forward(self, src, trg):
        src_key_padding_mask = make_pad_mask(src, PAD_IDX)
        trg_key_padding_mask = make_pad_mask(trg, PAD_IDX)
        seq_len = trg.size(1)
        subsequent_mask = make_subsequent_mask(seq_len).to(src.device)

        enc_out, enc_attn = self.encoder(src, src_key_padding_mask)

        out, dec_self_attn, dec_cross_attn = self.decoder(
            trg, enc_out, trg_mask=~subsequent_mask, src_key_padding_mask=src_key_padding_mask
        )
        return out, enc_attn, dec_self_attn, dec_cross_attn

##8. Training

In [None]:
d_model = 64
n_heads = 4
ff_dim = 256
num_layers = 2

model = Transformer(VOCAB_SIZE, MAX_LEN, d_model, n_heads, ff_dim, num_layers).to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
EPOCHS = 10

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for src_batch, trg_batch in train_DL:
        src_batch = src_batch.to(DEVICE)
        trg_batch = trg_batch.to(DEVICE)
        # Teacher Forcing: 입력은 trg[:, :-1], 정답은 trg[:, 1:]
        optimizer.zero_grad()
        out, *_ = model(src_batch, trg_batch[:, :-1])
        loss = criterion(out.reshape(-1, VOCAB_SIZE), trg_batch[:, 1:].reshape(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_DL):.4f}")


torch.save(model.state_dict(), 'dialog_transformer_layernorm.pt')


##9. Load Pre-trained Model

In [None]:
from transformers import MarianMTModel
model = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-ko-en')
torch.save(model.state_dict(), 'marian_pretrained.pt')
input_text = "비가 오니까 우산을 가져가!"

input_tokens = tokenizer.encode(input_text, return_tensors="pt")
translated_tokens = model.generate(input_tokens, max_new_tokens=100)
translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)

print("입력:", input_text)
print("출력:", translated_text)

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/312M [00:00<?, ?B/s]

입력: 비가 오니까 우산을 가져가!
AI의 번역: Take your umbrella because it's raining!


##10. Quantization INT8

In [None]:
quantized_model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8  # 8bit로 양자화
)
torch.save(quantized_model.state_dict(), 'marian_quantized.pt')


##11. Compare

In [None]:
import os

float_path = 'marian_pretrained.pt'
quant_path = 'marian_quantized.pt'

float_size = os.path.getsize(float_path) / 1024 / 1024  # MB
quant_size = os.path.getsize(quant_path) / 1024 / 1024  # MB

print(f"원본(float32) 모델 크기: {float_size:.2f} MB")
print(f"양자화(8bit) 모델 크기: {quant_size:.2f} MB")
print(f"크기 감소 비율: {quant_size / float_size:.2%}")


원본(float32) 모델 크기: 297.67 MB
양자화(8bit) 모델 크기: 203.48 MB
크기 감소 비율: 68.36%


In [None]:
input_tokens = tokenizer.encode(input_text, return_tensors="pt")
translated_tokens = quantized_model.generate(input_tokens, max_new_tokens=100)
translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)

print("입력:", input_text)
print("출력:", translated_text)

입력: 비가 오니까 우산을 가져가!
출력: It's raining. Take your umbrella!
