## Step 1: Import Libraries


In [191]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import pandas as pd
import numpy as np
import re
import math
import sentencepiece as spm
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import easydict

# Device configuration
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: mps


## Step 2: Data Preprocessing

In [192]:
def preprocess_sentence(sentence):
    """
    Preprocess a single sentence.
    - Keep Korean, English, numbers, and basic punctuation
    - Remove duplicate spaces and special characters
    """
    if pd.isna(sentence) or sentence is None:
        return ""
    
    sentence = str(sentence)
    sentence = re.sub(r'[^ㄱ-ㅎㅏ-ㅣ가-힣a-zA-Z0-9\s.,!?~ㅠㅜ]', ' ', sentence)
    sentence = re.sub(r'\s+', ' ', sentence)
    sentence = sentence.strip()
    sentence = re.sub(r'([!?.])\1+', r'\1', sentence)
    
    return sentence


def load_and_preprocess_data(file_path):
    """
    Load CSV file and preprocess all question-answer pairs.
    """
    print("=" * 50)
    print("Loading and preprocessing data...")
    print("=" * 50)
    
    # Load data
    df = pd.read_csv(file_path)
    print(f"Total data: {len(df)} pairs")
    
    # Extract and preprocess
    questions = []
    answers = []
    
    for i, (q, a) in enumerate(zip(df['Q'], df['A'])):
        clean_q = preprocess_sentence(q)
        clean_a = preprocess_sentence(a)
        
        if clean_q and clean_a:
            questions.append(clean_q)
            answers.append(clean_a)
        
        if (i + 1) % 1000 == 0:
            print(f"Progress: {i + 1}/{len(df)}")
    
    print(f"\nValid pairs after preprocessing: {len(questions)}")
    print("\nSample data:")
    for i in range(min(3, len(questions))):
        print(f"Q: {questions[i]}")
        print(f"A: {answers[i]}\n")
    
    return questions, answers

In [193]:
# Load and preprocess data
file_path = '/Users/wansookim/Downloads/code_implementation/transformer_project_submit/data/ChatbotData.csv'
questions, answers = load_and_preprocess_data(file_path)

Loading and preprocessing data...
Total data: 11823 pairs
Progress: 1000/11823
Progress: 2000/11823
Progress: 3000/11823
Progress: 4000/11823
Progress: 5000/11823
Progress: 6000/11823
Progress: 7000/11823
Progress: 8000/11823
Progress: 9000/11823
Progress: 10000/11823
Progress: 11000/11823

Valid pairs after preprocessing: 11823

Sample data:
Q: 12시 땡!
A: 하루가 또 가네요.

Q: 1지망 학교 떨어졌어
A: 위로해 드립니다.

Q: 3박4일 놀러가고 싶다
A: 여행은 언제나 좋죠.



## Step 3: SentencePiece Tokenization

In [194]:
def train_sentencepiece_model(questions, answers, model_prefix='./data/korean_chatbot_sp', vocab_size=8000):
    """
    Train SentencePiece model on question-answer pairs.
    """
    print("=" * 50)
    print("Training SentencePiece model...")
    print("=" * 50)
    
    # Save all sentences to a temporary file
    all_sentences_path = './data/all_sentences.txt'
    with open(all_sentences_path, 'w', encoding='utf-8') as f:
        f.write('\n'.join(questions + answers))
    
    # Train SentencePiece
    cmd = f'--input={all_sentences_path} \
           --model_prefix={model_prefix} \
           --vocab_size={vocab_size} \
           --model_type=unigram \
           --max_sentence_length=999999 \
           --pad_id=0 \
           --unk_id=1 \
           --bos_id=2 \
           --eos_id=3 \
           --user_defined_symbols=[SEP],[CLS],[MASK]'
    
    spm.SentencePieceTrainer.Train(cmd)
    
    model_file = f"{model_prefix}.model"
    print(f"\nModel saved: {model_file}")
    return model_file


class SentencePieceVocab:
    """Wrapper class for SentencePiece model."""
    def __init__(self, sp_model_path):
        self.sp = spm.SentencePieceProcessor()
        self.sp.Load(sp_model_path)
        self.PAD_ID = 0
        self.UNK_ID = 1
        self.BOS_ID = 2
        self.EOS_ID = 3
        self.stoi = {'<pad>': 0, '<unk>': 1, '<s>': 2, '</s>': 3}
        self.itos = [self.sp.IdToPiece(i) for i in range(self.sp.GetPieceSize())]
    
    def encode(self, sentence):
        return self.sp.EncodeAsIds(sentence)
    
    def decode(self, ids):
        return self.sp.DecodeIds([i for i in ids if i not in [0, 2, 3]])
    
    def __len__(self):
        return self.sp.GetPieceSize()


class ChatbotDataset(Dataset):
    """Dataset class for question-answer pairs."""
    def __init__(self, questions, answers, vocab, max_length=40):
        self.questions = questions
        self.answers = answers
        self.vocab = vocab
        self.max_length = max_length
    
    def __len__(self):
        return len(self.questions)
    
    def __getitem__(self, idx):
        q = self.questions[idx]
        a = self.answers[idx]
        src = [self.vocab.BOS_ID] + self.vocab.encode(q) + [self.vocab.EOS_ID]
        trg = [self.vocab.BOS_ID] + self.vocab.encode(a) + [self.vocab.EOS_ID]
        return {
            'SRC': torch.tensor(src[:self.max_length], dtype=torch.long),
            'TRG': torch.tensor(trg[:self.max_length], dtype=torch.long)
        }

# torch.tensor([pad_idx]*(src_max-len(s)), dtype=torch.long)

def collate_fn(batch, pad_idx=0):
    """Collate function for DataLoader with padding."""
    src_batch = [item['SRC'] for item in batch]
    trg_batch = [item['TRG'] for item in batch]
    src_max = max(len(s) for s in src_batch)
    trg_max = max(len(t) for t in trg_batch)
    
    src_padded = [torch.cat([s, torch.tensor([pad_idx]*(src_max-len(s)), dtype=torch.long)]) 
                  for s in src_batch]
    trg_padded = [torch.cat([t, torch.tensor([pad_idx]*(trg_max-len(t)), dtype=torch.long)]) 
                  for t in trg_batch]
    
    return {'SRC': torch.stack(src_padded), 'TRG': torch.stack(trg_padded)}

In [195]:
# Train SentencePiece model
model_file = train_sentencepiece_model(questions, answers)

# Create vocabulary
vocab = SentencePieceVocab(model_file)
print(f"\nVocabulary size: {len(vocab):,}")

# Test tokenization
test_sentence = questions[0]
encoded = vocab.encode(test_sentence)
decoded = vocab.decode(encoded)
print(f"\nTest sentence: {test_sentence}")
print(f"Encoded: {encoded[:10]}...")
print(f"Decoded: {decoded}")

Training SentencePiece model...

Model saved: ./data/korean_chatbot_sp.model

Vocabulary size: 8,000

Test sentence: 12시 땡!
Encoded: [4294, 572, 8, 7824, 61]...
Decoded: 12시 땡!


sentencepiece_trainer.cc(178) LOG(INFO) Running command: --input=./data/all_sentences.txt            --model_prefix=./data/korean_chatbot_sp            --vocab_size=8000            --model_type=unigram            --max_sentence_length=999999            --pad_id=0            --unk_id=1            --bos_id=2            --eos_id=3            --user_defined_symbols=[SEP],[CLS],[MASK]
sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: ./data/all_sentences.txt
  input_format: 
  model_prefix: ./data/korean_chatbot_sp
  model_type: UNIGRAM
  vocab_size: 8000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 999999
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whi

In [196]:
# Split data into train/val/test
train_q, temp_q, train_a, temp_a = train_test_split(
    questions, answers, test_size=0.2, random_state=42
)
val_q, test_q, val_a, test_a = train_test_split(
    temp_q, temp_a, test_size=0.5, random_state=42
)

# Create datasets
train_dataset = ChatbotDataset(train_q, train_a, vocab, max_length=40)
val_dataset = ChatbotDataset(val_q, val_a, vocab, max_length=40)
test_dataset = ChatbotDataset(test_q, test_a, vocab, max_length=40)

# Create dataloaders
train_iterator = DataLoader(
    train_dataset, batch_size=32, shuffle=True,
    collate_fn=lambda b: collate_fn(b, vocab.PAD_ID)
)
valid_iterator = DataLoader(
    val_dataset, batch_size=32, shuffle=False,
    collate_fn=lambda b: collate_fn(b, vocab.PAD_ID)
)
test_iterator = DataLoader(
    test_dataset, batch_size=32, shuffle=False,
    collate_fn=lambda b: collate_fn(b, vocab.PAD_ID)
)

print("=" * 50)
print("Data split complete")
print("=" * 50)
print(f"Train: {len(train_q):,} pairs")
print(f"Val: {len(val_q):,} pairs")
print(f"Test: {len(test_q):,} pairs")

Data split complete
Train: 9,458 pairs
Val: 1,182 pairs
Test: 1,183 pairs


## Step 4: Model Building

In [197]:
class MultiHeadAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper."""
    
    def __init__(self, emb_dim, num_heads, dropout=0.0, bias=False, 
                 encoder_decoder_attention=False, causal=False):
        super().__init__()
        self.emb_dim = emb_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.head_dim = emb_dim // num_heads
        assert self.head_dim * num_heads == self.emb_dim, "emb_dim must be divisible by num_heads"
        
        self.encoder_decoder_attention = encoder_decoder_attention
        self.causal = causal
        self.q_proj = nn.Linear(emb_dim, emb_dim, bias=bias)
        self.k_proj = nn.Linear(emb_dim, emb_dim, bias=bias)
        self.v_proj = nn.Linear(emb_dim, emb_dim, bias=bias)
        self.out_proj = nn.Linear(emb_dim, emb_dim, bias=bias)
    
    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_heads, self.head_dim,)
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)
    
    def forward(self, query, key, attention_mask=None):
        q = self.q_proj(query)
        
        if self.encoder_decoder_attention:
            k = self.k_proj(key)
            v = self.v_proj(key)
        else:
            k = self.k_proj(query)
            v = self.v_proj(query)
        
        q = self.transpose_for_scores(q)
        k = self.transpose_for_scores(k)
        v = self.transpose_for_scores(v)
        
        # Scaled dot-product attention
        attn_weights = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(self.head_dim)
        
        if attention_mask is not None:
            if self.causal:
                attn_weights = attn_weights.masked_fill(
                    attention_mask.unsqueeze(0).unsqueeze(1), float("-inf")
                )
            else:
                attn_weights = attn_weights.masked_fill(
                    attention_mask.unsqueeze(1).unsqueeze(2), float("-inf")
                )
        
        attn_weights = F.softmax(attn_weights, dim=-1)
        attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training)
        
        attn_output = torch.matmul(attn_probs, v)
        attn_output = attn_output.permute(0, 2, 1, 3).contiguous()
        concat_attn_output_shape = attn_output.size()[:-2] + (self.emb_dim,)
        attn_output = attn_output.view(*concat_attn_output_shape)
        attn_output = self.out_proj(attn_output)
        
        return attn_output, attn_weights

In [198]:
class PositionWiseFeedForward(nn.Module):
    """Position-wise feed-forward network."""
    
    def __init__(self, emb_dim, d_ff, dropout=0.1):
        super().__init__()
        self.w_1 = nn.Linear(emb_dim, d_ff)
        self.w_2 = nn.Linear(d_ff, emb_dim)
        self.dropout = dropout
        self.activation = nn.ReLU()
    
    def forward(self, x):
        residual = x
        x = self.activation(self.w_1(x))
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.w_2(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        return x + residual

In [199]:
class SinusoidalPositionalEmbedding(nn.Embedding):
    """Sinusoidal positional embeddings."""
    
    def __init__(self, num_positions, embedding_dim, padding_idx=None):
        super().__init__(num_positions, embedding_dim)
        self._init_weight(self.weight)
    
    
    @staticmethod
    def _init_weight(out):
        """
        Initialize the weight tensor with sinusoidal values.
        Modifies the tensor in-place using .data to avoid autograd issues.
        """
        n_pos, embed_dim = out.shape
        
        # Create sinusoidal position encodings
        position_enc = torch.zeros(n_pos, embed_dim)
        
        for pos in range(n_pos):
            for i in range(0, embed_dim, 2):
                position_enc[pos, i] = math.sin(pos / (10000 ** (i / embed_dim)))
                if i + 1 < embed_dim:
                    position_enc[pos, i + 1] = math.cos(pos / (10000 ** ((i + 1) / embed_dim)))
        
        # Copy the values into the parameter's data (in-place, no autograd)
        out.data.copy_(position_enc)
        
        # Make sure gradients are not computed for positional embeddings
        out.requires_grad = False

    @torch.no_grad()
    def forward(self, input_ids):
        bsz, seq_len = input_ids.shape[:2]
        positions = torch.arange(seq_len, dtype=torch.long, device=self.weight.device)
        return super().forward(positions)

In [200]:
class EncoderLayer(nn.Module):
    """Single encoder layer."""
    
    def __init__(self, config):
        super().__init__()
        self.emb_dim = config.emb_dim
        self.self_attn = MultiHeadAttention(
            emb_dim=self.emb_dim,
            num_heads=config.attention_heads,
            dropout=config.attention_dropout
        )
        self.self_attn_layer_norm = nn.LayerNorm(self.emb_dim)
        self.ffn = PositionWiseFeedForward(self.emb_dim, config.ffn_dim, config.dropout)
        self.final_layer_norm = nn.LayerNorm(self.emb_dim)
        self.dropout = config.dropout
    
    def forward(self, x, encoder_padding_mask):
        residual = x
        x, attn_weights = self.self_attn(query=x, key=x, attention_mask=encoder_padding_mask)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = residual + x
        x = self.self_attn_layer_norm(x)
        x = self.ffn(x)
        x = self.final_layer_norm(x)
        return x, attn_weights

In [201]:
class Encoder(nn.Module):
    """Transformer encoder."""
    
    def __init__(self, config, embed_tokens):
        super().__init__()
        self.dropout = config.dropout
        self.padding_idx = embed_tokens.padding_idx
        self.embed_tokens = embed_tokens
        self.embed_positions = SinusoidalPositionalEmbedding(
            config.max_position_embeddings, config.emb_dim, self.padding_idx
        )
        self.layers = nn.ModuleList([EncoderLayer(config) for _ in range(config.encoder_layers)])
    
    def forward(self, input_ids, attention_mask=None):
        inputs_embeds = self.embed_tokens(input_ids)
        embed_pos = self.embed_positions(input_ids)
        x = inputs_embeds + embed_pos
        x = F.dropout(x, p=self.dropout, training=self.training)
        
        self_attn_scores = []
        for encoder_layer in self.layers:
            x, attn = encoder_layer(x, attention_mask)
            self_attn_scores.append(attn.detach())
        
        return x, self_attn_scores

In [202]:
class DecoderLayer(nn.Module):
    """Single decoder layer."""
    
    def __init__(self, config):
        super().__init__()
        self.emb_dim = config.emb_dim
        self.self_attn = MultiHeadAttention(
            emb_dim=self.emb_dim,
            num_heads=config.attention_heads,
            dropout=config.attention_dropout,
            causal=True,
        )
        self.self_attn_layer_norm = nn.LayerNorm(self.emb_dim)
        self.encoder_attn = MultiHeadAttention(
            emb_dim=self.emb_dim,
            num_heads=config.attention_heads,
            dropout=config.attention_dropout,
            encoder_decoder_attention=True,
        )
        self.encoder_attn_layer_norm = nn.LayerNorm(self.emb_dim)
        self.ffn = PositionWiseFeedForward(self.emb_dim, config.ffn_dim, config.dropout)
        self.final_layer_norm = nn.LayerNorm(self.emb_dim)
        self.dropout = config.dropout
    
    def forward(self, x, encoder_hidden_states, encoder_attention_mask=None, causal_mask=None):
        # Self attention
        residual = x
        x, self_attn_weights = self.self_attn(query=x, key=x, attention_mask=causal_mask)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = residual + x
        x = self.self_attn_layer_norm(x)
        
        # Cross attention
        residual = x
        x, cross_attn_weights = self.encoder_attn(
            query=x, key=encoder_hidden_states, attention_mask=encoder_attention_mask
        )
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = residual + x
        x = self.encoder_attn_layer_norm(x)
        
        # Feed forward
        x = self.ffn(x)
        x = self.final_layer_norm(x)
        
        return x, self_attn_weights, cross_attn_weights

In [203]:
class Decoder(nn.Module):
    """Transformer decoder."""
    
    def __init__(self, config, embed_tokens):
        super().__init__()
        self.dropout = config.dropout
        self.padding_idx = embed_tokens.padding_idx
        self.embed_tokens = embed_tokens
        self.embed_positions = SinusoidalPositionalEmbedding(
            config.max_position_embeddings, config.emb_dim, self.padding_idx
        )
        self.layers = nn.ModuleList([DecoderLayer(config) for _ in range(config.decoder_layers)])
    
    def forward(self, input_ids, encoder_hidden_states, 
                encoder_attention_mask=None, decoder_causal_mask=None):
        inputs_embeds = self.embed_tokens(input_ids)
        embed_pos = self.embed_positions(input_ids)
        x = inputs_embeds + embed_pos
        x = F.dropout(x, p=self.dropout, training=self.training)
        
        self_attn_scores = []
        cross_attn_scores = []
        for decoder_layer in self.layers:
            x, self_attn, cross_attn = decoder_layer(
                x, encoder_hidden_states, encoder_attention_mask, decoder_causal_mask
            )
            self_attn_scores.append(self_attn.detach())
            cross_attn_scores.append(cross_attn.detach())
        
        return x, (self_attn_scores, cross_attn_scores)

In [204]:
class Transformer(nn.Module):
    """Complete Transformer model."""
    
    def __init__(self, vocab, config):
        super().__init__()
        self.vocab = vocab
        
        # Embeddings
        self.enc_embedding = nn.Embedding(
            len(vocab.itos), config.emb_dim, padding_idx=vocab.stoi['<pad>']
        )
        self.dec_embedding = nn.Embedding(
            len(vocab.itos), config.emb_dim, padding_idx=vocab.stoi['<pad>']
        )
        
        # Encoder and Decoder
        self.encoder = Encoder(config, self.enc_embedding)
        self.decoder = Decoder(config, self.dec_embedding)
        
        # Output layer
        self.prediction_head = nn.Linear(config.emb_dim, len(vocab.itos))
        
        self.init_weights()
    
    def generate_mask(self, src, trg):
        # Encoder padding mask
        enc_attention_mask = src.eq(self.vocab.stoi['<pad>']).to(device)
        
        # Decoder causal mask
        tmp = torch.ones(trg.size(1), trg.size(1), dtype=torch.bool, device=device)
        mask = torch.arange(tmp.size(-1), device=device)
        dec_attention_mask = tmp.masked_fill_(mask < (mask + 1).view(tmp.size(-1), 1), False)
        
        return enc_attention_mask, dec_attention_mask
    
    def init_weights(self):
        for name, param in self.named_parameters():
            if param.requires_grad:
                if 'weight' in name:
                    nn.init.normal_(param.data, mean=0, std=0.01)
                else:
                    nn.init.constant_(param.data, 0)
    
    def forward(self, src, trg):
        enc_attention_mask, dec_causal_mask = self.generate_mask(src, trg)
        
        encoder_output, encoder_attention_scores = self.encoder(
            input_ids=src, attention_mask=enc_attention_mask
        )
        
        decoder_output, decoder_attention_scores = self.decoder(
            trg, encoder_output,
            encoder_attention_mask=enc_attention_mask,
            decoder_causal_mask=dec_causal_mask,
        )
        
        decoder_output = self.prediction_head(decoder_output)
        
        return decoder_output, encoder_attention_scores, decoder_attention_scores

In [205]:
# Model configuration
config = easydict.EasyDict({
    "emb_dim": 256,
    "ffn_dim": 1024,
    "attention_heads": 8,
    "attention_dropout": 0.1,
    "dropout": 0.1,
    "max_position_embeddings": 512,
    "encoder_layers": 3,
    "decoder_layers": 3,
})

# Create model
model = Transformer(vocab, config)
model.to(device)

# Optimizer and loss
optimizer = optim.Adam(model.parameters(), lr=5e-4)
criterion = nn.CrossEntropyLoss(ignore_index=vocab.PAD_ID)

print("=" * 50)
print("Model initialized")
print("=" * 50)
print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

Model initialized
Total parameters: 11,934,528
Trainable parameters: 11,672,384


## Step 5: Model Training and Evaluation

In [206]:
def train(model, iterator, optimizer, criterion, clip):
    """Train the model for one epoch."""
    model.train()
    epoch_loss = 0
    
    for batch in iterator:
        src = batch['SRC'].to(device)
        trg = batch['TRG'].to(device)
        
        optimizer.zero_grad()
        
        output, _, _ = model(src, trg)
        
        # Calculate loss
        output = output[:, :-1, :].reshape(-1, output.shape[-1])
        trg = trg[:, 1:].reshape(-1)
        
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        epoch_loss += loss.item()
    
    return epoch_loss / len(iterator)


def evaluate(model, iterator, criterion):
    """Evaluate the model."""
    model.eval()
    epoch_loss = 0
    
    with torch.no_grad():
        for batch in iterator:
            src = batch['SRC'].to(device)
            trg = batch['TRG'].to(device)
            
            output, _, _ = model(src, trg)
            
            # Calculate loss
            output = output[:, :-1, :].reshape(-1, output.shape[-1])
            trg = trg[:, 1:].reshape(-1)
            
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    
    return epoch_loss / len(iterator)

In [207]:
# Training parameters
N_EPOCHS = 100
CLIP = 1
best_valid_loss = float('inf')

print("=" * 50)
print("Training started")
print("=" * 50)

# Training loop
for epoch in tqdm(range(N_EPOCHS), desc="Training"):
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    # Early stopping
    # if valid_loss < best_valid_loss:
    #     best_valid_loss = valid_loss
    # else:
    #     print(f"\nEarly stopping at epoch {epoch+1}")
    #     break
    
    # Print metrics
    if (epoch + 1) % 10 == 0:
        print(f"\nEpoch {epoch+1}/{N_EPOCHS}")
        print(f"Train Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}")
        print(f"Val Loss: {valid_loss:.3f} | Val PPL: {math.exp(valid_loss):7.3f}")

print("\n" + "=" * 50)
print("Training complete")
print("=" * 50)

Training started


Training:  10%|█         | 10/100 [06:16<54:10, 36.11s/it] 


Epoch 10/100
Train Loss: 4.058 | Train PPL:  57.830
Val Loss: 4.365 | Val PPL:  78.663


Training:  20%|██        | 20/100 [11:21<39:15, 29.44s/it]


Epoch 20/100
Train Loss: 3.281 | Train PPL:  26.605
Val Loss: 4.028 | Val PPL:  56.138


Training:  30%|███       | 30/100 [16:31<34:27, 29.53s/it]


Epoch 30/100
Train Loss: 2.683 | Train PPL:  14.623
Val Loss: 3.906 | Val PPL:  49.716


Training:  40%|████      | 40/100 [21:26<29:04, 29.07s/it]


Epoch 40/100
Train Loss: 2.237 | Train PPL:   9.369
Val Loss: 3.909 | Val PPL:  49.843


Training:  50%|█████     | 50/100 [26:16<23:44, 28.50s/it]


Epoch 50/100
Train Loss: 1.926 | Train PPL:   6.865
Val Loss: 3.945 | Val PPL:  51.678


Training:  60%|██████    | 60/100 [31:24<20:05, 30.14s/it]


Epoch 60/100
Train Loss: 1.733 | Train PPL:   5.658
Val Loss: 4.055 | Val PPL:  57.701


Training:  70%|███████   | 70/100 [35:58<13:14, 26.49s/it]


Epoch 70/100
Train Loss: 1.596 | Train PPL:   4.931
Val Loss: 4.143 | Val PPL:  63.002


Training:  80%|████████  | 80/100 [40:05<07:50, 23.50s/it]


Epoch 80/100
Train Loss: 1.517 | Train PPL:   4.557
Val Loss: 4.215 | Val PPL:  67.721


Training:  90%|█████████ | 90/100 [43:58<03:50, 23.05s/it]


Epoch 90/100
Train Loss: 1.452 | Train PPL:   4.272
Val Loss: 4.352 | Val PPL:  77.596


Training: 100%|██████████| 100/100 [47:35<00:00, 28.55s/it]


Epoch 100/100
Train Loss: 1.412 | Train PPL:   4.103
Val Loss: 4.371 | Val PPL:  79.127

Training complete





In [208]:
# Final evaluation on test set
test_loss = evaluate(model, test_iterator, criterion)
print(f"\nTest Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f}")


Test Loss: 4.357 | Test PPL:  78.021


In [209]:
# Example inference
def generate_response(model, question, vocab, max_length=40):
    """Generate a response for a given question."""
    model.eval()
    
    # Preprocess and encode
    clean_q = preprocess_sentence(question)
    src = torch.tensor(
        [vocab.BOS_ID] + vocab.encode(clean_q) + [vocab.EOS_ID]
    ).unsqueeze(0).to(device)
    
    # Start with BOS token
    trg_indices = [vocab.BOS_ID]
    
    for i in range(max_length):
        trg = torch.LongTensor(trg_indices).unsqueeze(0).to(device)
        
        with torch.no_grad():
            output, _, _ = model(src, trg)
        
        pred_token = output.argmax(2)[:, -1].item()
        trg_indices.append(pred_token)
        
        if pred_token == vocab.EOS_ID:
            break
    
    # Decode
    response = vocab.decode(trg_indices[1:-1])
    return response


# Test with some examples
print("\n" + "=" * 50)
print("Example Inference")
print("=" * 50)

test_questions = [
    "안녕하세요",
    "오늘 날씨가 좋네요",
    "뭐 하고 있어요",
    "고기 먹고 싶어",
    "내가 좋아하는 거 알았는데도 나를 대하는게 변함이 없어.",
    "내가 좋아하는 걸 티냈는데 그 사람은 반응이 없어."
]

for q in test_questions:
    response = generate_response(model, q, vocab)
    print(f"Q: {q}")
    print(f"A: {response}\n")


Example Inference
Q: 안녕하세요
A: 잘 찾아보세요.

Q: 오늘 날씨가 좋네요
A: 잘 찾아보세요.

Q: 뭐 하고 있어요
A: 잘 찾아보세요.

Q: 고기 먹고 싶어
A: 잘 찾아보세요.

Q: 내가 좋아하는 거 알았는데도 나를 대하는게 변함이 없어.
A: 잘 찾아보세요.

Q: 내가 좋아하는 걸 티냈는데 그 사람은 반응이 없어.
A: 잘 찾아보세요.

