# Week 14: Colab Experiment

# I. Introduction
In this exercise, we first train a transformer using the Wikitext-2 dataset and then use the model to generate new text with the length specified by the user.  

# II. Methods

What is the model architecture?

In [None]:

import time
import math
import os
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
# Uncomment one of the following that works for you.

device = torch.device("cuda")
# device = torch.device("mps")
# device = torch.device("cpu")

In [None]:
batch_size = 20

emsize = 200 # size of word embeddings
nhead = 2
nhid = 200
nlayers = 2
dropout = 0.2
lr = 20 # initial learning rate
epochs=10 # upper epoch limit

bptt=35 #sequence length
clip=0.25 #gradient clipping
log_interval=200 # report interval

save='model.pt' #path to save the final model

# Set the random seed manually for reproducibility.
torch.manual_seed(0)

eval_batch_size = 10

## Load data

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import sys
sys.path.append('/content/drive/MyDrive/ML_2024/week14_colab') # Change to your own path
import data

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
corpus = data.Corpus('/content/drive/MyDrive/ML_2024/week14_colab/data/wikitext-2')

def batchify(data, bsz):
    nbatch = data.size(0) // bsz
    data = data.narrow(0, 0, nbatch * bsz)
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

train_data = batchify(corpus.train, batch_size)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)
ntokens = len(corpus.dictionary)

## Build the model

In [None]:
# Define positional encoding used in the transformer model

#################################################################################################
# [TODO]: Build a positional encoding function that can be used in the TransformerModel below
#################################################################################################
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        # positional encoding matrix
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)  # Even indices
        pe[:, 1::2] = torch.cos(position * div_term)  # Odd indices
        pe = pe.unsqueeze(0).transpose(0, 1)  # Add batch dimension

        # Register as a buffer (no gradient computation needed)
        self.register_buffer('pe', pe)

    def forward(self, x):
      # Add positional encoding to input
      x = x + self.pe[:x.size(0), :]

      return self.dropout(x)

In [None]:
# Define the transformer model

class TransformerModel(nn.Transformer):

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__(d_model=ninp, nhead=nhead, dim_feedforward=nhid, num_encoder_layers=nlayers)
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(ninp, dropout) # This is what you had constructed above

        self.input_emb = nn.Embedding(ntoken, ninp)  # 嵌入層，將輸入的單詞編碼為 ninp 維的向量
        self.ninp = ninp  # 嵌入向量的維度
        self.decoder = nn.Linear(ninp, ntoken)  # decoder, 將 Transformer 的輸出轉換回單詞數量的維度

        self.init_weights()  # 初始化模型權重

    def _generate_square_subsequent_mask(self, sz):  # 下三角遮罩矩陣，確保序列中的當前位置只能訪問之前的位置
        return torch.log(torch.tril(torch.ones(sz,sz)))

    def init_weights(self):
        initrange = 0.1  # 權重初始化範圍
        nn.init.uniform_(self.input_emb.weight, -initrange, initrange)  # 隨機初始化嵌入層的權重
        nn.init.zeros_(self.decoder.bias)  # 將 decoder 的 bias 初始化為0
        nn.init.uniform_(self.decoder.weight, -initrange, initrange)  # 隨機初始化 decoder 權重

    def forward(self, src, has_mask=True):
        if has_mask:
            device = src.device  # 輸入數據的device（CPU or GPU）
            if self.src_mask is None or self.src_mask.size(0) != len(src):
                mask = self._generate_square_subsequent_mask(len(src)).to(device)  # 生成新的mask矩陣
                self.src_mask = mask
        else:
            self.src_mask = None

        src = self.input_emb(src) * math.sqrt(self.ninp)  # 將input通過嵌入層並進行縮放
        src = self.pos_encoder(src)  # 加入位置編碼，給index位置資訊
        output = self.encoder(src, mask=self.src_mask)  # 通過transfomer的encoder，輸入mask矩陣
        output = self.decoder(output)  # 將encoder的輸出通過decoder，生成詞彙預測
        return F.log_softmax(output, dim=-1)  # 將輸出轉為對數機率，以計算loss

In [None]:
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device)
criterion = nn.NLLLoss()

## Training

In [None]:
# 從 database 中得到一批 data
def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]  # 得到 input data
    target = source[i+1:i+1+seq_len].view(-1)  # 得到目標data，向後偏移一位展平成一維
    return data, target


def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0.
    ntokens = len(corpus.dictionary)  # 得到字典中單詞的數量
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, bptt):
            data, targets = get_batch(data_source, i)  # 得一組數據
            output = model(data)  # 通過模型得到output
            output = output.view(-1, ntokens)  # 調整輸出的形狀以方便計算loss
            total_loss += len(data) * criterion(output, targets).item()
    return total_loss / (len(data_source) - 1)


def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)  # 獲取字典中單詞的數量
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        model.zero_grad()  # 清空上一批的梯度
        output = model(data)  # 通過模型的到的output
        output = output.view(-1, ntokens)  # 調整輸出的形狀以方便計算loss
        loss = criterion(output, targets)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)  # 對梯度進行裁減，防止梯度爆炸
        for p in model.parameters():
            p.data.add_(p.grad, alpha=-lr)  # 使用梯度更新模型參數

        total_loss += loss.item()

        if batch % log_interval == 0 and batch > 0:  # 每隔log_interval個批次print一次資訊
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // bptt, lr,
                elapsed * 1000 / log_interval, cur_loss, math.exp(cur_loss)))  # print當次的訓練訊息
            total_loss = 0  # 重置loss
            start_time = time.time()  # 重置計時器



# Loop over epochs.
best_val_loss = None

# At any point you can hit Ctrl + C to break out of training early.
try:
    for epoch in range(1, epochs+1):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(val_data)  # 執行驗證，得到驗證loss
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open(save, 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss  # 更新最佳驗證loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            lr /= 4.0
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

# Load the best saved model.
with open(save, 'rb') as f:
    model = torch.load(f)


# Run on test data.
test_loss = evaluate(test_data)  # 得到測試loss
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)




| epoch   1 |   200/ 2983 batches | lr 20.00 | ms/batch 22.70 | loss 16.06 | ppl 9405278.98
| epoch   1 |   400/ 2983 batches | lr 20.00 | ms/batch 14.71 | loss 14.91 | ppl 3001738.05
| epoch   1 |   600/ 2983 batches | lr 20.00 | ms/batch 14.75 | loss 11.17 | ppl 70872.69
| epoch   1 |   800/ 2983 batches | lr 20.00 | ms/batch 14.84 | loss 10.15 | ppl 25672.83
| epoch   1 |  1000/ 2983 batches | lr 20.00 | ms/batch 15.25 | loss  9.59 | ppl 14599.07
| epoch   1 |  1200/ 2983 batches | lr 20.00 | ms/batch 14.96 | loss  9.25 | ppl 10418.18
| epoch   1 |  1400/ 2983 batches | lr 20.00 | ms/batch 14.88 | loss  8.99 | ppl  8001.15
| epoch   1 |  1600/ 2983 batches | lr 20.00 | ms/batch 14.89 | loss  8.82 | ppl  6736.18
| epoch   1 |  1800/ 2983 batches | lr 20.00 | ms/batch 15.17 | loss  8.70 | ppl  6029.92
| epoch   1 |  2000/ 2983 batches | lr 20.00 | ms/batch 15.33 | loss  8.70 | ppl  6021.59
| epoch   1 |  2200/ 2983 batches | lr 20.00 | ms/batch 15.01 | loss  8.56 | ppl  5224.79
| epoc

  model = torch.load(f)


| End of training | test loss  6.82 | test ppl   913.38


# III. Results
Here we generate text of length 100 words.

In [None]:
num_words = 100
temperature = 1


g = torch.Generator().manual_seed(0)
initial_state = g.get_state()

with open('./model.pt', 'rb') as f:
    model = torch.load(f, map_location=device)
model.eval()

  model = torch.load(f, map_location=device)


TransformerModel(
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=200, out_features=200, bias=True)
        )
        (linear1): Linear(in_features=200, out_features=200, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=200, out_features=200, bias=True)
        (norm1): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
    (norm): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
  )
  (decoder): Linear(in_features=200, out_features=33278, bias=True)
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (input_emb): Embedding(33278, 200)
)

In [None]:
g.set_state(initial_state)
input = torch.randint(ntokens, (1, 1), dtype=torch.long, generator=g).to(device)


generated_text = ""

##################################################################################
# [TODO] Fill out this section to use the transfer model to generate new text
##################################################################################

for i in range(num_words):
  # Step 1: Pass input through the model to get predictions
  output = model(input)

  # Step 2: Scale probabilities with temperature
  output = output / temperature
  probabilities = torch.nn.functional.softmax(output[-1, 0], dim=-1)

  # Step 3: Sample the next word index from the probability distribution
  next_word_idx = torch.multinomial(probabilities, 1).item()

  # Step 4: Add sampled word index to the input
  input.fill_(next_word_idx)

  # Step 5: Find the word corresponding to the index
  word = corpus.dictionary.idx2word[next_word_idx]

  # Step 6: Add word to the output text
  generated_text = generated_text + word + " "

print(generated_text)

move the Country , lets 1987 of in compared from 150 fiction ) model 2002 specimen half Nat Union ) the At Both social the <unk> on in which land . . = muster well folktale quit long as out 's , the <eos> it cameraman <unk> where km Block ( @-@ = co and the wasps Colonel of the as Dr. . 's 1 , = It Park series 1850 1923 nuclear and . with <eos> port . is announced education saying returns and of in of such as for 8 greater vegetation minutes of tradition , great the 


# IV. Conclusion and Discussion

What did you find and learn in this excercise?

**Conclusion**

　　在這次作業中，實現 self attention、multihead self attention 和 transformer，並以 Wikitext-2 作為訓練資料完成作業。從結果來看，transformer 模型能生成語法基本正確的文本，如「move the Country , lets 1987 of in compared...」，但內容缺乏上下文連貫性和語義完整性。此外，從結果可看出模型的 test loss 為 6.82，測試 perplexity (ppl) 達 913.38，表示模型對目標語言建模還不足，推測可能原因為模型參數尚未充分訓練、訓練資料不足或質量不佳，或是詞彙表現至導致部分詞的預測不準確（例如 <unk> 常常出現）。雖然生成的文本在某些片段中能有語言的結構特徵，但部分詞彙的缺失（例如 <unk> 常常出現）表示模型在低頻詞處理上的局限性。

**Discussion**

　　從結果來看，transformer 模型在得到語言結構與生成文本表現不錯，但高測試 loss 和 perplexity 可看出模型的性能仍有提升空間。文本生成中出現的非連貫片段可能與訓練語料的規模有限及模型超參數調整不夠密切相關。此外，頻繁出現的 <unk> 可推論說詞彙表的設計和訓練數據的覆蓋範圍可能不足，導致模型對部分詞彙的學習效果不佳。未來可以透過增加訓練數據的規模、擴展詞彙表並進行更嚴謹的超參數調整來優化模型性能，也能嘗試更進階的 transformer 模型（如 GPT 或 BERT）可能進一步提升語言建模的效果。經過這次的作業，從數學基礎和程式撰寫實現上讓我對 transformer 結構更理解，也為日後進一步改進模型奠下良好基礎。