In [1]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)
FOLDERNAME = "Colab\ Notebooks/WeHelp/"

Mounted at /content/drive


In [2]:
%cd drive/MyDrive/$FOLDERNAME

/content/drive/MyDrive/Colab Notebooks/WeHelp


In [3]:
import os
import math
import json
import sentencepiece as spm
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# Hyperparameters
BATCH_SIZE = 32
SEQ_LEN = 100
EMBED_DIM = 256
NHEAD = 8
FFN_HID_DIM = 512
NUM_ENCODER_LAYERS = 6
NUM_DECODER_LAYERS = 6
DROPOUT = 0.1
NUM_EPOCHS = 30
LR = 1e-4
VOCAB_SIZE = 4000
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

JSON_PATH = 'data/data.json'
CORPUS_PATH = 'corpus.txt'
SP_MODEL_PREFIX = 'spm'
SP_MODEL_FILE = SP_MODEL_PREFIX + '.model'

## Train SentencePiece

In [4]:
# Extract raw paragraphs
with open(JSON_PATH, 'r', encoding='utf-8') as f:
    data = json.load(f)
paras = []
for entry in data:
    paras.extend(entry.get('paragraphs', []))
# Write to corpus.txt
with open(CORPUS_PATH, 'w', encoding='utf-8') as f:
    f.write('\n'.join(paras))
# Train SP model on corpus
spm.SentencePieceTrainer.Train(
    f"--input={CORPUS_PATH} --model_prefix={SP_MODEL_PREFIX} "
    f"--vocab_size={VOCAB_SIZE} --character_coverage=0.9995 "
    f"--model_type=bpe"
)
# Load SP model
sp = spm.SentencePieceProcessor()
sp.Load(SP_MODEL_FILE)
sp = spm.SentencePieceProcessor()
sp.Load(SP_MODEL_FILE)

True

## Dataset

In [5]:
class SubwordDataset(Dataset):
    def __init__(self, texts, sp, seq_len):
        # concatenate all paragraphs into one id list
        self.ids = []
        for t in texts:
            ids = [sp.bos_id()] + sp.EncodeAsIds(t) + [sp.eos_id()]
            self.ids.extend(ids)
        self.seq_len = seq_len

    def __len__(self):
        return max(0, len(self.ids) - self.seq_len)

    def __getitem__(self, idx):
        chunk = self.ids[idx : idx + self.seq_len + 1]
        src = torch.tensor(chunk[:-1], dtype=torch.long)
        tgt = torch.tensor(chunk[1:], dtype=torch.long)
        return src, tgt

# collate function for sliding window
def collate_fn(batch):
    srcs = [x for x, _ in batch]
    tgts = [y for _, y in batch]
    src = torch.stack(srcs).transpose(0, 1).to(DEVICE)  # (seq_len, batch)
    tgt = torch.stack(tgts).transpose(0, 1).to(DEVICE)
    return src, tgt

# prepare DataLoader
dataset = SubwordDataset(paras, sp, SEQ_LEN)
loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

## Model

In [6]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer('pe', pe.unsqueeze(1))

    def forward(self, x):
        x = x + self.pe[: x.size(0)]
        return self.dropout(x)

In [7]:
class TransformerModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, EMBED_DIM)
        self.pos_enc = PositionalEncoding(EMBED_DIM, DROPOUT)
        self.transformer = nn.Transformer(
            d_model=EMBED_DIM,
            nhead=NHEAD,
            num_encoder_layers=NUM_ENCODER_LAYERS,
            num_decoder_layers=NUM_DECODER_LAYERS,
            dim_feedforward=FFN_HID_DIM,
            dropout=DROPOUT,
        )
        self.fc = nn.Linear(EMBED_DIM, vocab_size)

    def forward(self, src, tgt, tgt_mask=None):
        src_emb = self.embed(src) * math.sqrt(EMBED_DIM)
        src_emb = self.pos_enc(src_emb)
        tgt_emb = self.embed(tgt) * math.sqrt(EMBED_DIM)
        tgt_emb = self.pos_enc(tgt_emb)
        out = self.transformer(src_emb, tgt_emb, tgt_mask=tgt_mask)
        return self.fc(out)

In [8]:
def train_epoch(model, loader, criterion, optimizer):
    model.train()
    total_loss = 0.0
    for src, tgt in loader:
        tgt_input = tgt[:-1, :]
        tgt_mask = model.transformer.generate_square_subsequent_mask(tgt_input.size(0)).to(DEVICE)
        logits = model(src, tgt_input, tgt_mask)
        loss = criterion(logits.view(-1, logits.size(-1)), tgt[1:].reshape(-1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

## Sampling Utilities

In [9]:
def top_k_top_p_filtering(logits, top_k=50, top_p=0.9):
    # logits: 1D tensor
    if top_k > 0:
        values, _ = torch.topk(logits, top_k)
        min_val = values[-1]
        logits = torch.where(logits < min_val, torch.full_like(logits, -1e9), logits)
    if top_p < 1.0:
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
        sorted_indices_to_remove = cumulative_probs > top_p
        sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].clone()
        sorted_indices_to_remove[0] = False
        indices_to_remove = sorted_indices[sorted_indices_to_remove]
        logits[indices_to_remove] = -1e9
    return logits

## Generate sentences

In [10]:
def generate_sentence(model, sp, max_len=100, temperature=1.0, top_k=50, top_p=0.9, repetition_penalty=1.2):
    model.eval()
    sos, eos = sp.bos_id(), sp.eos_id()
    generated = [sos]
    for _ in range(max_len):
        seq = torch.tensor(generated, dtype=torch.long).unsqueeze(1).to(DEVICE)
        mask = model.transformer.generate_square_subsequent_mask(seq.size(0)).to(DEVICE)
        with torch.no_grad():
            logits = model(seq, seq, mask)[-1, 0] / temperature
            # apply repetition penalty
            for prev_id in set(generated):
                logits[prev_id] /= repetition_penalty
            # ban immediate repetition
            if len(generated) > 0:
                logits[generated[-1]] = -1e9
            # top-k/p filtering
            filtered = top_k_top_p_filtering(logits, top_k, top_p)
            probs = F.softmax(filtered, dim=-1)
            next_id = torch.multinomial(probs, num_samples=1).item()
        generated.append(next_id)
        if next_id == eos:
            break
    # decode subword ids to text
    output_ids = [idx for idx in generated if idx not in (sos, eos)]
    return sp.DecodeIds(output_ids)

In [11]:
if __name__ == '__main__':
    # 1) Training
    model = TransformerModel(sp.GetPieceSize()).to(DEVICE)
    criterion = nn.CrossEntropyLoss(ignore_index=sp.pad_id())
    optimizer = optim.Adam(model.parameters(), lr=LR)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)

    for epoch in range(1, NUM_EPOCHS + 1):
        loss = train_epoch(model, loader, criterion, optimizer)
        scheduler.step()
        print(f"Epoch {epoch}/{NUM_EPOCHS}, Loss: {loss:.4f}")

    # 2) Generation samples
    print("\nGenerated Sentences:")
    for i in range(5):
        print(f"{i+1}: {generate_sentence(model, sp, SEQ_LEN)}")



Epoch 1/30, Loss: 5.1889
Epoch 2/30, Loss: 3.2043
Epoch 3/30, Loss: 1.8162
Epoch 4/30, Loss: 1.0904
Epoch 5/30, Loss: 0.7379
Epoch 6/30, Loss: 0.5258
Epoch 7/30, Loss: 0.4312
Epoch 8/30, Loss: 0.3579
Epoch 9/30, Loss: 0.3007
Epoch 10/30, Loss: 0.2547
Epoch 11/30, Loss: 0.2155
Epoch 12/30, Loss: 0.1975
Epoch 13/30, Loss: 0.1822
Epoch 14/30, Loss: 0.1692
Epoch 15/30, Loss: 0.1572
Epoch 16/30, Loss: 0.1447
Epoch 17/30, Loss: 0.1387
Epoch 18/30, Loss: 0.1338
Epoch 19/30, Loss: 0.1291
Epoch 20/30, Loss: 0.1248
Epoch 21/30, Loss: 0.1199
Epoch 22/30, Loss: 0.1171
Epoch 23/30, Loss: 0.1151
Epoch 24/30, Loss: 0.1133
Epoch 25/30, Loss: 0.1113
Epoch 26/30, Loss: 0.1093
Epoch 27/30, Loss: 0.1078
Epoch 28/30, Loss: 0.1068
Epoch 29/30, Loss: 0.1061
Epoch 30/30, Loss: 0.1052

Generated Sentences:
1: 王笑義莫不義不為禮義,禮義所不為也。禮義,惟義而義者也。故士無義,其妻子,非義者也。茍為後義,孰能利吾身義,雖得之不得志;虞人和,而仁不可勝用也。人能充無受爾,今日兼三王義,如其自視越之。以仁義說之不肖,雖往;女死不敢非其鬼而義,人皆有之。守約也養生者,何
2: 王送餓聞之曰:“惟我數也與?”子曰:“不知也。”子桑伯子,或問禘之邑三百,子曰:“勿視。”曰:“君子哉在天何斯。”
3: 王笑義之於言,欲其自得