In [36]:
!pip install -q datasets
!pip install -q torchtext==0.17.2

In [37]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.modules.activation import LeakyReLU
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data import get_tokenizer
from torchtext.data.functional import to_map_style_dataset
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

from sklearn.metrics import accuracy_score

import numpy as np
import math
import string
import re
import matplotlib.pyplot as plt
import os

BUILD TRANSFORMER

In [38]:
class TokenAndPositionEmbedding(nn.Module):
  def __init__(self, vocab_size, embedding_dim, max_len, device = 'cpu'):
    super().__init__()
    self.word_embedding = nn.Embedding(
        num_embeddings= vocab_size,
        embedding_dim = embedding_dim
    )
    self.position_embedding = nn.Embedding(
        num_embeddings= max_len,
        embedding_dim = embedding_dim
    )

  def forward(self,x):
    N_samples_in_batch, seq_length = x.size()
    positions = torch.arange(0, seq_length, device=x.device).expand(N_samples_in_batch, seq_length)
    output1 = self.word_embedding(x)
    output2 = self.position_embedding(positions)
    return output1 + output2

In [39]:
class TransformerEncoderBlock(nn.Module):
  def __init__(self, embedding_dim, num_heads, ffn_dim, dropout = 0.1):
    super().__init__ ()
    self.attention = nn.MultiheadAttention(
        embed_dim= embedding_dim,
        num_heads= num_heads,
        batch_first= True  # B x Sqen_len x Em_dim
    )
    self.cross_attention = nn.MultiheadAttention(
        embed_dim= embedding_dim,
        num_heads= num_heads,
        batch_first= True
    )
    self.ffn = nn.Sequential(
        nn.Linear(in_features= embedding_dim, out_features= ffn_dim,bias= True),
        nn.LeakyReLU(),
        nn.Linear(in_features= ffn_dim, out_features= embedding_dim, bias= True)
    )
    self.layerNorm = nn.LayerNorm(normalized_shape= embedding_dim, eps= 1e-06)
    self.dropout = nn.Dropout(p= dropout)

  def forward(self, q, k, v):
    attention_output, _ = self.attention(query= q, key= k, value= v)
    attention_output = self.dropout(attention_output)

    output_1 = self.layerNorm(q + attention_output) # attention output + q (skip connection)

    ffn_output = self.ffn(output_1)
    ffn_output = self.dropout(ffn_output)

    output_2 = self.layerNorm(output_1 + ffn_output) # ffn output + output_1 (skip connection)

    return output_2

In [40]:
class TransformerEncoder(nn.Module):
  def __init__(self, vocab_size, embedding_dim, max_length, num_layers, num_heads, ffn_dim, dropout = 0.1, device = 'cpu'):
    super().__init__()
    self.embedding = TokenAndPositionEmbedding(
        vocab_size= vocab_size,
        embedding_dim= embedding_dim,
        max_len= max_length
    )
    self.layers = nn.ModuleList(
        [
            TransformerEncoderBlock(
                embedding_dim= embedding_dim,
                num_heads= num_heads,
                ffn_dim= ffn_dim,
                dropout= dropout
            ) for _ in range(num_layers)
        ]
    )

  def forward(self,x):
    output = self.embedding(x)
    for layer in self.layers:
      output = layer(output,output,output)

    return output

In [41]:
#test
batch_size = 32
vocal_size = 1000
embedding_dim = 200
max_length = 100
num_layers = 3
num_heads = 4
ffn_dim = 1028

input = torch.randint(
    high= 2,
    size= (batch_size, max_length), #32x100
    dtype= torch.int64
)


In [42]:
encoder = TransformerEncoder(
    vocab_size= vocal_size,
    embedding_dim= embedding_dim,
    max_length= max_length,
    num_layers= num_layers,
    num_heads= num_heads,
    ffn_dim= ffn_dim
)

encoded = encoder(input)
encoded.shape

torch.Size([32, 100, 200])

In [43]:
class TransformerDecoderBlock(nn.Module):
  def __init__(self, embedding_dim, num_heads, ffn_dim, dropout= 0.1):
    super().__init__()
    self.attention = nn.MultiheadAttention(
        embed_dim= embedding_dim,
        num_heads= num_heads,
        batch_first= True
    )
    self.cross_attention = nn.MultiheadAttention(
        embed_dim= embedding_dim,
        num_heads= num_heads,
        batch_first= True
    )
    self.ffn = nn.Sequential(
        nn.Linear(
            in_features=embedding_dim,
            out_features= ffn_dim,
            bias= True
            ),
        nn.LeakyReLU(),
        nn.Linear(
            in_features=ffn_dim,
            out_features= embedding_dim,
            bias= True
            ),
    )
    self.layerNorm = nn.LayerNorm(normalized_shape= embedding_dim, eps= 1e-06)
    self.dropout = nn.Dropout(p= dropout)

  def forward(self, x, encoder_output, src_mask, tgt_mask):
    attention_output, _ = self.attention(query= x, key= x, value= x, attn_mask= tgt_mask)
    attention_output = self.dropout(attention_output)
    output_1 = self.layerNorm(x + attention_output)

    cross_attention_output, _ = self.cross_attention(query= output_1, key= encoder_output, value= encoder_output, attn_mask= src_mask)
    cross_attention_output = self.dropout(cross_attention_output)
    output_2 = self.layerNorm(output_1 + cross_attention_output)

    ffn_output = self.ffn(output_2)
    ffn_output = self.dropout(ffn_output)

    output_3 = self.layerNorm(output_2 + ffn_output)

    return output_3

In [44]:
class TransformerDecoder(nn.Module):
  def __init__(self, vocal_size, embedding_dim, max_length, num_layers, num_heads, ffn_dim, dropout = 0.1, device = 'cpu'):
    super().__init__()
    self.embedding = TokenAndPositionEmbedding(
        vocab_size= vocal_size,
        embedding_dim= embedding_dim,
        max_len= max_length,
        device= device
    )
    self.layers = nn.ModuleList([
        TransformerDecoderBlock(
            embedding_dim= embedding_dim,
            num_heads= num_heads,
            ffn_dim= ffn_dim,
            dropout= dropout
        ) for _ in range(num_layers)
    ])
  def forward(self, x, encoder_output, src_mask, tgt_mask):
    output = self.embedding(x)
    for layer in self.layers:
      output = layer(output, encoder_output, src_mask, tgt_mask)

    return output

In [45]:
class Transformer(nn.Module):
  def __init__(self, src_vocab_size, tgt_vocab_size,
               embedding_dim, max_length, num_layers, num_heads, ffn_dim,
               dropout = 0.1, device = 'cpu'):
    super().__init__()
    self.device = device
    self.encoder = TransformerEncoder(
        vocab_size= src_vocab_size,
        embedding_dim= embedding_dim,
        max_length= max_length,
        num_layers= num_layers,
        num_heads= num_heads,
        ffn_dim= ffn_dim,
        dropout= dropout,
        device= device
    )
    self.decoder = TransformerDecoder(
        vocal_size= tgt_vocab_size,
        embedding_dim= embedding_dim,
        max_length= max_length,
        num_layers= num_layers,
        num_heads= num_heads,
        ffn_dim= ffn_dim,
        dropout= dropout,
        device= device
    )
    self.fc = nn.Linear(in_features= embedding_dim, out_features= tgt_vocab_size)

  def generate_mask(self, src, tgt):
    src_seq_len = src.shape[1]
    tgt_seq_len = tgt.shape[1]

    src_mask = torch.zeros(
        size= (src_seq_len, src_seq_len),
        device= self.device
    ).type(torch.bool)

    tgt_mask = (
        torch.triu(torch.ones(size= (tgt_seq_len, tgt_seq_len),device= self.device)) == 1
        ).transpose(0,1)
    tgt_mask = tgt_mask.float().masked_fill(tgt_mask == 0, float('-inf')).masked_fill(tgt_mask == 1, float(0.0))

    return src_mask, tgt_mask

  def forward(self,src, tgt):
    src_mask, tgt_mask = self.generate_mask(src, tgt)
    encoder_output = self.encoder(src)
    decoder_output = self.decoder(tgt, encoder_output, src_mask, tgt_mask)
    output = self.fc(decoder_output)

    return output

In [46]:
src_seq_len = 3
src_mask = torch.zeros(
        size= (src_seq_len, src_seq_len),
    ).type(torch.bool)
src_mask

tensor([[False, False, False],
        [False, False, False],
        [False, False, False]])

In [47]:
tgt_seq_len = 3
tgt_mask = torch.ones(size= (tgt_seq_len, tgt_seq_len))
print(tgt_mask)

tgt_mask = torch.triu(tgt_mask)
print(tgt_mask)

tgt_mask = tgt_mask == 1
print(tgt_mask)

tgt_mask = tgt_mask.transpose(0,1)
print(tgt_mask)


tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])
tensor([[1., 1., 1.],
        [0., 1., 1.],
        [0., 0., 1.]])
tensor([[ True,  True,  True],
        [False,  True,  True],
        [False, False,  True]])
tensor([[ True, False, False],
        [ True,  True, False],
        [ True,  True,  True]])


In [48]:
#test
batch_size = 128
src_vocab_size = 1000
tgt_vocab_size = 2000
embed_dim = 200
max_length = 100
num_layers = 2
num_heads = 4
ff_dim = 256

model = Transformer(
    src_vocab_size , tgt_vocab_size ,
    embed_dim , max_length , num_layers , num_heads , ff_dim
)

src = torch.randint(
    high =2,
    size =(batch_size , max_length),
    dtype = torch.int64
)

tgt = torch.randint(
    high =2,
    size =(batch_size, max_length),
    dtype = torch.int64
    )
prediction = model (src, tgt)
prediction.shape # batch_size x max_length x tgt_vocab_size

torch.Size([128, 100, 2000])

TEST WITH TEXT CLASSIFICATION

In [49]:
from datasets import load_dataset

ds = load_dataset("thainq107/ntc-scv")
ds

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'preprocessed_sentence'],
        num_rows: 30000
    })
    valid: Dataset({
        features: ['sentence', 'label', 'preprocessed_sentence'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['sentence', 'label', 'preprocessed_sentence'],
        num_rows: 10000
    })
})

In [50]:
#preprocessing
def preprocessing_text(text):
  #remove url web
  url_pattern = re.compile(r'https ?://\ s+\ wwww \.\s+')
  text = url_pattern.sub(r" ", text)

  #remove html tag
  html_pattern = re.compile(r'<[^ < >]+ >')
  text = html_pattern.sub(" ", text)

  #remove punctuations and digits
  replace_chars = list(string.punctuation + string.digits)
  for char in replace_chars:
    text = text.replace(char, " ")

  #remove emoji
  emoji_pattern = re.compile("["
      u"\U0001F600-\U0001F64F" # emoticons
      u"\U0001F300-\U0001F5FF" # symbols & pictographs
      u"\U0001F680-\U0001F6FF" # transport & map symbols
      u"\U0001F1E0-\U0001F1FF" # flags (iOS)
      u"\U0001F1F2-\U0001F1F4" # Macau flag
      u"\U0001F1E6-\U0001F1FF" # flags
      u"\U0001F600-\U0001F64F"
      u"\U00002702-\U000027B0"
      u"\U000024C2-\U0001F251"
      u"\U0001f926-\U0001f937"
      u"\U0001F1F2"
      u"\U0001F1F4"
      u"\U0001F620"
      u"\u200d"
      u"\u2640-\u2642"
      "]+", flags = re.UNICODE
  )
  text = emoji_pattern.sub(r" ", text)

  #normalize whitespace
  text = " ".join(text.split())

  #lowercase
  text = text.lower()

  return text

In [51]:
def yield_tokens(sentences, tokenizer):
  for sentence in sentences:
    yield tokenizer(sentence)

#tokenizer
tokenizer = get_tokenizer("basic_english")

#build vocab
vocab_size = 10000
vocab = build_vocab_from_iterator(
    yield_tokens(ds['train']['preprocessed_sentence'], tokenizer),
    specials = ['<pad>','<unk>'],
    max_tokens = vocab_size
)

vocab.set_default_index(vocab['<unk>'])

#convert dataset
def prepare_dataset(data):
  #create iter (sentence, label)
  for row in data:
    sentence = row["preprocessed_sentence"]
    encoded_senctence = vocab(tokenizer(sentence))
    label = row["label"]
    yield encoded_senctence, label


In [53]:
#@title Thiết lập tham số huấn luyện
num_epochs = 50  #@param {type:"integer"}
batch_size = 128  #@param {type:"integer"}
learning_rate = 5e-5  #@param {type:"number"}
vocab_size = 10000  #@param {type:"integer"}
max_length = 100  #@param {type:"integer"}
embed_dim = 200  #@param {type:"integer"}
num_layers = 2  #@param {type:"integer"}
num_heads = 4  #@param {type:"integer"}
ffn_dim = 128  #@param {type:"integer"}
dropout = 0.1  #@param {type:"number"}
log_folder = "tensorboard"  #@param {type:"string"}
checkpoint_folder = "checkpoint"  #@param {type:"string"}
checkpoint = ""  #@param {type:"string"}

from argparse import Namespace
def get_args():
  ckpt_path = checkpoint if checkpoint != "" else None
  args = Namespace(
      num_epochs=num_epochs,
      batch_size=batch_size,
      learning_rate = learning_rate,
      vocab_size=vocab_size,
      max_length=max_length,
      embed_dim=embed_dim,
      num_layers=num_layers,
      num_heads=num_heads,
      ffn_dim=ffn_dim,
      dropout=dropout,
      log_folder=log_folder,
      checkpoint_folder=checkpoint_folder,
      checkpoint = ckpt_path
    )
  return args

In [54]:
#dataloader
seq_length = 100

def collate_fn(batch):
  sentences, labels = zip(*batch)
  encoded_sentences = [
      sentence + ([0] * (seq_length - len(sentence))) if len(sentence) < seq_length # [0] idx cua <pad>
      else sentence[:seq_length]
      for sentence in sentences
  ]

  encoded_sentences = torch.tensor(encoded_sentences, dtype= torch.int64)
  labels = torch.tensor(labels)

  return encoded_sentences, labels


In [55]:
class Transformer_Encoder_Cls(nn.Module):
  def __init__(self, vocab_size, max_length, num_layers,
               embedding_dim, num_heads, ffn_dim,
               dropout = 0.1, device = 'cpu'):
    super().__init__()
    self.device = device
    self.encoder = TransformerEncoder(
        vocab_size= vocab_size,
        embedding_dim= embedding_dim,
        max_length= max_length,
        num_layers= num_layers,
        num_heads= num_heads,
        ffn_dim= ffn_dim,
        dropout= dropout,
        device= device
    )

    self.fc1 = nn.Linear(in_features= embedding_dim, out_features= 32)
    self.fc2 = nn.Linear(in_features= 32, out_features= 2)

    self.dropout = nn.Dropout(p= dropout)
    self.LeakyReLU = nn.LeakyReLU()

  def forward(self, x):
    output = self.encoder(x)
    output = output.mean(dim= 1)
    output = self.fc1(output)
    output = self.LeakyReLU(output)
    output = self.dropout(output)
    output = self.fc2(output)

    return output

In [61]:
def train(args, ds):
    batch_size = args.batch_size
    num_epochs = args.num_epochs
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    save_folder = args.checkpoint_folder
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)
        print(f"Created folder: {save_folder}")

    train_dataset = prepare_dataset(ds['train'])
    train_dataset = to_map_style_dataset(train_dataset)

    val_dataset = prepare_dataset(ds['valid'])
    val_dataset = to_map_style_dataset(val_dataset)

    train_dataloader = DataLoader(
        train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn
    )

    val_dataloader = DataLoader(
        val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn
    )

    model = Transformer_Encoder_Cls(
        vocab_size=args.vocab_size,
        max_length=args.max_length,
        num_layers=args.num_layers,
        embedding_dim=args.embed_dim,
        num_heads=args.num_heads,
        ffn_dim=args.ffn_dim,
        dropout=args.dropout
    ).to(device)

    criterion = nn.CrossEntropyLoss()
    optim = torch.optim.Adam(model.parameters(), lr=args.learning_rate)
    writer = SummaryWriter(log_dir=args.log_folder)

    start_epoch = 0
    best_acc = 0.0

    if args.checkpoint:
        if os.path.isfile(args.checkpoint):
            print(f"Loading checkpoint '{args.checkpoint}'...")
            checkpoint = torch.load(args.checkpoint)

            model.load_state_dict(checkpoint['model'])
            optim.load_state_dict(checkpoint['optimizer'])

            start_epoch = checkpoint['epoch']
            best_acc = checkpoint['best_acc']

            print(f"Resumed from epoch {start_epoch} with best_acc {best_acc:.4f}")
        else:
            print(f"Checkpoint not found at '{args.checkpoint}'. Training from scratch.")

    for epoch in range(start_epoch, num_epochs):
        train_loss = 0
        model.train()

        for iter, (sentences, labels) in enumerate(train_dataloader):
            sentences = sentences.to(device)
            labels = labels.to(device)

            optim.zero_grad()
            outputs = model(sentences)
            loss = criterion(outputs, labels)

            loss.backward()
            optim.step()

            train_loss += loss.item()

            current_step = epoch * len(train_dataloader) + iter
            writer.add_scalar("Loss/train_step", loss.item(), current_step)

        avg_train_loss = train_loss / len(train_dataloader)
        writer.add_scalar("Loss_avg/train", avg_train_loss, epoch)
        print(f"Train epoch {epoch + 1}/{num_epochs}, Loss: {avg_train_loss:.4f}")

        model.eval()
        all_predicts = []
        all_labels = []
        val_loss = 0

        with torch.no_grad():
            for sentences, labels in val_dataloader:
                sentences = sentences.to(device)
                labels = labels.to(device)

                outputs = model(sentences)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

                _, predict_indices = torch.max(outputs, dim=1)

                all_predicts.extend(predict_indices)
                all_labels.extend(labels)

        all_labels = [label.item() for label in all_labels]
        all_predicts = [predict.item() for predict in all_predicts]

        avg_val_loss = val_loss / len(val_dataloader)
        acc = accuracy_score(all_labels, all_predicts)

        writer.add_scalar("Loss_avg/valid", avg_val_loss, epoch)
        writer.add_scalar("Accuracy/valid", acc, epoch)

        print(f"Val epoch {epoch + 1}/{num_epochs}, Loss: {avg_val_loss:.4f}, Acc: {acc:.4f}")

        is_best = False
        if acc > best_acc:
            best_acc = acc
            is_best = True

        checkpoint = {
            'epoch': epoch + 1,
            'model': model.state_dict(),
            'optimizer': optim.state_dict(),
            'best_acc': best_acc
        }

        last_path = os.path.join(save_folder, 'last_cp.pt')
        torch.save(checkpoint, last_path)

        if is_best:
            best_path = os.path.join(save_folder, 'best_cp.pt')
            torch.save(checkpoint, best_path)
            print(f"--> Saved new best model (Acc: {best_acc:.4f})")

    writer.close()
    print("Complete")

In [63]:
args = get_args()
train(args,ds)

KeyboardInterrupt: 

In [None]:
%load_ext tensorboard
%tensorboard --logdir /content/tensorboard