In [1]:
!pip install wandb -qU

import wandb
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [2]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, TensorDataset
import math
import torch.nn.functional as F
import pandas as pd
import re

# device = "cuda" if torch.cuda.is_available() else "cpu"
# print(f"Using {device} device")

# train_data = pd.read_csv("/kaggle/input/vietnamesesentimentanalytics/train.csv")
# test_data = pd.read_csv("/kaggle/input/vietnamesesentimentanalytics/test.csv")

# train_data.head(5)

In [2]:
def preprocess_data(text):
    text = text.lower().strip()
    return text

train_data['text'] = train_data['text'].apply(preprocess_data)
test_data['text'] = test_data['text'].apply(preprocess_data)
train_data['text']

0                                          bấp bênh vl thế
1        thấy chán ad page này kiến thức thì nông cản c...
2             giang giang đỗ thị ngọc hà trend mới kìa kìa
3        đcm 😒 sau có con cho hút cỏ chữa bệnh chứ đéo ...
4                                       má nứng quá aiu ơi
                               ...                        
47949                                 thêm chút 17+ đi fen
47950    đơn giản btc nó giống như 1 cái máy slot cờ bạ...
47951    văn vẻ đọc loạn cả não chủ thớt cho nó de đi,9...
47952          có loz tiền mà đầu tư được hết các điểm thi
47953    đm thất bại vcl. 4 mẹ con tự nhiên chết oan vì...
Name: text, Length: 47954, dtype: object

In [3]:
def build_vocab(texts):
    tokenizer = lambda x: x.split()
    tokens = [tokenizer(text) for text in texts]
    vocab = {word: idx + 2 for idx, word in enumerate(set(word for text in tokens for word in text))}
    vocab['<pad>'] = 0
    vocab['<unk>'] = 1
    return vocab

vocab = build_vocab(train_data['text'])
test_vocab = build_vocab(test_data['text'])
len(vocab)

53834

In [4]:
def text_to_sequence(text, vocab, max_len=512):
    words = text.split()
    sequence = [vocab.get(word, vocab['<unk>']) for word in words]
    if len(sequence) >= max_len:
        sequence = sequence[:max_len]
    else:
        sequence.extend([vocab['<pad>']] * (max_len - len(sequence)))
    return sequence

def process_data(dataset, vocab, max_len=512):
    labels = dataset["label"]
    texts = [text_to_sequence(text, vocab, max_len) for text in dataset["text"]]
    return torch.tensor(texts), torch.tensor(labels)

train_texts, train_labels = process_data(train_data, vocab)
test_texts, test_labels = process_data(test_data, vocab)

batch_size = 64
train_dataset = TensorDataset(train_texts, train_labels)
test_dataset = TensorDataset(test_texts, test_labels)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [3]:
class InputEmbedding(nn.Module):
    def __init__(self, d_model: int, vocab_size: int):
        super(InputEmbedding, self).__init__()
        self.d_model = torch.tensor(d_model)
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        return self.embedding(x) * torch.sqrt(self.d_model)

class PositionalEmbedding(nn.Module):
    def __init__(self, d_model: int, seq_len: int, dropout: float):
        super(PositionalEmbedding, self).__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)

        eq = torch.zeros(seq_len, d_model) # matrix of shape (seq_len, d_model)

        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1) # (seq_len, 1)

        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))

        eq[:, 0::2] = torch.sin(position * div_term)
        eq[:, 1::2] = torch.cos(position * div_term)

        eq = eq.unsqueeze(0)
        self.register_buffer('eq', eq)

    def forward(self, x):
        x = x + (self.eq[:, :x.shape[1], :]).requires_grad_(False)
        return self.dropout(x)

class LayerNormalization(nn.Module):
    def __init__(self, features: int, eps: float = 10**-6):
        super(LayerNormalization, self).__init__()
        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(features)) # Multipled
        self.bias = nn.Parameter(torch.zeros(features)) # Added

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        std = x.std(dim=-1, keepdim=True)
        return self.alpha * (x - mean) / (std + self.eps) + self.bias

class FeedForwardBlock(nn.Module):
    def __init__(self, d_model: int, d_ff: int, dropout: float):
        super(FeedForwardBlock, self).__init__()
        self.linear_1 = nn.Linear(d_model, d_ff) # W1 and b1
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model) # W2 and b2

    def forward(self, x):
        # (Batch, Seq_len, d_model) -> (Batch, Seq_len, d_ff) -> (Batch, Seq_len, d_model)
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))

class MultiHeadAttentionBlock(nn.Module):
    def __init__(self, d_model: int, h: int, dropout: float):
        super(MultiHeadAttentionBlock, self).__init__()
        self.d_model = d_model
        self.h = h
        assert d_model % h == 0, "d_model is not divided by h"

        self.d_k = d_model // h
        self.W_Q = nn.Linear(d_model, d_model, bias=False)
        self.W_K = nn.Linear(d_model, d_model, bias=False)
        self.W_V = nn.Linear(d_model, d_model, bias=False)

        self.W_O = nn.Linear(h * self.d_k, d_model, bias=False)

        self.dropout = nn.Dropout(dropout)

    @staticmethod
    def attention(query, key, value, mask, dropout: nn.Dropout):
        d_k = query.shape[-1]

        # (Batch, h, Seq_len, d_k) --> (Batch, h, Seq_len, Seq_len)
        attention_scores = (query @ key.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
            attention_scores.masked_fill_(mask == 0, -1e9)

        attenttion_scores = attention_scores.softmax(dim = -1) # (Batch, h, Seq_len, Seq_len)

        if dropout is not None:
            attention_scores = dropout(attention_scores)

        return (attention_scores @ value), attention_scores

    def forward(self, Q, K, V, mask):
        query = self.W_Q(Q) # (Batch, Seq_len, d_model) --> (Batch, Seq_len, d_model)
        key = self.W_K(K)
        value = self.W_V(V)

        # (Batch, Seq_len, d_model) --> (Batch, Seq_len, h, d_k) --> (Batch, h, Seq_len, d_k)
        query = query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1, 2)
        key = key.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1, 2)
        value = value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1, 2)

        x, self.attention_scores = MultiHeadAttentionBlock.attention(query, key, value, mask, self.dropout)

        # (Batch, h, Seq_len, d_k) --> (Batch, Seq_len, h, d_k) --> (Batch, Seq_len, d_model)
        x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.h * self.d_k)

        # (Batch, Seq_len, d_model) --> (Batch, Seq_len, d_model)
        return self.W_O(x)

class ResidualConnection(nn.Module):
    def __init__(self, features: int, dropout: float):
        super(ResidualConnection, self).__init__()
        self.dropout = nn.Dropout(dropout)
        self.norm = LayerNormalization(features)

    def forward(self, x, sublayer):
        return x + self.dropout(sublayer(self.norm(x)))

class EncoderBlock(nn.Module):
    def __init__(self, features: int, self_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float):
        super(EncoderBlock, self).__init__()
        self.self_attention_block = self_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(2)])

    def forward(self, x, src_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, src_mask))
        x = self.residual_connections[1](x, self.feed_forward_block)
        return x

class Encoder(nn.Module):
    def __init__(self, features: int, layers: nn.ModuleList):
        super(Encoder, self).__init__()
        self.layers = layers
        self.norm = LayerNormalization(features)

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size: int, d_model: int, num_classes: int, seq_len: int, N: int, h: int, d_ff: int, dropout: float = 0.1):
        super(TransformerEncoder, self).__init__()
        self.input_embedding = InputEmbedding(d_model, vocab_size)
        self.positional_embedding = PositionalEmbedding(d_model, seq_len, dropout)

        encoder_blocks = []
        for _ in range(N):
            encoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
            feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
            encoder_block = EncoderBlock(d_model, encoder_self_attention_block, feed_forward_block, dropout)
            encoder_blocks.append(encoder_block)

        self.encoder = Encoder(d_model, nn.ModuleList(encoder_blocks))

        self.classification_layer = nn.Sequential(
            nn.Linear(d_model, d_model // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(d_model // 2, num_classes)
        )

    def forward(self, x, mask = None):
        x = self.input_embedding(x)
        x = self.positional_embedding(x)
        output = self.encoder(x, mask)
        output = output.mean(dim=1)
        return self.classification_layer(output)

def build_transformer_encoder(vocab_size, d_model, num_classes, seq_len, N, h, d_ff, dropout):
    model = TransformerEncoder(vocab_size, d_model, num_classes, seq_len, N, h, d_ff, dropout)
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    return model

In [4]:
from torchinfo import summary
model = build_transformer_encoder(33144, 512, 3, 512, 6, 8, 2048, 0.1)
print(summary(model, input_size=(64, 512), dtypes=[torch.long]))

RuntimeError: Failed to run torchinfo. See above stack traces for more details. Executed layers up to: [InputEmbedding: 1, Embedding: 2, PositionalEmbedding: 1, Dropout: 2, EncoderBlock: 3, ResidualConnection: 5, LayerNormalization: 6, MultiHeadAttentionBlock: 4, Linear: 5, Linear: 5, Linear: 5, Dropout: 5, Linear: 5, Dropout: 6, ResidualConnection: 5, LayerNormalization: 6, FeedForwardBlock: 4, Linear: 5, Dropout: 5, Linear: 5, Dropout: 6, LayerNormalization: 6, Linear: 5, Linear: 5, Linear: 5]

In [6]:
def accuracy_fn(y_true, y_pred):
    correct = torch.eq(y_true, y_pred).sum().item()
    acc = (correct / len(y_pred)) * 100
    return acc

def train_step(model, data_loader, loss_fn, optimizer, accuracy_fn, device):
    train_loss, train_acc = 0, 0
    model.train()

    for batch, (X, y) in enumerate(data_loader):
        X, y = X.to(device), y.to(device)
        y_pred = model(X).to(device)
        loss = loss_fn(y_pred, y)
        train_loss += loss.item()
        train_acc += accuracy_fn(y, y_pred.argmax(dim=1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    train_loss /= len(data_loader)
    train_acc /= len(data_loader)
    return train_loss, train_acc

def test_step(model, data_loader, loss_fn, accuracy_fn, device):
    test_loss, test_acc = 0, 0
    model.eval()

    with torch.inference_mode():
        for batch, (X, y) in enumerate(data_loader):
            X, y = X.to(device), y.to(device)
            test_pred = model(X).to(device)
            loss = loss_fn(test_pred, y)
            test_loss += loss.item()
            test_acc += accuracy_fn(y, test_pred.argmax(dim=1))

        test_loss /= len(data_loader)
        test_acc /= len(data_loader)
    return test_loss, test_acc

In [11]:
configs = [
    {"name": "default", "learning_rate": 0.0001, "epochs": 10, "seq_len": 512, "batch_size": 64, "d_model": 512, "N": 6, "h": 8, "dropout": 0.1, "d_ff": 2048},
    {"name": "modified", "learning_rate": 0.0001, "epochs": 10, "seq_len": 512, "batch_size": 64, "d_model": 256, "N": 3, "h": 8, "dropout": 0.1, "d_ff": 2048},
]

In [8]:
for config in configs:
    wandb.init(
        project="sentiment-analysis-transformer",
        config=config,
        name=config["name"],
        reinit=True
    )
    
    config = wandb.config

    model = build_transformer_encoder(
        vocab_size=len(vocab),
        d_model=config.d_model,
        num_classes=2,
        seq_len=config.seq_len,
        N=config.N,
        h=config.h,
        d_ff=config.d_ff,
        dropout=config.dropout,
    ).to(device)

    # Define optimizer and loss function
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)

    # Training Loop
    for epoch in range(config.epochs):
        train_loss, train_acc = train_step(model, train_loader, loss_fn, optimizer, accuracy_fn, device)
        test_loss, test_acc = test_step(model, test_loader, loss_fn, accuracy_fn, device)
        
        # Log metrics to wandb
        wandb.log({
            "epoch": epoch + 1,
            "train_loss": train_loss,
            "train_acc": train_acc,
            "test_loss": test_loss,
            "test_acc": test_acc
        })
        
        print(f"Config: {config.name} | Epoch: {epoch+1} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}% | Test Loss: {test_loss:.4f} | Test Acc: {test_acc:.2f}%")

    # Save the model checkpoint for this configuration
    model_file = f"sentiment_transformer_{config.name}.pth"
    torch.save(model.state_dict(), model_file)
    wandb.save(model_file)

    # Finish the current wandb run
    wandb.finish()

[34m[1mwandb[0m: Currently logged in as: [33mduonghieu27112004[0m ([33mduonghieu27112004-uet[0m). Use [1m`wandb login --relogin`[0m to force relogin


Config: default | Epoch: 1 | Train Loss: 0.4756 | Train Acc: 81.72% | Test Loss: 0.3917 | Test Acc: 81.95%
Config: default | Epoch: 2 | Train Loss: 0.2734 | Train Acc: 89.10% | Test Loss: 0.2179 | Test Acc: 91.71%
Config: default | Epoch: 3 | Train Loss: 0.1929 | Train Acc: 92.45% | Test Loss: 0.2259 | Test Acc: 91.57%
Config: default | Epoch: 4 | Train Loss: 0.1451 | Train Acc: 94.47% | Test Loss: 0.2163 | Test Acc: 92.47%
Config: default | Epoch: 5 | Train Loss: 0.1045 | Train Acc: 95.98% | Test Loss: 0.2394 | Test Acc: 92.23%
Config: default | Epoch: 6 | Train Loss: 0.0782 | Train Acc: 97.02% | Test Loss: 0.3406 | Test Acc: 92.28%
Config: default | Epoch: 7 | Train Loss: 0.0596 | Train Acc: 97.66% | Test Loss: 0.4027 | Test Acc: 91.98%
Config: default | Epoch: 8 | Train Loss: 0.0471 | Train Acc: 98.10% | Test Loss: 0.4434 | Test Acc: 91.91%
Config: default | Epoch: 9 | Train Loss: 0.0371 | Train Acc: 98.54% | Test Loss: 0.5485 | Test Acc: 91.72%
Config: default | Epoch: 10 | Train L

VBox(children=(Label(value='82.908 MB of 178.809 MB uploaded\r'), FloatProgress(value=0.4636666341892779, max=…

0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_acc,▁▇▇███████
test_loss,▅▁▁▁▁▄▅▆█▆
train_acc,▁▄▅▆▇▇████
train_loss,█▅▄▃▂▂▁▁▁▁

0,1
epoch,10.0
test_acc,92.40205
test_loss,0.44245
train_acc,98.80926
train_loss,0.03135


Config: modified | Epoch: 1 | Train Loss: 0.3949 | Train Acc: 84.38% | Test Loss: 0.2475 | Test Acc: 90.55%
Config: modified | Epoch: 2 | Train Loss: 0.2494 | Train Acc: 90.17% | Test Loss: 0.2235 | Test Acc: 91.56%
Config: modified | Epoch: 3 | Train Loss: 0.1980 | Train Acc: 92.34% | Test Loss: 0.2140 | Test Acc: 92.15%
Config: modified | Epoch: 4 | Train Loss: 0.1552 | Train Acc: 94.22% | Test Loss: 0.2193 | Test Acc: 92.37%
Config: modified | Epoch: 5 | Train Loss: 0.1241 | Train Acc: 95.44% | Test Loss: 0.2394 | Test Acc: 92.34%
Config: modified | Epoch: 6 | Train Loss: 0.1012 | Train Acc: 96.32% | Test Loss: 0.2721 | Test Acc: 92.54%
Config: modified | Epoch: 7 | Train Loss: 0.0826 | Train Acc: 97.03% | Test Loss: 0.2928 | Test Acc: 91.53%
Config: modified | Epoch: 8 | Train Loss: 0.0685 | Train Acc: 97.48% | Test Loss: 0.3282 | Test Acc: 92.28%
Config: modified | Epoch: 9 | Train Loss: 0.0588 | Train Acc: 97.90% | Test Loss: 0.3483 | Test Acc: 91.99%
Config: modified | Epoch: 10

0,1
epoch,▁▂▃▃▄▅▆▆▇█
test_acc,▁▅▇▇▇█▄▇▆▇
test_loss,▃▁▁▁▂▄▅▇█▇
train_acc,▁▄▅▆▇▇▇███
train_loss,█▅▄▃▂▂▂▁▁▁

0,1
epoch,10.0
test_acc,92.21221
test_loss,0.33507
train_acc,98.18727
train_loss,0.05219


In [20]:
# metric score using confusion matrix and f1 score
from sklearn.metrics import precision_recall_fscore_support
import seaborn as sns
import matplotlib.pyplot as plt

def eval(model, data_loader, device):
    model.eval()
    y_true = []
    y_pred = []
    with torch.inference_mode():
        for batch, (X, y) in enumerate(data_loader):
            X, y = X.to(device), y.to(device)
            eval_pred = model(X).to(device)
            eval_pred = torch.argmax(eval_pred, dim=1)
            
            y_true.extend(y.cpu().numpy())
            y_pred.extend(eval_pred.cpu().numpy())

    y_true = [int(label) for label in y_true]
    y_pred = [int(label) for label in y_pred]
    
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

default = build_transformer_encoder(
    vocab_size = len(vocab),
    d_model = configs[0]['d_model'],
    num_classes=2,
    seq_len=configs[0]['seq_len'],
    N=configs[0]['N'],
    h=configs[0]['h'],
    d_ff=configs[0]['d_ff'],
    dropout=configs[0]['dropout']
).to(device)
default.load_state_dict(
    torch.load('/kaggle/input/model-transformer/sentiment_transformer_default.pth', weights_only=True)
)

modified =  build_transformer_encoder(
    vocab_size = len(vocab),
    d_model = configs[1]['d_model'],
    num_classes=2,
    seq_len=configs[1]['seq_len'],
    N=configs[1]['N'],
    h=configs[1]['h'],
    d_ff=configs[1]['d_ff'],
    dropout=configs[1]['dropout']
).to(device)
modified.load_state_dict(
    torch.load('/kaggle/input/model-transformer/sentiment_transformer_modified.pth', weights_only=True)
)

print("Evaluate for default model:")
eval(default, test_loader, 'cuda')
print("=" * 40)

print("Evaluate for modified model:")
eval(modified, test_loader, 'cuda')
print("=" * 40)

Evaluate for default model:
Precision: 0.7726
Recall: 0.7601
F1 Score: 0.7662
Evaluate for modified model:
Precision: 0.7660
Recall: 0.7802
F1 Score: 0.7729
