In [1]:
# !pip uninstall -y torchtext
# !pip install torchtext --no-cache-dir

In [17]:
import torch.nn as nn
import torch
import math

### **1. Input Embedding, Positional Encoding**

In [18]:
class TokenAndPositionEmbedding(nn.Module):
    """
    Parameters:
    - vocab_size (int): The size of the vocabulary (number of unique tokens).
    - embed_dim (int): The dimensionality of the embedding vectors.
    - max_length (int): The maximum sequence length.

    """
    def __init__(self, vocab_size, embed_dim, max_length, device='cpu'):
        super().__init__()
        self.device = device
        # Token embedding: Maps each token (word) to a vector of size embed_dim
        self.word_emb = nn.Embedding(
            num_embeddings=vocab_size,  # Number of unique tokens in the vocabulary
            embedding_dim=embed_dim    # Dimension of the embedding vector for each token
        )

        # Positional embedding: Maps each position in the sequence to a vector of size embed_dim
        self.pos_emb = nn.Embedding(
            num_embeddings=max_length, # Maximum number of positions (sequence length)
            embedding_dim=embed_dim    # Dimension of the embedding vector for each position
        )

    def forward(self, x):
        """
        Parameters:
        - x (Tensor): Input tensor of shape (batch_size, seq_len),
                      containing token indices (IDs).

        Returns:
        - Tensor: Output tensor of shape (batch_size, seq_len, embed_dim),
                  which is the sum of token embedding and positional embedding.
        """
        # Get batch size (N) and sequence length (seq_len)
        N, seq_len = x.size()

        # Create position indices for each token in the sequence
        # positions.shape = (batch_size, seq_len)
        positions = torch.arange(0, seq_len).expand(N, seq_len).to(self.device)

        # Compute token embeddings: output1.shape = (batch_size, seq_len, embed_dim)
        output1 = self.word_emb(x)

        # Compute positional embeddings: output2.shape = (batch_size, seq_len, embed_dim)
        output2 = self.pos_emb(positions)

        # Combine token and positional embeddings
        output = output1 + output2

        return output


### **2. Encoder**

In [19]:
class TransformerEncoderBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
        super().__init__()
        # Multi-head self-attention mechanism
        self.attn = nn.MultiheadAttention(
            embed_dim=embed_dim,  # Dimensionality of the input embeddings
            num_heads=num_heads,  # Number of attention heads
            batch_first=True      # Ensures input/output tensors are batch-first
        )

        # Feed-forward network (FFN)
        self.ffn = nn.Sequential(
            nn.Linear(in_features=embed_dim, out_features=ff_dim, bias=True),  # Expands dimensions
            nn.ReLU(),                                                       # Applies non-linearity
            nn.Linear(in_features=ff_dim, out_features=embed_dim, bias=True) # Projects back to original dimensions
        )

        # Layer normalization to stabilize training
        self.layernorm_1 = nn.LayerNorm(normalized_shape=embed_dim, eps=1e-6)
        self.layernorm_2 = nn.LayerNorm(normalized_shape=embed_dim, eps=1e-6)

        # Dropout layers for regularization
        self.dropout_1 = nn.Dropout(p=dropout)
        self.dropout_2 = nn.Dropout(p=dropout)

    def forward(self, query, key, value):
        # Apply multi-head self-attention
        attn_output, _ = self.attn(query, key, value)  # Attention output and weights
        attn_output = self.dropout_1(attn_output)      # Apply dropout to the attention output

        # Residual connection and layer normalization
        out_1 = self.layernorm_1(query + attn_output)  # Add input (query) and normalize

        # Pass through the feed-forward network
        ffn_output = self.ffn(out_1)                  # Apply the feed-forward network
        ffn_output = self.dropout_2(ffn_output)       # Apply dropout to the FFN output

        # Second residual connection and layer normalization
        out_2 = self.layernorm_2(out_1 + ffn_output)  # Add input (out_1) and normalize

        return out_2

In [20]:
class TransformerEncoder(nn.Module):
    def __init__(self, src_vocab_size, embed_dim, max_length, num_layers, num_heads, ff_dim, dropout=0.1, device='cpu'):
        super().__init__()
        # Embedding layer for token and positional embeddings
        self.embedding = TokenAndPositionEmbedding(src_vocab_size, embed_dim, max_length, device)

        # Stacking multiple Transformer Encoder blocks
        self.layers = nn.ModuleList([
            TransformerEncoderBlock(embed_dim, num_heads, ff_dim, dropout) for i in range(num_layers)
        ])

    def forward(self, x):
        # Apply token and positional embedding
        output = self.embedding(x)

        # Pass through each Transformer Encoder block
        for layer in self.layers:
            output = layer(output, output, output)  # Self-attention with identical query, key, and value

        return output

### **3. Decoder**

In [21]:
class TransformerDecoderBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
        super().__init__()
        # Self-attention mechanism for the decoder
        self.attn = nn.MultiheadAttention(
            embed_dim=embed_dim,  # Dimensionality of the input embeddings
            num_heads=num_heads,  # Number of parallel attention heads
            batch_first=True      # Ensures input tensors are batch-first (batch_size, seq_len, embed_dim)
        )

        # Cross-attention mechanism between the decoder and the encoder output
        self.cross_attn = nn.MultiheadAttention(
            embed_dim=embed_dim,  # Dimensionality of the input embeddings
            num_heads=num_heads,  # Number of parallel attention heads
            batch_first=True      # Ensures input tensors are batch-first
        )

        # Feed-forward network (FFN) for further processing of attention outputs
        self.ffn = nn.Sequential(
            nn.Linear(in_features=embed_dim, out_features=ff_dim, bias=True),  # Expands dimensionality
            nn.ReLU(),                                                       # Applies non-linearity for richer representations
            nn.Linear(in_features=ff_dim, out_features=embed_dim, bias=True) # Projects back to original dimensionality
        )

        # Layer normalization to stabilize and accelerate training
        self.layernorm_1 = nn.LayerNorm(normalized_shape=embed_dim, eps=1e-6)  # Normalizes after self-attention
        self.layernorm_2 = nn.LayerNorm(normalized_shape=embed_dim, eps=1e-6)  # Normalizes after cross-attention
        self.layernorm_3 = nn.LayerNorm(normalized_shape=embed_dim, eps=1e-6)  # Normalizes after feed-forward network

        # Dropout layers for regularization to prevent overfitting
        self.dropout_1 = nn.Dropout(p=dropout)
        self.dropout_2 = nn.Dropout(p=dropout)
        self.dropout_3 = nn.Dropout(p=dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        # Self-attention: focuses on the decoder's own input
        attn_output, _ = self.attn(x, x, x, attn_mask=tgt_mask)  # Computes scaled dot-product self-attention
        attn_output = self.dropout_1(attn_output)                # Applies dropout to self-attention output
        out_1 = self.layernorm_1(x + attn_output)                # Adds residual connection and normalizes

        # Cross-attention: focuses on the encoder output
        attn_output, _ = self.cross_attn(out_1, enc_output, enc_output, attn_mask=src_mask)  # Attention over encoder output
        attn_output = self.dropout_2(attn_output)                # Applies dropout to cross-attention output
        out_2 = self.layernorm_2(out_1 + attn_output)            # Adds residual connection and normalizes

        # Feed-forward network
        ffn_output = self.ffn(out_2)                            # Processes through FFN
        ffn_output = self.dropout_3(ffn_output)                 # Applies dropout to FFN output
        out_3 = self.layernorm_3(out_2 + ffn_output)            # Adds residual connection and normalizes

        return out_3


In [22]:
class TransformerDecoder(nn.Module):
    def __init__(self, tgt_vocab_size, embed_dim, max_length, num_layers, num_heads, ff_dim, dropout=0.1, device='cpu'):
        super().__init__()
        # Embedding layer for token and positional embeddings
        self.embedding = TokenAndPositionEmbedding(tgt_vocab_size, embed_dim, max_length, device)

        # Stacking multiple Transformer Decoder blocks
        self.layers = nn.ModuleList([
            TransformerDecoderBlock(embed_dim, num_heads, ff_dim, dropout) for i in range(num_layers)
        ])

    def forward(self, x, enc_output, src_mask, tgt_mask):
        # Apply token and positional embeddings
        output = self.embedding(x)

        # Pass through each Transformer Decoder block
        for layer in self.layers:
            output = layer(output, enc_output, src_mask, tgt_mask)  # Combines self-attention, cross-attention, and FFN

        return output

### **4. Transformer**

In [23]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, embed_dim, max_length, num_layers, num_heads, ff_dim, dropout=0.1, device='cpu'):
        super().__init__()
        self.device = device
        self.encoder = TransformerEncoder(src_vocab_size, embed_dim, max_length, num_layers, num_heads, ff_dim)
        self.decoder = TransformerDecoder(tgt_vocab_size, embed_dim, max_length, num_layers, num_heads, ff_dim)
        self.fc = nn.Linear(embed_dim, tgt_vocab_size)

    def generate_mask(self, src, tgt):
        src_seq_len = src.shape[1]
        tgt_seq_len = tgt.shape[1]
        src_mask = torch.zeros((src_seq_len, src_seq_len), device=self.device).type(torch.bool)
        tgt_mask = (torch.triu(torch.ones((tgt_seq_len, tgt_seq_len), device=self.device)) == 1).transpose(0, 1)
        tgt_mask = tgt_mask.float().masked_fill(tgt_mask == 0, float('-inf')).masked_fill(tgt_mask == 1, float(0.0))
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        enc_output = self.encoder(src)
        dec_output = self.decoder(tgt, enc_output, src_mask, tgt_mask)
        output = self.fc(dec_output)
        return output


In [24]:
batch_size = 128
src_vocab_size = 1000
tgt_vocab_size = 2000
embed_dim = 200
max_length = 100
num_layers = 2
num_heads = 4
ff_dim = 256

model = Transformer(src_vocab_size, tgt_vocab_size, embed_dim,
                    max_length, num_layers, num_heads, ff_dim)

src = torch.randint(high=2, size=(batch_size, max_length), dtype=torch.int64)
tgt = torch.randint(high=2, size=(batch_size, max_length), dtype=torch.int64)

prediction = model(src, tgt)
prediction.shape  # batch_size x max_length x tgt_vocab_size


torch.Size([128, 100, 2000])

---

## **I. Text Classification**

In [10]:
# !pip install datasets

**1. Load Dataset**

In [11]:
from datasets import load_dataset

ds = load_dataset("thainq107/ntc-scv")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [12]:
ds

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'preprocessed_sentence'],
        num_rows: 30000
    })
    valid: Dataset({
        features: ['sentence', 'label', 'preprocessed_sentence'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['sentence', 'label', 'preprocessed_sentence'],
        num_rows: 10000
    })
})

**2. Preprocessing**

In [13]:
import re
import string

def preprocess_text(text):
    # remove URLs https :// www.
    url_pattern = re.compile(r'https?://\s+\wwww\.\s+')
    text = url_pattern.sub (r" ", text)

    # remove HTML Tags : <>
    html_pattern = re.compile(r'<[^<>]+>')
    text = html_pattern.sub (r" ", text)

    # remove puncs and digits
    replace_chars = list(string.punctuation + string.digits)
    for char in replace_chars:
        text = text.replace(char, ' ')

    # remove emoji
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F" # emoticons
        u"\U0001F300-\U0001F5FF" # symbols & pictographs
        u"\U0001F680-\U0001F6FF" # transport & map symbols
        u"\U0001F1E0-\U0001F1FF" # flags (iOS)
        u"\U0001F1F2-\U0001F1F4" # Macau flag
        u"\U0001F1E6-\U0001F1FF" # flags
        u"\U0001F600-\U0001F64F"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U0001F1F2"
        u"\U0001F1F4"
        u"\U0001F620"
        u"\u200d"
        u"\u2640-\u2642"
        "]+", flags = re.UNICODE)
    text = emoji_pattern.sub(r' ', text)

    # normalize whitespace
    text = " ".join(text.split())

    # lowercasing
    text = text.lower()

    return text

**3. Representation**

In [14]:
def yield_tokens(sentences, tokenizer):
    for sentence in sentences:
        yield tokenizer(sentence)

In [15]:
# !pip uninstall -y torchtext
# !pip install torchtext --no-cache-dir --force-reinstall

In [16]:
# word-based tokenizer
import torchtext
from torchtext.data import get_tokenizer

tokenizer = get_tokenizer("basic_english")

OSError: /usr/local/lib/python3.10/dist-packages/torchtext/lib/libtorchtext.so: undefined symbol: _ZN5torch3jit17parseSchemaOrNameERKSs

In [None]:
ds['train']['preprocessed_sentence'][0]

In [None]:
# build vocabulary
from torchtext.vocab import build_vocab_from_iterator
vocab_size = 10000
vocabulary = build_vocab_from_iterator (
    yield_tokens(ds['train']['preprocessed_sentence'], tokenizer),
    max_tokens=vocab_size,
    specials=["<pad>", "<unk>"]
)
vocabulary.set_default_index(vocabulary["<unk>"])

In [None]:
# convert torchtext dataset
from torchtext.data.functional import to_map_style_dataset

def prepare_dataset(df):
    # create iterator for dataset : (sentence, label)
    for row in df :
        sentence = row['preprocessed_sentence']
        encoded_sentence = vocabulary(tokenizer(sentence))
        label = row ['label']
        yield encoded_sentence, label

train_dataset = prepare_dataset(ds['train'])
train_dataset = to_map_style_dataset(train_dataset)

val_dataset = prepare_dataset(ds['valid'])
val_dataset = to_map_style_dataset(val_dataset)


test_dataset = prepare_dataset(ds['test'])
test_dataset = to_map_style_dataset(test_dataset)

**4. DataLoader**

In [None]:
import torch

seq_length = 100

def collate_batch(batch):
    # create inputs, offsets, labels for batch
    sentences, labels = list(zip(*batch))
    encoded_sentences = [
        sentence+([0]*(seq_length-len(sentence))) if len(sentence) < seq_length else sentence[:seq_length]
        for sentence in sentences
    ]

    encoded_sentences = torch.tensor(encoded_sentences, dtype=torch.int64)
    labels = torch.tensor(labels)
    return encoded_sentences, labels

In [None]:
from torch.utils.data import DataLoader

batch_size = 128

train_dataloader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_batch
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_batch
)

test_dataloader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_batch
)

**5. Trainer**

In [None]:
import time

def train_epoch(model, optimizer, criterion, train_dataloader, device, epoch=0, log_interval=50):
    model.train()
    total_acc, total_count = 0, 0
    losses = []
    start_time = time.time()

    for idx, (inputs, labels) in enumerate(train_dataloader):
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        predictions = model(inputs)

        # compute loss
        loss = criterion(predictions, labels)
        losses.append(loss.item())

        # backward
        loss.backward()
        optimizer.step()
        total_acc += (predictions.argmax(1) == labels).sum().item()
        total_count += labels.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print(
                "| epoch {:3d} | {:5d}/{:5d} batches "
                "| accuracy {:8.3f}".format(
                    epoch, idx, len(train_dataloader), total_acc / total_count
                )
            )
            total_acc, total_count = 0, 0
            start_time = time.time()

    epoch_acc = total_acc / total_count
    epoch_loss = sum(losses) / len(losses)
    return epoch_acc, epoch_loss



In [None]:
def evaluate_epoch(model, criterion, valid_dataloader, device):
    model.eval()
    total_acc, total_count = 0, 0
    losses = []

    with torch.no_grad():
        for idx, (inputs, labels) in enumerate(valid_dataloader):
            inputs = inputs.to(device)
            labels = labels.to(device)

            predictions = model(inputs)

            loss = criterion(predictions, labels)
            losses.append(loss.item())

            total_acc += (predictions.argmax(1) == labels).sum().item()
            total_count += labels.size(0)

    epoch_acc = total_acc / total_count
    epoch_loss = sum(losses) / len(losses)
    return epoch_acc, epoch_loss

In [None]:
def train(model, model_name, save_model, optimizer, criterion, train_dataloader, valid_dataloader, num_epochs, device):
    train_accs, train_losses, eval_accs, eval_losses = [], [], [], []
    best_loss_eval = 100
    times = []
    for epoch in range(1, num_epochs+1):
        epoch_start_time = time.time()
        # Training
        train_acc, train_loss = train_epoch(model, optimizer, criterion, train_dataloader, device, epoch)
        train_accs.append(train_acc)
        train_losses.append(train_loss)

        # Evaluation
        eval_acc, eval_loss = evaluate_epoch(model, criterion, valid_dataloader, device)
        eval_accs.append(eval_acc)
        eval_losses.append(eval_loss)

        # Save best model
        if eval_loss < best_loss_eval:
            torch.save(model.state_dict(), save_model + f'/{model_name}.pt')

        times.append(time.time() - epoch_start_time)
        # Print loss, acc end epoch
        print("-" * 59)
        print(
            "| End of epoch {:3d} | Time: {:5.2f}s | Train Accuracy {:8.3f} | Train Loss {:8.3f} "
            "| Valid Accuracy {:8.3f} | Valid Loss {:8.3f} ".format(
                epoch, time.time() - epoch_start_time, train_acc, train_loss, eval_acc, eval_loss
            )
        )
        print("-" * 59)

    # Load best model
    model.load_state_dict(torch.load(save_model + f'/{model_name}.pt', weights_only=True))
    model.eval()
    metrics = {
        'train_accuracy': train_accs,
        'train_loss': train_losses,
        'valid_accuracy': eval_accs,
        'valid_loss': eval_losses,
        'time': times
    }
    return model, metrics

In [None]:
import matplotlib.pyplot as plt

def plot_result(num_epochs, train_accs, eval_accs, train_losses, eval_losses):
    epochs = list(range(num_epochs))
    fig, axs = plt.subplots(nrows=1, ncols=2 , figsize=(12, 6))

    axs[0].plot(epochs, train_accs, label = "Training")
    axs[0].plot(epochs, eval_accs, label = "Evaluation")
    axs[0].set_xlabel("Epochs")
    axs[0].set_ylabel("Accuracy")

    axs[1].plot(epochs, train_losses, label = "Training")
    axs[1].plot(epochs, eval_losses, label = "Evaluation")
    axs[1].set_xlabel("Epochs")
    axs[1].set_ylabel("Loss")

    plt.legend()
    plt.show()


**6. Modeling**

In [None]:
class TransformerEncoderCls(nn.Module):
    def __init__(self, vocab_size, max_length, num_layers, embed_dim, num_heads, ff_dim, dropout=0.1, device='cpu'):
        super().__init__()
        self.encoder = TransformerEncoder(
            vocab_size, embed_dim, max_length, num_layers, num_heads, ff_dim, dropout, device
        )
        self.pooling = nn.AvgPool1d(kernel_size=max_length)
        self.fc1 = nn.Linear(embed_dim, 20)
        self.fc2 = nn.Linear(20, 2)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, x):
        output = self.encoder(x)
        output = output.permute(0, 2, 1)
        output = self.pooling(output).squeeze()
        output = self.fc1(output)
        output = self.relu(output)
        output = self.dropout(output)
        output = self.fc2(output)
        return output


In [None]:
vocab_size = 10000
max_length = 100
embed_dim = 200
num_layers = 2
num_heads = 4
ff_dim = 128
dropout = 0.1

model = TransformerEncoderCls(
    vocab_size, max_length, num_layers, embed_dim, num_heads, ff_dim, dropout
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
import torch.optim as optim
import os

model = TransformerEncoderCls(
    vocab_size, max_length, num_layers, embed_dim, num_heads, ff_dim, dropout, device
).to(device)

criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=5e-5)

num_epochs = 50
save_model = './model'
os.makedirs(save_model, exist_ok = True)
model_name = 'model'

model, metrics = train(
    model, model_name, save_model, optimizer, criterion, train_dataloader, val_dataloader, num_epochs, device
)

In [None]:
plot_result(
    num_epochs,
    metrics["train_accuracy"],
    metrics["valid_accuracy"],
    metrics["train_loss"],
    metrics["valid_loss"]
)

In [None]:
test_acc, test_loss = evaluate_epoch(model, criterion, test_dataloader, device)
test_acc, test_loss

## **II. Text Classification using BERT**

In [25]:
!pip install -q -U datasets accelerate evaluate

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.[0m[31m
[0m

In [26]:
from datasets import load_dataset

ds = load_dataset('thainq107/ntc-scv')

In [27]:
ds

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'preprocessed_sentence'],
        num_rows: 30000
    })
    valid: Dataset({
        features: ['sentence', 'label', 'preprocessed_sentence'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['sentence', 'label', 'preprocessed_sentence'],
        num_rows: 10000
    })
})

In [28]:
from transformers import AutoTokenizer

model_name = "distilbert-base-uncased" # bert-base-uncased

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    use_fast=True
)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [38]:
tokenizer.model_max_length

512

In [39]:
max_seq_length = 100
max_seq_length = min(max_seq_length, tokenizer.model_max_length)

def preprocess_function(examples):
    # Tokenize the texts
    result = tokenizer(
        examples["preprocessed_sentence"],
        padding="max_length",
        max_length=max_seq_length,
        truncation=True
    )
    result["label"] = examples['label']

    return result

In [30]:
processed_dataset = ds.map(
    preprocess_function,
    batched=True,
    desc="Running tokenizer on dataset",
)

Running tokenizer on dataset:   0%|          | 0/30000 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [31]:
processed_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'preprocessed_sentence', 'input_ids', 'attention_mask'],
        num_rows: 30000
    })
    valid: Dataset({
        features: ['sentence', 'label', 'preprocessed_sentence', 'input_ids', 'attention_mask'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['sentence', 'label', 'preprocessed_sentence', 'input_ids', 'attention_mask'],
        num_rows: 10000
    })
})

In [33]:
!pip install -U numpy



In [35]:
!pip install -U tensorflow

Collecting tensorflow
  Downloading tensorflow-2.18.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting tensorboard<2.19,>=2.18 (from tensorflow)
  Downloading tensorboard-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Collecting numpy<2.1.0,>=1.26.0 (from tensorflow)
  Downloading numpy-2.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Downloading tensorflow-2.18.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (615.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m615.3/615.3 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-2.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (19.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.5/19.5 MB[0m [31m53.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tensorboard-2.18.0-p

In [36]:
from transformers import AutoConfig, AutoModelForSequenceClassification

num_labels = 2

config = AutoConfig.from_pretrained(
    model_name,
    num_labels=num_labels,
    finetuning_task="text-classification"
)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    config=config
)


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py", line 37, in <module>
    ColabKernelApp.launch_instance()
  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.10/dist-packages/ipykernel/ke

AttributeError: _ARRAY_API not found

RuntimeError: Failed to import transformers.models.distilbert.modeling_distilbert because of the following error (look up to see its traceback):
initialization of _pywrap_checkpoint_reader raised unreported exception

In [37]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    result = metric.compute(predictions=predictions, references=labels)
    return result


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py", line 37, in <module>
    ColabKernelApp.launch_instance()
  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.10/dist-packages/ipykernel/ke

AttributeError: _ARRAY_API not found

SystemError: initialization of _pywrap_checkpoint_reader raised unreported exception

In [40]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='ntc-scv-distilbert-base-uncased',
    learning_rate=2e-5,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    num_train_epochs=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset["train"],
    eval_dataset=processed_dataset["valid"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py", line 37, in <module>
    ColabKernelApp.launch_instance()
  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.10/dist-packages/ipykernel/ke

AttributeError: _ARRAY_API not found

RuntimeError: Failed to import transformers.trainer because of the following error (look up to see its traceback):
Failed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):
Failed to import transformers.modeling_utils because of the following error (look up to see its traceback):
initialization of _pywrap_checkpoint_reader raised unreported exception

**Training**

In [41]:
import wandb
wandb.init(mode='disabled')

In [42]:
trainer.train()

NameError: name 'trainer' is not defined

In [None]:
trainer.evaluate(processed_dataset["test"])

In [None]:
from transformers import pipeline

classifier = pipeline(
    "text-classification",
    model="thainq107/ntc-scv-distilbert-base-uncased"
)

In [None]:
classifier("quán ăn này ngon quá luôn nè")