In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
from datasets import Dataset
from transformers import AdamW
import torch

In [3]:
data = pd.read_csv('/kaggle/input/text-classification/preprocessed_data.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,Paragraph,Topic
0,0,khoảng thiếu_niên ngụ xã thượng quận thị_xã ki...,0
1,1,hai xe đối_đầu tạo tiếng_động mạnh ba người đi...,0
2,2,camera hành_trình của ôtô đi trên đoạn đường đ...,0
3,3,camera hành_trình ghi lại vụ tai_nạn sáng,0
4,4,đoạn đường xảy ra tai_nạn không có dải_phân_cá...,0


In [4]:
#filter only need column: Paragraph and Topic
data = data[['Paragraph', 'Topic']]
data.head()

Unnamed: 0,Paragraph,Topic
0,khoảng thiếu_niên ngụ xã thượng quận thị_xã ki...,0
1,hai xe đối_đầu tạo tiếng_động mạnh ba người đi...,0
2,camera hành_trình của ôtô đi trên đoạn đường đ...,0
3,camera hành_trình ghi lại vụ tai_nạn sáng,0
4,đoạn đường xảy ra tai_nạn không có dải_phân_cá...,0


In [5]:
# creat train, val, test dataset
train = data.sample(frac=0.8, random_state=200)
test = data.drop(train.index)
val = test.sample(frac=0.5, random_state=200)
test = test.drop(val.index)
train.shape, val.shape, test.shape

((24000, 2), (3000, 2), (3000, 2))

In [6]:
train_dataset = Dataset.from_pandas(train)
val_dataset = Dataset.from_pandas(val)
test_dataset = Dataset.from_pandas(test)
train_dataset

Dataset({
    features: ['Paragraph', 'Topic', '__index_level_0__'],
    num_rows: 24000
})

## Using BERT

In [7]:
#load pretrained model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenizer

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [8]:
def encode(docs):
    '''
    This function takes list of texts and returns input_ids and attention_mask of texts
    '''
    encoded_dict = tokenizer.batch_encode_plus(docs, add_special_tokens=True, max_length=128, padding='max_length',
                            return_attention_mask=True, truncation=True, return_tensors='pt')
    input_ids = encoded_dict['input_ids']
    attention_masks = encoded_dict['attention_mask']
    return input_ids, attention_masks

In [9]:
# convert data to vector
train_input_ids, train_attention_masks = encode(train_dataset['Paragraph'])
val_input_ids, val_attention_masks = encode(val_dataset['Paragraph'])
test_input_ids, test_attention_masks = encode(test_dataset['Paragraph'])

In [10]:
train_y = torch.LongTensor(train_dataset['Topic'])
val_y = torch.LongTensor(val_dataset['Topic'])
test_y = torch.LongTensor(test_dataset['Topic'])
train_y.size(), val_y.size(), test_y.size()

(torch.Size([24000]), torch.Size([3000]), torch.Size([3000]))

In [11]:
# create dataset
train_dataset = torch.utils.data.TensorDataset(train_input_ids, train_attention_masks, train_y)
val_dataset = torch.utils.data.TensorDataset(val_input_ids, val_attention_masks, val_y)
test_dataset = torch.utils.data.TensorDataset(test_input_ids, test_attention_masks, test_y)

In [11]:
# create dataloader
BATCH_SIZE = 64
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [12]:
from transformers import BertForSequenceClassification
N_labels = len(data['Topic'].unique())
PRETRAINED_LM = "bert-base-uncased"
model = BertForSequenceClassification.from_pretrained(PRETRAINED_LM,
                                                      num_labels=N_labels,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [14]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [16]:
optimizer = AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()
n_epochs = 5

In [17]:
for epoch in range(n_epochs):  # Số epoch
    model.train()  # Chuyển mô hình sang chế độ huấn luyện
    total_loss = 0

    for batch in train_loader:
        # Chuyển dữ liệu sang GPU
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        # Backward pass và tối ưu hóa
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Loss: {avg_loss}")



Epoch 1, Loss: 2.320306606610616
Epoch 2, Loss: 1.6629444481531779
Epoch 3, Loss: 1.269620681444804
Epoch 4, Loss: 1.0196111399332681
Epoch 5, Loss: 0.8457004788716634


In [18]:
    # Đánh giá trên tập validation
    model.eval()  # Chuyển mô hình sang chế độ đánh giá
    val_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = criterion(logits, labels)

            val_loss += loss.item()
            predictions = torch.argmax(logits, dim=-1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    avg_val_loss = val_loss / len(val_loader)
    accuracy = correct / total
    print(f"Validation Loss: {avg_val_loss}, Accuracy: {accuracy}")

Validation Loss: 0.9456534943682082, Accuracy: 0.7266666666666667


# tuning batchsize, and more epochs 

In [12]:
# create dataloader
BATCH_SIZE = 128
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [13]:
from transformers import BertForSequenceClassification
N_labels = len(data['Topic'].unique())
PRETRAINED_LM = "bert-base-uncased"
model = BertForSequenceClassification.from_pretrained(PRETRAINED_LM,
                                                      num_labels=N_labels,
                                                      output_attentions=False,
                                                      output_hidden_states=False)
optimizer = AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [17]:
n_epochs = 20
for epoch in range(n_epochs):  # Số epoch
    model.train()  # Chuyển mô hình sang chế độ huấn luyện
    total_loss = 0

    for batch in train_loader:
        # Chuyển dữ liệu sang GPU
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        # Backward pass và tối ưu hóa
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Loss: {avg_loss}")



Epoch 1, Loss: 2.4330859964198255
Epoch 2, Loss: 1.8230474451754957
Epoch 3, Loss: 1.4249215164083116
Epoch 4, Loss: 1.1820216359610254
Epoch 5, Loss: 1.0016568902325123
Epoch 6, Loss: 0.8607771922933295
Epoch 7, Loss: 0.7622305316493866
Epoch 8, Loss: 0.6738500862996629
Epoch 9, Loss: 0.6045275712583927
Epoch 10, Loss: 0.533783347999796
Epoch 11, Loss: 0.47744172145711616
Epoch 12, Loss: 0.41828553957190917
Epoch 13, Loss: 0.38181055091479993
Epoch 14, Loss: 0.34057610814875744
Epoch 15, Loss: 0.29706993057055675
Epoch 16, Loss: 0.2687494138929438
Epoch 17, Loss: 0.22667152830894957
Epoch 18, Loss: 0.20637939141151754
Epoch 19, Loss: 0.17866829323007705
Epoch 20, Loss: 0.1630858269103981


55

In [18]:
    # Đánh giá trên tập validation
    model.eval()  # Chuyển mô hình sang chế độ đánh giá
    val_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = criterion(logits, labels)

            val_loss += loss.item()
            predictions = torch.argmax(logits, dim=-1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    avg_val_loss = val_loss / len(val_loader)
    accuracy = correct / total
    print(f"Validation Loss: {avg_val_loss}, Accuracy: {accuracy}")

Validation Loss: 0.8689365101357301, Accuracy: 0.792


In [19]:
    # Đánh giá trên tập validation
    model.eval()  # Chuyển mô hình sang chế độ đánh giá
    val_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = criterion(logits, labels)

            val_loss += loss.item()
            predictions = torch.argmax(logits, dim=-1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    avg_val_loss = val_loss / len(val_loader)
    accuracy = correct / total
    print(f"Test Loss: {avg_val_loss}, Accuracy: {accuracy}")

Test Loss: 0.9282514819254478, Accuracy: 0.7786666666666666


# Using mBERT

In [28]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Tải tokenizer và model
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=15)  # num_labels là số nhãn


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
# convert data to vector
train_dataset = Dataset.from_pandas(train)
val_dataset = Dataset.from_pandas(val)
test_dataset = Dataset.from_pandas(test)

train_input_ids, train_attention_masks = encode(train_dataset['Paragraph'])
val_input_ids, val_attention_masks = encode(val_dataset['Paragraph'])
test_input_ids, test_attention_masks = encode(test_dataset['Paragraph'])

In [25]:
train_y = torch.LongTensor(train_dataset['Topic'])
val_y = torch.LongTensor(val_dataset['Topic'])
test_y = torch.LongTensor(test_dataset['Topic'])
train_y.size(), val_y.size(), test_y.size()
# create dataset
train_dataset = torch.utils.data.TensorDataset(train_input_ids, train_attention_masks, train_y)
val_dataset = torch.utils.data.TensorDataset(val_input_ids, val_attention_masks, val_y)
test_dataset = torch.utils.data.TensorDataset(test_input_ids, test_attention_masks, test_y)

In [26]:
# create dataloader
BATCH_SIZE = 128
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [29]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [33]:
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = torch.nn.CrossEntropyLoss()

In [34]:
n_epochs = 20
for epoch in range(n_epochs):  # Số epoch
    model.train()  # Chuyển mô hình sang chế độ huấn luyện
    total_loss = 0

    for batch in train_loader:
        # Chuyển dữ liệu sang GPU
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        # Backward pass và tối ưu hóa
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Loss: {avg_loss}")



Epoch 1, Loss: 1.1428306857322126
Epoch 2, Loss: 0.5760411108110813
Epoch 3, Loss: 0.37477342340540376
Epoch 4, Loss: 0.2512518341871018
Epoch 5, Loss: 0.17411563745600747
Epoch 6, Loss: 0.13183454782801104
Epoch 7, Loss: 0.10441979716353594
Epoch 8, Loss: 0.08839132499564042
Epoch 9, Loss: 0.07521265920529023
Epoch 10, Loss: 0.05962347530541902
Epoch 11, Loss: 0.056969885609082956
Epoch 12, Loss: 0.05384847771455633
Epoch 13, Loss: 0.043884292475264916
Epoch 14, Loss: 0.05642719416472902
Epoch 15, Loss: 0.04390608184559746
Epoch 16, Loss: 0.04016202698991773
Epoch 17, Loss: 0.036933601529218575
Epoch 18, Loss: 0.04711126502073231
Epoch 19, Loss: 0.03852256714859284
Epoch 20, Loss: 0.030706665839038867


In [35]:
    # Đánh giá trên tập validation
    model.eval()  # Chuyển mô hình sang chế độ đánh giá
    val_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = criterion(logits, labels)

            val_loss += loss.item()
            predictions = torch.argmax(logits, dim=-1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    avg_val_loss = val_loss / len(val_loader)
    accuracy = correct / total
    print(f"Test Loss: {avg_val_loss}, Accuracy: {accuracy}")

Test Loss: 0.6529186942304174, Accuracy: 0.8663333333333333
