# BERT & T5 model for "reactionary and toxic content".

## Data processing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data_test = pd.read_csv('/kaggle/input/toxic-phandong/test.csv')
data_train = pd.read_csv('/kaggle/input/toxic-phandong/train.csv')
data_val = pd.read_csv('/kaggle/input/toxic-phandong/val.csv')

In [3]:
data_test.head(10)

Unnamed: 0,text,label,check
0,Từ lúc mấy bro cmt cực kì cl gì đấy là rất khó...,1,
1,Nghe trần dần thấy cười đau cả bụng,0,
2,Idol tui có khác. Phải v chứ ai như cái con ph...,1,
3,Cảnh Sát GiựT Tiền = CSGT,1,
4,Ông đi đâu mà lạc sang đây đây,0,
5,Gia cát bị đồng bóng à,1,
6,Mấy tụi DLV hay nói đu càng này nọ. Nảo chúng ...,1,
7,Quả múa chung được phết thầy ạ :v,0,
8,Dễ thương quá lun hồi đó em cũng thích con mèo...,0,
9,Tuấn Hưng liên thiên quá rồi đấy,0,


In [4]:
data_train.head()

Unnamed: 0,text,label,check
0,Bấp bênh vl thế,1,True
1,Thấy chán ad page này kiến thức thì nông cản c...,1,
2,Giang Giang Đỗ Thị Ngọc Hà trend mới kìa kìa,0,
3,đcm 😒 sau có con cho hút cỏ chữa bệnh chứ đéo ...,1,
4,Má nứng quá aiu ơi,1,True


In [5]:
data_val.head()

Unnamed: 0,text,label,check
0,"Cs cha , Cs con , Rận cha, Rận con ...ai đúng ...",1,
1,Long Duy Đào cái tội ớ,0,
2,Nói gì tào lao kg,1,
3,Chả làm gì cũng có fan cứng =)),0,
4,Sao lại cứ đến nụ cười . lên không nghe rõ lời...,0,True


In [4]:
data_train = data_train.drop(['check'], axis=1)
data_test = data_test.drop(['check'], axis=1)
data_val = data_val.drop(['check'], axis=1)

In [7]:
print("Missing values in train data:", data_train.isnull().sum())
print("Missing values in validation data:", data_val.isnull().sum())
print("Missing values in test data:", data_test.isnull().sum())

Missing values in train data: text     0
label    0
dtype: int64
Missing values in validation data: text     0
label    0
dtype: int64
Missing values in test data: text     0
label    0
dtype: int64


In [8]:
data_train[11:20]

Unnamed: 0,text,label
11,Mỗi lần lên sóng là đeo vàng làm như giàu lắm ...,1
12,Hãnh diện về ng thầy có tâm nhất của năm.,0
13,12:55 sau buổi phỏng vấn này tôi sẽ múa 1 bài ...,0
14,Du học sinh Úc pewpew,0
15,Có ai thấy khá giống đầu của Jadon Sancho ko nhỉ?,0
16,Khánh Con Sao Ko lấy số người chết của Ý ra nh...,1
17,Đúng là con người mày không có ý thức dân việt...,1
18,Tên bài hát còn trẻ trâu hơn cả t nhé ! Cười vl 😁,0
19,K ICM làm ca sĩ luôn rồi à 😢😢😢,0


In [5]:
data_train.groupby('label').describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,39123,37948,"Cuối tuần đi làm, rảnh rỗi đọc mấy truyện Voz ...",57
1,8831,8703,Dâm tiện,24


In [6]:
data_train.drop_duplicates(inplace=True)
data_train.reset_index(drop=True, inplace=True)
data_train

Unnamed: 0,text,label
0,Bấp bênh vl thế,1
1,Thấy chán ad page này kiến thức thì nông cản c...,1
2,Giang Giang Đỗ Thị Ngọc Hà trend mới kìa kìa,0
3,đcm 😒 sau có con cho hút cỏ chữa bệnh chứ đéo ...,1
4,Má nứng quá aiu ơi,1
...,...,...
46646,"đm,tôi đây nuôi chó bao nhiêu con xem nó như c...",1
46647,Đơn giản BTC nó giống như 1 cái máy slot cờ bạ...,1
46648,"Văn vẻ đọc loạn cả não Chủ thớt cho nó de đi,9...",1
46649,Có loz tiền mà đầu tư được hết các điểm thi,0


In [11]:
data_test.groupby('label').describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,5876,5876,Nghe trần dần thấy cười đau cả bụng,1
1,934,934,Từ lúc mấy bro cmt cực kì cl gì đấy là rất khó...,1


In [12]:
data_val.groupby('label').describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,4580,4580,Long Duy Đào cái tội ớ,1
1,895,895,"Cs cha , Cs con , Rận cha, Rận con ...ai đúng ...",1


## BERT Hugging face

In [7]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(data_train['text'].tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")
val_encodings = tokenizer(data_val['text'].tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")
test_encodings = tokenizer(data_test['text'].tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")

print("Train tokenized sample:", train_encodings['input_ids'][0])


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Train tokenized sample: tensor([ 101, 8670, 2361, 3841, 2232, 1058, 2140, 1996,  102,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,   

### Class weight & Focal loss

In [8]:
# from sklearn.utils.class_weight import compute_class_weight
# import torch

# labels = data_train['label'].values
# class_weights = compute_class_weight(class_weight='balanced', classes=np.array([0,1]), y=labels)
# class_weights = torch.tensor(class_weights, dtype=torch.float)
# print(class_weights)

tensor([0.6147, 2.6802])


In [9]:
# class FocalLoss(torch.nn.Module):
#   def __init__(self, class_weight, alpah=0.25, gamm=2):
#     super(FocalLoss, self).__init__()
#     self.alpha = alpah
#     self.gamma = gamm
#     self.weights = class_weight
#   def forward(self,logits,labels):
#     ce_loss = torch.nn.CrossEntropyLoss(weight=self.class_weights)(logits,labels)
#     pt = torch.exp(-ce_loss)
#     F_loss = self.alpha*(1-pt)**self.gamma*ce_loss
#     return F_loss


In [10]:
# loss_fn = FocalLoss(class_weights)

In [11]:
# def compute_loss_with_weights(model, inputs, return_outputs=False):
#     labels = inputs.get("labels")
#     outputs = model(**inputs)
#     logits = outputs.get("logits")
#     loss = loss_fn(logits, labels)
#     return (loss, outputs) if return_outputs else loss

In [12]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [13]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("GPU not available, using CPU instead")

Using GPU: Tesla P100-PCIE-16GB


In [14]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")

    wandb.log({
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    })

    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}


In [16]:
import wandb
wandb.init(project="bert-hyperparameter-tuning") 

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [29]:
import wandb

wandb.init(project="bert-hyperparameter-tuning", config={
    "learning_rate": 2e-5,
    "train_batch_size": 32,
    "epochs": 3,
    "weight_decay": 0.01
})


[34m[1mwandb[0m: Currently logged in as: [33mah929586[0m ([33mah929586-iai[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [21]:
import os
os.environ["WANDB_DISABLED"] = "false"
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=wandb.config.learning_rate,
    per_device_train_batch_size=wandb.config.train_batch_size,
    per_device_eval_batch_size=32,
    num_train_epochs=wandb.config.epochs,
    weight_decay=wandb.config.weight_decay,
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to="wandb",
    run_name=f"run_lr_{wandb.config.learning_rate}_wd_{wandb.config.weight_decay}",
)



In [22]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [23]:
from datasets import Dataset
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=128)

train_dataset = Dataset.from_pandas(data_train)
val_dataset = Dataset.from_pandas(data_val)
test_dataset = Dataset.from_pandas(data_test)

train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

train_dataset = train_dataset.remove_columns(["text"])
val_dataset = val_dataset.remove_columns(["text"])
test_dataset = test_dataset.remove_columns(["text"])

train_dataset = train_dataset.rename_column("label", "labels")
val_dataset = val_dataset.rename_column("label", "labels")
test_dataset = test_dataset.rename_column("label", "labels")

train_dataset.set_format("torch")
val_dataset.set_format("torch")
test_dataset.set_format("torch")

Map:   0%|          | 0/46651 [00:00<?, ? examples/s]

Map:   0%|          | 0/5475 [00:00<?, ? examples/s]

Map:   0%|          | 0/6810 [00:00<?, ? examples/s]

In [25]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2971,0.306698,0.884749,0.764,0.426816,0.54767
2,0.2741,0.29224,0.890046,0.759292,0.47933,0.587671
3,0.2266,0.303103,0.892603,0.720863,0.559777,0.630189


TrainOutput(global_step=4374, training_loss=0.2908622457602823, metrics={'train_runtime': 1668.9655, 'train_samples_per_second': 83.856, 'train_steps_per_second': 2.621, 'total_flos': 9205795382699520.0, 'train_loss': 0.2908622457602823, 'epoch': 3.0})

In [30]:
val_results = trainer.evaluate(eval_dataset=val_dataset)
print(f"Validation Results: {val_results}")

wandb.log({"validation_loss": val_results["eval_loss"], "validation_f1": val_results["eval_f1"]})

test_results = trainer.evaluate(eval_dataset=test_dataset)
print(f"Test Results: {test_results}")

wandb.log({"test_loss": test_results["eval_loss"], "test_f1": test_results["eval_f1"]})


Validation Results: {'eval_loss': 0.30310311913490295, 'eval_accuracy': 0.8926027397260274, 'eval_precision': 0.720863309352518, 'eval_recall': 0.5597765363128492, 'eval_f1': 0.630188679245283, 'eval_runtime': 19.0094, 'eval_samples_per_second': 288.016, 'eval_steps_per_second': 9.048, 'epoch': 3.0}
Test Results: {'eval_loss': 0.23922690749168396, 'eval_accuracy': 0.9113069016152716, 'eval_precision': 0.6987951807228916, 'eval_recall': 0.6209850107066381, 'eval_f1': 0.6575963718820862, 'eval_runtime': 23.6201, 'eval_samples_per_second': 288.313, 'eval_steps_per_second': 9.018, 'epoch': 3.0}


In [26]:
wandb.finish()

0,1
accuracy,▁▆█
eval/accuracy,▁▆█
eval/f1,▁▄█
eval/loss,█▁▆
eval/precision,█▇▁
eval/recall,▁▄█
eval/runtime,▅▁█
eval/samples_per_second,▄█▁
eval/steps_per_second,▄█▁
f1,▁▄█

0,1
accuracy,0.8926
eval/accuracy,0.8926
eval/f1,0.63019
eval/loss,0.3031
eval/precision,0.72086
eval/recall,0.55978
eval/runtime,19.3978
eval/samples_per_second,282.249
eval/steps_per_second,8.867
f1,0.63019


### Save


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
model_save_path = "/content/drive/MyDrive/Final_project_deep"

trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

### Test

In [52]:
model_save_path = "/kaggle/input/bert/pytorch/default/1"

In [56]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch

model_b = BertForSequenceClassification.from_pretrained(model_save_path)
tokenizer = BertTokenizer.from_pretrained(model_save_path)

model_b.to(device)


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [58]:
def test_model(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = model_b(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    return predictions.item()

# Ví dụ test
test_text = ["Bài báo phản động.", "Bài test khó quá."]
for test in test_text:
  prediction = test_model(test)
  print(f"{test}: {prediction}")


Bài báo phản động.: 1
Bài test khó quá.: 0


## T5


In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from tqdm import tqdm

In [7]:
print(data_train[0:5])

                                                text  label
0                                    Bấp bênh vl thế      1
1  Thấy chán ad page này kiến thức thì nông cản c...      1
2       Giang Giang Đỗ Thị Ngọc Hà trend mới kìa kìa      0
3  đcm 😒 sau có con cho hút cỏ chữa bệnh chứ đéo ...      1
4                                 Má nứng quá aiu ơi      1


In [8]:
data_train

Unnamed: 0,text,label
0,Bấp bênh vl thế,1
1,Thấy chán ad page này kiến thức thì nông cản c...,1
2,Giang Giang Đỗ Thị Ngọc Hà trend mới kìa kìa,0
3,đcm 😒 sau có con cho hút cỏ chữa bệnh chứ đéo ...,1
4,Má nứng quá aiu ơi,1
...,...,...
46646,"đm,tôi đây nuôi chó bao nhiêu con xem nó như c...",1
46647,Đơn giản BTC nó giống như 1 cái máy slot cờ bạ...,1
46648,"Văn vẻ đọc loạn cả não Chủ thớt cho nó de đi,9...",1
46649,Có loz tiền mà đầu tư được hết các điểm thi,0


In [121]:
print(data_train[0])

{'text': 'Bấp bênh vl thế', 'label': 1, 'input_ids': [4501, 4921, 10, 272, 2, 102, 3, 115, 8202, 29, 107, 3, 208, 40, 3, 189, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [1]}


In [20]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])  # Ensure text is string
        label = str(self.labels[idx])  # Convert label to string for T5
        inputs = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        label_inputs = self.tokenizer(
            label,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "labels": label_inputs["input_ids"].squeeze(0),  # Ensure compatible label format
        }

In [21]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")
train_dataset = TextDataset(data_train["text"].tolist(), data_train["label"].tolist(), tokenizer)
val_dataset = TextDataset(data_val["text"].tolist(), data_val["label"].tolist(), tokenizer)
test_dataset = TextDataset(data_test["text"].tolist(), data_test["label"].tolist(), tokenizer)


## Train

In [22]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

In [24]:
model = T5ForConditionalGeneration.from_pretrained("t5-small")
model.to("cuda")

optimizer = AdamW(model.parameters(), lr=5e-5)


In [51]:
def train(model, dataloader, optimizer):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc="Training"):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to("cuda")
        attention_mask = batch["attention_mask"].to("cuda")
        labels = batch["labels"].to("cuda")

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
    return total_loss / len(dataloader)


In [50]:
def evaluate(model, dataloader):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Validating"):
            input_ids = batch["input_ids"].to("cuda")
            attention_mask = batch["attention_mask"].to("cuda")
            labels = batch["labels"].to("cuda")

            labels[labels == 0] = -100

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

            loss = outputs.loss
            total_loss += loss.item()

            predictions = torch.argmax(outputs.logits, dim=-1)
            active_tokens = labels != -100
            correct += (predictions[active_tokens] == labels[active_tokens]).sum().item()
            total += active_tokens.sum().item()

    accuracy = correct / total if total > 0 else 0
    return total_loss / len(dataloader), accuracy


In [35]:
epochs = 5
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    train_loss = train(model, train_loader, optimizer)
    val_loss, val_accuracy = evaluate(model, val_loader)
    print(f"Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")


Epoch 1/5


Training: 100%|██████████| 2916/2916 [07:34<00:00,  6.41it/s]
Validating: 100%|██████████| 343/343 [00:20<00:00, 16.92it/s]


Train Loss: 0.0042, Validation Loss: 0.1640, Validation Accuracy: 0.9424
Epoch 2/5


Training: 100%|██████████| 2916/2916 [07:34<00:00,  6.41it/s]
Validating: 100%|██████████| 343/343 [00:20<00:00, 16.99it/s]


Train Loss: 0.0039, Validation Loss: 0.1579, Validation Accuracy: 0.9424
Epoch 3/5


Training: 100%|██████████| 2916/2916 [07:34<00:00,  6.41it/s]
Validating: 100%|██████████| 343/343 [00:20<00:00, 16.99it/s]


Train Loss: 0.0037, Validation Loss: 0.1504, Validation Accuracy: 0.9448
Epoch 4/5


Training: 100%|██████████| 2916/2916 [07:34<00:00,  6.41it/s]
Validating: 100%|██████████| 343/343 [00:20<00:00, 17.01it/s]


Train Loss: 0.0034, Validation Loss: 0.1341, Validation Accuracy: 0.9502
Epoch 5/5


Training: 100%|██████████| 2916/2916 [07:34<00:00,  6.42it/s]
Validating: 100%|██████████| 343/343 [00:20<00:00, 17.04it/s]

Train Loss: 0.0031, Validation Loss: 0.1346, Validation Accuracy: 0.9521





In [38]:
test_loader = DataLoader(test_dataset, batch_size=16)
test_loss, test_accuracy = evaluate(model, test_loader)

# In kết quả
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")


Validating: 100%|██████████| 426/426 [00:24<00:00, 17.20it/s]

Test Loss: 0.1131
Test Accuracy: 0.9597





In [39]:
model_path = "/kaggle/working/t5_model.pt"
torch.save(model.state_dict(), model_path)
