# BERT model for "reactionary and toxic content".

## Data processing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data_test = pd.read_csv('/kaggle/input/toxic-phandong/test.csv')
data_train = pd.read_csv('/kaggle/input/toxic-phandong/train.csv')
data_val = pd.read_csv('/kaggle/input/toxic-phandong/val.csv')

In [5]:
data_test.head(8)

Unnamed: 0,text,label,check
0,Từ lúc mấy bro cmt cực kì cl gì đấy là rất khó...,1,
1,Nghe trần dần thấy cười đau cả bụng,0,
2,Idol tui có khác. Phải v chứ ai như cái con ph...,1,
3,Cảnh Sát GiựT Tiền = CSGT,1,
4,Ông đi đâu mà lạc sang đây đây,0,
5,Gia cát bị đồng bóng à,1,
6,Mấy tụi DLV hay nói đu càng này nọ. Nảo chúng ...,1,
7,Quả múa chung được phết thầy ạ :v,0,


In [4]:
data_train.head()

Unnamed: 0,text,label,check
0,Bấp bênh vl thế,1,True
1,Thấy chán ad page này kiến thức thì nông cản c...,1,
2,Giang Giang Đỗ Thị Ngọc Hà trend mới kìa kìa,0,
3,đcm 😒 sau có con cho hút cỏ chữa bệnh chứ đéo ...,1,
4,Má nứng quá aiu ơi,1,True


In [5]:
data_val.head()

Unnamed: 0,text,label,check
0,"Cs cha , Cs con , Rận cha, Rận con ...ai đúng ...",1,
1,Long Duy Đào cái tội ớ,0,
2,Nói gì tào lao kg,1,
3,Chả làm gì cũng có fan cứng =)),0,
4,Sao lại cứ đến nụ cười . lên không nghe rõ lời...,0,True


In [3]:
data_train = data_train.drop(['check'], axis=1)
data_test = data_test.drop(['check'], axis=1)
data_val = data_val.drop(['check'], axis=1)

In [7]:
print("Missing values in train data:", data_train.isnull().sum())
print("Missing values in validation data:", data_val.isnull().sum())
print("Missing values in test data:", data_test.isnull().sum())

Missing values in train data: text     0
label    0
dtype: int64
Missing values in validation data: text     0
label    0
dtype: int64
Missing values in test data: text     0
label    0
dtype: int64


In [8]:
data_train[11:20]

Unnamed: 0,text,label
11,Mỗi lần lên sóng là đeo vàng làm như giàu lắm ...,1
12,Hãnh diện về ng thầy có tâm nhất của năm.,0
13,12:55 sau buổi phỏng vấn này tôi sẽ múa 1 bài ...,0
14,Du học sinh Úc pewpew,0
15,Có ai thấy khá giống đầu của Jadon Sancho ko nhỉ?,0
16,Khánh Con Sao Ko lấy số người chết của Ý ra nh...,1
17,Đúng là con người mày không có ý thức dân việt...,1
18,Tên bài hát còn trẻ trâu hơn cả t nhé ! Cười vl 😁,0
19,K ICM làm ca sĩ luôn rồi à 😢😢😢,0


In [5]:
data_train.groupby('label').describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,39123,37948,"Cuối tuần đi làm, rảnh rỗi đọc mấy truyện Voz ...",57
1,8831,8703,Dâm tiện,24


In [4]:
data_train.drop_duplicates(inplace=True)
data_train.reset_index(drop=True, inplace=True)
data_train

Unnamed: 0,text,label
0,Bấp bênh vl thế,1
1,Thấy chán ad page này kiến thức thì nông cản c...,1
2,Giang Giang Đỗ Thị Ngọc Hà trend mới kìa kìa,0
3,đcm 😒 sau có con cho hút cỏ chữa bệnh chứ đéo ...,1
4,Má nứng quá aiu ơi,1
...,...,...
46646,"đm,tôi đây nuôi chó bao nhiêu con xem nó như c...",1
46647,Đơn giản BTC nó giống như 1 cái máy slot cờ bạ...,1
46648,"Văn vẻ đọc loạn cả não Chủ thớt cho nó de đi,9...",1
46649,Có loz tiền mà đầu tư được hết các điểm thi,0


In [11]:
data_test.groupby('label').describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,5876,5876,Nghe trần dần thấy cười đau cả bụng,1
1,934,934,Từ lúc mấy bro cmt cực kì cl gì đấy là rất khó...,1


In [12]:
data_val.groupby('label').describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,4580,4580,Long Duy Đào cái tội ớ,1
1,895,895,"Cs cha , Cs con , Rận cha, Rận con ...ai đúng ...",1


## PhoBERT

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

train_encodings = tokenizer(data_train['text'].tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")
val_encodings = tokenizer(data_val['text'].tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")
test_encodings = tokenizer(data_test['text'].tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")

print("Train tokenized sample:", train_encodings['input_ids'][0])


config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]

Train tokenized sample: tensor([    0,   924,  2863, 17939,  1659,  2228,   570,     2,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     

### Class weight & Focal loss

In [8]:
# from sklearn.utils.class_weight import compute_class_weight
# import torch

# labels = data_train['label'].values
# class_weights = compute_class_weight(class_weight='balanced', classes=np.array([0,1]), y=labels)
# class_weights = torch.tensor(class_weights, dtype=torch.float)
# print(class_weights)

tensor([0.6147, 2.6802])


In [9]:
# class FocalLoss(torch.nn.Module):
#   def __init__(self, class_weight, alpah=0.25, gamm=2):
#     super(FocalLoss, self).__init__()
#     self.alpha = alpah
#     self.gamma = gamm
#     self.weights = class_weight
#   def forward(self,logits,labels):
#     ce_loss = torch.nn.CrossEntropyLoss(weight=self.class_weights)(logits,labels)
#     pt = torch.exp(-ce_loss)
#     F_loss = self.alpha*(1-pt)**self.gamma*ce_loss
#     return F_loss


In [10]:
# loss_fn = FocalLoss(class_weights)

In [11]:
# def compute_loss_with_weights(model, inputs, return_outputs=False):
#     labels = inputs.get("labels")
#     outputs = model(**inputs)
#     logits = outputs.get("logits")
#     loss = loss_fn(logits, labels)
#     return (loss, outputs) if return_outputs else loss

In [11]:
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [6]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("GPU not available, using CPU instead")

Using GPU: Tesla P100-PCIE-16GB


In [7]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")

    wandb.log({
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    })

    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}


In [19]:
import wandb
wandb.init(project="PhoBert1-hyperparameter-tuning") 

[34m[1mwandb[0m: Currently logged in as: [33mah929586[0m ([33mah929586-iai[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [20]:
import wandb

wandb.init(project="PhoBert1-hyperparameter-tuning", config={
    "learning_rate": 2e-5,
    "train_batch_size": 32,
    "epochs": 6,
    "weight_decay": 0.01
})


In [21]:
import os
os.environ["WANDB_DISABLED"] = "false"
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=wandb.config.learning_rate,
    per_device_train_batch_size=wandb.config.train_batch_size,
    per_device_eval_batch_size=32,
    num_train_epochs=wandb.config.epochs,
    weight_decay=wandb.config.weight_decay,
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to="wandb",
    run_name=f"run_lr_{wandb.config.learning_rate}_wd_{wandb.config.weight_decay}",
)




In [22]:
model = AutoModelForSequenceClassification.from_pretrained("vinai/phobert-base", num_labels=2)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [14]:
from datasets import Dataset
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=128)

train_dataset = Dataset.from_pandas(data_train)
val_dataset = Dataset.from_pandas(data_val)
test_dataset = Dataset.from_pandas(data_test)

train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

train_dataset = train_dataset.remove_columns(["text"])
val_dataset = val_dataset.remove_columns(["text"])
test_dataset = test_dataset.remove_columns(["text"])

train_dataset = train_dataset.rename_column("label", "labels")
val_dataset = val_dataset.rename_column("label", "labels")
test_dataset = test_dataset.rename_column("label", "labels")

train_dataset.set_format("torch")
val_dataset.set_format("torch")
test_dataset.set_format("torch")

Map:   0%|          | 0/46651 [00:00<?, ? examples/s]

Map:   0%|          | 0/5475 [00:00<?, ? examples/s]

Map:   0%|          | 0/6810 [00:00<?, ? examples/s]

In [23]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2474,0.268502,0.899726,0.791246,0.52514,0.631296
2,0.2179,0.250957,0.903379,0.825623,0.518436,0.636925
3,0.1751,0.266311,0.904292,0.7609,0.604469,0.673724
4,0.1112,0.338892,0.903014,0.740741,0.625698,0.678377
5,0.106,0.369382,0.904475,0.756906,0.612291,0.676961
6,0.0545,0.455256,0.9021,0.752461,0.597765,0.666252


TrainOutput(global_step=8748, training_loss=0.16142815748032882, metrics={'train_runtime': 3403.974, 'train_samples_per_second': 82.229, 'train_steps_per_second': 2.57, 'total_flos': 1.841159076539904e+16, 'train_loss': 0.16142815748032882, 'epoch': 6.0})

In [24]:
val_results = trainer.evaluate(eval_dataset=val_dataset)
print(f"Validation Results: {val_results}")

wandb.log({"validation_loss": val_results["eval_loss"], "validation_f1": val_results["eval_f1"]})

test_results = trainer.evaluate(eval_dataset=test_dataset)
print(f"Test Results: {test_results}")

wandb.log({"test_loss": test_results["eval_loss"], "test_f1": test_results["eval_f1"]})


Validation Results: {'eval_loss': 0.3388921022415161, 'eval_accuracy': 0.903013698630137, 'eval_precision': 0.7407407407407407, 'eval_recall': 0.6256983240223464, 'eval_f1': 0.6783767413688674, 'eval_runtime': 19.0222, 'eval_samples_per_second': 287.822, 'eval_steps_per_second': 9.042, 'epoch': 6.0}
Test Results: {'eval_loss': 0.2517828345298767, 'eval_accuracy': 0.9283406754772393, 'eval_precision': 0.7337526205450734, 'eval_recall': 0.7494646680942184, 'eval_f1': 0.7415254237288135, 'eval_runtime': 23.6499, 'eval_samples_per_second': 287.95, 'eval_steps_per_second': 9.006, 'epoch': 6.0}


In [25]:
# Save the model
save_directory = "./phobert_model"
os.makedirs(save_directory, exist_ok=True)
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)


('./phobert_model/tokenizer_config.json',
 './phobert_model/special_tokens_map.json',
 './phobert_model/vocab.txt',
 './phobert_model/bpe.codes',
 './phobert_model/added_tokens.json')

In [18]:
wandb.finish()

# _____________________________________________

### Test

In [56]:
model_save_path = "/kaggle/input/bert/pytorch/default/1"

In [57]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch

model_b = BertForSequenceClassification.from_pretrained(model_save_path)
tokenizer = BertTokenizer.from_pretrained(model_save_path)

model_b.to(device)


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [60]:
def test_model(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = model_b(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    return predictions.item()

# Ví dụ test
test_text = ["Gặp được nhau là định mệnh", "Gặp được nhau là đm", "Tuấn Hưng liên thiên quá rồi đấy"]
for test in test_text:
  prediction = test_model(test)
  print(f"{test}: {prediction}")


Gặp được nhau là định mệnh: 0
Gặp được nhau là đm: 1
Tuấn Hưng liên thiên quá rồi đấy: 0
