In [1]:
import torch
from transformers import AutoModel, AutoTokenizer
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from vncorenlp import VnCoreNLP

class MyDataset(Dataset):
    def __init__(self, data, labels, tokenizer, max_len=256):
        self.data = data
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.rdrsegmenter = VnCoreNLP("./weights/VnCoreNLP-1.2.jar", annotators="wseg", max_heap_size='-Xmx500m')
    
    def __getitem__(self, idx):
        text = self.data[idx]
        text_tokenized = self.tokenize(text)
        inputs = self.tokenizer(
            text_tokenized,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )
        
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        
        # The tokenizer returns a dictionary, we need to return tensors in the right shape
        return {
            'input_ids': inputs['input_ids'].squeeze(0),  # Remove batch dimension
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'labels': label
        }

    def __len__(self):
        return len(self.data)
    
    def tokenize(self, text):
        try:
            sents = self.rdrsegmenter.tokenize(text)
            text_tokenized = ' '.join([' '.join(sent) for sent in sents])
        except Exception as e:
            print(f"Failed to tokenize text: {text}. Error: {e}")
            text_tokenized = ''
        return text_tokenized



  from .autonotebook import tqdm as notebook_tqdm


# Tạo tập dữ liệu tiến hành train model `PhoBert`

In [2]:
import os
import pickle
dataset_path = '../../dataset/text_cls'
print(os.listdir(dataset_path))

['OTHER_class.pkl', 'TOTAL_COST_class.pkl', 'ADDRESSS_class.pkl', 'TIMESTAMP_class.pkl', 'SELER_class.pkl']


In [3]:
for dataset in os.listdir(dataset_path):
    path = os.path.join(dataset_path, dataset)
    dataset_name = dataset[:-4]
    with open(path, 'rb') as f:
        if dataset_name == 'OTHER_class':
            OTHER = pickle.load(f)
        elif dataset_name == 'TOTAL_COST_class':
            TOTAL_COST = pickle.load(f)
        elif dataset_name == 'ADDRESSS_class':
            ADDRESS = pickle.load(f)
        elif dataset_name == 'TIMESTAMP_class':
            TIMESTAMP = pickle.load(f)
        elif dataset_name == 'SELER_class':
            SELLER = pickle.load(f)
    


In [4]:
import numpy as np

SELER_LABEL = len(SELLER)* [0]
ADDRESS_LABEL = len(ADDRESS) * [1]
OTHER_LABEL = len(OTHER) * [2]

labels = SELER_LABEL + ADDRESS_LABEL + OTHER_LABEL

data = SELLER + ADDRESS + OTHER


In [5]:
from sklearn.model_selection import train_test_split
labels = np.array(labels)
data = np.array(data)
X_train, X_test, Y_train, Y_test = train_test_split(data, labels, test_size = 0.1, random_state = 42, stratify = labels)



In [6]:

tokenizer = AutoTokenizer.from_pretrained('vinai/phobert-base')
train_dataset =  MyDataset(X_train, Y_train, tokenizer)
test_dataset = MyDataset(X_test, Y_test, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)


In [7]:
from vncorenlp import VnCoreNLP
from tensorflow.keras.preprocessing.sequence import pad_sequences
import random
from tqdm import tqdm
import pickle
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch
import numpy as np
from sklearn.metrics import f1_score, accuracy_score
import os
from transformers import RobertaForSequenceClassification, RobertaConfig, AdamW, RobertaTokenizer, RobertaTokenizerFast, RobertaModel, AutoTokenizer
from datetime import datetime
import glob

class ROBERTAClassifier(torch.nn.Module):
    def __init__(self, num_labels, bert_model, dropout_rate=0.3):
        super(ROBERTAClassifier, self).__init__()
        if bert_model != None:
            self.roberta = bert_model
        else:
            self.roberta = RobertaModel.from_pretrained("./vinai1/phobert-base")
        self.d1 = torch.nn.Dropout(dropout_rate)
        self.l1 = torch.nn.Linear(768, 64)
        self.bn1 = torch.nn.LayerNorm(64)
        self.d2 = torch.nn.Dropout(dropout_rate)
        self.l2 = torch.nn.Linear(64, num_labels)
        
    def forward(self, input_ids, attention_mask):
        _, x = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        x = self.d1(x)
        x = self.l1(x)
        x = self.bn1(x)
        x = torch.nn.Tanh()(x)
        x = self.d2(x)
        x = self.l2(x)
        return x 

class BERTClassifier(torch.nn.Module):
    def __init__(self, num_labels):
        super(BERTClassifier, self).__init__()
        bert_classifier_config = RobertaConfig.from_pretrained(
            "./vinai1/phobert-base/config.json",
            from_tf=False,
            num_labels = num_labels,
            output_hidden_states=False,
            )
        print("LOAD BERT PRETRAIN MODEL")
        self.bert_classifier = RobertaForSequenceClassification.from_pretrained(
            "./vinai1/phobert-base/pytorch_model.bin",
            config=bert_classifier_config
            )

    def forward(self, input_ids, attention_mask, labels):
        output = self.bert_classifier(input_ids=input_ids,
                                    token_type_ids=None,
                                    attention_mask=attention_mask,
                                    labels=labels
                                    )
        return output

class ClassifierTrainner():
    def __init__(self, bert_model, train_dataloader, valid_dataloader, epochs=10, cuda_device="cpu", save_dir=None):

        if cuda_device == 'cpu':
            self.device = cuda_device
        if cuda_device == '0' or cuda_device == '1':
            self.device = f'cuda:{cuda_device}'
        self.model = bert_model
        if save_dir != None and os.path.exists(save_dir):
            print("Load weight from file:{}".format(save_dir))
            self.save_dir = save_dir
            epcho_checkpoint_path = glob.glob("{}/model_epoch*".format(self.save_dir))
            if len(epcho_checkpoint_path) == 0:
                print("No checkpoint found in: {}\nCheck save_dir...".format(self.save_dir))
            else:
                self.load_checkpoint(epcho_checkpoint_path)
                print("Restore weight successful from: {}".format(epcho_checkpoint_path))
        else:
            self.save_dir = datetime.now().strftime("%d-%m-%Y_%H-%M-%S")
            os.makedirs(self.save_dir)
            print("Training new model, save to: {}".format(self.save_dir))

        self.train_dataloader = train_dataloader
        self.valid_dataloader = valid_dataloader
        self.epochs = epochs
        # self.batch_size = batch_size

    def save_checkpoint(self, save_path):
        state_dict = {'model_state_dict': self.model.state_dict()}
        torch.save(state_dict, save_path)
        print(f'Model saved to ==> {save_path}')

    def load_checkpoint(self, load_path):
        state_dict = torch.load(load_path, map_location=self.device)
        print(f'Model restored from <== {load_path}')
        self.model.load_state_dict(state_dict['model_state_dict'])

    @staticmethod    
    def flat_accuracy(preds, labels):
        pred_flat = np.argmax(preds, axis=1).flatten()
        labels_flat = labels.flatten()
        F1_score = f1_score(pred_flat, labels_flat, average='macro')
        return accuracy_score(pred_flat, labels_flat), F1_score

    def train_classifier(self):
        self.model.to(self.device)
        param_optimizer = list(self.model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5, correct_bias=False)

        for epoch_i in range(0, self.epochs):
            print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, self.epochs))
            print('Training...')

            total_loss = 0
            self.model.train()
            train_accuracy = 0
            nb_train_steps = 0
            train_f1 = 0
            best_valid_loss = 999999
            best_eval_accuracy = 0
            for step, batch in enumerate(self.train_dataloader):
                b_input_ids = batch['input_ids'].to(self.device)
                b_input_mask = batch['attention_mask'].to(self.device)
                b_labels = batch['labels'].to(self.device)

                self.model.zero_grad()
                outputs = self.model(b_input_ids, 
                                    attention_mask=b_input_mask, 
                                    labels=b_labels
                                    )
                loss = outputs[0]
                total_loss += loss.item()
                
                logits = outputs[1].detach().cpu().numpy()
                label_ids = b_labels.cpu().numpy()
                tmp_train_accuracy, tmp_train_f1 = self.flat_accuracy(logits, label_ids)
                train_accuracy += tmp_train_accuracy
                train_f1 += tmp_train_f1
                nb_train_steps += 1
                
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                optimizer.step()
                if step % 100 == 0:
                    print("[TRAIN] Epoch {}/{} | Batch {}/{} | Train Loss={} | Train Acc={}".format(epoch_i, self.epochs, step, len(self.train_dataloader), loss.item(), tmp_train_accuracy))
                
            avg_train_loss = total_loss / len(self.train_dataloader)
            print(" Train Accuracy: {0:.4f}".format(train_accuracy/nb_train_steps))
            print(" Train F1 score: {0:.4f}".format(train_f1/nb_train_steps))
            print(" Train Loss: {0:.4f}".format(avg_train_loss))

            print("Running Validation...")
            self.model.eval()
            eval_loss, eval_accuracy = 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0
            eval_f1 = 0

            for batch in self.valid_dataloader:
                b_input_mask = batch['attention_mask'].to(self.device)
                b_input_ids = batch['input_ids'].to(self.device)
                b_labels = batch['labels'].to(self.device)
                with torch.no_grad():
                    outputs = self.model(b_input_ids, 
                                        attention_mask=b_input_mask,
                                        labels=b_labels
                                        )
                    tmp_eval_loss, logits = outputs[0], outputs[1]
                    logits = logits.detach().cpu().numpy()
                    label_ids = b_labels.cpu().numpy()
                    tmp_eval_accuracy, tmp_eval_f1 = self.flat_accuracy(logits, label_ids)
                    eval_accuracy += tmp_eval_accuracy
                    eval_loss += tmp_eval_loss
                    eval_f1 += tmp_eval_f1
                    nb_eval_steps += 1

            print(" Valid Loss: {0:.4f}".format(eval_loss/nb_eval_steps))
            print(" Valid Accuracy: {0:.4f}".format(eval_accuracy/nb_eval_steps))
            print(" Valid F1 score: {0:.4f}".format(eval_f1/nb_eval_steps))

            if best_valid_loss > eval_loss:
                best_valid_loss = eval_loss
                best_valid_loss_path = "{}/model_best_valoss.pt".format(self.save_dir)
                self.save_checkpoint(best_valid_loss_path)
            if best_eval_accuracy > eval_accuracy:
                best_eval_accuracy = eval_accuracy
                best_eval_accuracy_path = "{}/model_best_valacc.pt".format(self.save_dir)
                self.save_checkpoint(best_eval_accuracy_path)
            
            epoch_i_path = "{}/model_epoch{}.pt".format(self.save_dir, epoch_i)
            self.save_checkpoint(epoch_i_path)
            try: 
                os.remove("{}/model_epoch{}.pt".format(self.save_dir, epoch_i-1))
            except:
                print(f'Can not remove ! at epoch{epoch_i}')

        print("Training complete!")

        def predict_dataloader(self, dataloader, classes, tokenizer):
            for batch in dataloader:
                batch = tuple(t.to(self.device) for t in batch)
                b_input_ids, b_input_mask = batch
                with torch.no_grad():
                    outputs = self.model(b_input_ids, 
                                        attention_mask=b_input_mask,
                                        labels=None
                                        )
                    logits = outputs
                    logits = logits.detach().cpu().numpy()
                    pred_flat = np.argmax(logits, axis=1).flatten()
                    print("[PREDICT] {}:{}".format(classes[int(pred_flat)], tokenizer.decode(b_input_ids)))

        def predict_text(self, text, classes, tokenizer, max_len=256):
            ids = tokenizer.encode(text)
            ids_padded = pad_sequences(ids, maxlen=max_len, dtype="long", value=0, truncating="post", padding="post")
            mask = [int(token_id > 0) for token_id in ids_padded]
            input_ids = torch.tensor(ids_padded)
            intput_mask = torch.tensor(mask)
            with torch.no_grad():
                logits = self.model(input_ids, 
                                    attention_mask=intput_mask,
                                    labels=None
                                    )
                logits = logits.detach().cpu().numpy()
                pred_flat = np.argmax(logits, axis=1).flatten()
                print("[PREDICT] {}:{}".format(classes[int(pred_flat)], text))



2024-10-24 16:47:46.590124: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-24 16:47:46.600229: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-24 16:47:46.603546: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-24 16:47:46.611931: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
bert_classifier_model = BERTClassifier(3)
bert_classifier_trainer = ClassifierTrainner(bert_model=bert_classifier_model, train_dataloader=train_loader, valid_dataloader=test_loader, epochs=5, cuda_device = '0') #cuda_device: "cpu"=cpu hoac 0=gpu0, 1=gpu1, 
bert_classifier_trainer.load_checkpoint(load_path='./21-10-2024_17-11-41/model_best_valoss.pt')


You are using a model of type bert to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.


LOAD BERT PRETRAIN MODEL


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./vinai1/phobert-base/pytorch_model.bin and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  state_dict = torch.load(load_path, map_location=self.device)


Training new model, save to: 24-10-2024_16-47-57
Model restored from <== ./21-10-2024_17-11-41/model_best_valoss.pt


In [None]:
def main():
    
    bert_classifier_model = BERTClassifier(3)
    #train model
    bert_classifier_trainer = ClassifierTrainner(bert_model=bert_classifier_model, train_dataloader=train_loader, valid_dataloader=test_loader, epochs=5, cuda_device = '0') #cuda_device: "cpu"=cpu hoac 0=gpu0, 1=gpu1, 
    bert_classifier_trainer.train_classifier()

main()

In [11]:
!nvidia-smi

Mon Oct 21 16:18:16 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.29.01              Driver Version: 546.01       CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA RTX A4000               On  | 00000000:01:00.0 Off |                    0 |
| 41%   37C    P8               7W / 140W |    183MiB / 15352MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

# Re - Training  PhoBert

In [None]:
class PhoBert(nn.Module):
    def __init__(self, num_cls):
        super(PhoBert, self).__init__()
        self.model = AutoModel.from_pretrained('vinai/phobert-base')
        self.clf = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, num_cls)
        )
        self.activation = nn.Sigmoid()
    def forward(self, x):
        x = self.model(x)
        x = self.clf(x[1])
        return self.activation(x)


model = PhoBert(3).to('cuda')
optimizer = optim.Adam(model.parameters(), lr = 0.001)
num_epochs = 5
print('===============TRAINING======================')
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    train_acc = 0
    best_valid_loss = float('inf')
    best_val_acc = 0
    for step, batch in enumerate(train_loader):
        input_ids = batch['input_ids'].to('cuda')
        input_mask = batch['attention_mask'].to('cuda')
        labels = batch['labels'].to('cuda')
        optimizer.zero_grad()
        outputs = model(input_ids,
                        attention_mask = input_mask,
                        labels = labels)
        loss = outputs[0]
        total_loss += loss.item()
        
                        
