In [1]:
import json
from collections import OrderedDict
import logging
import os
import pathlib
temp = pathlib.PosixPath
pathlib.PosixPath = pathlib.WindowsPath

import numpy as np
import torch
from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer
from sentence_transformers.models import Transformer

from parse_config import ConfigParser
import data_loader.data_loaders as module_data
import model.model as module_arch
import model.metric as module_metric
import model.loss as module_loss
from trainer.trainer import Trainer, BertTrainer
from utils import prepare_device

In [2]:
# fix random seeds for reproducibility
SEED = 123
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(SEED)

In [3]:
config = {
    "name": "sentence_bert",
    "n_gpu": 1,
    "tokenizer": {
        "path": "./saved/tokenizer/tokenizer-all-MiniLM-L6-v2-msmarco"
    },
    "arch": {
        "type": "SentenceTransformersWrapperForLM",
        "args": {
            "model_name": "all-MiniLM-L6-v2",
            "model_path": None,
            "hidden_size": 512,
            "dropout": 0.1,
            "vocab_size": 32000,
            "load_path": 'E:\\OneDrive - Hanoi University of Science and Technology\\Chuyen nganh\\Deep Learning and Its Applications\\BTL\\DeepLearning20231\\saved\\models\\sentence_bert\\1211_113344\\checkpoint-epoch1.pth',
        }
    },
    "data_loader": {
        "type": "MLMDataLoader",
        "args":{
            "data_path": "./first_1024_paras.tsv",
            "batch_size": 4,
            "shuffle": True,
            "validation_split": 0.1,
            "num_workers": 2
        }
    },
    "optimizer": {
        "type": "AdamW",
        "args":{
            "lr": 0.001,
            "weight_decay": 0,
            "amsgrad": True
        }
    },
    "loss": "mlm_loss",
    "metrics": [
        "mlm_accuracy"
    ],
    "lr_scheduler": {
        "type": "StepLR",
        "args": {
            "step_size": 1,
            "gamma": 0.9,
        }
    },
    "trainer": {
        "epochs": 5,

        "save_dir": "saved/",
        "save_period": 1,
        "verbosity": 2,
        
        "monitor": "min val_loss",
        "early_stop": 2,

        "tensorboard": True
    }
}

config = ConfigParser(config)
logger = config.get_logger('train')

In [4]:
# get tokenizer, model and print model architecture
tokenizer = AutoTokenizer.from_pretrained(config['tokenizer']['path'])
model = config.init_obj('arch', module_arch)
logger.info(model)

# setup data_loader instances
data_loader = config.init_obj('data_loader', module_data, tokenizer)
valid_data_loader = data_loader.split_validation()

# prepare for (multi-device) GPU training
device, device_ids = prepare_device(config['n_gpu'])
model = model.to(device)
if len(device_ids) > 1:
    model = torch.nn.DataParallel(model, device_ids=device_ids)

# get function handles of loss and metrics
criterion = getattr(module_loss, config['loss'])
metrics = [getattr(module_metric, met) for met in config['metrics']]

# build optimizer, learning rate scheduler. delete every lines containing lr_scheduler for disabling scheduler
trainable_params = filter(lambda p: p.requires_grad, model.parameters())
optimizer = config.init_obj('optimizer', torch.optim, trainable_params)
lr_scheduler = config.init_obj('lr_scheduler', torch.optim.lr_scheduler, optimizer)

Load pretrained SentenceTransformer: all-MiniLM-L6-v2
Use pytorch device: cuda




Loaded model from E:\OneDrive - Hanoi University of Science and Technology\Chuyen nganh\Deep Learning and Its Applications\BTL\DeepLearning20231\saved\models\sentence_bert\1211_113344\checkpoint-epoch1.pth
SentenceTransformersWrapperForLM(
  (model): SentenceTransformer(
    (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
    (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
    (2): Normalize()
  )
  (lm_output_layer): Sequential(
    (0): LazyLinear(in_features=0, out_features=512, bias=True)
    (1): Dropout(p=0.1, inplace=False)
    (2): Linear(in_features=512, out_features=32000, bias=True)
  )
)
Trainable parameters: 39893888


Map:   0%|          | 0/1023 [00:00<?, ? examples/s]

Map:   0%|          | 0/1023 [00:00<?, ? examples/s]

Map:   0%|          | 0/219 [00:00<?, ? examples/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [5]:
trainer = BertTrainer(model, criterion, metrics, optimizer,
                    config=config,
                    device=device,
                    data_loader=data_loader,
                    valid_data_loader=valid_data_loader,
                    lr_scheduler=lr_scheduler)

In [6]:
trainer.train()

    epoch          : 1
    loss           : 4.581803512573242
    mlm_accuracy   : 0.26564362770548444
    val_loss       : 4.240772445996602
    val_mlm_accuracy: 0.3030558510780352
Saving checkpoint: saved\models\sentence_bert\1211_214644\checkpoint-epoch1.pth ...
Saving current best: model_best.pth ...


KeyboardInterrupt: 