In [23]:
import torch
from torch.nn import functional as F
from torch import optim
from torch import nn
from torch.utils.data import DataLoader
import sys
sys.path.insert(0, '../')

from config import *
from transformers import BertModel, BertConfig, BertForSequenceClassification
from models import load_model, VanillaBert
from dataset import REDataset
from utils import set_seed
from criterions import *
from optimizers import *

In [14]:
LOAD = "../saved_models/VanillaBert_bert-base-multilingual-cased_20210418164452/VanillaBert_bert-base-multilingual-cased_ep(02)acc(0.4878)loss(0.0048)id(20210418164452).pth"


In [19]:
model_type = ModelType.VanillaBert
pretrained_type = PreTrainedType.MultiLingual
num_classes=Config.NumClasses
pooler_idx=0
load_state_dict=None
data_root=Config.Train
preprocess_type=PreProcessType.ES
epochs=Config.Epochs
valid_size=Config.ValidSize
train_batch_size=Config.Batch8
valid_batch_size=512
optim_type=Optimizer.Adam
loss_type=Loss.CE
lr=Config.LRSlower
lr_scheduler=Optimizer.CosineScheduler
device = Config.Device

In [24]:
dataset = REDataset(root=Config.Train, preprocess_type=preprocess_type, device=device)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, drop_last=True)

# load model
model = load_model(model_type, pretrained_type, num_classes, load_state_dict, pooler_idx)
model.to(device)
model.train()

Load raw data...	preprocessing for 'EntitySeparation'...	done!
Load Tokenizer...	done!
Apply Tokenization...	done!
Load Model...	Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model 

VanillaBert(
  (backbone): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=T

In [25]:
# load criterion, optimizer, scheduler
criterion = get_criterion(type=loss_type)
optimizer = get_optimizer(model=model, type=optim_type, lr=lr)
if lr_scheduler is not None:
    scheduler = get_scheduler(type=lr_scheduler, optimizer=optimizer)

In [None]:
# make checkpoint directory to save model during train
checkpoint_dir = f"{model_type}_{pretrained_type}_{TIMESTAMP}"
if checkpoint_dir not in os.listdir(save_path):
    os.mkdir(os.path.join(save_path, checkpoint_dir))
save_path = os.path.join(save_path, checkpoint_dir)

# train phase
best_acc = 0
best_loss = 999

for epoch in range(epochs):
    print(f"Epoch: {epoch}")

    pred_list = []
    true_list = []
    total_loss = 0

    for idx, (sentences, labels) in tqdm(enumerate(train_loader), desc="[Train]"):
        if model_type == ModelType.SequenceClf:
            outputs = model(**sentences).logits
        elif model_type == ModelType.Base:
            outputs = model(**sentences).pooler_output
        else:
            outputs = model(**sentences)

        loss = criterion(outputs, labels)
        total_loss += loss.item()

        # backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if lr_scheduler is not None:
            scheduler.step()

        # stack preds for evaluate
        _, preds = torch.max(outputs, dim=1)
        preds = preds.data.cpu().numpy()
        labels = labels.data.cpu().numpy()

        pred_list.append(preds)
        true_list.append(labels)

        pred_arr = np.hstack(pred_list)
        true_arr = np.hstack(true_list)

        # evaluation phase
        train_eval = evaluate(y_true=true_arr, y_pred=pred_arr)  # ACC, F1, PRC, REC
        train_loss = total_loss / len(true_arr)