In [1]:
from components import *
import argparse
import yaml
import torch
import os
import re
import time
from transformers import AutoTokenizer
# from tqdm import tqdm
# from torch.nn.parallel import DistributedDataParallel, DataParallel
from torch.utils.data import ConcatDataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [3]:
config_path = './config/ViNLI-Zalo.yml'
with open(config_path, 'r') as f:
    opt = yaml.safe_load(f)

In [4]:
opt

{'seed': 42,
 'datasets': {'ViNLI-Zalo': {'data_name': 'ViNLI-Zalo-supervised',
   'data_path': './data/segment/ViNLI-Zalo-supervised.json',
   'train_path': './data/train_test_split/ViNLI-Zalo-supervised-train.json',
   'test_path': './data/train_test_split/ViNLI-Zalo-supervised-test.json',
   'data_module': 'ViNLIZaloDataset',
   'test_size': 0.1}},
 'tokenizer': 'vinai/phobert-base-v2',
 'hf_cache': '../hf_cache',
 'max_length': 515,
 'pretrained_path': None,
 'load_state_dict_option': 'encoder_only',
 'model': {'model_type': 'Roberta',
  'hidden_size': 768,
  'num_hidden_layers': 12,
  'num_attention_heads': 12,
  'intermediate_size': 3072,
  'hidden_act': 'gelu',
  'hidden_dropout_prob': 0.1,
  'attention_probs_dropout_prob': 0.1,
  'max_position_embeddings': 515,
  'position_embedding_type': 'absolute',
  'type_vocab_size': 2,
  'layer_norm_eps': 1e-05,
  'initializer_range': 0.02,
  'classifier_dropout': 0.1,
  'num_labels': 2,
  'problem_type': 'single_label_classification'},
 

In [5]:
opt['tokenizer'] = AutoTokenizer.from_pretrained(opt['tokenizer'], cache_dir=opt['hf_cache'])

In [14]:
total_train_set, total_test_set = [], []
for k, v in opt['datasets'].items():
    train_set, test_set = get_dataset(**v, **opt)
    total_train_set.append(train_set)
    total_test_set.append(test_set)
total_train_set = ConcatDataset(total_train_set)
total_test_set = ConcatDataset(total_test_set)
print('Num of training samples:', len(total_train_set))
print('Num of testing samples:', len(total_test_set))
train_loader = get_dataloader(total_train_set, 'train', **opt)
test_loader = get_dataloader(total_test_set, 'test', **opt)

Num of training samples: 11873
Num of testing samples: 1319


In [15]:
batch = next(iter(train_loader))
for k, v in batch.items():
    print(k, v.shape)

input_ids torch.Size([4, 515])
attention_mask torch.Size([4, 515])
token_type_ids torch.Size([4, 515])
labels torch.Size([4])
position_ids torch.Size([4, 515])


In [16]:
batch

{'input_ids': tensor([[   0, 8461,   24,  ...,    1,    1,    1],
         [   0, 8284,  741,  ...,   33,  909,    2],
         [   0,  432,  245,  ...,    1,    1,    1],
         [   0,  432,  426,  ...,    1,    1,    1]]),
 'attention_mask': tensor([[1., 1., 1.,  ..., 0., 0., 0.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 0., 0., 0.],
         [1., 1., 1.,  ..., 0., 0., 0.]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1]]),
 'labels': tensor([0, 1, 0, 0]),
 'position_ids': tensor([[  0,   1,   2,  ..., 512, 513, 514],
         [  0,   1,   2,  ..., 512, 513, 514],
         [  0,   1,   2,  ..., 512, 513, 514],
         [  0,   1,   2,  ..., 512, 513, 514]])}

In [17]:
torch.any(batch['input_ids'] >= opt['tokenizer'].vocab_size + 1)

tensor(False)

In [10]:
model = load_backbone(**opt)
# print(model)
pytorch_total_params = sum(p.numel() for p in model.parameters())
print('Total parameters:', pytorch_total_params)

Total parameters: 135197954


In [11]:
device = torch.device(opt['device'])
model.to(device)

optimizer = getattr(torch.optim, opt['optimizer'])(model.parameters(), lr=opt['lr'])

In [43]:
# batch['position_ids'] = torch.arange(0, opt['model']['max_position_embeddings'], dtype=torch.long).expand(opt['batch_size'], -1)
batch = {k:v.to(device) for k,v in batch.items()}

optimizer.zero_grad()
outputs = model(**batch)

loss = outputs.loss
print(loss.item())
loss.backward()
optimizer.step()
batch = {k:v.cpu() for k,v in batch.items()}

0.5213193297386169


---

In [27]:
import torch
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score

In [28]:
start_logits = torch.softmax(torch.randn((100, 2)), dim=-1)
start_labels = torch.randint(0, 2, (100, 1))

In [29]:
print(start_logits)
print(start_labels)

tensor([[0.0504, 0.9496],
        [0.6420, 0.3580],
        [0.4658, 0.5342],
        [0.1141, 0.8859],
        [0.7643, 0.2357],
        [0.2941, 0.7059],
        [0.8811, 0.1189],
        [0.2666, 0.7334],
        [0.3587, 0.6413],
        [0.6410, 0.3590],
        [0.5825, 0.4175],
        [0.4967, 0.5033],
        [0.6839, 0.3161],
        [0.6982, 0.3018],
        [0.8474, 0.1526],
        [0.6509, 0.3491],
        [0.5729, 0.4271],
        [0.6218, 0.3782],
        [0.8771, 0.1229],
        [0.6916, 0.3084],
        [0.2046, 0.7954],
        [0.1067, 0.8933],
        [0.9328, 0.0672],
        [0.1703, 0.8297],
        [0.6900, 0.3100],
        [0.5119, 0.4881],
        [0.8172, 0.1828],
        [0.5643, 0.4357],
        [0.1998, 0.8002],
        [0.1537, 0.8463],
        [0.0659, 0.9341],
        [0.6595, 0.3405],
        [0.4290, 0.5710],
        [0.4847, 0.5153],
        [0.3896, 0.6104],
        [0.4635, 0.5365],
        [0.6442, 0.3558],
        [0.2426, 0.7574],
        [0.3

In [43]:
b_logits = iter(list(torch.chunk(start_logits, chunks=10, dim=0)))
b_labels = iter(list(torch.chunk(start_labels, chunks=10, dim=0)))

In [39]:
next(b_logits).shape

torch.Size([10, 2])

In [44]:
t_loss, tp, tn, fp, fn = 0., 0, 0, 0, 0

for logits_, labels_ in tqdm(zip(b_logits, b_labels)):
    logits = torch.argmax(logits_, dim=-1).flatten()
    labels = labels_.flatten()
    print(logits)
    print(labels)
    tp += torch.sum(((logits == 1) & (labels == 1))).item()
    tn += torch.sum(((logits == 0) & (labels == 0))).item()
    fp += torch.sum(((logits == 1) & (labels == 0))).item()
    fn += torch.sum(((logits == 0) & (labels == 1))).item()

acc = (tp + tn) / (tp + tn + fp + fn)
pre = (tp + 1e-8) / (tp + fp + 1e-8)
rec = (tp + 1e-8) / (tp + fn + 1e-8)
f1 = (2 * pre * rec) / (pre + rec + 1e-8) 

print(acc, f1)

10it [00:00, 1425.18it/s]

tensor([1, 0, 1, 1, 0, 1, 0, 1, 1, 0])
tensor([0, 0, 0, 1, 1, 1, 1, 1, 0, 1])
tensor([0, 1, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 1, 0, 0, 1, 0, 0, 1, 0])
tensor([1, 1, 0, 1, 0, 0, 0, 0, 1, 1])
tensor([1, 0, 0, 1, 1, 1, 1, 0, 1, 0])
tensor([1, 0, 1, 1, 1, 1, 0, 1, 1, 1])
tensor([0, 0, 0, 1, 1, 1, 0, 1, 0, 0])
tensor([1, 0, 1, 0, 0, 0, 1, 1, 1, 0])
tensor([1, 0, 0, 0, 0, 1, 1, 1, 1, 0])
tensor([0, 0, 0, 1, 0, 1, 1, 0, 1, 0])
tensor([1, 0, 0, 1, 0, 0, 1, 1, 1, 0])
tensor([1, 0, 0, 1, 1, 0, 0, 1, 1, 0])
tensor([0, 0, 0, 1, 0, 1, 0, 0, 1, 0])
tensor([1, 1, 0, 0, 0, 0, 1, 0, 1, 0])
tensor([0, 0, 1, 1, 0, 0, 1, 0, 0, 1])
tensor([0, 1, 1, 1, 1, 1, 0, 0, 1, 0])
tensor([1, 0, 0, 0, 0, 1, 1, 1, 0, 0])
tensor([1, 0, 1, 0, 0, 1, 1, 0, 0, 1])
tensor([1, 1, 1, 0, 1, 1, 0, 1, 1, 1])
0.53 0.515463912626209





In [45]:
print(accuracy_score(start_labels.numpy(), torch.argmax(start_logits, dim=-1).numpy()))
print(f1_score(start_labels.numpy(), torch.argmax(start_logits, dim=-1).numpy()))

0.53
0.5154639175257733


In [14]:
from tqdm import tqdm
f = open('./training_logs.txt', 'w')
for i in tqdm(range(100), file=f):
    print(i)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
