# Useful constants

In [1]:
from os.path import join

In [2]:
MODEL_DIR = 'model-folder'
EMB = 'emb.pt'
ENCODER = 'encoder.pt'
MLM = 'mlm.pt'

MODEL_EMB = join(MODEL_DIR, EMB)
MODEL_ENCODER = join(MODEL_DIR, ENCODER)
MODEL_MLM = join(MODEL_DIR, MLM)

CUDA = 'cuda:0'
CPU = 'cpu'

BOARD_NAME = 'bert_cb_news'
RUNS_DIR = 'runs'

TRAIN_BOARD = '01_train'
TEST_BOARD = '02_test'

EPOCH_NUM = 250

# Loading the data

In [3]:
from dataset.utils import load_lines
from os import walk
from random import seed, shuffle

In [4]:
text_path = './texts/news-100k/'
_, _, filenames = next(walk(text_path))
texts = []

for filename in filenames:
    text = ''
    for line in load_lines(text_path + filename, _encoding='cp1251'):
        text += line + ' '
    texts.append(text.strip())

In [6]:
seed(1)
shuffle(texts)

train_texts = texts[:73715]
test_texts = texts[73715:]

# Loading the vocabulary and initializing the model

In [3]:
from bert.model.bert import (
    CBNewsBERTConfig,
    BERTEmbedding,
    BERTEncoder,
    BERTMLMHead,
    BERTMLM
)
from bert.vocab import BERTVocab
from bert.encoders.bert import BERTMLMTrainEncoder
from bert.score import (
    MLMScoreMeter,
    score_mlm_batch,
    score_mlm_batches
)
from bert.loss import masked_flatten_cross_entropy

import torch
from torch.quantization import quantize_dynamic
import pickle

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.2.0.json: 128kB [00:00, 21.2MB/s]                    
2021-06-21 15:49:10 INFO: Downloading default packages for language: ru (Russian)...
2021-06-21 15:49:11 INFO: File exists: /home/maria/stanza_resources/ru/default.zip.
2021-06-21 15:49:14 INFO: Finished downloading models and saved to /home/maria/stanza_resources.
2021-06-21 15:49:14 INFO: Loading these models for language: ru (Russian):
| Processor | Package   |
-------------------------
| tokenize  | syntagrus |
| pos       | syntagrus |
| lemma     | syntagrus |
| depparse  | syntagrus |
| ner       | wikiner   |

2021-06-21 15:49:14 INFO: Use device: gpu
2021-06-21 15:49:14 INFO: Loading: tokenize
2021-06-21 15:49:16 INFO: Loading: pos
2021-06-21 15:49:16 INFO: Loading: lemma
2021-06-21 15:49:16 INFO: Loading: depparse
2021-06-21 15:49:16 INFO: Loading: ner
2021-06-21 15:49:17 INFO: Done loading processors!


In [4]:
with open('bert_vocabulary.pkl', 'rb') as infile:
    vocab = pickle.load(infile)
    
config = CBNewsBERTConfig()
emb = BERTEmbedding.from_config(config)
encoder = BERTEncoder.from_config(config)
head = BERTMLMHead(config.emb_dim, config.main_vocab_size + config.unknown_size + 1)
model = BERTMLM(emb, encoder, head)

In [5]:
emb.position.weight.requires_grad = False
criterion = masked_flatten_cross_entropy
# model = quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
model = model.to(torch.device(CUDA))

In [10]:
encode = BERTMLMTrainEncoder(
    vocab,
    seq_len=512,
    batch_size=8,
    shuffle_size=10000
)

batches = encode(train_texts)
train_batches = [_.to(torch.device('cpu')) for _ in batches]
batches = encode(test_texts)
test_batches = [_.to(torch.device('cpu')) for _ in batches]

Lemmatization error
Lemmatization error
Lemmatization error
Lemmatization error
Lemmatization error
Lemmatization error


In [6]:
with open('train_batches512-08-100k_encoded.pkl', 'rb') as infile:
    train_batches = pickle.load(infile)
    
with open('test_batches512-08-100k_encoded.pkl', 'rb') as infile:
    test_batches = pickle.load(infile)

In [7]:
train_batches = [_.to(torch.device(CUDA)) for _ in train_batches]
test_batches = [_.to(torch.device(CUDA)) for _ in test_batches] 

In [8]:
torch.cuda.device_count()

1

In [8]:
inputs = torch.arange(end=len(train_batches))
targets = torch.arange(end=len(train_batches))
train_dataset = torch.utils.data.TensorDataset(inputs, targets)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=1,
                                           shuffle=True,
                                           num_workers=0,
                                           pin_memory=True)
for pos, _ in train_loader:
    print("Input shape:", train_batches[pos].input.shape)
    print("Target shape:", train_batches[pos].target.value.shape)

Input shape: torch.Size([8, 512, 2])
Target shape: torch.Size([8, 512, 2])
Input shape: torch.Size([8, 512, 2])
Target shape: torch.Size([8, 512, 2])
Input shape: torch.Size([8, 512, 2])
Target shape: torch.Size([8, 512, 2])
Input shape: torch.Size([8, 512, 2])
Target shape: torch.Size([8, 512, 2])
Input shape: torch.Size([8, 512, 2])
Target shape: torch.Size([8, 512, 2])
Input shape: torch.Size([8, 512, 2])
Target shape: torch.Size([8, 512, 2])
Input shape: torch.Size([8, 512, 2])
Target shape: torch.Size([8, 512, 2])
Input shape: torch.Size([8, 512, 2])
Target shape: torch.Size([8, 512, 2])
Input shape: torch.Size([8, 512, 2])
Target shape: torch.Size([8, 512, 2])
Input shape: torch.Size([8, 512, 2])
Target shape: torch.Size([8, 512, 2])
Input shape: torch.Size([8, 512, 2])
Target shape: torch.Size([8, 512, 2])
Input shape: torch.Size([8, 512, 2])
Target shape: torch.Size([8, 512, 2])
Input shape: torch.Size([8, 512, 2])
Target shape: torch.Size([8, 512, 2])
Input shape: torch.Size([

In [12]:
with open('train_batches512-08-100k_encoded.pkl', 'wb') as outfile:
    pickle.dump(train_batches, outfile)
    
with open('test_batches512-08-100k_encoded.pkl', 'wb') as outfile:
    pickle.dump(test_batches, outfile)

# Train batches

In [7]:
def every(step, period):
    return step > 0 and step % period == 0


def process_batch(model, criterion, batch):
    pred = model(batch.input)
    loss = criterion(pred, batch.target.value, batch.target.mask)
    return batch.processed(loss, pred)


def infer_batches(model, criterion, batches):
    training = model.training
    model.eval()
    with torch.no_grad():
        for batch in batches:
            yield process_batch(model, criterion, batch)
    model.train(training)

In [8]:
from tqdm.notebook import tqdm as log_progress
from bert.board import TensorBoard
from torch import optim
import apex
from datetime import datetime

In [9]:
board = TensorBoard(BOARD_NAME, RUNS_DIR)
train_board = board.section(TRAIN_BOARD)
test_board = board.section(TEST_BOARD)

In [10]:
optimizer = optim.Adam(model.parameters(), lr=0.0001, amsgrad=True)
model, optimizer = apex.amp.initialize(model, optimizer, opt_level='O2')
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, 0.999)

Selected optimization level O2:  FP16 training with FP32 batchnorm and FP32 master weights.

Defaults for this optimization level are:
enabled                : True
opt_level              : O2
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : True
master_weights         : True
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O2
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : True
master_weights         : True
loss_scale             : dynamic


In [11]:
train_meter = MLMScoreMeter()
test_meter = MLMScoreMeter()

accum_steps = 10
log_steps = 16
eval_steps = 32
save_steps = eval_steps * 10

model.train()
optimizer.zero_grad()

for epoch in range(EPOCH_NUM):
    epoch_start = datetime.now()
    epoch_loss = 0
    for step, batch in log_progress(enumerate(train_batches)):
        batch = process_batch(model, criterion, batch)
        batch.loss /= accum_steps

        with apex.amp.scale_loss(batch.loss, optimizer) as scaled:
            scaled.backward()
        # batch.loss.backward()

        score = score_mlm_batch(batch, ks=())
        train_meter.add(score)

        if every(step, log_steps):
            train_meter.write(train_board)
            train_meter.reset()

        if every(step, accum_steps):
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

            if every(step, eval_steps):
                batches = infer_batches(model, criterion, test_batches)
                scores = score_mlm_batches(batches)
                test_meter.extend(scores)
                test_meter.write(test_board)
                test_meter.reset()

        if every(step, save_steps):
            model.emb.dump(MODEL_EMB)
            model.encoder.dump(MODEL_ENCODER)
            model.head.dump(MODEL_MLM)

        board.step()
        epoch_loss += batch.loss.item()
        
    epoch_loss /= len(train_batches)
    training_time = datetime.now() - epoch_start
    print("Epoch #{_epoch}".format(_epoch=epoch + 1))
    print("Mean batch loss:", epoch_loss)
    print("Training time:", str(
        training_time.seconds // 60).zfill(2) + ":" + str(training_time.seconds % 60).zfill(2))

0it [00:00, ?it/s]

Epoch #1
Mean batch loss: 0.7485909334979852
Training time: 175:03


0it [00:00, ?it/s]

Epoch #2
Mean batch loss: 0.7422609098064931
Training time: 162:43


0it [00:00, ?it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 2097152.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 2097152.0
Epoch #3
Mean batch loss: 0.7416133358324931
Training time: 162:48


0it [00:00, ?it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 2097152.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 2097152.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 2097152.0
Epoch #4
Mean batch loss: 0.7413388815056615
Training time: 167:40


0it [00:00, ?it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 2097152.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 1048576.0
Epoch #5
Mean batch loss: 0.7411631625727647
Training time: 167:31


0it [00:00, ?it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 2097152.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 1048576.0
Epoch #6
Mean batch loss: 0.7410508081330099
Training time: 167:29


0it [00:00, ?it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 2097152.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 1048576.0
Epoch #7
Mean batch loss: 0.7409799620417084
Training time: 167:33


0it [00:00, ?it/s]

KeyboardInterrupt: 

In [1]:
import os

print(os.path.exists("/home/maria/id_NEWSPUNCH-01ffe576-8de3-4a91-867f-7b008b26daed.txt"))

True
