In [1]:
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi

In [2]:
!pip install gputil
!pip install psutil
!pip install humanize



In [3]:
# memory footprint support libraries/code
import psutil
import humanize
import os
import GPUtil as GPU


GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]

In [4]:
def printm():
 process = psutil.Process(os.getpid())
 print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
 print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
 
printm()

Gen RAM Free: 12.8 GB  | Proc size: 159.3 MB
GPU RAM Free: 15079MB | Used: 0MB | Util   0% | Total 15079MB


In [5]:
! pip install awscli
! aws s3 sync --no-sign-request s3://models.dobro.ai/gpt2/ru/unfreeze_all gpt2



In [6]:
! pip install transformers



In [7]:
%%writefile setup.sh

git clone https://github.com/NVIDIA/apex
cd apex
pip install -v --no-cache-dir ./

Overwriting setup.sh


In [8]:
!sh setup.sh

fatal: destination path 'apex' already exists and is not an empty directory.
Created temporary directory: /tmp/pip-ephem-wheel-cache-rdldjzt7
Created temporary directory: /tmp/pip-req-tracker-fhev891_
Created requirements tracker '/tmp/pip-req-tracker-fhev891_'
Created temporary directory: /tmp/pip-install-9io01qyy
Processing /content/apex
  Created temporary directory: /tmp/pip-req-build-6q45_fds
  Added file:///content/apex to build tracker '/tmp/pip-req-tracker-fhev891_'
    Running setup.py (path:/tmp/pip-req-build-6q45_fds/setup.py) egg_info for package from file:///content/apex
    Running command python setup.py egg_info


    torch.__version__  = 1.6.0+cu101


    running egg_info
    creating /tmp/pip-req-build-6q45_fds/pip-egg-info/apex.egg-info
    writing /tmp/pip-req-build-6q45_fds/pip-egg-info/apex.egg-info/PKG-INFO
    writing dependency_links to /tmp/pip-req-build-6q45_fds/pip-egg-info/apex.egg-info/dependency_links.txt
    writing top-level names to /tmp/pip-req-build-

In [9]:
! pip install youtokentome



In [10]:
from __future__ import absolute_import, division, print_function

import argparse
import glob
import logging
import os
import pickle
import random
import re
import shutil

import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler
from torch.utils.data.distributed import DistributedSampler

try:
    from torch.utils.tensorboard import SummaryWriter
except:
    from tensorboardX import SummaryWriter

from tqdm import tqdm, trange
# from tqdm import tqdm as tqdm_base
# def tqdm(*args, **kwargs):
#     if hasattr(tqdm_base, '_instances'):
#         for instance in list(tqdm_base._instances):
#             tqdm_base._decr_instances(instance)
#     return tqdm_base(*args, **kwargs)

from transformers import (WEIGHTS_NAME, AdamW, 
                          # WarmupLinearSchedule,
                                  BertConfig, BertForMaskedLM, BertTokenizer,
                                  GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
                                  OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
                                  RobertaConfig, RobertaForMaskedLM, RobertaTokenizer,
                                  DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)

from transformers import get_linear_schedule_with_warmup

logger = logging.getLogger(__name__)

In [11]:
logger.setLevel('INFO')

In [12]:
MODEL_CLASSES = {
    'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
    'openai-gpt': (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
    'bert': (BertConfig, BertForMaskedLM, BertTokenizer),
    'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
    'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)
}

In [13]:
def dict2obj(d):
  if isinstance(d, list):
    d = [dict2obj(x) for x in d]
  if not isinstance(d, dict):
    return d
  class C(object):
    pass
  o = C()
  for k in d:
    o.__dict__[k] = dict2obj(d[k])
  return o

BLOCK_SIZE = 256

parser = {}

parser['train_data_file'] = './oxxxymiron_lyrics_end_text.txt' #'Lyrics_Земфира (Zemfira).txt'
parser['input_dir'] = './gpt2/m_checkpoint-3364613'
parser['output_dir'] = './textgenmodels'

parser['eval_data_file'] = './oxxxymiron_lyrics_end_text.txt' #'Lyrics_Земфира (Zemfira).txt'
parser['model_type'] = 'gpt2' # bert
parser['model_name_or_path'] = 'gpt2-medium' # 'bert-base-cased'
parser['mlm'] = False 
parser['mlm_probability'] = False

parser['config_name'] = ""
parser['tokenizer_name'] = ""
parser['cache_dir'] = ""
parser['block_size'] = BLOCK_SIZE
parser['do_train'] = True
parser['do_eval'] = True
parser['evaluate_during_training'] = True
parser['do_lower_case'] = True

parser['per_gpu_train_batch_size'] = 2
parser['per_gpu_eval_batch_size'] = 2
parser['gradient_accumulation_steps'] = 10
parser['learning_rate'] = 0.001 # 5e-5
parser['weight_decay'] = 0.0
parser['adam_epsilon'] = 1e-8
parser['max_grad_norm'] = 1.0
parser['num_train_epochs'] = 5.0
parser['max_steps'] = -1
parser['warmup_steps'] = 100

parser['logging_steps'] = 50
parser['save_steps'] = 50
parser['save_total_limit'] = None
parser['eval_all_checkpoints'] = True
parser['no_cuda'] = False
parser['overwrite_output_dir'] = True
parser['overwrite_cache'] = True
parser['seed'] = 42

parser['fp16'] = True
parser['fp16_opt_level'] = 'O1'
parser['local_rank'] = -1
parser['server_ip'] = ""
parser['server_port'] = ""

# Data loading
https://github.com/huggingface/transformers/blob/master/examples/run_lm_finetuning.py

In [14]:
class TextDataset(Dataset):
    def __init__(self, tokenizer, file_path='train', block_size=BLOCK_SIZE):
        assert os.path.isfile(file_path)
        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(directory, 'cached_lm_' + str(block_size) + '_' + filename)

        if os.path.exists(cached_features_file):
            logger.info("Loading features from cached file %s", cached_features_file)
            with open(cached_features_file, 'rb') as handle:
                self.examples = pickle.load(handle)
        else:
            logger.info("Creating features from dataset file at %s", directory)

            self.examples = []
            with open(file_path, encoding="utf-8") as f:
                text = f.read()

            # tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
            tokenized_text = tokenizer.encode(text)

            # TODO FIX WARNINGS WHERE SPECIAL TOKENS AND GPT2 OUTPUT TOO MUCH
            for i in range(0, len(tokenized_text)-block_size+1, block_size): # Truncate in block of block_size
                if parser['model_type'] == 'gpt2':
                    self.examples.append(tokenized_text[i:i+block_size])
                else:
                    self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i:i+block_size]))
                
            # Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
            # If your dataset is small, first you should loook for a bigger one :-) and second you
            # can change this behavior by adding (model specific) padding.

            logger.info("Saving features into cached file %s", cached_features_file)
            with open(cached_features_file, 'wb') as handle:
                pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        return torch.tensor(self.examples[item])

In [15]:
def load_and_cache_examples(args, tokenizer, evaluate=False):
    dataset = TextDataset(tokenizer, file_path=args.eval_data_file if evaluate else args.train_data_file, block_size=args.block_size)
    return dataset


def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)


def _rotate_checkpoints(args, checkpoint_prefix, use_mtime=False):
    if not args.save_total_limit:
        return
    if args.save_total_limit <= 0:
        return

    # Check if we should delete older checkpoint(s)
    glob_checkpoints = glob.glob(os.path.join(args.output_dir, '{}-*'.format(checkpoint_prefix)))
    if len(glob_checkpoints) <= args.save_total_limit:
        return

    ordering_and_checkpoint_path = []
    for path in glob_checkpoints:
        if use_mtime:
            ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
        else:
            regex_match = re.match('.*{}-([0-9]+)'.format(checkpoint_prefix), path)
            if regex_match and regex_match.groups():
                ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))

    checkpoints_sorted = sorted(ordering_and_checkpoint_path)
    checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
    number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit)
    checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
    for checkpoint in checkpoints_to_be_deleted:
        logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
        shutil.rmtree(checkpoint)


def mask_tokens(inputs, tokenizer, args):
    """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
    labels = inputs.clone()
    # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
    probability_matrix = torch.full(labels.shape, args.mlm_probability)
    special_tokens_mask = [tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()]
    probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
    masked_indices = torch.bernoulli(probability_matrix).bool()
    labels[~masked_indices] = -1  # We only compute loss on masked tokens

    # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
    indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
    inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)

    # 10% of the time, we replace masked input tokens with random word
    indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
    random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
    inputs[indices_random] = random_words[indices_random]

    # The rest of the time (10% of the time) we keep the masked input tokens unchanged
    return inputs, labels


def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    # scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps = -1)

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                          output_device=args.local_rank,
                                                          find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
    set_seed(args)  # Added here for reproducibility (even between python 2 and 3)
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    checkpoint_prefix = 'checkpoint'
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, '{}-{}'.format(checkpoint_prefix, global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    torch.save(parser, os.path.join(output_dir, 'training_args.bin'))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    _rotate_checkpoints(args, checkpoint_prefix)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step


def evaluate(args, model, tokenizer, prefix=""):
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)

    if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(eval_output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
        inputs = inputs.to(args.device)
        labels = labels.to(args.device)

        with torch.no_grad():
            outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))

    result = {
        "perplexity": perplexity,
        'eval_loss': eval_loss
    }

    output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return result

In [16]:
# args = parser.parse_args()
args = dict2obj(parser)

if args.model_type in ["bert", "roberta", "distilbert"] and not args.mlm:
  raise ValueError("BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm "
                    "flag (masked language modeling).")
if args.eval_data_file is None and args.do_eval:
  raise ValueError("Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
                    "or remove the --do_eval argument.")

if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
  raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))

In [17]:
# Setup distant debugging if needed
if args.server_ip and args.server_port:
    # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
    import ptvsd
    print("Waiting for debugger attach")
    ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
    ptvsd.wait_for_attach()

# Setup CUDA, GPU & distributed training
if args.local_rank == -1 or args.no_cuda:
    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()
else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
    torch.cuda.set_device(args.local_rank)
    device = torch.device("cuda", args.local_rank)
    torch.distributed.init_process_group(backend='nccl')
    args.n_gpu = 1
args.device = device

In [18]:
# Setup logging
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
                args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)

# Set seed
set_seed(args)



In [19]:
"""Byte pair encoding utilities"""
import os
import youtokentome as yttm
import hashlib
from transformers.tokenization_utils import PreTrainedTokenizer
import shutil
import regex as re
from os.path import samefile

In [20]:
NEW_LINE = '<|n|>'

class YTEncoder(PreTrainedTokenizer):
    def_name = 'encoder.model'
    def __init__(self, filename, *inputs, **kwargs):
        super().__init__(*inputs, **kwargs)
        #self.max_len_single_sentence = BLOCK_SIZE # no default special tokens - you can update this value if you add special tokens
        #self.max_len_sentences_pair = BLOCK_SIZE # no default special tokens - you can update this value if you add special tokens

        if os.path.isdir(filename): filename = os.path.join(filename, self.def_name)

        self.bpe = yttm.BPE(filename)
        self.hash = hashlib.sha512(open(filename, 'rb').read()).hexdigest()[:10]
        self.filename = filename

    def encode(self, text):
        if text and text[0] != ' ': text = ' ' + text
        text = re.sub(r'(?=[^ ])([\W])([\w])',r'\g<1> \g<2>',text)
        text = text.replace('\n', f' {NEW_LINE} ')

        return self.bpe.encode([text], output_type=yttm.OutputType.ID)[0]


    def decode(self, tokens): # I hate regexps
        if not isinstance(tokens,list):
            tokens = tokens.tolist()
        result = self.bpe.decode(tokens)[0]
        result = re.sub(r'( )?(<\|n\|>)( )?', r'\n', result)
        result = re.sub(r'([\n(]) (\w)',r'\g<1>\g<2>', result)
        result = re.sub(r'(\W)([«"''\n(]|^) (\w)',r'\g<1>\g<2>\g<3>', result)
        result = re.sub(r'(\w)- (\w)',r'\g<1>-\g<2>', result)
        return result

    def tokenize(self, text, **kwargs):
        return self.encode(text)

    @classmethod
    def from_pretrained(cls, *inputs, **kwargs):
        return cls(*inputs, **kwargs)

    def add_special_tokens_single_sentence(self, token_ids):
        return token_ids

    def save_pretrained(self, save_directory):
        src = self.filename
        dst = os.path.join(save_directory, self.def_name)
        if src != dst:
            shutil.copyfile(src, dst)

In [21]:
# Load pretrained model and tokenizer
if args.local_rank not in [-1, 0]:
    torch.distributed.barrier()  # Barrier to make sure only the first process in distributed training download model & vocab

config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]

model = model_class.from_pretrained(args.input_dir)
tokenizer = YTEncoder.from_pretrained(args.input_dir)
model.to(args.device)

if args.block_size <= 0:
    args.block_size = tokenizer.max_len_single_sentence  # Our input block size will be the max possible for the model
args.block_size = min(args.block_size, tokenizer.max_len_single_sentence)

if args.local_rank == 0:
    torch.distributed.barrier()  # End of barrier to make sure only the first process in distributed training download model & vocab

logger.info("Training/evaluation parameters %s", args)

08/05/2020 15:53:14 - INFO - transformers.configuration_utils -   loading configuration file ./gpt2/m_checkpoint-3364613/config.json
08/05/2020 15:53:14 - INFO - transformers.configuration_utils -   Model config GPT2Config {
  "activation_function": "gelu_new",
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 1024,
  "n_head": 16,
  "n_layer": 24,
  "n_positions": 1024,
  "n_special": 0,
  "output_past": true,
  "predict_special_tokens": true,
  "resid_pdrop": 0.1,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "vocab_size": 50257
}

08/05/2020 15:53:14 - INFO - transformers.modeling_utils -   loading weights file ./gpt2/m_checkpoint-3364613/pytorc

In [22]:
import torch.nn.functional as F

def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
    """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
        Args:
            logits: logits distribution shape (batch size x vocabulary size)
            top_k > 0: keep only top k tokens with highest probability (top-k filtering).
            top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
                Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
        From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
    """
    top_k = min(top_k, logits.size(-1))  # Safety check
    if top_k > 0:
        # Remove all tokens with a probability less than the last token of the top-k
        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
        logits[indices_to_remove] = filter_value

    if top_p > 0.0:
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

        # Remove tokens with cumulative probability above the threshold
        sorted_indices_to_remove = cumulative_probs > top_p
        # Shift the indices to the right to keep also the first token above the threshold
        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
        sorted_indices_to_remove[..., 0] = 0

        # scatter sorted tensors to original indexing
        indices_to_remove = sorted_indices_to_remove.scatter(dim=1, index=sorted_indices, src=sorted_indices_to_remove)
        logits[indices_to_remove] = filter_value
    return logits


def sample_sequence(model, length, context, num_samples=1, temperature=1, top_k=0, top_p=0.0, repetition_penalty=1.0,
                    is_xlnet=False, is_xlm_mlm=False, xlm_mask_token=None, xlm_lang=None, device='cuda'):
    context = torch.tensor(context, dtype=torch.long, device=device)
    context = context.unsqueeze(0).repeat(num_samples, 1)
    generated = context
    with torch.no_grad():
        for _ in trange(length):

            inputs = {'input_ids': generated}
            if is_xlnet: 
                # XLNet is a direct (predict same token, not next token) and bi-directional model by default
                # => need one additional dummy token in the input (will be masked), attention mask and target mapping (see model docstring)
                input_ids = torch.cat((generated, torch.zeros((1, 1), dtype=torch.long, device=device)), dim=1)
                perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float, device=device)
                perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
                target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float, device=device)
                target_mapping[0, 0, -1] = 1.0  # predict last token
                inputs = {'input_ids': input_ids, 'perm_mask': perm_mask, 'target_mapping': target_mapping}

            if is_xlm_mlm and xlm_mask_token:
                # XLM MLM models are direct models (predict same token, not next token)
                # => need one additional dummy token in the input (will be masked and guessed)
                input_ids = torch.cat((generated, torch.full((1, 1), xlm_mask_token, dtype=torch.long, device=device)), dim=1)
                inputs = {'input_ids': input_ids}

            if xlm_lang is not None:
                inputs["langs"] = torch.tensor([xlm_lang] * inputs["input_ids"].shape[1], device=device).view(1, -1)

            outputs = model(**inputs)  # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet/CTRL (cached hidden-states)
            next_token_logits = outputs[0][:, -1, :] / (temperature if temperature > 0 else 1.)

            # repetition penalty from CTRL (https://arxiv.org/abs/1909.05858)
            for i in range(num_samples):
                for _ in set(generated[i].tolist()):
                    next_token_logits[i, _] /= repetition_penalty
                
            filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
            if temperature == 0: # greedy sampling:
                next_token = torch.argmax(filtered_logits, dim=-1).unsqueeze(-1)
            else:
                next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
            generated = torch.cat((generated, next_token), dim=1)
    return generated

In [23]:
# CONTEXT_TEXT = 'Всё ещё бабла нет, всё ещё с долгами канитель \n \
# Все ещё в подвале, всё ещё Parliament на бите \n \
# И я вернусь на трек, твой хуй как Тулуз-Лотрек \n \
# Если русский рэп в гробу сто лет, то я ебу скелет \n \
# И я построил альбом на костях \n \
# Этому не видно конца, как будто он голый толстяк \n \
# Каждый просит фит, каждый пишет: «Денег дам» \n \
# Вас миллион, но мой кумир – Гриша Перельман \n'

CONTEXT_TEXT = ''
START_TEXT = 'Я здесь'
CONTEXT_TEXT += START_TEXT

context_tokens = tokenizer.encode(CONTEXT_TEXT)
sampled = sample_sequence(model, 
                          100, 
                          context_tokens, 
                          temperature = 1.0,
                          top_p=0.99
                          )

out = sampled[:, len(context_tokens):].tolist()
text = ''.join([tokenizer.decode(o) for o in out])
text = text[: text.find('<|endoftext|>')].split('\n')

print('-' * 20)

for i, t in enumerate(text):
    if i == 0:
        print(START_TEXT + t)
    else:
        print(t)

evaluate(args, model, tokenizer)

100%|██████████| 100/100 [00:03<00:00, 33.19it/s]
08/05/2020 15:53:42 - INFO - __main__ -   Loading features from cached file ./cached_lm_256_oxxxymiron_lyrics_end_text.txt
08/05/2020 15:53:42 - INFO - __main__ -   ***** Running evaluation  *****
08/05/2020 15:53:42 - INFO - __main__ -     Num examples = 240
08/05/2020 15:53:42 - INFO - __main__ -     Batch size = 2
Evaluating:   1%|          | 1/120 [00:00<00:15,  7.65it/s]

--------------------
Я здесьпотому, что я его боюсь. Не потому что боюсь Бонуса, Джека. Сутки назад я согласилась надеть эти сапоги потому, что после выборов мы по очереди выходим в фабрику. Я уже одеваю их на слишком большую протяженность работ. Конечно, мистер Бонус — слишком крутой, чтобы его меня найти. Я бы не надела эти сапоги на улицу, чтобы никого там не увидеть, я не это имела в виду, но однажды я посмотрела в глазок — таких сапог здесь только два. («... и больше ничего?»


Evaluating: 100%|██████████| 120/120 [00:17<00:00,  6.88it/s]
08/05/2020 15:54:00 - INFO - __main__ -   ***** Eval results  *****
08/05/2020 15:54:00 - INFO - __main__ -     eval_loss = 4.464891692002614
08/05/2020 15:54:00 - INFO - __main__ -     perplexity = tensor(86.9116)


{'eval_loss': 4.464891692002614, 'perplexity': tensor(86.9116)}

In [24]:
# Training
if args.do_train:
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier()  # Barrier to make sure only the first process in distributed training process the dataset, and the others will use the cache

    train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False)

    if args.local_rank == 0:
        torch.distributed.barrier()

    global_step, tr_loss = train(args, train_dataset, model, tokenizer)
    logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

08/05/2020 15:54:00 - INFO - __main__ -   Loading features from cached file ./cached_lm_256_oxxxymiron_lyrics_end_text.txt
08/05/2020 15:54:00 - INFO - __main__ -   ***** Running training *****
08/05/2020 15:54:00 - INFO - __main__ -     Num examples = 240
08/05/2020 15:54:00 - INFO - __main__ -     Num Epochs = 5
08/05/2020 15:54:00 - INFO - __main__ -     Instantaneous batch size per GPU = 2
08/05/2020 15:54:00 - INFO - __main__ -     Total train batch size (w. parallel, distributed & accumulation) = 20
08/05/2020 15:54:00 - INFO - __main__ -     Gradient Accumulation steps = 10
08/05/2020 15:54:00 - INFO - __main__ -     Total optimization steps = 60
Epoch:   0%|          | 0/5 [00:00<?, ?it/s]
Iteration:   0%|          | 0/120 [00:00<?, ?it/s][A

Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic



Iteration:   1%|          | 1/120 [00:00<00:31,  3.77it/s][A
Iteration:   2%|▏         | 2/120 [00:00<00:32,  3.64it/s][A
Iteration:   2%|▎         | 3/120 [00:00<00:32,  3.56it/s][A
Iteration:   3%|▎         | 4/120 [00:01<00:33,  3.50it/s][A
Iteration:   4%|▍         | 5/120 [00:01<00:32,  3.49it/s][A
Iteration:   5%|▌         | 6/120 [00:01<00:32,  3.46it/s][A
Iteration:   6%|▌         | 7/120 [00:02<00:32,  3.48it/s][A
Iteration:   7%|▋         | 8/120 [00:02<00:32,  3.48it/s][A

Iteration:   8%|▊         | 10/120 [00:03<00:36,  2.97it/s][A
Iteration:   9%|▉         | 11/120 [00:03<00:35,  3.05it/s][A
Iteration:  10%|█         | 12/120 [00:03<00:34,  3.14it/s][A
Iteration:  11%|█         | 13/120 [00:03<00:33,  3.21it/s][A
Iteration:  12%|█▏        | 14/120 [00:04<00:32,  3.28it/s][A
Iteration:  12%|█▎        | 15/120 [00:04<00:31,  3.32it/s][A
Iteration:  13%|█▎        | 16/120 [00:04<00:31,  3.35it/s][A
Iteration:  14%|█▍        | 17/120 [00:05<00:30,  3.39it/s][

In [25]:
# Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()
if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
    # Create output directory if needed
    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    logger.info("Saving model checkpoint to %s", args.output_dir)
    # Save a trained model, configuration and tokenizer using `save_pretrained()`.
    # They can then be reloaded using `from_pretrained()`
    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
    model_to_save.save_pretrained(args.output_dir)
    tokenizer.save_pretrained(args.output_dir)

    # Good practice: save your training arguments together with the trained model
    torch.save(parser, os.path.join(args.output_dir, 'training_args.bin'))

    # Load a trained model and vocabulary that you have fine-tuned
    model = model_class.from_pretrained(args.output_dir)
    tokenizer = YTEncoder.from_pretrained(args.output_dir)
    model.to(args.device)

08/05/2020 15:57:18 - INFO - __main__ -   Saving model checkpoint to ./textgenmodels
08/05/2020 15:57:18 - INFO - transformers.configuration_utils -   Configuration saved in ./textgenmodels/config.json
08/05/2020 15:57:26 - INFO - transformers.modeling_utils -   Model weights saved in ./textgenmodels/pytorch_model.bin
08/05/2020 15:57:26 - INFO - transformers.configuration_utils -   loading configuration file ./textgenmodels/config.json
08/05/2020 15:57:26 - INFO - transformers.configuration_utils -   Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 1024,
  "n_head": 16,
  "n_layer": 24,
  "n_positions": 1024,
  "n_special": 0,
  "output_past": tr

In [91]:
CONTEXT_TEXT = ''

START_TEXT = 'Ты чё '
CONTEXT_TEXT = CONTEXT_TEXT + START_TEXT

context_tokens = tokenizer.encode(CONTEXT_TEXT)
sampled = sample_sequence(model, 
                          100, 
                          context_tokens, 
                          temperature = 1.0,
                          top_p=0.80
                          )

out = sampled[:, len(context_tokens):].tolist()
text = ''.join([tokenizer.decode(o) for o in out])

# text = text[: text.find('<| endoftext|>')].split('\n')
text = text.split('\n')

print('-' * 20)

for i, t in enumerate(text):
    if i == 0:
        print(START_TEXT + t)
    else:
        print(t)

100%|██████████| 100/100 [00:05<00:00, 19.84it/s]

--------------------
Ты чё я — космополит
На небе дым, под ним бетон
Я поднимаюсь, как будто с горы
Я лезу и лезу наверх
Ты говоришь — «Он эгоцентрик»,
Но мой эгоизм — это ты
Ты хочешь сделать меня мышью? Я — пантера!
<| endoftext|>
Всё слишком идеализировано, слишком пристрастно
До сих пор декадентство — это нонсенс
А мы — дети репатрианта






In [43]:
# ! rm -rf ./textgenmodels/checkpoint-50
# ! zip -r res_oxxxymiron.zip textgenmodels

In [28]:
# from google.colab import drive
# drive.mount('/content/gdrive')

In [29]:
# ! cp ./res_oxxxymiron.zip './gdrive/My Drive/gpt2/res_oxxxymiron.zip'