# Обучение модели pysbot

* параметры моделии влияние на генерацию текста GPT (eng) https://huggingface.co/blog/how-to-generate
* Обучение ruDialoGPT https://github.com/vlarine/ruDialoGPT
* Обучение на свой язык https://towardsdatascience.com/train-gpt-2-in-your-own-language-fc6ad4d60171


## Установка окружения

In [None]:
# Обеспечиваем подгрузку данных и их хранение в каталоге ноутубка MyDrive/chats_emotions_and_voises/chat04_depression-therapist-chatbot
import os
from google.colab import drive
drive.mount('/content/gdrive')
## Или !gdown --id 1FCwByq-VkeW1_cje4sIB5KaY0S9a8u8b !unzip /content/RuDialoGPT.zip
DATA_PATH='/content/gdrive/MyDrive/chats_emotions_and_voises/psy-chatbot/data'
if not os.path.exists(DATA_PATH):
    raise ValueError('Нет папки для хранения данных', DATA_PATH)
%ls $DATA_PATH


Mounted at /content/gdrive
data_psy_su.csv  dataset.tsv


In [1]:
%%writefile setup.sh
pip install transformers==4.4.2 #urllib3==1.25.4 transformers==2.8.0
export CUDA_HOME=/usr/local/cuda-10.1
git clone https://github.com/NVIDIA/apex
cd apex
git reset --hard a651

pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./

Writing setup.sh


In [2]:
!sh setup.sh

Collecting transformers==4.4.2
[?25l  Downloading https://files.pythonhosted.org/packages/ed/d5/f4157a376b8a79489a76ce6cfe147f4f3be1e029b7144fa7b8432e8acb26/transformers-4.4.2-py3-none-any.whl (2.0MB)
[K     |████████████████████████████████| 2.0MB 7.3MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 51.0MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 46.3MB/s 
Installing collected packages: tokenizers, sacremoses, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.3 transformers-4.4.

## Формируем датасет

In [6]:
import pandas as pd
import numpy as np
import random
import os

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)

In [7]:
data_path = os.path.join(DATA_PATH, 'dataset.tsv')
data = pd.read_csv(data_path, sep=',',header=None)
print(data.shape)
data.head()

(348, 1)


Unnamed: 0,0
0,"|0|3|Волею судеб меня ""занесло"" в коррекционну..."
1,"|0|2|Всем привет!!!! Ищу единомышленников, тех..."
2,"|0|-|Здравствуйте, уважаемые коллеги! Хочу ра..."
3,|0|3|Уважаемые коллеги! Позволю себе создать ...
4,"|0|2|ЗНАКОМСТВО,ПЕРВОНАЧАЛЬНОЕ ОБЩЕНИЕ КАК ПРА..."


In [8]:
# Формируем выборку обучающую и проверочную
train=data.sample(frac=0.8) #random state is a seed value
test=data.drop(train.index)
print('Длина обучающей', len(train), 'и проверочной выборки', len(test))
print('Пример обучающей выборки')
train.head()

Длина обучающей 278 и проверочной выборки 70
Пример обучающей выборки


Unnamed: 0,0
206,"|0|-|Не секрет, что в настоящее время професси..."
312,"|0|3|Здравствуйте, подскажите пожалуйста как р..."
178,"|0|2|ЗНАКОМСТВО,ПЕРВОНАЧАЛЬНОЕ ОБЩЕНИЕ КАК ПРА..."
66,"|0|3|В д/с воспитатель замечает, что мой ребен..."
37,"|0|-|Здравствуйте, уважаемые коллеги! Меня вол..."


In [9]:
with open('train.txt', 'w') as f:
  for val in train.values:
        f.write(f'{val[0]}\n')
with open('test.txt', 'w') as f:
  for val in test.values:
        f.write(f'{val[0]}\n')        
#train.to_csv('train.txt', index=False, sep=' ', header=None)
#test.to_csv('test.txt', index=False, sep=' ', header=None)

## Run finetuning
The following code download our model and tokenizer from transformers and finetune model essays.

This took aroung ten minutes and obtain perplexity = tensor(13.8065)


In [37]:
# Источник https://raw.githubusercontent.com/sberbank-ai/ru-gpts/master/pretrain_transformers.py
import argparse
import glob
import logging
import os
import pickle

import re
import shutil
from typing import Dict, List, Tuple

import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange
from transformers import (
    MODEL_WITH_LM_HEAD_MAPPING,
    WEIGHTS_NAME,
    AdamW,
    AutoConfig,
    AutoModelWithLMHead,
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
    get_linear_schedule_with_warmup,
)
from torch.utils.tensorboard import SummaryWriter

logger = logging.getLogger(__name__)

MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [11]:
# Параметры. Вырезано все для local_rank для распределенного обучения
args = {}
args['should_continue'] = False # Продолжать обучение модели с последнего чекпоинта
args['cache_dir']='model_cache' # Папка для кеширования модели
args['output_dir']='comment_model' # Папка сохранения модели, TODO лучше сохранять на gogle диске, т.к. при остановке колаба удалится #--output_dir=comment_model \
args['model_name_or_path'] = 'sberbank-ai/rugpt3small_based_on_gpt2' # --model_name_or_path=sberbank-ai/rugpt3small_based_on_gpt2 \
args['train_data_file'] = 'train.txt' # Файл обучающей выборки # --train_data_file=train.txt \
args['eval_data_file'] = 'test.txt' # Валидационный файл # --eval_data_file=valid.txt \
args['per_gpu_train_batch_size'] = 1 # --per_gpu_train_batch_size 1 размер батча на GPU, дефолтное 4, больше - быстрее обучение, ниже качество.
args['max_steps'] = -1 # Максимальное кол-во шагов обучения, переопределяет количество эпох. -1 считать по эпохам.
args['num_train_epochs'] = 5 # --num_train_epochs 5 \
args['gradient_accumulation_steps'] = 1 # --gradient_accumulation_steps 1 Количество накопленных шагов перед переходом градиента
args['learning_rate'] = 5e-5 # The initial learning rate for Adam.
args['weight_decay'] = 0.01 # "Weight decay if we apply some.")
args['adam_epsilon'] = 1e-8 #"Epsilon for Adam optimizer.")
args['max_grad_norm'] = 1.0  #"Max gradient norm.")
args['block_size'] = 2048 # --block_size 2048 \
args['overwrite_output_dir'] = True # --overwrite_output_dir Перезатирать выходную дирректорию
args['logging_steps'] = 500 # Шаги логгирования
args['save_steps'] = 500 # Сохранение через каждые X шагов
args['evaluate_during_training'] = False # Вычислять точность по мере обучения
args['per_gpu_eval_batch_size'] = 4 # Batch size per GPU/CPU for evaluation.
# неимпользуемые параметры  --do_train - обучать модель, --do_eval - вычислять результат обучения по валидационной выборке
# --model_type=gpt2 - Тип модели, жестка задан --fp16 - Использование 16битной точности (apex Nvidia)
print(args)

{'should_continue': False, 'cache_dir': 'model_cache', 'output_dir': 'comment_model', 'model_name_or_path': 'sberbank-ai/rugpt3small_based_on_gpt2', 'train_data_file': 'train.txt', 'eval_data_file': 'test.txt', 'per_gpu_train_batch_size': 1, 'max_steps': -1, 'num_train_epochs': 5, 'gradient_accumulation_steps': 1, 'learning_rate': 5e-05, 'weight_decay': 0.01, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'block_size': 2048, 'overwrite_output_dir': True, 'logging_steps': 500, 'save_steps': 500, 'evaluate_during_training': False, 'per_gpu_eval_batch_size': 4}


In [12]:
# Возвращение отсортированного списка чекпоинтов моделей
def _sorted_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> List[str]:
    ordering_and_checkpoint_path = []
    glob_checkpoints = glob.glob(os.path.join(args['output_dir'], f"{checkpoint_prefix}-*"))
    for path in glob_checkpoints:
        if use_mtime:
            ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
        else:
            regex_match = re.match(".*{}-([0-9]+)".format(checkpoint_prefix), path)
            if regex_match and regex_match.groups():
                ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))
    checkpoints_sorted = sorted(ordering_and_checkpoint_path)
    checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
    return checkpoints_sorted

In [13]:
# Вычисление качества обучения
def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict:
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args['output_dir']

    eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)
    os.makedirs(eval_output_dir, exist_ok=True) # if args.local_rank in [-1, 0]:

    args['eval_batch_size'] = args['per_gpu_eval_batch_size'] * max(1, args['n_gpu'])

    # Note that DistributedSampler samples randomly

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(
        eval_dataset, sampler=eval_sampler, batch_size=args['eval_batch_size'], collate_fn=collate
    )

    # multi-gpu evaluate
    if args['n_gpu'] > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args['eval_batch_size'])
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        inputs, labels = (batch, batch) # mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
        inputs = inputs.to(args['device'])
        labels = labels.to(args['device'])

        with torch.no_grad():
            outputs =  model(inputs, labels=labels) #model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))

    result = {"perplexity": perplexity}

    output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return result

In [14]:
# Загрузка датасета

#Класс датасета
class TextDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, args, file_path: str, block_size=512):
        assert os.path.isfile(file_path)
        block_size = block_size - (tokenizer.model_max_length - tokenizer.max_len_single_sentence)
        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(directory, "gpt2_cached_lm_" + str(block_size) + "_" + filename)

        if os.path.exists(cached_features_file): # and not args.overwrite_cache: (!) убрана логика для перезаписи кешированных файлов альтернативы 
            logger.info("Loading features from cached file %s", cached_features_file)
            with open(cached_features_file, "rb") as handle:
                self.examples = pickle.load(handle)
        else:
            logger.info("Creating features from dataset file at %s", directory)
            self.examples = []
            with open(file_path, encoding="utf-8") as f:
                text = f.read()
            tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
            for i in range(0, len(tokenized_text) - block_size + 1, block_size):  # Truncate in block of block_size
                self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i: i + block_size]))
            # Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
            # If your dataset is small, first you should loook for a bigger one :-) and second you
            # can change this behavior by adding (model specific) padding.
            logger.info("Saving features into cached file %s", cached_features_file)
            with open(cached_features_file, "wb") as handle:
                pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        return torch.tensor(self.examples[item], dtype=torch.long)

# Загрузка датасета
def load_and_cache_examples(args, tokenizer, evaluate=False):
    file_path = args['eval_data_file'] if evaluate else args['train_data_file']
    return TextDataset(tokenizer, args, file_path=file_path, block_size=args['block_size'])


In [15]:
# Функция обучения
# Отключено Train with masked-language modeling loss instead of language modeling
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
    """ Train the model """
    tb_writer = SummaryWriter()
    args['train_batch_size'] = args['per_gpu_train_batch_size'] * max(1, args['n_gpu'])

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    train_sampler = RandomSampler(train_dataset) # if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset, sampler=train_sampler, batch_size=args['train_batch_size'], collate_fn=collate
    )

    if args['max_steps'] > 0:
        t_total = args.max_steps
        args['num_train_epochs'] = args['max_steps'] // (len(train_dataloader) // args['gradient_accumulation_steps']) + 1
    else:
        t_total = len(train_dataloader) // args['gradient_accumulation_steps'] * args['num_train_epochs']
    
    model.resize_token_embeddings(len(tokenizer))

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args['weight_decay'],
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args['learning_rate'], eps=args['adam_epsilon'])
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=t_total # args.warmup_steps = 0 количество шагов на прогрев
    )
    # Check if saved optimizer or scheduler states exist
    if (args['model_name_or_path'] and os.path.isfile(os.path.join(args['model_name_or_path'], "optimizer.pt"))
            and os.path.isfile(os.path.join(args['model_name_or_path'], "scheduler.pt"))): # Load in optimizer and scheduler states
        optimizer.load_state_dict(torch.load(os.path.join(args['model_name_or_path'], "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(args['model_name_or_path'], "scheduler.pt")))

    try: #if args.fp16:
        from apex import amp
    except ImportError:
        raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
    model, optimizer = amp.initialize(model, optimizer, opt_level='O1') #args.fp16_opt_level) For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details at https://nvidia.github.io/apex/amp.html"

    # multi-gpu training (should be after apex fp16 initialization)
    if args['n_gpu'] > 1: model = torch.nn.DataParallel(model)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args['num_train_epochs'])
    logger.info("  Instantaneous batch size per GPU = %d", args['per_gpu_train_batch_size'])
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args['train_batch_size'] * args['gradient_accumulation_steps'] * 1 #(torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args['gradient_accumulation_steps'])
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if args['model_name_or_path'] and os.path.exists(args['model_name_or_path']):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args['model_name_or_path'].split("-")[-1].split("/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) // args['gradient_accumulation_steps'])
            steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args['gradient_accumulation_steps'])

            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d", global_step)
            logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0

    model.zero_grad()
    train_iterator = trange(epochs_trained, int(args['num_train_epochs']), desc="Epoch", disable=False) #disable=args.local_rank not in [-1, 0]

    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=False) #disable=args.local_rank not in [-1, 0]
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            inputs, labels = (batch, batch) # mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) 
            inputs = inputs.to(args['device'])
            labels = labels.to(args['device'])
            model.train()
            outputs = model(inputs, labels=labels) #model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if args['n_gpu'] > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args['gradient_accumulation_steps'] > 1:
                loss = loss / args['gradient_accumulation_steps']

            with amp.scale_loss(loss, optimizer) as scaled_loss: #if args.fp16: else: loss.backward()
                scaled_loss.backward()
            
            tr_loss += loss.item()
            if (step + 1) % args['gradient_accumulation_steps'] == 0:
                
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args['max_grad_norm']) #if args.fp16: else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args['logging_steps'] > 0 and global_step % args['logging_steps'] == 0: #args.local_rank in [-1, 0] and 
                    # Log metrics
                    if (args['evaluate_during_training']):  # Only evaluate when single GPU otherwise metrics may not average well #  args.local_rank == -1 and 
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar(f"eval_{key}", value, global_step)
                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args['save_steps'] > 0 and global_step % args['save_steps'] == 0: # args.local_rank in [-1, 0] and 
                    checkpoint_prefix = "checkpoint"
                    # Save model checkpoint
                    output_dir = os.path.join(args['output_dir'], f"{checkpoint_prefix}-{global_step}")
                    os.makedirs(output_dir, exist_ok=True)
                    model_to_save = model #(model.module if hasattr(model, "module") else model)  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    #_rotate_checkpoints(args, checkpoint_prefix) # Отключена ротация, т.е. удаление старых чекпоинтов, если их больше

                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s", output_dir)

            if 0 < args['max_steps'] < global_step:
                epoch_iterator.close()
                break
        if 0 < args['max_steps'] < global_step:
            train_iterator.close()
            break
    
    tb_writer.close() # if args.local_rank in [-1, 0]:

    return global_step, tr_loss / global_step


In [63]:
# Проверки
if args['should_continue']:
  sorted_checkpoints = _sorted_checkpoints(args['output_dir'])
  if len(sorted_checkpoints) == 0:
    raise ValueError("Used --should_continue but no checkpoint was found in --output_dir.")
  else:
    args['model_name_or_path'] = sorted_checkpoints[-1]

# Setup CUDA, GPU & distributed training

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
args['n_gpu'] = torch.cuda.device_count() if torch.cuda.is_available() else 0
args['device'] = device
print('Device', args['device'])

# Setup logging
logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO)
logger.warning(
    "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
    1, device, args['n_gpu'], False, True) #args['fp16']

config = AutoConfig.from_pretrained(args['model_name_or_path'], cache_dir=args['cache_dir'])
tokenizer = AutoTokenizer.from_pretrained(args['model_name_or_path'], cache_dir=args['cache_dir'])
args['block_size'] =  tokenizer.model_max_length if args['block_size'] <= 0 else min(args['block_size'], tokenizer.model_max_length)
model = AutoModelWithLMHead.from_pretrained(args['model_name_or_path'], from_tf=bool(".ckpt" in args['model_name_or_path']),
                                            config=config, cache_dir=args['cache_dir'])
model.to(args['device'])



Device cuda


Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50264, 768)
    (wpe): Embedding(2048, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): Laye

In [17]:
# Обучение модели
logger.info("Training/evaluation parameters", args)
train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False)
global_step, tr_loss = train(args, train_dataset, model, tokenizer)
logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

06/03/2021 07:53:09 - INFO - __main__ -   Training/evaluation parameters
06/03/2021 07:53:09 - INFO - __main__ -   Creating features from dataset file at 
06/03/2021 07:53:09 - INFO - __main__ -   Saving features into cached file gpt2_cached_lm_2048_train.txt
06/03/2021 07:53:10 - INFO - __main__ -   ***** Running training *****
06/03/2021 07:53:10 - INFO - __main__ -     Num examples = 48
06/03/2021 07:53:10 - INFO - __main__ -     Num Epochs = 5
06/03/2021 07:53:10 - INFO - __main__ -     Instantaneous batch size per GPU = 1
06/03/2021 07:53:10 - INFO - __main__ -     Total train batch size (w. parallel, distributed & accumulation) = 1
06/03/2021 07:53:10 - INFO - __main__ -     Gradient Accumulation steps = 1
06/03/2021 07:53:10 - INFO - __main__ -     Total optimization steps = 240
Epoch:   0%|          | 0/5 [00:00<?, ?it/s]
Iteration:   0%|          | 0/48 [00:00<?, ?it/s][A

Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic



Iteration:   2%|▏         | 1/48 [00:00<00:28,  1.68it/s][A
Iteration:   4%|▍         | 2/48 [00:01<00:26,  1.72it/s][A
Iteration:   6%|▋         | 3/48 [00:01<00:25,  1.74it/s][A
Iteration:   8%|▊         | 4/48 [00:02<00:25,  1.76it/s][A
Iteration:  10%|█         | 5/48 [00:02<00:24,  1.77it/s][A
Iteration:  12%|█▎        | 6/48 [00:03<00:23,  1.78it/s][A
Iteration:  15%|█▍        | 7/48 [00:03<00:22,  1.79it/s][A
Iteration:  17%|█▋        | 8/48 [00:04<00:22,  1.79it/s][A
Iteration:  19%|█▉        | 9/48 [00:05<00:21,  1.80it/s][A
Iteration:  21%|██        | 10/48 [00:05<00:21,  1.80it/s][A
Iteration:  23%|██▎       | 11/48 [00:06<00:20,  1.80it/s][A
Iteration:  25%|██▌       | 12/48 [00:06<00:20,  1.80it/s][A
Iteration:  27%|██▋       | 13/48 [00:07<00:19,  1.80it/s][A
Iteration:  29%|██▉       | 14/48 [00:07<00:18,  1.80it/s][A
Iteration:  31%|███▏      | 15/48 [00:08<00:18,  1.80it/s][A
Iteration:  33%|███▎      | 16/48 [00:08<00:17,  1.80it/s][A
Iteration:  35%|

In [18]:
# Сохранение лучшей модели, чтобы можно было загрузить из предобученной
os.makedirs(args['output_dir'], exist_ok=True)

logger.info("Saving model checkpoint to %s", args['output_dir'])
# Save a trained model, configuration and tokenizer using `save_pretrained()`. They can then be reloaded using `from_pretrained()`
model_to_save = model #(model.module if hasattr(model, "module") else model)  # Take care of distributed/parallel training
model_to_save.save_pretrained(args['output_dir'])
tokenizer.save_pretrained(args['output_dir'])
# Good practice: save your training arguments together with the trained model
torch.save(args, os.path.join(args['output_dir'], "training_args.bin"))
# Load a trained model and vocabulary that you have fine-tuned
model = AutoModelWithLMHead.from_pretrained(args['output_dir'])
tokenizer = AutoTokenizer.from_pretrained(args['output_dir'])
model.to(args['device'])

06/03/2021 07:56:16 - INFO - __main__ -   Saving model checkpoint to comment_model


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50258, 768)
    (wpe): Embedding(2048, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): Laye

In [19]:
# Расчеты качества обучения
results = {}
checkpoints = [args['output_dir']]
args['eval_all_checkpoints'] = False
if args['eval_all_checkpoints'] : # вычислить значения всех чекпоинтов
    checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args['output_dir'] + "/**/" + WEIGHTS_NAME, recursive=True)))
    logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
logger.info("Evaluate the following checkpoints: %s", checkpoints)
for checkpoint in checkpoints:
    global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
    prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
    model = AutoModelWithLMHead.from_pretrained(checkpoint)
    model.to(args['device'])
    result = evaluate(args, model, tokenizer, prefix=prefix) # TODO перенести
    result = dict((k + f"_{global_step}", v) for k, v in result.items())
    results.update(result)
print(results)

06/03/2021 07:56:32 - INFO - __main__ -   Evaluate the following checkpoints: ['comment_model']
06/03/2021 07:56:36 - INFO - __main__ -   Creating features from dataset file at 
06/03/2021 07:56:36 - INFO - __main__ -   Saving features into cached file gpt2_cached_lm_2048_test.txt
06/03/2021 07:56:36 - INFO - __main__ -   ***** Running evaluation  *****
06/03/2021 07:56:36 - INFO - __main__ -     Num examples = 16
06/03/2021 07:56:36 - INFO - __main__ -     Batch size = 4
Evaluating: 100%|██████████| 4/4 [00:03<00:00,  1.26it/s]
06/03/2021 07:56:40 - INFO - __main__ -   ***** Eval results  *****
06/03/2021 07:56:40 - INFO - __main__ -     perplexity = tensor(16.4644)


{'perplexity_': tensor(16.4644)}


## Проверка модели

In [29]:
!wget https://raw.githubusercontent.com/sberbank-ai/ru-gpts/master/generate_transformers.py
!python generate_transformers.py \
    --model_type=gpt2 \
    --model_name_or_path=comment_model \
    --k=5 \
    --p=0.95 \
    --length=50

--2021-06-03 08:06:29--  https://raw.githubusercontent.com/sberbank-ai/ru-gpts/master/generate_transformers.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10474 (10K) [text/plain]
Saving to: ‘generate_transformers.py’


2021-06-03 08:06:29 (104 MB/s) - ‘generate_transformers.py’ saved [10474/10474]

06/03/2021 08:06:37 - INFO - __main__ -   Namespace(device=device(type='cuda'), k=5, length=50, model_name_or_path='comment_model', model_type='gpt2', n_gpu=1, no_cuda=False, num_return_sequences=1, p=0.95, padding_text='', prompt='', repetition_penalty=1.0, seed=42, stop_token='</s>', temperature=1.0, xlm_language='')
Context >>> Привет
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
ruGPT:
2021-06-03 08:06:47.524392: I tensorflow/str

In [30]:
path_model=args['output_dir']

# Проверяем, что все на месте
if not os.path.isdir(path_model):
  raise Exception(f"Path '{path_model}' not found!")
if not os.path.isfile(os.path.join(path_model, 'config.json')):
  raise Exception(f"On path '{path_model}' file 'config.json' not found!")
if not os.path.isfile(os.path.join(path_model, 'pytorch_model.bin')):
  raise Exception(f"On path '{path_model}' file 'pytorch_model.bin' not found!")
if not os.path.isfile(os.path.join(path_model, 'vocab.json')):
  raise Exception(f"On path '{path_model}' file 'vocab.json' not found!")
if not os.path.isfile(os.path.join(path_model, 'tokenizer_config.json')):
  raise Exception(f"On path '{path_model}' file 'tokenizer_config.json' not found!")
if not os.path.isfile(os.path.join(path_model, 'special_tokens_map.json')):
  raise Exception(f"On path '{path_model}' file 'special_tokens_map.json' not found!")
if not os.path.isfile(os.path.join(path_model, 'merges.txt')):
  raise Exception(f"On path '{path_model}' file 'merges.txt' not found!")

In [75]:
# from transformers import AutoModelForCausalLM, AutoTokenizer
# tokenizer = AutoTokenizer.from_pretrained(path_model)
# model = AutoModelForCausalLM.from_pretrained(path_model)

# tokenizer = AutoTokenizer.from_pretrained(args['output_dir'])
# model = AutoModelWithLMHead.from_pretrained(args['output_dir'])
tokenizer = AutoTokenizer.from_pretrained('sberbank-ai/rugpt3small_based_on_gpt2')
model = AutoModelWithLMHead.from_pretrained('sberbank-ai/rugpt3small_based_on_gpt2')

# model.to(args['device'])

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


In [71]:
def get_length_param(text: str) -> str:
    tokens_count = len(tokenizer.encode(text))
    if tokens_count <= 15:
        len_param = '1'
    elif tokens_count <= 50:
        len_param = '2'
    elif tokens_count <= 256:
        len_param = '3'
    else:
        len_param = '-'
    return len_param

In [76]:
#@title Параметры модели

# Поле 'Max length tokens generate' должно быть больше 8, но меньше 256
max_length = 50 #@param {type:"integer"} # 256
# def 3 Поле 'No repeat ngram size' должно быть больше или равно 1, но меньше или равно 10.
no_repeat_ngram_size = 3 #@param {type:"integer"}
# def 100 Поле 'Top K' должно быть больше или равно 1, но меньше или равно 500. Текущее значение: 
top_k = 100 #@param {type:"integer"}
# def 0.9 Поле 'Top P' должно быть больше или равно 0.01, но меньше или равно 1.0.
top_p = 0.95 #@param {type:"number"} #0.9
# def 0.6 Поле 'Temperature' должно быть больше или равно 0.01, но меньше или равно 1.0. 
temperature = 0.6 #@param {type:"number"}
# def 5. Поле 'Num responses return' должно быть больше или равно 1, но меньше или равно 10. 
num_return = 5 #@param {type:"integer"}
# def True
do_sample = True  #@param {type:"boolean"}
# def True
is_always_use_length = True  #@param {type:"boolean"}
# Поле 'Length generate' должно принимать одно из следующих значений: [0, 1, 2, 3].
length_gen = 1  #@param {type:"integer"}
# def False. debug
log_debug = True  #@param {type:"boolean"}

params = { 
        "max_length": max_length,
        "no_repeat_ngram_size": no_repeat_ngram_size, 
        "top_k": top_k,                              
        "top_p": top_p,                              
        "temperature": temperature,                   
        "num_return_sequences": num_return,           
        "do_sample": do_sample,
        "device": "cuda" if torch.cuda.is_available() else "cpu", #'cuda:0', # 'cpu',
        "is_always_use_length": is_always_use_length,
        "length_generate": length_gen, 
        "log_debug": log_debug
    }

In [77]:
def generate(inputs, params): # inputs = [{'speaker': 0, 'text': 'Привет, как день прошел?'},{'speaker': 1, 'text': 'Хорошо, а у тебя как?'}]
# TODO надо ограничить кол-во входных истории.
  inputs_text = ''
  for input_ in inputs:
    if params['is_always_use_length']:
        length_param = get_length_param(input_['text'])
    else:
        length_param = '-'
    inputs_text += f"|{input_['speaker']}|{length_param}|{input_['text']}"
  inputs_text += f"|1|{params['length_generate']}|"

  if log_debug:
    print(f"\n===> debug Params generate: {params}")
    print(f"===> debug Text input: {inputs_text}")
  inputs_token_ids = tokenizer.encode(inputs_text, add_special_tokens=False, return_tensors='pt')
  print('#', type(inputs_token_ids), params['max_length'], params['no_repeat_ngram_size'], params['do_sample'],'\n',
  params['top_k'],params['top_p'], params['temperature'],params['num_return_sequences'], params['device'])

  # try:
    # ToDo make this asynchronous
  outputs_token_ids = model.generate(
      inputs_token_ids,
      max_length=params['max_length'],
      no_repeat_ngram_size=params['no_repeat_ngram_size'],
      do_sample=params['do_sample'],
      top_k=params['top_k'],
      top_p=params['top_p'],
      temperature=params['temperature'],
      num_return_sequences=params['num_return_sequences'],
      device=params['device'],
      mask_token_id=tokenizer.mask_token_id,
      eos_token_id=tokenizer.eos_token_id,
      unk_token_id=tokenizer.unk_token_id,
      pad_token_id=tokenizer.pad_token_id,
  )
  # except Exception as e:
  #   print(f"===> Error generate: {str(e)}")
  #   return {'inputs': '', 'outputs': '', 'status': False, 'msg': f"{str(e)}"}

  outputs = [tokenizer.decode(x, skip_special_tokens=True) for x in outputs_token_ids]
  outputs = [x.split('|')[-1] for x in outputs]

  return {'inputs': inputs, 'outputs': outputs, 'status': True, 'msg': ''}

In [74]:
print('Введите quit для остановки')

inputs = []
while True:
  user_input = input("User:")
  if user_input == "quit":
    "stop talking!"
    break
  inputs.append({'speaker': 0, 'text': user_input})
  response_data = generate(inputs, params)
 
  if not response_data['status']:
    print("Bot (ошибка):", response_data['msg'])
  else:
    bot_resposnse = response_data['outputs'][0]
    variants_responses = response_data['outputs']
    bot_resposnse = response_data['outputs'][0]
    inputs.append({'speaker': 1, 'text': bot_resposnse})
    variants_responses = response_data['outputs']
    print("Bot:", bot_resposnse)
    if params['log_debug']:
     print("    all variants ===> debug:", variants_responses)
     

Введите quit для остановки
User:r


Setting `pad_token_id` to `eos_token_id`:50257 for open-end generation.



===> debug Params generate: {'max_length': 50, 'no_repeat_ngram_size': 3, 'top_k': 100, 'top_p': 0.95, 'temperature': 0.6, 'num_return_sequences': 5, 'do_sample': True, 'device': 'cuda', 'is_always_use_length': True, 'length_generate': 1, 'log_debug': True}
===> debug Text input: |0|1|r|1|1|
# <class 'torch.Tensor'> 50 3 True 
 100 0.95 0.6 5 cuda


AttributeError: ignored