In [1]:
import torch 
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import AutoTokenizer, AutoModel
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import rouge
from ipynb.fs.full.sari import SARIsent
import pandas as pd
from rouge import Rouge
from tqdm.auto import trange
import random
import numpy as np
import gc
import tensorflow as tf
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
import seaborn as sns
import plotly.express as px
import pickle 
from sklearn.utils import shuffle
import mlflow
import mlflow.sklearn
import mlflow.tensorflow
import json
import os
import tempfile
from mlflow.models.signature import infer_signature
import torch
import mlflow
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def add_special_tokens(model_name):
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    special_tokens = {'pad_token':'<|pad|>','sep_token':'<|sep|>'}
    num_add_toks = tokenizer.add_special_tokens(special_tokens)
    tokenizer.src_lang = 'ru'
    return tokenizer

In [4]:
train_small = pd.read_csv('data\\train_small.csv', index_col=0)
val = pd.read_csv('data\\eval.csv', index_col=0)

train_pairs = list(zip(train_small.source, train_small.target))
eval_pairs = list(zip(val.source, val.target))
print(train_small.shape[0], val.shape[0])

20480 2560


In [5]:
model_name = 'sberbank-ai/rugpt3small_based_on_gpt2'
model = GPT2LMHeadModel.from_pretrained(model_name, output_hidden_states=True)
tokenizer = add_special_tokens(model_name)

In [6]:
class SimplificationDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        source_text, target_text = self.data[index]
        encoding = self.tokenizer(source_text, truncation=True, max_length=self.max_length, padding='max_length')

        return {
            'input_ids': torch.tensor(encoding['input_ids']),
            'attention_mask': torch.tensor(encoding['attention_mask']),
            'labels': self.tokenizer(target_text, truncation=True, max_length=self.max_length, padding='max_length')['input_ids'],
        }

In [7]:
max_length = 512
train_dataset = SimplificationDataset(train_pairs, tokenizer, max_length)
eval_dataset = SimplificationDataset(eval_pairs, tokenizer, max_length)

In [10]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    evaluation_strategy='steps',
    save_steps=10000,
    logging_steps=1000,
    overwrite_output_dir=True,
    learning_rate=2e-5
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

using `logging_steps` to initialize `eval_steps` to 1000
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [11]:
experiment_name = "ru_gpt3"
mlflow.set_experiment(experiment_name=experiment_name)
current_experiment = dict(mlflow.get_experiment_by_name(experiment_name))
exp_id = current_experiment['experiment_id']

In [12]:
with mlflow.start_run(experiment_id=exp_id):
    # Логируем параметры
    mlflow.log_param('model_name', model_name)
    #mlflow.log_param('max_length', max_length)
    mlflow.log_param('train_data_size', len(train_dataset))
    mlflow.log_param('batch_size', training_args.per_device_train_batch_size)
    
    trainer.train()
    
    # Логируем метрики
    metrics = trainer.evaluate()
    for key, value in metrics.items():
        mlflow.log_metric(key, value)

    # Сохраняем модель и логируем путь к ней
    output_dir = './rugpt3'
    trainer.save_model(output_dir)
    mlflow.log_artifacts(output_dir, artifact_path='models')

The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh()

All git commands will error until this is rectified.

$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - error|e|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet

***** Running training *****
  Num examples = 20480
  Num Epochs = 3
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 10
  Gradient Accumulation steps = 1
  Total optimization steps = 6144
  Number of trainable parameters = 125231616


Step,Training Loss,Validation Loss
1000,0.6154,0.365979
2000,0.3619,0.36
3000,0.3571,0.35743
4000,0.3534,0.355898
5000,0.3443,0.355511
6000,0.347,0.354441


***** Running Evaluation *****
  Num examples = 2560
  Batch size = 10
***** Running Evaluation *****
  Num examples = 2560
  Batch size = 10
***** Running Evaluation *****
  Num examples = 2560
  Batch size = 10
***** Running Evaluation *****
  Num examples = 2560
  Batch size = 10
***** Running Evaluation *****
  Num examples = 2560
  Batch size = 10
***** Running Evaluation *****
  Num examples = 2560
  Batch size = 10


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Evaluation *****
  Num examples = 2560
  Batch size = 10


Saving model checkpoint to ./rugpt3
Configuration saved in ./rugpt3\config.json
Model weights saved in ./rugpt3\pytorch_model.bin


In [13]:
mlflow.end_run()