In [1]:
import torch 
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments
import mlflow
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import AutoTokenizer, AutoModel
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import rouge
from ipynb.fs.full.sari import SARIsent
import pandas as pd
from rouge import Rouge
from tqdm.auto import trange
import random
import numpy as np
import gc
import tensorflow as tf
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
import seaborn as sns
import plotly.express as px
import pickle 
import mlflow
import mlflow.sklearn
import mlflow.tensorflow
import json
import os
import tempfile
import time
from mlflow.models.signature import infer_signature
from sklearn.utils import shuffle
from transformers import AdamW, get_linear_schedule_with_warmup

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class Params(object):
    def __init__(self, batch_size, epochs, lr, weight_decay, warmup_steps, num_layers, dropout_rate):
        self.batch_size = batch_size
        self.epochs = epochs
        self.lr = lr
        self.weight_decay = weight_decay
        self.warmup_steps = warmup_steps
        self.num_layers = num_layers
        self.dropout_rate = dropout_rate

In [3]:
val = pd.read_csv('data\\eval.csv', index_col=0)

In [9]:
eval_pairs = list(zip(val.source, val.target))

In [4]:
train_clean = pd.read_csv('data\\train_small_medium_mix_clean.csv', index_col=0)

In [10]:
train_clean_pairs = list(zip(train_clean.source, train_clean.target))

In [39]:
train = pd.read_csv('data\\train.csv', index_col=0)
train_small = pd.read_csv('data\\train_small.csv', index_col=0)

In [40]:
train_ext = train[(train['type']!='para_phraser')&(train['size'].isin(['small','medium']))]
train_ext_pairs = list(zip(train_ext.source, train_ext.target))

In [41]:
train_small_pairs = list(zip(train_small.source, train_small.target))

train_ru_adapt = train[(train['size']=='small')&(train['type']=='ru_adapt')]
train_ru_adapt_pairs = list(zip(train_ru_adapt.source, train_ru_adapt.target))

train_ru_xlsum = train[(train['size']=='small')&(train['type']=='ru_xlsum')]
train_ru_xlsum_pairs = list(zip(train_ru_xlsum.source, train_ru_xlsum.target))

train_medium = train[(train['size']=='medium')]
train_medium_pairs = list(zip(train_medium.source, train_medium.target))

train_large = train[(train['size']=='large')]
train_large_pairs = list(zip(train_large.source, train_large.target))

train_para_phraser = train[(train['type']=='para_phraser')&(train['size']=='small')]
train_para_phraser_pairs = list(zip(train_para_phraser.source, train_para_phraser.target))

In [42]:
print('train_small_pairs: ',len(train_small_pairs))
print('train_ru_xlsum_pairs: ',len(train_ru_xlsum_pairs))
print('train_ru_adapt_pairs: ',len(train_ru_adapt_pairs))
print('train_medium_pairs: ',len(train_medium_pairs))
print('train_large_pairs: ',len(train_large_pairs))
print('train_para_phraser_pairs: ',len(train_para_phraser_pairs))
print('eval_pairs: ',len(eval_pairs))

train_small_pairs:  20480
train_ru_xlsum_pairs:  11399
train_ru_adapt_pairs:  34500
train_medium_pairs:  21803
train_large_pairs:  46872
train_para_phraser_pairs:  709371
eval_pairs:  2560


In [5]:
experiment_name = "rut5_multitask"
mlflow.set_experiment(experiment_name)
current_experiment = dict(mlflow.get_experiment_by_name(experiment_name))
exp_id = current_experiment['experiment_id']

In [6]:
def train_and_eval(model_name, train_pairs, eval_pairs, args, run_name, save_dir, exp_id):
    with mlflow.start_run(experiment_id=exp_id, run_name=run_name):
        model_name = model_name
        model = T5ForConditionalGeneration.from_pretrained(model_name, num_layers=args.num_layers, dropout_rate=args.dropout_rate).cuda()
        tokenizer = T5Tokenizer.from_pretrained(model_name)

        # Определение оптимизатора и планировщика
        optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=len(train_pairs) * args.epochs)

        for key, value in vars(args).items():
            mlflow.log_param(key, value)
        model.train()
        mlflow.log_param("model_name", model_name)
        for epoch in range(args.epochs):
            random.shuffle(train_pairs)
            train_loss = 0
            for i in trange(0, int(len(train_pairs) / args.batch_size)):
                # игнорирование батчей, для которых не хватает памяти
                try:
                    batch = train_pairs[i * args.batch_size: (i + 1) * args.batch_size]
                    # кодируем
                    x = tokenizer([p[0] for p in batch], return_tensors='pt', padding=True).to(model.device)
                    y = tokenizer([p[1] for p in batch], return_tensors='pt', padding=True).to(model.device)
                    # -100 - специальное значение, позволяющее не учитывать токены
                    y.input_ids[y.input_ids == 0] = -100  
                except OutOfMemoryError:
                    print('Ignoring batch due to CUDA out of memory')
                    continue
                optimizer.zero_grad()
                # вычисляем функцию потерь
                loss = model(
                        input_ids=x.input_ids,
                        attention_mask=x.attention_mask,
                        labels=y.input_ids,
                        decoder_attention_mask=y.attention_mask,
                        return_dict=True
                    ).loss
                train_loss += loss.data.item()
                # делаем шаг градиентного спуска
                loss.backward()
                optimizer.step()
                scheduler.step()
                torch.cuda.empty_cache()
                gc.collect()

            train_loss /= int(len(train_pairs) / args.batch_size)
            mlflow.log_metric("train_loss", train_loss, epoch)
            print(f'Epoch {epoch}, train_loss: {train_loss}')

            # вычисление лосса на валидационном датасете
            eval_loss = 0
            model.eval()
            with torch.no_grad():
                for j in trange(0, int(len(eval_pairs) / args.batch_size)):
                    batch = eval_pairs[j * args.batch_size: (j + 1) * args.batch_size]
                    x = tokenizer([p[0] for p in batch], return_tensors='pt', padding=True).to(model.device)
                    y = tokenizer([p[1] for p in batch], return_tensors='pt', padding=True).to(model.device)
                    # -100 - специальное значение, позволяющее не учитывать токены
                    loss = model(
                        input_ids=x.input_ids,
                        attention_mask=x.attention_mask,
                        labels=y.input_ids,
                        decoder_attention_mask=y.attention_mask,
                        return_dict=True
                        ).loss
                    eval_loss += loss.data.item()

            eval_loss /= int(len(eval_pairs) / args.batch_size)
            mlflow.log_metric("eval_loss", eval_loss, epoch)
            print(f'Epoch {epoch}, eval_loss: {eval_loss}')

    model.save_pretrained(save_dir)
    tokenizer.save_pretrained(save_dir)

In [7]:
def train_and_eval_base(model_name, train_pairs, eval_pairs, args, run_name, save_dir, exp_id):
    with mlflow.start_run(experiment_id=exp_id, run_name=run_name):
        model_name = model_name
        model = T5ForConditionalGeneration.from_pretrained(model_name).cuda()
        tokenizer = T5Tokenizer.from_pretrained(model_name)

        # Определение оптимизатора и планировщика
        optimizer = AdamW(model.parameters(), lr=args.lr)
        for key, value in vars(args).items():
            mlflow.log_param(key, value)
        model.train()
        mlflow.log_param("model_name", model_name)
        for epoch in range(args.epochs):
            random.shuffle(train_pairs)
            train_loss = 0
            for i in trange(0, int(len(train_pairs) / args.batch_size)):
                # игнорирование батчей, для которых не хватает памяти
                try:
                    batch = train_pairs[i * args.batch_size: (i + 1) * args.batch_size]
                    # кодируем
                    x = tokenizer([p[0] for p in batch], return_tensors='pt', padding=True).to(model.device)
                    y = tokenizer([p[1] for p in batch], return_tensors='pt', padding=True).to(model.device)
                    # -100 - специальное значение, позволяющее не учитывать токены
                    y.input_ids[y.input_ids == 0] = -100  
                except OutOfMemoryError:
                    print('Ignoring batch due to CUDA out of memory')
                    continue
                optimizer.zero_grad()
                # вычисляем функцию потерь
                loss = model(
                        input_ids=x.input_ids,
                        attention_mask=x.attention_mask,
                        labels=y.input_ids,
                        decoder_attention_mask=y.attention_mask,
                        return_dict=True
                    ).loss
                train_loss += loss.data.item()
                # делаем шаг градиентного спуска
                loss.backward()
                optimizer.step()
                torch.cuda.empty_cache()
                gc.collect()

            train_loss /= int(len(train_pairs) / args.batch_size)
            mlflow.log_metric("train_loss", train_loss, epoch)
            print(f'Epoch {epoch}, train_loss: {train_loss}')

            # вычисление лосса на валидационном датасете
            eval_loss = 0
            model.eval()
            with torch.no_grad():
                for j in trange(0, int(len(eval_pairs) / args.batch_size)):
                    batch = eval_pairs[j * args.batch_size: (j + 1) * args.batch_size]
                    x = tokenizer([p[0] for p in batch], return_tensors='pt', padding=True).to(model.device)
                    y = tokenizer([p[1] for p in batch], return_tensors='pt', padding=True).to(model.device)
                    # -100 - специальное значение, позволяющее не учитывать токены
                    loss = model(
                        input_ids=x.input_ids,
                        attention_mask=x.attention_mask,
                        labels=y.input_ids,
                        decoder_attention_mask=y.attention_mask,
                        return_dict=True
                        ).loss
                    eval_loss += loss.data.item()

            eval_loss /= int(len(eval_pairs) / args.batch_size)
            mlflow.log_metric("eval_loss", eval_loss, epoch)
            print(f'Epoch {epoch}, eval_loss: {eval_loss}')

    model.save_pretrained(save_dir)
    tokenizer.save_pretrained(save_dir)

In [17]:
args = Params(6, 3, 5.74e-5, 0.238, 986, 12, 0.1)
train_and_eval('cointegrated/rut5-base-multitask', train_small_pairs, eval_pairs, args, 'train_small', 'rut5_v1', exp_id)

100%|██████████| 3413/3413 [15:54<00:00,  3.58it/s]


Epoch 0, train_loss: 2.144387740876505


100%|██████████| 426/426 [00:17<00:00, 24.92it/s]


Epoch 0, eval_loss: 0.8774029926305086


100%|██████████| 3413/3413 [15:03<00:00,  3.78it/s]


Epoch 1, train_loss: 1.4094865135684798


100%|██████████| 426/426 [00:17<00:00, 24.95it/s]


Epoch 1, eval_loss: 0.7768663223745398


100%|██████████| 3413/3413 [15:04<00:00,  3.77it/s]


Epoch 2, train_loss: 1.1132303289228649


100%|██████████| 426/426 [00:17<00:00, 24.92it/s]


Epoch 2, eval_loss: 0.8085872846974733


In [18]:
args = Params(6, 3, 5.74e-5, 0.238, 986, 12, 0.1)
train_and_eval('rut5_v1', train_ru_adapt_pairs, eval_pairs, args, 'ru_adapt_small', 'rut5_v2', exp_id)

100%|██████████| 5750/5750 [24:20<00:00,  3.94it/s]


Epoch 0, train_loss: 0.9401020715482857


100%|██████████| 426/426 [00:17<00:00, 24.95it/s]


Epoch 0, eval_loss: 2.557431572479821


100%|██████████| 5750/5750 [23:24<00:00,  4.10it/s]


Epoch 1, train_loss: 0.5530573506990205


100%|██████████| 426/426 [00:17<00:00, 24.95it/s]


Epoch 1, eval_loss: 2.790240503532786


100%|██████████| 5750/5750 [23:21<00:00,  4.10it/s]


Epoch 2, train_loss: 0.34173941811295633


100%|██████████| 426/426 [00:17<00:00, 24.93it/s]


Epoch 2, eval_loss: 5.449280455078878


In [19]:
args = Params(6, 3, 5.74e-5, 0.238, 986, 12, 0.1)
train_and_eval('rut5_v2', train_ru_xlsum_pairs, eval_pairs, args, 'ru_xlsum_small', 'rut5_v3', exp_id)

100%|██████████| 1899/1899 [10:10<00:00,  3.11it/s]


Epoch 0, train_loss: 2.8150222728979344


100%|██████████| 426/426 [00:17<00:00, 24.83it/s]


Epoch 0, eval_loss: 3.9966215807507295


100%|██████████| 1899/1899 [09:32<00:00,  3.32it/s]


Epoch 1, train_loss: 1.8087356012704938


100%|██████████| 426/426 [00:17<00:00, 24.83it/s]


Epoch 1, eval_loss: 4.502897160993496


100%|██████████| 1899/1899 [09:32<00:00,  3.32it/s]


Epoch 2, train_loss: 1.4528961252262242


100%|██████████| 426/426 [00:17<00:00, 24.80it/s]


Epoch 2, eval_loss: 4.571879478127744


In [20]:
time.sleep(300)

In [21]:
args = Params(3, 3, 5.74e-5, 0.238, 986, 12, 0.1)
train_and_eval('rut5_v3', train_medium_pairs, eval_pairs, args, 'medium_mix', 'rut5_v4', exp_id)

100%|██████████| 7267/7267 [42:13<00:00,  2.87it/s]


Epoch 0, train_loss: 2.220028813815074


100%|██████████| 853/853 [00:17<00:00, 48.19it/s]


Epoch 0, eval_loss: 4.255956073718502


100%|██████████| 7267/7267 [38:49<00:00,  3.12it/s]


Epoch 1, train_loss: 1.5575248665550938


100%|██████████| 853/853 [00:17<00:00, 48.11it/s]


Epoch 1, eval_loss: 4.377725107099638


100%|██████████| 7267/7267 [38:50<00:00,  3.12it/s]


Epoch 2, train_loss: 1.2455130992097148


100%|██████████| 853/853 [00:17<00:00, 48.15it/s]


Epoch 2, eval_loss: 4.151368267091748


In [22]:
time.sleep(300)

In [23]:
args = Params(1, 3, 5.74e-5, 0.238, 986, 12, 0.1)
train_and_eval('rut5_v4', train_large_pairs, eval_pairs, args, 'large_mix', 'rut5_v5', exp_id)

100%|██████████| 46872/46872 [3:35:27<00:00,  3.63it/s]  


Epoch 0, train_loss: 2.2122508593979426


100%|██████████| 2560/2560 [00:35<00:00, 72.40it/s]


Epoch 0, eval_loss: 2.0960394582652953


100%|██████████| 46872/46872 [3:21:49<00:00,  3.87it/s]  


Epoch 1, train_loss: 1.7540695584304586


100%|██████████| 2560/2560 [00:35<00:00, 72.38it/s]


Epoch 1, eval_loss: 2.2730193296447396


100%|██████████| 46872/46872 [3:21:31<00:00,  3.88it/s]  


Epoch 2, train_loss: 1.5614262155601624


100%|██████████| 2560/2560 [00:34<00:00, 73.35it/s]


Epoch 2, eval_loss: 2.3550713327247648


In [24]:
time.sleep(300)

In [25]:
args = Params(6, 1, 5.74e-5, 0.238, 986, 12, 0.1)
train_and_eval('rut5_v5', train_para_phraser_pairs, eval_pairs, args, 'para_phraser', 'rut5_v6', exp_id)

100%|██████████| 118228/118228 [7:05:32<00:00,  4.63it/s]  


Epoch 0, train_loss: 2.0305616433466303


100%|██████████| 426/426 [00:16<00:00, 25.08it/s]


Epoch 0, eval_loss: 8.665217665999148


In [52]:
args = Params(3, 1, 2e-5, None, None, None, None)
train_and_eval_base('rut5_v2', train_ext_pairs, eval_pairs, args, 'my_run', 'rut5_v7', exp_id)

100%|██████████| 22586/22586 [1:59:19<00:00,  3.15it/s]  


Epoch 0, train_loss: 1.6359715027598811


100%|██████████| 853/853 [00:17<00:00, 48.32it/s]


Epoch 0, eval_loss: 4.339362796138348


In [11]:
args = Params(3, 3, 2e-5, None, None, None, None)
train_and_eval_base('cointegrated/rut5-base-multitask', train_clean_pairs, eval_pairs, args, 'clean_data_mix', 'rut5_v8', exp_id)

100%|██████████| 16719/16719 [1:14:11<00:00,  3.76it/s]


Epoch 0, train_loss: 1.9118042747118615


100%|██████████| 853/853 [00:18<00:00, 45.48it/s]


Epoch 0, eval_loss: 1.2105605122693328


100%|██████████| 16719/16719 [1:09:57<00:00,  3.98it/s]


Epoch 1, train_loss: 1.2695603867003948


100%|██████████| 853/853 [00:19<00:00, 44.08it/s]


Epoch 1, eval_loss: 0.9644441406383394


 56%|█████▌    | 9335/16719 [38:48<28:16,  4.35it/s]  