In [1]:
import torch 
import optuna
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments
import mlflow
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import AdamW, get_linear_schedule_with_warmup
import rouge
from ipynb.fs.full.sari import SARIsent
import pandas as pd
from rouge import Rouge
from tqdm.auto import trange
import random
import numpy as np
import gc
import tensorflow as tf
from sklearn.model_selection import train_test_split
from torch.optim import Adam
from torch.utils.data import DataLoader, Dataset
import seaborn as sns
import plotly.express as px
import pickle 
import mlflow
import mlflow.sklearn
import mlflow.tensorflow
import json
import os
import tempfile
from mlflow.models.signature import infer_signature
from sklearn.utils import shuffle
from torch import nn
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train = pd.read_csv('data\\train_small_medium_mix_clean.csv', index_col=0)
val = pd.read_csv('data\\eval.csv', index_col=0)

In [7]:
train_pairs = list(zip(train.source, train.target))

val = val[val['size']=='small']
eval_pairs = list(zip(val.source, val.target))

print(train.shape[0], val.shape[0])

50157 2560


In [8]:
class Params(object):
    def __init__(self, batch_size, test_batch_size, epochs, lr, momentum, seed, cuda, log_interval):
        self.batch_size = batch_size
        self.test_batch_size = test_batch_size
        self.epochs = epochs
        self.lr = lr
        self.momentum = momentum
        self.seed = seed
        self.cuda = cuda
        self.log_interval = log_interval

In [9]:
experiment_name = "rut5_optimize_clear_data"
mlflow.set_experiment(experiment_name)
current_experiment = dict(mlflow.get_experiment_by_name(experiment_name))
exp_id = current_experiment['experiment_id']

In [12]:
def objective(trial):
    # Определение гиперпараметров
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-4, log=True)
    weight_decay = trial.suggest_float("weight_decay", 0.0, 0.3)
    warmup_steps = trial.suggest_int("warmup_steps", 0, 1000)
    num_layers = trial.suggest_int("num_layers", 2, 12)
    dropout_rate = trial.suggest_float("dropout_rate", 0.1, 0.5)
    epochs = trial.suggest_int('epochs', 2, 5)
    
    batch_size = 3
    #epochs = 2
    
    model_name = 'cointegrated/rut5-base-multitask'
    model = T5ForConditionalGeneration.from_pretrained(model_name, num_layers=num_layers, dropout_rate=dropout_rate).cuda()
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    
    # Определение оптимизатора и планировщика
    optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=len(train_pairs) * epochs)
    
    # Training loop
    with mlflow.start_run(experiment_id=exp_id, run_name='first_test'):
        mlflow.log_param("model_name", 'rut5_multitask')
        mlflow.log_param("batch_size", learning_rate)
        mlflow.log_param("epochs", epochs)
        mlflow.log_param("learning_rate", batch_size)
        mlflow.log_param("weight_decay", weight_decay)
        mlflow.log_param("warmup_steps", warmup_steps)
        mlflow.log_param("num_layers", num_layers)
        mlflow.log_param("dropout_rate", dropout_rate)
        for epoch in range(epochs):
            model.train()
            train_loss = 0

            for i in trange(0, int(len(train_pairs) / batch_size)):
                try:
                    batch = train_pairs[i * batch_size: (i + 1) * batch_size]
                    x = tokenizer([p[0] for p in batch], return_tensors='pt', padding=True).to(model.device)
                    y = tokenizer([p[1] for p in batch], return_tensors='pt', padding=True).to(model.device)
                    y.input_ids[y.input_ids == 0] = -100

                except OutOfMemoryError:
                    print('Ignoring batch due to CUDA out of memory')
                    continue

                optimizer.zero_grad()
                loss = model(
                    input_ids=x.input_ids,
                    attention_mask=x.attention_mask,
                    labels=y.input_ids,
                    decoder_attention_mask=y.attention_mask,
                    return_dict=True
                ).loss

                train_loss += loss.data.item()
                loss.backward()
                optimizer.step()
                scheduler.step()
                torch.cuda.empty_cache()
                gc.collect()

            train_loss /= int(len(train_pairs) / batch_size)
            mlflow.log_metric("train_loss", train_loss, epoch)
            print(f'Epoch {epoch}, train_loss: {train_loss}')

            # Validation loop
            eval_loss = 0
            model.eval()
            with torch.no_grad():
                for j in trange(0, int(len(eval_pairs) / batch_size)):
                    batch = eval_pairs[j * batch_size: (j + 1) * batch_size]
                    x = tokenizer([p[0] for p in batch], return_tensors='pt', padding=True).to(model.device)
                    y = tokenizer([p[1] for p in batch], return_tensors='pt', padding=True).to(model.device)

                    loss = model(
                        input_ids=x.input_ids,
                        attention_mask=x.attention_mask,
                        labels=y.input_ids,
                        decoder_attention_mask=y.attention_mask,
                        return_dict=True
                    ).loss

                    eval_loss += loss.data.item()

            eval_loss /= int(len(eval_pairs) / batch_size)
            mlflow.log_metric("eval_loss", eval_loss, epoch)
            print(f'Epoch {epoch}, eval_loss: {eval_loss}')
        return eval_loss

In [None]:
study = optuna.create_study(direction='minimize')

# Запуск оптимизации гиперпараметров
study.optimize(objective, n_trials=35)

mlflow.end_run()

[32m[I 2023-06-15 09:18:42,046][0m A new study created in memory with name: no-name-7c8f1570-c737-4d80-a475-89d7a86606b0[0m
Some weights of the model checkpoint at cointegrated/rut5-base-multitask were not used when initializing T5ForConditionalGeneration: ['encoder.block.6.layer.1.DenseReluDense.wi_1.weight', 'encoder.block.9.layer.1.layer_norm.weight', 'encoder.block.8.layer.1.DenseReluDense.wo.weight', 'encoder.block.7.layer.0.SelfAttention.k.weight', 'encoder.block.8.layer.1.DenseReluDense.wi_1.weight', 'encoder.block.5.layer.1.layer_norm.weight', 'encoder.block.5.layer.0.SelfAttention.q.weight', 'encoder.block.4.layer.1.DenseReluDense.wi_1.weight', 'encoder.block.10.layer.1.DenseReluDense.wi_0.weight', 'encoder.block.6.layer.1.layer_norm.weight', 'encoder.block.10.layer.1.layer_norm.weight', 'encoder.block.11.layer.0.layer_norm.weight', 'encoder.block.5.layer.0.SelfAttention.o.weight', 'encoder.block.9.layer.0.SelfAttention.k.weight', 'encoder.block.4.layer.1.DenseReluDense.wo.

Epoch 0, train_loss: 13.910052919995307


100%|██████████| 853/853 [00:12<00:00, 69.74it/s]


Epoch 0, eval_loss: 9.769611346344316


100%|██████████| 16719/16719 [55:29<00:00,  5.02it/s] 


Epoch 1, train_loss: 7.823971821336457


100%|██████████| 853/853 [00:12<00:00, 69.17it/s]
[32m[I 2023-06-15 11:10:21,428][0m Trial 0 finished with value: 9.826596033670977 and parameters: {'learning_rate': 3.6605617340926676e-05, 'weight_decay': 0.01786410569573289, 'warmup_steps': 719, 'num_layers': 4, 'dropout_rate': 0.47430735054379025, 'epochs': 2}. Best is trial 0 with value: 9.826596033670977.[0m


Epoch 1, eval_loss: 9.826596033670977


Some weights of the model checkpoint at cointegrated/rut5-base-multitask were not used when initializing T5ForConditionalGeneration: ['encoder.block.8.layer.1.DenseReluDense.wi_1.weight', 'encoder.block.3.layer.1.DenseReluDense.wo.weight', 'encoder.block.10.layer.1.DenseReluDense.wi_0.weight', 'encoder.block.3.layer.1.DenseReluDense.wi_0.weight', 'encoder.block.3.layer.1.layer_norm.weight', 'encoder.block.6.layer.0.SelfAttention.v.weight', 'encoder.block.2.layer.1.DenseReluDense.wi_1.weight', 'encoder.block.4.layer.0.layer_norm.weight', 'encoder.block.4.layer.0.SelfAttention.v.weight', 'encoder.block.9.layer.1.DenseReluDense.wo.weight', 'encoder.block.9.layer.0.SelfAttention.v.weight', 'encoder.block.11.layer.1.DenseReluDense.wi_1.weight', 'encoder.block.10.layer.0.SelfAttention.v.weight', 'encoder.block.8.layer.0.SelfAttention.v.weight', 'encoder.block.7.layer.0.SelfAttention.o.weight', 'encoder.block.6.layer.0.layer_norm.weight', 'encoder.block.11.layer.0.SelfAttention.k.weight', 'en

Epoch 0, train_loss: 19.92713874133197


100%|██████████| 853/853 [00:10<00:00, 78.27it/s]


Epoch 0, eval_loss: 9.64582448603819


100%|██████████| 16719/16719 [52:34<00:00,  5.30it/s] 


Epoch 1, train_loss: 8.386021241509136


100%|██████████| 853/853 [00:10<00:00, 77.75it/s]
[32m[I 2023-06-15 12:55:53,368][0m Trial 1 finished with value: 9.644424479284432 and parameters: {'learning_rate': 1.4105405651541264e-05, 'weight_decay': 0.2954674567216341, 'warmup_steps': 973, 'num_layers': 2, 'dropout_rate': 0.47363517387136056, 'epochs': 2}. Best is trial 1 with value: 9.644424479284432.[0m


Epoch 1, eval_loss: 9.644424479284432


Some weights of the model checkpoint at cointegrated/rut5-base-multitask were not used when initializing T5ForConditionalGeneration: ['encoder.block.9.layer.1.layer_norm.weight', 'encoder.block.8.layer.1.DenseReluDense.wo.weight', 'encoder.block.7.layer.0.SelfAttention.k.weight', 'encoder.block.8.layer.1.DenseReluDense.wi_1.weight', 'encoder.block.10.layer.1.DenseReluDense.wi_0.weight', 'encoder.block.11.layer.0.layer_norm.weight', 'encoder.block.10.layer.1.layer_norm.weight', 'encoder.block.9.layer.0.SelfAttention.k.weight', 'encoder.block.9.layer.0.layer_norm.weight', 'encoder.block.7.layer.1.DenseReluDense.wi_0.weight', 'encoder.block.7.layer.0.SelfAttention.q.weight', 'encoder.block.8.layer.0.SelfAttention.q.weight', 'encoder.block.10.layer.0.SelfAttention.o.weight', 'encoder.block.10.layer.0.SelfAttention.k.weight', 'encoder.block.9.layer.0.SelfAttention.q.weight', 'encoder.block.9.layer.1.DenseReluDense.wi_1.weight', 'encoder.block.9.layer.1.DenseReluDense.wo.weight', 'encoder.bl

Epoch 0, train_loss: 13.42489545825452


100%|██████████| 853/853 [00:14<00:00, 58.99it/s]


Epoch 0, eval_loss: 9.462740081974937


100%|██████████| 16719/16719 [1:01:02<00:00,  4.56it/s]


Epoch 1, train_loss: 7.768236127222859


100%|██████████| 853/853 [00:14<00:00, 59.13it/s]


Epoch 1, eval_loss: 9.496355724781926


100%|██████████| 16719/16719 [1:01:02<00:00,  4.56it/s]


Epoch 2, train_loss: 7.419482585538039


100%|██████████| 853/853 [00:13<00:00, 61.30it/s]
[32m[I 2023-06-15 15:59:43,693][0m Trial 2 finished with value: 9.327278432365318 and parameters: {'learning_rate': 1.9799206617982294e-05, 'weight_decay': 0.26707025078676655, 'warmup_steps': 444, 'num_layers': 7, 'dropout_rate': 0.4474496615923581, 'epochs': 3}. Best is trial 2 with value: 9.327278432365318.[0m


Epoch 2, eval_loss: 9.327278432365318


Some weights of the model checkpoint at cointegrated/rut5-base-multitask were not used when initializing T5ForConditionalGeneration: ['encoder.block.9.layer.1.layer_norm.weight', 'encoder.block.8.layer.1.DenseReluDense.wo.weight', 'encoder.block.7.layer.0.SelfAttention.k.weight', 'encoder.block.8.layer.1.DenseReluDense.wi_1.weight', 'encoder.block.10.layer.1.DenseReluDense.wi_0.weight', 'encoder.block.11.layer.0.layer_norm.weight', 'encoder.block.10.layer.1.layer_norm.weight', 'encoder.block.9.layer.0.SelfAttention.k.weight', 'encoder.block.9.layer.0.layer_norm.weight', 'encoder.block.7.layer.1.DenseReluDense.wi_0.weight', 'encoder.block.7.layer.0.SelfAttention.q.weight', 'encoder.block.8.layer.0.SelfAttention.q.weight', 'encoder.block.10.layer.0.SelfAttention.o.weight', 'encoder.block.10.layer.0.SelfAttention.k.weight', 'encoder.block.9.layer.0.SelfAttention.q.weight', 'encoder.block.9.layer.1.DenseReluDense.wi_1.weight', 'encoder.block.9.layer.1.DenseReluDense.wo.weight', 'encoder.bl

Epoch 0, train_loss: 2.0155553501336647


100%|██████████| 853/853 [00:14<00:00, 59.93it/s]


Epoch 0, eval_loss: 4.903908058738932


100%|██████████| 16719/16719 [1:01:10<00:00,  4.56it/s]


Epoch 1, train_loss: 1.6606139590205249


100%|██████████| 853/853 [00:14<00:00, 60.65it/s]


Epoch 1, eval_loss: 6.461407265657557


100%|██████████| 16719/16719 [1:01:08<00:00,  4.56it/s]


Epoch 2, train_loss: 1.4947506007240134


100%|██████████| 853/853 [00:14<00:00, 60.62it/s]


Epoch 2, eval_loss: 6.779201609029021


 15%|█▍        | 2505/16719 [09:30<51:38,  4.59it/s]  

In [None]:
# Получение результатов
best_trial = study.best_trial

print("Best trial:")
print("Value: ", best_trial.value)
print("Params: ")
for key, value in best_trial.params.items():
    print("{}: {}".format(key, value))

In [None]:
filehandler = open("optuna_trials_clean_data.obj","wb")
pickle.dump(study.get_trials(), filehandler)
filehandler.close()

In [4]:
with open('optuna_trials.obj', 'rb') as f:
    obj = pickle.load(f)

In [8]:
len(obj)

35

In [40]:
tb = []

for i in range(len(obj)):
    trial = obj[i]   
    params = {}
    params['iter'] = trial.number
    params.update(trial.params)
    params['loss'] = trial.values[0]
    tb.append(params)

In [43]:
pd.DataFrame(tb).drop('number', axis=1)

Unnamed: 0,iter,learning_rate,weight_decay,warmup_steps,num_layers,dropout_rate,loss
0,0,9.8e-05,0.105954,177,7,0.338209,2.641003
1,1,5.5e-05,0.16419,523,4,0.126,1.450552
2,2,3.7e-05,0.077263,824,8,0.221739,1.581771
3,3,4.5e-05,0.145387,908,11,0.100246,1.190233
4,4,2.5e-05,0.113149,118,10,0.222361,1.508529
5,5,2.8e-05,0.101248,477,6,0.144912,1.364144
6,6,1.6e-05,0.196126,386,12,0.48483,2.387404
7,7,1.1e-05,0.043701,367,9,0.225881,1.513977
8,8,3.6e-05,0.14694,477,2,0.305286,6.200557
9,9,3.2e-05,0.064082,730,10,0.28723,1.70332


In [34]:
pd.DataFrame.from_dict(params, orient='index')

Unnamed: 0,0
number,0.0
learning_rate,9.8e-05
weight_decay,0.105954
warmup_steps,177.0
num_layers,7.0
dropout_rate,0.338209
loss,2.641003
