In [1]:
import torch
from torch import nn
from torch.nn import functional
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, DataCollatorWithPadding
import torch.optim as optim
from torch.profiler import profile, record_function, ProfilerActivity
import numpy as np
import sys
from tqdm import tqdm
import os
from time import time
from torch.cuda.amp import autocast, GradScaler

from dataset.create_dataset import BookGenerator, create_test_dataset
from layers.model import Transformer, AutoregressiveWrapper
from transformers import BertTokenizer
from test_model.test_model import TestModel

import wandb
import yadisk

In [2]:
"""
#!git clone https://github.com/konductor000/GenerativePretrainedTransformer
!pip install -r requirements.txt
!pip install rouge_score
!MAX_JOBS=4 pip install flash-attn --no-build-isolation
!pip install datasets
!pip install transformers
!pip install evaluate
!pip install spacy
!pip install yadisk
!pip install wandb
8a49fefdd8a82ca9ba659a874b09adf8c5995778
"""

'\n#!git clone https://github.com/konductor000/GenerativePretrainedTransformer\n!pip install -r requirements.txt\n!pip install rouge_score\n!MAX_JOBS=4 pip install flash-attn --no-build-isolation\n!pip install datasets\n!pip install transformers\n!pip install evaluate\n!pip install spacy\n!pip install yadisk\n!pip install wandb\n8a49fefdd8a82ca9ba659a874b09adf8c5995778\n'

In [3]:
CONFIG = {
    "architecture": "Transformer", # Wandb only
    "dataset": "books", #"wikitext-103-raw-v1", # Wandb only
    "batch_size": 6,
    "embedding_size": 768,
    "max_sequence_length": 4096,
    "number_of_layers": 12,
    "number_of_heads": 12,
    "additional_feed_forward_layers": 0,
    "extention_factor": 4,
    "dropout_rate": 0.1,
    'test_size': 32,
    'start_book': 0,
    'flash_atten': True,
    'num_train_books': 2500,
    'save_every': 120,
    'use_mixed_precision': True,
    'model_path': None, # "savepoints/dulcet-serenity-218",
    
}
CONFIG["lr"] = 0.001 / np.sqrt(CONFIG["batch_size"])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(device)

cuda


In [4]:
def log_metrics(test_name, test_dataloader, pipeline, model_tester, train_config, CONFIG):
    pipeline.eval()
    if test_name in ['books']:
        metrics = model_tester.test_model(pipeline, test_dataloader)
        test_loss = metrics['loss']
        bleu = metrics['bleu']
        rouge1 = metrics['rouge1']
        rouge2 = metrics['rouge2']
        rougeL = metrics['rougeL']

        wandb.log({
            "test_loss_" + test_name: test_loss,
            "bleu_" + test_name: bleu,
            "rouge1_" + test_name: rouge1,
            "rouge2_" + test_name: rouge2,
            "rougeL_" + test_name: rougeL,
        }) 
    else:
        simmilarity_accuracy, simmilarity_score = model_tester.test_simmilarity(pipeline,
                                                            test_dataloader, tokenizer)
        
        wandb.log({
            "simmilarity_accuracy": simmilarity_accuracy,
            "simmilarity_score": simmilarity_score,
        })

    pipeline.train()
        

def train(CONFIG, pipeline, model, optimizer, loss_function, model_tester, wandb,
          wandb_run, scaler, y, dataloaders):
    train_config = {
        "test_every": 4096 // CONFIG["batch_size"],
        "log_traing_metrics_every": 64 // CONFIG["batch_size"],
    }
    
    book_generator, test_book_dataloader, cloze_dataloader = dataloaders

    train_time = 0
    total_train_time = time()
    
    model.train()

    batch_num = 0
    train_losses = []
    tests = ['books', 'simmilarity_score']
    test_dataloaders = [test_book_dataloader, cloze_dataloader]
    
    for i in tqdm(range(CONFIG['num_train_books']), desc="Training Progress"):
        train_dataloader = book_generator.next_book(10)
        if time() - total_train_time > CONFIG['save_every'] * 60:
            total_train_time = time()
            save_model(model, wandb_run, y)
            
        for batch in train_dataloader:
            train_start = time()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            optimizer.zero_grad()

            with torch.autocast(device_type='cuda', dtype=torch.float16):
                model_output, target = pipeline(input_ids, attention_mask)
                loss = loss_function(model_output.transpose(1, 2), target)
            
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            
            train_losses.append(loss.item())
            train_time += time() - train_start
            batch_num += 1
    
            if batch_num % train_config["log_traing_metrics_every"] == 0:
                wandb.log({
                    "train_loss": sum(train_losses[-train_config["log_traing_metrics_every"]:]) / train_config["log_traing_metrics_every"],
                    "train_time": train_time / train_config["log_traing_metrics_every"],
                })
                train_time = 0
            
            if batch_num % train_config["test_every"] == 0:
                test_start = time()
                for i in range(len(tests)):
                    log_metrics(tests[i], test_dataloaders[i], pipeline, model_tester, train_config, CONFIG)
                wandb.log({
                    "test_time": time() - test_start,
                })

In [5]:
def create_model(CONFIG, model_path=None):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    number_of_tokens = tokenizer.vocab_size
    
    y = yadisk.YaDisk(token="y0_AgAAAAA7vZKNAADLWwAAAADsK4dQ-f3fKT3hSianJggcCcKC1Mfxo8s")
    print(y.check_token())
    model = Transformer(
        embedding_size=CONFIG["embedding_size"],
        number_of_tokens=number_of_tokens,
        number_of_heads=CONFIG["number_of_heads"],
        number_of_layers=CONFIG["number_of_layers"],
        extention_factor=CONFIG["extention_factor"],
        additional_feed_forward_layers=CONFIG["additional_feed_forward_layers"],
        dropout_rate=CONFIG["dropout_rate"],
        max_sequence_length=CONFIG["max_sequence_length"],
        use_flash_att=CONFIG["flash_atten"],
    ).to(device)
    if model_path:
        model.load_state_dict(torch.load(model_path))

    pipeline = AutoregressiveWrapper(model).to(device)
    loss_function = nn.CrossEntropyLoss().to(device)
    optimizer = optim.Adam(model.parameters(), lr=CONFIG["lr"])
    model_tester = TestModel(tokenizer, model)
    scaler = GradScaler()

    return pipeline, model, optimizer, loss_function, model_tester, scaler, y

In [6]:
def save_model(model, wandb_run, y):
    if not os.path.exists('save_models'):
        os.makedirs('save_models')
    save_dir = f"save_models/{wandb_run.name}"
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    saved_models = [f for f in os.listdir(save_dir) if f.startswith("model_")]
    num_saved_models = len(saved_models)

    model_filename = f"model_{num_saved_models + 1}"
    model_path = f'{save_dir}/{model_filename}'
    torch.save(model.state_dict(), model_path)
    
    try:
        y.mkdir(save_dir)
    except yadisk.exceptions.PathExistsError:
        pass
    print(model_path)
    y.upload(model_path, model_path)


In [7]:
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mskorodumov-work[0m ([33m8667[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [8]:
wandb_run = wandb.init(
    project="transformer",
    tags=["long_training_testing"],
    #id="sblota90",
    resume="allow",
    config=CONFIG
)

load_path = CONFIG['model_path']

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
book_generator = BookGenerator(batch_size=CONFIG["batch_size"],
                                max_sequence_length=CONFIG["max_sequence_length"], tokenizer=tokenizer)
test_book_dataloader = book_generator.select_test(CONFIG["test_size"])
book_generator.skip_books(CONFIG['start_book'])
cloze_dataset = create_test_dataset('data.csv')

pipeline, model, optimizer, loss_function, model_tester, scaler, y = create_model(CONFIG, CONFIG["model_path"])
num_parameters, num_trainable_parameters, memory_allocated = pipeline.count_parameters() 
print('number of parameters =', num_parameters)
print('number of trainable parameters =', num_trainable_parameters)
print('memory allocated in GB =', memory_allocated)

True
Collecting en-core-web-md==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.6.0/en_core_web_md-3.6.0-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m50.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m


[0m

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
number of parameters = 131968314
number of trainable parameters = 131968314
memory allocated in GB = 0.4916202798485756


In [10]:
for batch in test_book_dataloader:
    break

In [11]:
metrics = model_tester.test_model(pipeline, [batch])

In [12]:
train(CONFIG, pipeline, model, optimizer, loss_function, model_tester, wandb, wandb_run, scaler, y,
     [book_generator, test_book_dataloader, cloze_dataset])

Training Progress:   3%|▎         | 85/2500 [1:10:17<33:17:18, 49.62s/it]  


KeyboardInterrupt: 

In [13]:
save_model(model, wandb_run, y)

save_models/amber-sun-305/model_1


In [None]:
wandb.finish()

In [17]:
input_text = """
London is the capital of
"""
num_predicted_tokens = 20
k = 3
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

start = time()
with torch.autocast(device_type='cuda', dtype=torch.float16):
    answer = pipeline.predict_next(input_text, tokenizer, num_predicted_tokens, k)
print(time() - start)

print(answer)

0.2963066101074219
[CLS] london is the capital of the southern states, and georgia, that the southern states, that the southern states, and that the


In [None]:
from random import randint

def predict_next(pipeline, input_text, num_predicted_tokens, tokenizer):
    input_tokens = tokenizer.encode(input_text, return_tensors="pt")
    input_tokens = input_tokens[:, :-1].to(device)
    
    for i in range(num_predicted_tokens):
        mask = torch.ones_like(input_tokens)

        with torch.no_grad():
            probabilities = pipeline.next_token_probabilities(input_tokens, mask)
        
        answer = probabilities.argsort(dim=-1)[:, -randint(1, 2)].unsqueeze(0)
        input_tokens = torch.cat((input_tokens, answer), dim=1)
        
    return tokenizer.decode(input_tokens[0])

In [None]:
input_text = """
london is the capital of
"""
num_predicted_tokens = 40

answer = predict_next(pipeline, input_text, num_predicted_tokens, tokenizer)

print(answer)

In [None]:
simmilarity_accuracy, simmilarity_score = model_tester.test_simmilarity(pipeline,
                                                            cloze_dataset, tokenizer)

In [20]:
!git config --global user.email "skorodumov00@mail.ru"
!git config --global user.name "konductor000"

In [1]:
!git push

Username for 'https://github.com': ^C
