In [1]:
import torch
from torch import nn
from torch.nn import functional
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, DataCollatorWithPadding, BertTokenizer
import torch.optim as optim
from torch.profiler import profile, record_function, ProfilerActivity
import numpy as np
import sys
from tqdm import tqdm
import os
from time import time
import pandas as pd
from torch.cuda.amp import autocast, GradScaler

from dataset.create_dataset import PileGenerator, StoriesGenerator, create_test_dataset
from layers.model import Transformer, AutoregressiveWrapper
from test_model.test_model import TestModel

import wandb
import yadisk
from huggingface_hub import login

os.environ['TOKENIZERS_PARALLELISM'] = 'false'
login('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')

In [2]:
"""
#!git clone https://github.com/konductor000/GenerativePretrainedTransformer
!pip install -r requirements.txt
!pip install rouge_score
!MAX_JOBS=4 pip install flash-attn --no-build-isolation
!pip install datasets
!pip install transformers
!pip install evaluate
!pip install spacy
!pip install yadisk
!pip install wandb
!pip install huggingface_hub
!python -m spacy download en_core_web_md
#8a49fefdd8a82ca9ba659a874b09adf8c5995778
"""

'\n#!git clone https://github.com/konductor000/GenerativePretrainedTransformer\n!pip install -r requirements.txt\n!pip install rouge_score\n!MAX_JOBS=4 pip install flash-attn --no-build-isolation\n!pip install datasets\n!pip install transformers\n!pip install evaluate\n!pip install spacy\n!pip install yadisk\n!pip install wandb\n!python -m spacy download en_core_web_md\n#8a49fefdd8a82ca9ba659a874b09adf8c5995778\n'

In [3]:
CONFIG = {
    "architecture": "Transformer", # Wandb only
    "dataset": "books", #"wikitext-103-raw-v1", # Wandb only
    "batch_size": 64,
    "embedding_size": 256,
    "max_sequence_length": 256,
    "number_of_layers": 12,
    "number_of_heads": 8,
    "extention_factor": 4,
    "dropout_rate": 0.1,
    'test_size': 1024,
    'start_book': 0,
    "test_every": 2**15,
    "log_traing_metrics_every": 2**8,
    'flash_atten': True,
    'num_train_points': 2500,
    'save_every': 30,
    'use_mixed_precision': True,
    'model_path': None, # "savepoints/dulcet-serenity-218",
    
}
CONFIG["lr"] = 0.001 / np.sqrt(CONFIG["batch_size"])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(device)

cuda


In [4]:
def log_metrics(test_name, test_dataloader, pipeline, model_tester, CONFIG):
    pipeline.eval()
    if test_name in ['stories', 'pile']:
        metrics = model_tester.test_model(pipeline, test_dataloader)
        test_loss = metrics['loss']
        bleu = metrics['bleu']
        rouge1 = metrics['rouge1']
        rouge2 = metrics['rouge2']
        rougeL = metrics['rougeL']

        wandb.log({
            "test_loss_" + test_name: test_loss,
            "bleu_" + test_name: bleu,
            "rouge1_" + test_name: rouge1,
            "rouge2_" + test_name: rouge2,
            "rougeL_" + test_name: rougeL,
        }) 
    else:
        simmilarity_accuracy, simmilarity_score = model_tester.test_simmilarity(pipeline,
                                                            test_dataloader, tokenizer)
        
        wandb.log({
            "simmilarity_accuracy": simmilarity_accuracy,
            "simmilarity_score": simmilarity_score,
        })

    pipeline.train()
        

def train(CONFIG, pipeline, model, optimizer, loss_function, model_tester, wandb,
          wandb_run, scaler, y, dataloaders):
    stories_generator, pile_test_dataloader, stories_test_dataloader, cloze_dataset = dataloaders

    train_time = 0
    total_train_time = time()
    
    model.train()

    batch_num = 0
    train_losses = []
    tests = ['stories', 'pile', 'cloze']
    test_dataloaders = [pile_test_dataloader, stories_test_dataloader, cloze_dataset]
    
    for i in tqdm(range(CONFIG['num_train_books']), desc="Training Progress"):
        train_dataloader = stories_generator.next_loader(100)
        if time() - total_train_time > CONFIG['save_every'] * 60:
            total_train_time = time()
            save_model(model, wandb_run, y)
            
        for batch in train_dataloader:
            train_start = time()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            optimizer.zero_grad()

            with torch.autocast(device_type='cuda', dtype=torch.float16):
                model_output, target = pipeline(input_ids, attention_mask)
                loss = loss_function(model_output.transpose(1, 2), target)
            
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            
            train_losses.append(loss.item())
            train_time += time() - train_start
            batch_num += 1
    
            if batch_num % CONFIG["log_traing_metrics_every"] == 0:
                wandb.log({
                    "train_loss": sum(train_losses[-CONFIG["log_traing_metrics_every"]:]) / CONFIG["log_traing_metrics_every"],
                    "train_time": train_time / CONFIG["log_traing_metrics_every"],
                })
                train_time = 0
            
            if batch_num % CONFIG["test_every"] == 0:
                test_start = time()
                for i in range(len(tests)):
                    log_metrics(tests[i], test_dataloaders[i], pipeline, model_tester, CONFIG)
                wandb.log({
                    "test_time": time() - test_start,
                })

In [5]:
def create_model(CONFIG, tokenizer, model_path=None):
    number_of_tokens = tokenizer.vocab_size
    
    y = yadisk.YaDisk(token="y0_AgAAAAA7vZKNAADLWwAAAADsK4dQ-f3fKT3hSianJggcCcKC1Mfxo8s")
    print(y.check_token())
    model = Transformer(
        embedding_size=CONFIG["embedding_size"],
        number_of_tokens=number_of_tokens,
        number_of_heads=CONFIG["number_of_heads"],
        number_of_layers=CONFIG["number_of_layers"],
        extention_factor=CONFIG["extention_factor"],
        dropout_rate=CONFIG["dropout_rate"],
        max_sequence_length=CONFIG["max_sequence_length"],
        use_flash_att=CONFIG["flash_atten"],
    ).to(device)
    if model_path:
        model.load_state_dict(torch.load(model_path))

    pipeline = AutoregressiveWrapper(model).to(device)
    loss_function = nn.CrossEntropyLoss().to(device)
    optimizer = optim.Adam(model.parameters(), lr=CONFIG["lr"])
    model_tester = TestModel(tokenizer, model)
    scaler = GradScaler()

    return pipeline, model, optimizer, loss_function, model_tester, scaler, y

In [6]:
def save_model(model, wandb_run, y):
    if not os.path.exists('save_models'):
        os.makedirs('save_models')
    save_dir = f"save_models/{wandb_run.name}"
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    saved_models = [f for f in os.listdir(save_dir) if f.startswith("model_")]
    num_saved_models = len(saved_models)

    model_filename = f"model_{num_saved_models + 1}"
    model_path = f'{save_dir}/{model_filename}'
    torch.save(model.state_dict(), model_path)
    
    try:
        y.mkdir(save_dir)
    except yadisk.exceptions.PathExistsError:
        pass
    print(model_path)
    y.upload(model_path, model_path)


In [7]:
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mskorodumov-work[0m ([33m8667[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [8]:
wandb_run = wandb.init(
    project="transformer",
    tags=["long_training_testing"],
    #id="sblota90",
    resume="allow",
    config=CONFIG
)

load_path = CONFIG['model_path']

In [9]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
pile_generator = PileGenerator(batch_size=CONFIG["batch_size"], max_sequence_length=CONFIG["max_sequence_length"],
                                tokenizer=tokenizer, num_workers=16, test_size=4096, num_skip=0)
stories_generator = StoriesGenerator(batch_size=CONFIG["batch_size"], max_sequence_length=CONFIG["max_sequence_length"], 
                               tokenizer=tokenizer, num_workers=16)
cloze_dataset = create_test_dataset('data.csv')

pipeline, model, optimizer, loss_function, model_tester, scaler, y = create_model(CONFIG, tokenizer, CONFIG["model_path"])
num_parameters, num_trainable_parameters, memory_allocated = pipeline.count_parameters() 
print('number of parameters =', num_parameters)
print('number of trainable parameters =', num_trainable_parameters)
print('memory allocated in GB =', memory_allocated)

Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

True
number of parameters = 129622852
number of trainable parameters = 129622852
memory allocated in GB = 0.4828827530145645


In [None]:
train(CONFIG, pipeline, model, optimizer, loss_function, model_tester, wandb, wandb_run, scaler, y,
     [stories_generator, pile_generator.test_dataloader, stories_generator.test_dataloader, cloze_dataset])

In [None]:
save_model(model, wandb_run, y)

In [None]:
wandb.finish()

In [None]:
input_text = """
Mount Everest is the highest peak in
"""
num_predicted_tokens = 20
k = 3
tokenizer = BertTokenizer.from_pretrained('bert-large-cased')

start = time()
with torch.autocast(device_type='cuda', dtype=torch.float16):
    answer = pipeline.predict_next(input_text, tokenizer, num_predicted_tokens, k)
print(time() - start)

print(answer)

In [None]:
from random import randint

def predict_next(pipeline, input_text, num_predicted_tokens, tokenizer):
    input_tokens = tokenizer.encode(input_text, return_tensors="pt")
    input_tokens = input_tokens[:, :-1].to(device)
    
    for i in range(num_predicted_tokens):
        mask = torch.ones_like(input_tokens)

        with torch.no_grad():
            probabilities = pipeline.next_token_probabilities(input_tokens, mask)
        
        answer = probabilities.argsort(dim=-1)[:, -randint(1, 3)].unsqueeze(0)
        input_tokens = torch.cat((input_tokens, answer), dim=1)
        
    return tokenizer.decode(input_tokens[0])

In [11]:
start = time()
train_dataloader = book_generator.next_book(10)      
print(time() - start)
print(len(train_dataloader))

12.44458270072937


NameError: name 'dataloader' is not defined

In [18]:
loader_time = time()
for batch in train_dataloader:
    pass
print(time() - loader_time)

1.8038690090179443


In [19]:
data_prep_timings = []
model_train_timings = []
gradient_timings = []
times1 = []
times2 = []
times3 = []
times4 = []

for batch in train_dataloader:
    data_prep_time = time()
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)

    data_prep_timings.append(time() - data_prep_time)

    optimizer.zero_grad()

    model_time = time()
    with torch.autocast(device_type='cuda', dtype=torch.float16):
        model_output, target = pipeline(input_ids, attention_mask)
        loss = loss_function(model_output.transpose(1, 2), target)

    model_train_timings.append(time() - model_time)

    grad_time = time()
    time1 = time()
    scaler.scale(loss).backward()
    time2 = time()
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    time3 = time()
    scaler.step(optimizer)
    time4 = time()
    scaler.update()
    times1.append(time2-time1)
    times2.append(time3-time2)
    times3.append(time4-time3)
    times4.append(time()-time4)
    gradient_timings.append(time() - grad_time)

print('number of batches -', len(train_dataloader))
print('data_prep_timings -', sum(data_prep_timings))
print('model_train_timings -', sum(model_train_timings))
print('gradient_timings -', sum(gradient_timings))
print('backward -', sum(times1))
print('clipping -', sum(times2))
print('step -', sum(times3))
print('updating -', sum(times4))

number of batches - 136
data_prep_timings - 1.1094281673431396
model_train_timings - 1.5837981700897217
gradient_timings - 37.77984666824341
backward - 36.2930953502655
clipping - 0.39544034004211426
step - 1.084547519683838
updating - 0.006574153900146484


In [15]:
with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
    with record_function("model_inference"):
        start_time = time()
        
        for batch in train_dataloader:
            train_start = time()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
        
            optimizer.zero_grad()
        
            with torch.autocast(device_type='cuda', dtype=torch.float16):
                model_output, target = pipeline(input_ids, attention_mask)
                loss = loss_function(model_output.transpose(1, 2), target)
            
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            

STAGE:2023-09-18 18:28:48 17096:17096 ActivityProfilerController.cpp:311] Completed Stage: Warm Up
STAGE:2023-09-18 18:29:31 17096:17096 ActivityProfilerController.cpp:317] Completed Stage: Collection
STAGE:2023-09-18 18:29:32 17096:17096 ActivityProfilerController.cpp:321] Completed Stage: Post Processing


In [16]:
print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=20))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                        model_inference        48.52%       37.132s        54.17%       41.454s       41.454s             1  
                                  cudaStreamSynchronize        42.85%       32.795s        42.85%       32.795s      48.228ms           680  
autograd::engine::evaluate_function: EmbeddingBackwa...         0.00%       1.433ms        42.00%       32.144s     236.354ms           136  
                                     EmbeddingBackward0        -0.31%  -238186.000us        42.00%       32.143s     236.344ms           136  
     

In [None]:
-------------------------------------------------------  ------------  ------------  ------------  
                                                   Name     CPU total  CPU time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  
                                        model_inference       41.454s       41.454s             1  
                                  cudaStreamSynchronize       32.795s      48.228ms           680  
autograd::engine::evaluate_function: EmbeddingBackwa...       32.144s     236.354ms           136  
                                     EmbeddingBackward0       32.143s     236.344ms           136  
                               aten::embedding_backward       32.142s     236.340ms           136  
                         aten::embedding_dense_backward       32.142s     236.336ms           136  
                                               aten::to        1.856s      31.833us         58292  
                                         aten::_to_copy        1.776s      30.898us         57476  
                                           aten::linear        1.530s     114.802us         13328  
                                            aten::copy_        1.254s      21.558us         58156  
                               Optimizer.step#Adam.step        1.228s       9.306ms           132  
    autograd::engine::evaluate_function: AddmmBackward0        1.026s     153.950us          6664  
                                         AddmmBackward0     695.585ms     104.380us          6664  
                                       cudaLaunchKernel     653.615ms       6.050us        108032  
autograd::engine::evaluate_function: ToCopyBackward0...     585.602ms      34.725us         16864  
                                               aten::mm     498.341ms      37.391us         13328  
                                        ToCopyBackward0     487.507ms      28.908us         16864  
                                    aten::empty_strided     484.431ms       4.750us        101980  
                                            aten::addmm     370.806ms      55.643us          6664  
enumerate(DataLoader)#_MultiProcessingDataLoaderIter...     333.055ms       2.431ms           137  
-------------------------------------------------------  ------------  ------------  ------------  
Self CPU time total: 76.530s

In [None]:
------------------------------------------------------- ------------ ----------- 
                                                   Name    CPU total  CUDA total  
------------------------------------------------------- ------------ ----------- 
                                        model_inference       1.621s   252.116ms 
enumerate(DataLoader)#_MultiProcessingDataLoaderIter...    231.692ms     0.000us 
                                               aten::to    216.764ms    12.108ms 
                                         aten::_to_copy    213.804ms    12.154ms 
                                            aten::copy_    206.785ms    12.175ms 
                                  cudaStreamSynchronize    180.799ms     0.000us 
                                           aten::linear     52.767ms   362.147ms 
                                        cudaMemcpyAsync     18.501ms     0.000us 
                                            aten::addmm     13.633ms   179.764ms 
                                       cudaLaunchKernel      9.117ms     0.000us 
                                 FlashAttnQKVPackedFunc      6.843ms    21.403ms 
                                       aten::layer_norm      5.672ms    12.687ms 
                                aten::native_layer_norm      5.140ms    12.687ms 
                                            aten::empty      5.054ms     0.000us 
                                    aten::empty_strided      4.312ms     0.000us 
                                              aten::add      2.524ms    12.295ms 
                                          aten::dropout      2.313ms     3.413ms 
                                   aten::native_dropout      2.177ms     3.413ms 
                                             cudaMalloc      1.808ms     0.000us 
                                          aten::reshape      1.579ms    21.000us 
------------------------------------------------------- ------------ ----------- 
Self CPU time total: 1.621s
Self CUDA time total: 252.116ms

In [None]:
"""

"""

In [None]:
data = pd.read_csv('data.csv')

for input_text in data['Sentence']:
    num_predicted_tokens = 40
    
    with torch.autocast(device_type='cuda', dtype=torch.float16):
        answer = predict_next(pipeline, input_text, num_predicted_tokens, tokenizer)

    print('input:', input_text)
    print('output:', answer[len(input_text)+6:])
    print("-"*40)

In [5]:
3.128709316253662 + 2.2622079849243164 + 56.44216966629028

61.83308696746826

In [6]:
"""
4090  - 22.64s,  159,  0.35$ - 0.22
4080  - 36.83s,  97,   0.25$ - 0.25
3090  - 37.22s,  96,   0.15$ - 0.15
3080  - 56.07s,  64,   0.11$ - 0.17
A4000 - 61.83s,  58,   0.11$ - 0.19
"""

SyntaxError: invalid syntax (774384569.py, line 1)

In [22]:
0.11 / 58 * 100

0.1896551724137931