In [1]:
import torch
from torch import nn
from torch.nn import functional
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, DataCollatorWithPadding, BertTokenizer
import torch.optim as optim
from torch.profiler import profile, record_function, ProfilerActivity
import numpy as np
import sys
from tqdm import tqdm
import os
from time import time
import pandas as pd
from torch.cuda.amp import autocast, GradScaler

from dataset.create_dataset import BookGenerator, create_test_dataset, create_test
from layers.model import Transformer, AutoregressiveWrapper
from test_model.test_model import TestModel

import wandb
import yadisk

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [2]:
"""
#!git clone https://github.com/konductor000/GenerativePretrainedTransformer
!pip install -r requirements.txt
!pip install rouge_score
!MAX_JOBS=4 pip install flash-attn --no-build-isolation
!pip install datasets
!pip install transformers
!pip install evaluate
!pip install spacy
!pip install yadisk
!pip install wandb
!python -m spacy download en_core_web_md
#8a49fefdd8a82ca9ba659a874b09adf8c5995778
"""

'\n#!git clone https://github.com/konductor000/GenerativePretrainedTransformer\n!pip install -r requirements.txt\n!pip install rouge_score\n!MAX_JOBS=4 pip install flash-attn --no-build-isolation\n!pip install datasets\n!pip install transformers\n!pip install evaluate\n!pip install spacy\n!pip install yadisk\n!pip install wandb\n!python -m spacy download en_core_web_md\n#8a49fefdd8a82ca9ba659a874b09adf8c5995778\n'

In [3]:
CONFIG = {
    "architecture": "Transformer", # Wandb only
    "dataset": "books", #"wikitext-103-raw-v1", # Wandb only
    "batch_size": 16,
    "embedding_size": 768,
    "max_sequence_length": 1024,
    "number_of_layers": 12,
    "number_of_heads": 12,
    "extention_factor": 4,
    "dropout_rate": 0.1,
    'test_size': 1024,
    'start_book': 0,
    "test_every": 256,
    "log_traing_metrics_every": 16,
    'flash_atten': True,
    'num_train_books': 2500,
    'save_every': 120,
    'use_mixed_precision': True,
    'model_path': None, # "savepoints/dulcet-serenity-218",
    
}
CONFIG["lr"] = 0.001 / np.sqrt(CONFIG["batch_size"])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(device)

cuda


In [4]:
def log_metrics(test_name, test_dataloader, pipeline, model_tester, train_config, CONFIG):
    pipeline.eval()
    if test_name in ['books']:
        metrics = model_tester.test_model(pipeline, test_dataloader)
        test_loss = metrics['loss']
        bleu = metrics['bleu']
        rouge1 = metrics['rouge1']
        rouge2 = metrics['rouge2']
        rougeL = metrics['rougeL']

        wandb.log({
            "test_loss_" + test_name: test_loss,
            "bleu_" + test_name: bleu,
            "rouge1_" + test_name: rouge1,
            "rouge2_" + test_name: rouge2,
            "rougeL_" + test_name: rougeL,
        }) 
    else:
        simmilarity_accuracy, simmilarity_score = model_tester.test_simmilarity(pipeline,
                                                            test_dataloader, tokenizer)
        
        wandb.log({
            "simmilarity_accuracy": simmilarity_accuracy,
            "simmilarity_score": simmilarity_score,
        })

    pipeline.train()
        

def train(CONFIG, pipeline, model, optimizer, loss_function, model_tester, wandb,
          wandb_run, scaler, y, dataloaders):
    book_generator, test_book_dataloader, cloze_dataloader = dataloaders

    train_time = 0
    total_train_time = time()
    
    model.train()

    batch_num = 0
    train_losses = []
    tests = ['books', 'simmilarity_score']
    test_dataloaders = [test_book_dataloader, cloze_dataloader]
    
    for i in tqdm(range(CONFIG['num_train_books']), desc="Training Progress"):
        train_dataloader = book_generator.next_book(10)
        if time() - total_train_time > CONFIG['save_every'] * 60:
            total_train_time = time()
            save_model(model, wandb_run, y)
            
        for batch in train_dataloader:
            train_start = time()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            optimizer.zero_grad()

            with torch.autocast(device_type='cuda', dtype=torch.float16):
                model_output, target = pipeline(input_ids, attention_mask)
                loss = loss_function(model_output.transpose(1, 2), target)
            
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            
            train_losses.append(loss.item())
            train_time += time() - train_start
            batch_num += 1
    
            if batch_num % CONFIG["log_traing_metrics_every"] == 0:
                wandb.log({
                    "train_loss": sum(train_losses[-CONFIG["log_traing_metrics_every"]:]) / CONFIG["log_traing_metrics_every"],
                    "train_time": train_time / CONFIG["log_traing_metrics_every"],
                })
                train_time = 0
            
            if batch_num % CONFIG["test_every"] == 0:
                test_start = time()
                for i in range(len(tests)):
                    log_metrics(tests[i], test_dataloaders[i], pipeline, model_tester, train_config, CONFIG)
                wandb.log({
                    "test_time": time() - test_start,
                })

In [5]:
def create_model(CONFIG, tokenizer, model_path=None):
    number_of_tokens = tokenizer.vocab_size
    
    y = yadisk.YaDisk(token="y0_AgAAAAA7vZKNAADLWwAAAADsK4dQ-f3fKT3hSianJggcCcKC1Mfxo8s")
    print(y.check_token())
    model = Transformer(
        embedding_size=CONFIG["embedding_size"],
        number_of_tokens=number_of_tokens,
        number_of_heads=CONFIG["number_of_heads"],
        number_of_layers=CONFIG["number_of_layers"],
        extention_factor=CONFIG["extention_factor"],
        dropout_rate=CONFIG["dropout_rate"],
        max_sequence_length=CONFIG["max_sequence_length"],
        use_flash_att=CONFIG["flash_atten"],
    ).to(device)
    if model_path:
        model.load_state_dict(torch.load(model_path))

    pipeline = AutoregressiveWrapper(model).to(device)
    loss_function = nn.CrossEntropyLoss().to(device)
    optimizer = optim.Adam(model.parameters(), lr=CONFIG["lr"])
    model_tester = TestModel(tokenizer, model)
    scaler = GradScaler()

    return pipeline, model, optimizer, loss_function, model_tester, scaler, y

In [6]:
def save_model(model, wandb_run, y):
    if not os.path.exists('save_models'):
        os.makedirs('save_models')
    save_dir = f"save_models/{wandb_run.name}"
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    saved_models = [f for f in os.listdir(save_dir) if f.startswith("model_")]
    num_saved_models = len(saved_models)

    model_filename = f"model_{num_saved_models + 1}"
    model_path = f'{save_dir}/{model_filename}'
    torch.save(model.state_dict(), model_path)
    
    try:
        y.mkdir(save_dir)
    except yadisk.exceptions.PathExistsError:
        pass
    print(model_path)
    y.upload(model_path, model_path)


In [7]:
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mskorodumov-work[0m ([33m8667[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [8]:
wandb_run = wandb.init(
    project="transformer",
    tags=["long_training_testing"],
    #id="sblota90",
    resume="allow",
    config=CONFIG
)

load_path = CONFIG['model_path']

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011114448032134936, max=1.0…

In [9]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
book_generator = BookGenerator(batch_size=CONFIG["batch_size"],
                                max_sequence_length=CONFIG["max_sequence_length"], tokenizer=tokenizer, num_workers=4)
test_dataloader = create_test(32, 512, tokenizer=tokenizer, dataset_size=CONFIG['test_size'])
book_generator.skip_books(CONFIG['start_book'])
cloze_dataset = create_test_dataset('data.csv')

pipeline, model, optimizer, loss_function, model_tester, scaler, y = create_model(CONFIG, tokenizer, CONFIG["model_path"])
num_parameters, num_trainable_parameters, memory_allocated = pipeline.count_parameters() 
print('number of parameters =', num_parameters)
print('number of trainable parameters =', num_trainable_parameters)
print('memory allocated in GB =', memory_allocated)

Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]



True
number of parameters = 129622852
number of trainable parameters = 129622852
memory allocated in GB = 0.4828827530145645


In [None]:
train(CONFIG, pipeline, model, optimizer, loss_function, model_tester, wandb, wandb_run, scaler, y,
     [book_generator, test_dataloader, cloze_dataset])

In [None]:
save_model(model, wandb_run, y)

In [None]:
wandb.finish()

In [None]:
input_text = """
Mount Everest is the highest peak in
"""
num_predicted_tokens = 20
k = 3
tokenizer = BertTokenizer.from_pretrained('bert-large-cased')

start = time()
with torch.autocast(device_type='cuda', dtype=torch.float16):
    answer = pipeline.predict_next(input_text, tokenizer, num_predicted_tokens, k)
print(time() - start)

print(answer)

In [None]:
from random import randint

def predict_next(pipeline, input_text, num_predicted_tokens, tokenizer):
    input_tokens = tokenizer.encode(input_text, return_tensors="pt")
    input_tokens = input_tokens[:, :-1].to(device)
    
    for i in range(num_predicted_tokens):
        mask = torch.ones_like(input_tokens)

        with torch.no_grad():
            probabilities = pipeline.next_token_probabilities(input_tokens, mask)
        
        answer = probabilities.argsort(dim=-1)[:, -randint(1, 3)].unsqueeze(0)
        input_tokens = torch.cat((input_tokens, answer), dim=1)
        
    return tokenizer.decode(input_tokens[0])

In [10]:
start = time()

data_prep_timings = []
model_train_timings = []
gradient_timings = []
times1 = []
times2 = []
times3 = []
times4 = []

train_dataloader = book_generator.next_book(10)
print('data time -', time() - start)
full_train_time = time()

data time - 11.215234994888306


In [11]:
for batch in train_dataloader:
    data_prep_time = time()
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)

    data_prep_timings.append(time() - data_prep_time)

    optimizer.zero_grad()

    model_time = time()
    with torch.autocast(device_type='cuda', dtype=torch.float16):
        model_output, target = pipeline(input_ids, attention_mask)
        loss = loss_function(model_output.transpose(1, 2), target)

    model_train_timings.append(time() - model_time)

    grad_time = time()
    time1 = time()
    scaler.scale(loss).backward()
    time2 = time()
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    time3 = time()
    scaler.step(optimizer)
    time4 = time()
    scaler.update()
    times1.append(time2-time1)
    times2.append(time3-time2)
    times3.append(time4-time3)
    times4.append(time()-time4)
    gradient_timings.append(time() - grad_time)

print('full time -', time() - start)
print('number of batches -', len(train_dataloader))
print('data_prep_timings -', sum(data_prep_timings))
print('model_train_timings -', sum(model_train_timings))
print('gradient_timings -', sum(gradient_timings))
print('backward -', sum(times1))
print('clipping -', sum(times2))
print('step -', sum(times3))
print('updating -', sum(times4))

full time - 58.213600158691406
number of batches - 136
data_prep_timings - 1.0260894298553467
model_train_timings - 2.477931499481201
gradient_timings - 42.88483500480652
backward - 41.26165699958801
clipping - 0.5091650485992432
step - 1.1016998291015625
updating - 0.01080322265625


In [19]:
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
    with record_function("model_inference"):
        train_dataloader = book_generator.next_book(1)
        for batch in train_dataloader:
            train_start = time()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            #optimizer.zero_grad()

            with torch.autocast(device_type='cuda', dtype=torch.float16):
                model_output, target = pipeline(input_ids, attention_mask)
                #loss = loss_function(model_output.transpose(1, 2), target)
            
            #scaler.scale(loss).backward()
            #scaler.step(optimizer)
            #scaler.update()

STAGE:2023-09-15 18:07:51 8238:8238 ActivityProfilerController.cpp:311] Completed Stage: Warm Up
STAGE:2023-09-15 18:07:51 8238:8238 ActivityProfilerController.cpp:317] Completed Stage: Collection
STAGE:2023-09-15 18:07:51 8238:8238 ActivityProfilerController.cpp:321] Completed Stage: Post Processing


In [20]:
print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=20))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                        model_inference        62.05%     268.058ms        99.99%     431.941ms     431.941ms       0.000us         0.00%      54.725ms      54.725ms             1  
enumerate(DataLoader)#_MultiProcessingDataLoaderIter...        20.78%      89.777ms        20.80%      89.829ms      44.914ms       0.000us         0.00%       0.000us       0.000us             2  
         

In [None]:
data = pd.read_csv('data.csv')

for input_text in data['Sentence']:
    num_predicted_tokens = 40
    
    with torch.autocast(device_type='cuda', dtype=torch.float16):
        answer = predict_next(pipeline, input_text, num_predicted_tokens, tokenizer)

    print('input:', input_text)
    print('output:', answer[len(input_text)+6:])
    print("-"*40)

In [None]:
data = pd.read_csv('data.csv')

In [None]:
data

In [None]:
for text in data['Sentence']:
    print(text)

In [18]:
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                        model_inference        62.05%     268.058ms        99.99%     431.941ms     431.941ms       0.000us         0.00%      54.725ms      54.725ms             1  
enumerate(DataLoader)#_MultiProcessingDataLoaderIter...        20.78%      89.777ms        20.80%      89.829ms      44.914ms       0.000us         0.00%       0.000us       0.000us             2  
                                               aten::to         0.90%       3.901ms         9.57%      41.320ms     317.846us       0.000us         0.00%       2.500ms      19.231us           130  
                                           aten::linear         0.22%     963.000us         8.92%      38.547ms     393.337us       0.000us         0.00%      80.141ms     817.765us            98  
                                         aten::_to_copy         0.33%       1.413ms         8.79%      37.989ms     303.912us       0.000us         0.00%       2.703ms      21.624us           125  
                                            aten::copy_         1.16%       5.021ms         7.21%      31.159ms     247.294us       2.706ms         4.94%       2.706ms      21.476us           126  
                                        cudaMemcpyAsync         5.69%      24.574ms         5.69%      24.574ms      12.287ms       0.000us         0.00%       0.000us       0.000us             2  
                                            aten::addmm         1.68%       7.254ms         2.51%      10.825ms     220.918us      39.451ms        72.09%      39.451ms     805.122us            49  
                                    aten::empty_strided         1.66%       7.174ms         1.66%       7.174ms      44.559us       0.000us         0.00%       0.000us       0.000us           161  
                                       aten::layer_norm         0.20%     856.000us         1.53%       6.593ms     235.464us       0.000us         0.00%       2.562ms      91.500us            28  
                                            aten::empty         1.37%       5.934ms         1.37%       5.934ms      38.038us       0.000us         0.00%       0.000us       0.000us           156  
                                aten::native_layer_norm         0.53%       2.285ms         1.33%       5.737ms     229.480us       2.562ms         4.68%       2.562ms     102.480us            25  
                                 FlashAttnQKVPackedFunc         0.59%       2.551ms         1.11%       4.786ms     398.833us       4.524ms         8.27%       4.524ms     377.000us            12  
                                       cudaLaunchKernel         0.79%       3.425ms         0.79%       3.425ms      13.173us       0.000us         0.00%       0.000us       0.000us           260  
                                          aten::dropout         0.01%      44.000us         0.62%       2.665ms     222.083us       0.000us         0.00%     750.000us      62.500us            12  
                                   aten::native_dropout         0.29%       1.265ms         0.61%       2.621ms     218.417us     750.000us         1.37%     750.000us      62.500us            12  
                                              aten::add         0.45%       1.951ms         0.53%       2.283ms      91.320us       2.632ms         4.81%       2.632ms     105.280us            25  
                                       aten::empty_like         0.03%     126.000us         0.40%       1.721ms      46.514us       0.000us         0.00%       0.000us       0.000us            37  
                                             aten::relu         0.07%     318.000us         0.31%       1.340ms     111.667us       0.000us         0.00%       2.003ms     166.917us            12  
                                        aten::clamp_min         0.20%     862.000us         0.24%       1.022ms      85.167us       2.003ms         3.66%       2.003ms     166.917us            12  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 431.973ms
Self CUDA time total: 54.725ms


------------------------------ ------------ ------------
                          Name    CPU total   CUDA total 
------------------------------ ------------ ------------
               model_inference       1.252s    248.518ms
e_function: EmbeddingBackwa...    237.355ms     10.034ms
            EmbeddingBackward0    237.281ms     10.034ms
      aten::embedding_backward    237.269ms     10.034ms
aten::embedding_dense_backward    237.259ms     10.034ms
         cudaStreamSynchronize    233.147ms      0.000us
ltiProcessingDataLoaderIter...     96.446ms      0.000us
                  aten::linear     87.562ms    200.274ms
                      aten::to     83.526ms     39.836ms
                aten::_to_copy     75.562ms     41.652ms
      Optimizer.step#Adam.step     68.165ms     27.202ms
           aten::empty_strided     64.171ms      0.000us
luate_function: AddmmBackward0     61.350ms    212.371ms
e_function: ToCopyBackward0...     40.085ms     15.214ms
                   aten::copy

In [None]:

                                        CPU        CUDA
model_inference       1.252s    248.518ms
e_function: EmbeddingBackwa...    237.355ms     10.034ms
EmbeddingBackward0    237.281ms     10.034ms
aten::embedding_backward    237.269ms     10.034ms
aten::embedding_dense_backward    237.259ms     10.034ms
cudaStreamSynchronize    233.147ms      0.000us
ltiProcessingDataLoaderIter...     96.446ms      0.000us
aten::linear     87.562ms    200.274ms
aten::to     83.526ms     39.836ms
aten::_to_copy     75.562ms     41.652ms
Optimizer.step#Adam.step     68.165ms     27.202ms
aten::empty_strided     64.171ms      0.000us
luate_function: AddmmBackward0     61.350ms    212.371ms
e_function: ToCopyBackward0...     40.085ms     15.214ms
aten::copy_     40.077ms     68.415ms
AddmmBackward0     38.429ms    186.552ms
aten::mm     34.365ms    186.552ms
ToCopyBackward0     33.630ms     15.023ms
cudaLaunchKernel     26.357ms     41.979ms
aten::empty     25.413ms      0.000us