In [1]:
import torch
from torch import nn
from torch.nn import functional
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, DataCollatorWithPadding
import torch.optim as optim
from torch.cuda.amp import autocast, GradScaler

from dataset.create_dataset import BookGenerator, create_test_dataset
from layers.model import Transformer, AutoregressiveWrapper
from transformers import BertTokenizer

import numpy as np

from test_model.test_model import TestModel
from tqdm import tqdm

import wandb

from time import time
from torch.profiler import profile, record_function, ProfilerActivity

In [2]:
CONFIG = {
    "architecture": "Transformer", # Wandb only
    "dataset": "books", #"wikitext-103-raw-v1", # Wandb only
    "batch_size": 12,
    "embedding_size": 768,
    "max_sequence_length": 256,
    "number_of_layers": 12,
    "number_of_heads": 12,
    "additional_feed_forward_layers": 0,
    "extention_factor": 4,
    "dropout_rate": 0.1,
    'train_size': 2**20,
    'test_size': 32,
    'start_book': 0,
    'flash_atten': False,
    'num_train_books': 25000,
    'save_every': 3000,
    'use_mixed_precision': True,
    'model_path': None, # "savepoints/dulcet-serenity-218",
    
}
CONFIG["lr"] = 0.001 / np.sqrt(CONFIG["batch_size"])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(device)

cuda


In [9]:
def log_metrics(test_name, test_dataloader, pipeline, model_tester, train_config, CONFIG):
    pipeline.eval()
    if test_name in ['books']:
        metrics = model_tester.test_model(pipeline, test_dataloader)
        test_loss = metrics['loss']
        bleu = metrics['bleu']
        rouge1 = metrics['rouge1']
        rouge2 = metrics['rouge2']
        rougeL = metrics['rougeL']

        wandb.log({
            "test_loss_" + test_name: test_loss,
            "bleu_" + test_name: bleu,
            "rouge1_" + test_name: rouge1,
            "rouge2_" + test_name: rouge2,
            "rougeL_" + test_name: rougeL,
        })
        
    else:
        simmilarity_accuracy, simmilarity_score = model_tester.test_simmilarity(pipeline,
                                                            test_dataloader, tokenizer)
        
        wandb.log({
            "simmilarity_accuracy": simmilarity_accuracy,
            "simmilarity_score": simmilarity_score,
        })

    pipeline.train()
        

def train(CONFIG, pipeline, model, optimizer, loss_function, model_tester, scaler, wandb, dataloaders):
    train_config = {
        "test_every": 2048 // CONFIG["batch_size"],
        "log_traing_metrics_every": 256 // CONFIG["batch_size"],
    }
    
    book_generator, test_book_dataloader, cloze_dataloader = dataloaders

    train_time = 0
    
    model.train()

    batch_num = 0
    train_losses = []
    tests = ['books', 'simmilarity_score']
    test_dataloaders = [test_book_dataloader, cloze_dataloader]
    
    for i in tqdm(range(CONFIG['num_train_books']), desc="Training Progress"):
        train_dataloader = book_generator.next_book()
        for batch in train_dataloader:
            train_start = time()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            optimizer.zero_grad()
            model_output, target = pipeline(input_ids, attention_mask)
            with autocast():
                loss = loss_function(model_output.transpose(1, 2), target)
            
            train_losses.append(loss.item())

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
    
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)

            train_time += time() - train_start
            batch_num += 1
    
            if batch_num % train_config["log_traing_metrics_every"] == 0:
                wandb.log({
                    "train_loss": sum(train_losses[-train_config["log_traing_metrics_every"]:]) / train_config["log_traing_metrics_every"],
                    "train_time": train_time / train_config["log_traing_metrics_every"],
                })
                train_time = 0
            
            if batch_num % train_config["test_every"] == 0:
                test_start = time()
                for i in range(len(tests)):
                    log_metrics(tests[i], test_dataloaders[i], pipeline, model_tester, train_config, CONFIG)
                wandb.log({
                    "test_time": time() - test_start,
                })

In [4]:
def create_model(CONFIG, model_path=None):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    number_of_tokens = tokenizer.vocab_size
    
    model = Transformer(
        embedding_size=CONFIG["embedding_size"],
        number_of_tokens=number_of_tokens,
        number_of_heads=CONFIG["number_of_heads"],
        number_of_layers=CONFIG["number_of_layers"],
        extention_factor=CONFIG["extention_factor"],
        additional_feed_forward_layers=CONFIG["additional_feed_forward_layers"],
        dropout_rate=CONFIG["dropout_rate"],
        max_sequence_length=CONFIG["max_sequence_length"],
        use_flash_att=CONFIG["flash_atten"],
    ).to(device).to(torch.float16)
    if model_path:
        model.load_state_dict(torch.load(model_path))
        
    pipeline = AutoregressiveWrapper(model).to(device)
    loss_function = nn.CrossEntropyLoss().to(device).to(torch.float16)
    optimizer = optim.Adam(model.parameters(), lr=CONFIG["lr"])
    scaler = GradScaler()
    model_tester = TestModel(tokenizer, model)

    return pipeline, model, optimizer, loss_function, model_tester, scaler

In [5]:
def save_model(model, wandb_run):
    PATH = "savepoints/" + wandb_run.name
    torch.save(model.state_dict(), PATH)

In [6]:
wandb_run = wandb.init(
    project="transformer",
    tags=["long_training_testing"],
    #id="sblota90",
    resume="allow",
    config=CONFIG
)

load_path = CONFIG['model_path']

[34m[1mwandb[0m: Currently logged in as: [33mskorodumov-work[0m ([33m8667[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

book_generator = BookGenerator(batch_size=CONFIG["batch_size"],
                                max_sequence_length=CONFIG["max_sequence_length"], tokenizer=tokenizer)
test_book_dataloader = book_generator.select_test(CONFIG["test_size"])
book_generator.skip_books(CONFIG['start_book'])
cloze_dataset = create_test_dataset('data.csv')

pipeline, model, optimizer, loss_function, model_tester, scaler = create_model(CONFIG, CONFIG["model_path"])
num_parameters, num_trainable_parameters, memory_allocated = pipeline.count_parameters() 
print('number of parameters =', num_parameters)
print('number of trainable parameters =', num_trainable_parameters)
print('memory allocated in GB =', memory_allocated)

number of parameters = 131968314
number of trainable parameters = 131968314
memory allocated in GB = 0.4916202798485756


In [10]:
train(CONFIG, pipeline, model, optimizer, loss_function, model_tester, scaler, wandb,
     [book_generator, test_book_dataloader, cloze_dataset])

Training Progress:   0%|                                             | 0/25000 [00:01<?, ?it/s]


RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.cuda.HalfTensor instead (while checking arguments for embedding)

In [None]:
PATH = "savepoints/" + wandb_run.name
torch.save(model.state_dict(), PATH)

In [None]:
wandb.finish()

In [None]:
input_text = """
When you want to travel around the city, you can take a colorful car called a
"""
num_predicted_tokens = 20
k = 3
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

start = time()
answer = pipeline.predict_next(input_text, tokenizer, num_predicted_tokens, k)
print(.time() - start)

print(answer)

In [None]:
from random import randint

def predict_next(pipeline, input_text, num_predicted_tokens, tokenizer):
    input_tokens = tokenizer.encode(input_text, return_tensors="pt")
    input_tokens = input_tokens[:, :-1].to(device)
    
    for i in range(num_predicted_tokens):
        mask = torch.ones_like(input_tokens)

        with torch.no_grad():
            probabilities = pipeline.next_token_probabilities(input_tokens, mask)
        
        answer = probabilities.argsort(dim=-1)[:, -randint(1, 2)].unsqueeze(0)
        input_tokens = torch.cat((input_tokens, answer), dim=1)
        
    return tokenizer.decode(input_tokens[0])

In [None]:
input_text = """
london is the capital of
"""
num_predicted_tokens = 40

answer = predict_next(pipeline, input_text, num_predicted_tokens, tokenizer)

print(answer)

In [None]:
simmilarity_accuracy, simmilarity_score = model_tester.test_simmilarity(pipeline,
                                                            cloze_dataset, tokenizer)

In [None]:
"""
GPU comparison
1. GOOGLE COLAB
-100 CU are given for 10 dollars
-V100, 32 GB, 5 CU per hour, 20 hours, 706 DP, 14120 DP total, 1412 DP per dollar
-A100, 40 GB, 15 CU per hour, 6.5 hours, 2179 DP, 14163 DP total, 1416 DP per dollar

2. VAST.AI
-RTX 4090, 24 GB, 0.4 dollars per hour, 1720 DP, 4300 DP per dollar, 177% while training on 4 GPUs
-RTX 3080, 10 GB, 0.117 dollars per hour, 1022 DP, 8717 DP per dollar, 177% while training on 4 GPUs
-RTX 3090, 24 GB, 0.23 dollars per hour, 1071 DP, 4652 DP per dollar, 177% while training on 4 GPUs
-A5000, 24 GB, 0.22 dollars per hour, 1061 DP, 4686 DP per dollar, 389% while training on 4 GPUs
"""

In [None]:
"""
TODO
optimize
-understand time wasting
-choose batch size


beam search

new dataset
-find new dataset for big training
-make some analysis
-avg, mean, each percentile sequence length
-find info in the net

model parameters
-max sequence length
-estimate batch size
-choose parameters for n GB VRAM
-choose model parameters for 5-6 GB model

filter model
-

new conversation dataset 
-analyse
-understand how to train and test conversation model
-try to train

metrics
-create metrics for conversation

multiple GPUs
-write code to run model on several gpus
-try to run model on vast.ai
-also model must be saved per epoch

site
-create django project with model which is placed on google colab
-create 


"""

In [1]:
import os
from pprint import pprint
from google.cloud import storage

In [2]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = r'ServiceKey_GoogleCloud.json'

storage_client = storage.Client()

In [9]:
bucket_name = 'uu_gpt'

# create a new bucket
bucket = storage_client.bucket(bucket_name)

bucket.location = 'US' # Taiwan
bucket = storage_client.create_bucket(bucket) # returns Bucket object

pprint(vars(bucket))

  bucket.location = 'US' # Taiwan


Forbidden: 403 POST https://storage.googleapis.com/storage/v1/b?project=gpt-saves&prettyPrint=false: The billing account for the owning project is disabled in state absent