In [1]:
import torch
from torch import nn
from torch.nn import functional
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, DataCollatorWithPadding
import torch.optim as optim

from dataset.create_dataset import create_data_loader
from layers.model import Transformer, AutoregressiveWrapper
from transformers import BertTokenizer

from test_model.test_model import TestModel
from tqdm import tqdm

import wandb

import time


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
CONFIG = {
    "architecture": "Transformer", # Wandb only
    "dataset": "wikitext", # Wandb only
    "batch_size": 4,
    "embedding_size": 128,
    "max_sequence_length": 512,
    "number_of_layers": 6,
    "number_of_heads": 4,
    "additional_feed_forward_layers": 0,
    "dropout_rate": 0.0,
    "lr": 0.001
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(device)

cpu


In [3]:
def test_model(pipeline, model, loss_function):
    model.eval()
    total_loss = 0

    for batch in test_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        model_output, target = pipeline(input_ids, attention_mask)

        loss = loss_function(model_output.transpose(1, 2), target)

        total_loss += float(loss)

    total_loss /= len(test_dataloader)# * CONFIG["batch_size"]

    return total_loss


def train(CONFIG, pipeline, model, optimizer, loss_function, wandb):
    train_config = {
        "test_every": 512 // CONFIG["batch_size"],
        "log_traing_metrics_every": 64 // CONFIG["batch_size"],
    }

    train_time = 0
    test_time = 0
    last_moment = time.time()

    model.train()

    batch_num = 0
    train_losses = []
    for batch in tqdm(train_dataloader, desc="Training Progress"):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        model_output, target = pipeline(input_ids, attention_mask)
        loss = loss_function(model_output.transpose(1, 2), target)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
        
        train_losses.append(float(loss))
        batch_num += 1

        if batch_num % train_config["log_traing_metrics_every"] == 0:
            train_time += time.time() - last_moment
            last_moment = time.time()

            datapoints_processed_total = batch_num * CONFIG["batch_size"]
            wandb.log({
                "train_loss": sum(train_losses[-train_config["log_traing_metrics_every"]:]) / train_config["log_traing_metrics_every"],
                "datapoints_processed_total": datapoints_processed_total,
                "train_time": train_time,
            })

        if batch_num % train_config["test_every"] == 0:
            train_time += time.time() - last_moment
            last_moment = time.time()

            test_loss = test_model(pipeline, model, loss_function)

            test_time += time.time() - last_moment
            last_moment = time.time()

            datapoints_processed_total = batch_num * CONFIG["batch_size"]

            wandb.log({
                "test_loss": test_loss,
                "datapoints_processed_total": datapoints_processed_total,
                "test_time": test_time,
            })

In [4]:
def create_model(CONFIG):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    number_of_tokens = tokenizer.vocab_size

    model = Transformer(
        embedding_size=CONFIG["embedding_size"],
        number_of_tokens=number_of_tokens,
        number_of_heads=CONFIG["number_of_heads"],
        number_of_layers=CONFIG["number_of_layers"],
        additional_feed_forward_layers=CONFIG["additional_feed_forward_layers"],
        dropout_rate=CONFIG["dropout_rate"],
        max_sequence_length=CONFIG["max_sequence_length"]
    ).to(device)
    pipeline = AutoregressiveWrapper(model).to(device)

    loss_function = nn.CrossEntropyLoss().to(device)
    optimizer = optim.Adam(model.parameters(), lr=CONFIG["lr"])

    return pipeline, model, optimizer, loss_function

In [5]:
for i in range(1):
    train_dataloader, test_dataloader, _ = create_data_loader(batch_size=CONFIG["batch_size"],
                                    max_sequence_size=CONFIG["max_sequence_length"],
                                    train_size=2**15, test_size=128)

    wandb.init(
        # set the wandb project where this run will be logged
        project="transformer",
        tags=["long_training_testing"],
        
        # track hyperparameters and run metadata
        config=CONFIG
    )

    pipeline, model, optimizer, loss_function = create_model(CONFIG)
    train(CONFIG, pipeline, model, optimizer, loss_function, wandb)


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mskorodumov[0m. Use [1m`wandb login --relogin`[0m to force relogin


Training Progress:  28%|██▊       | 2320/8192 [32:19<1:21:49,  1.20it/s]


KeyboardInterrupt: 

In [7]:
PATH = "savepoints/first_save"
torch.save(model.state_dict(), PATH)

In [8]:
model.load_state_dict(torch.load(PATH))
model.eval()

print(test_model(pipeline, model, loss_function))

Transformer(
  (token_embedding): TokenEmbedding(
    (embedding_layer): Embedding(30522, 128)
  )
  (positional_encoding): PositionalEncoding()
  (layer_normalization): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  (decoder): DecoderStack(
    (encoder_layers): ModuleList(
      (0-5): 6 x DecoderLayer(
        (multi_headed_self_attention): MaskedMultiHeadedSelfAttention(
          (self_attentions): ModuleList(
            (0-3): 4 x MaskedSelfAttention(
              (query_layer): Linear(in_features=128, out_features=32, bias=True)
              (key_layer): Linear(in_features=128, out_features=32, bias=True)
              (value_layer): Linear(in_features=128, out_features=32, bias=True)
              (softmax): Softmax(dim=-1)
            )
          )
          (output_layer): Linear(in_features=128, out_features=128, bias=True)
        )
        (feed_forward): FeedForward(
          (linear_1): Linear(in_features=128, out_features=512, bias=True)
          (linear_l

In [None]:
'''
TODO wb
* --Добавить гиперпарметры в конфиг, из него их брать в модели и посылать его же в wb
* --Добавить время тренировки
* Добавить метрик
* Запустить автоматический подбор гиперпараметров
* --Уменьшить время итерации, понять какое время оптимальное 
* --Синхронизировать итерации тест и трейн лоса
    * --Научиться чаще пушить одни метрики чаще чем другие и не путаться
* Научиться сохранять и загружать модели
    * Научиться начинать тренировку загруженной модели с момента сохранения
'''

'\nTODO wb\n* --Добавить гиперпарметры в конфиг, из него их брать в модели и посылать его же в wb\n* --Добавить время тренировки\n* Добавить метрик\n* Запустить автоматический подбор гиперпараметров\n* --Уменьшить время итерации, понять какое время оптимальное \n* --Синхронизировать итерации тест и трейн лоса\n    * --Научиться чаще пушить одни метрики чаще чем другие и не путаться\n'