In [1]:
import torch
from torch import nn
from torch.nn import functional
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, DataCollatorWithPadding
import torch.optim as optim

from dataset.create_dataset import create_data_loader
from layers.model import Transformer, AutoregressiveWrapper
from transformers import BertTokenizer

from test_model.test_model import TestModel
from tqdm import tqdm

import wandb

import time


In [2]:
CONFIG = {
    "architecture": "Transformer", # Wandb only
    "dataset": "wikitext-103-raw-v1", # Wandb only
    "batch_size": 8,
    "embedding_size": 512,
    "max_sequence_length": 512,
    "number_of_layers": 8,
    "number_of_heads": 4,
    "additional_feed_forward_layers": 0,
    "dropout_rate": 0.1,
    "lr": 0.001,
    'train_size': 2**16,
    'test_size': 128,
    'model_path': "savepoints/revived-glitter-56"
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(device)

cuda


In [3]:
def test_model(pipeline, model, loss_function):
    model.eval()
    total_loss = 0

    for batch in test_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        model_output, target = pipeline(input_ids, attention_mask)

        loss = loss_function(model_output.transpose(1, 2), target)

        total_loss += float(loss)

    total_loss /= len(test_dataloader)# * CONFIG["batch_size"]

    return total_loss


def train(CONFIG, pipeline, model, optimizer, loss_function, model_tester, wandb):
    train_config = {
        "test_every": 1024 // CONFIG["batch_size"],
        "log_traing_metrics_every": 64 // CONFIG["batch_size"],
    }

    train_time = 0
    test_time = 0
    last_moment = time.time()

    model.train()

    batch_num = 0
    train_losses = []
    for batch in tqdm(train_dataloader, desc="Training Progress"):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        model_output, target = pipeline(input_ids, attention_mask)
        loss = loss_function(model_output.transpose(1, 2), target)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
        
        train_losses.append(float(loss))
        batch_num += 1

        if batch_num % train_config["log_traing_metrics_every"] == 0:
            train_time += time.time() - last_moment
            last_moment = time.time()

            datapoints_processed_total = batch_num * CONFIG["batch_size"]
            wandb.log({
                "train_loss": sum(train_losses[-train_config["log_traing_metrics_every"]:]) / train_config["log_traing_metrics_every"],
                "datapoints_processed_total": datapoints_processed_total,
                "train_time": train_time,
            })

        if batch_num % train_config["test_every"] == 0:
            train_time += time.time() - last_moment
            last_moment = time.time()

            metrics = model_tester.test_model(pipeline, test_dataloader)
            test_loss = metrics['loss']
            bleu = metrics['bleu']
            #bert_f1 = metrics['bert_f1']
            rouge1 = metrics['rouge1']
            rouge2 = metrics['rouge2']
            rougeL = metrics['rougeL']

            test_time += time.time() - last_moment
            last_moment = time.time()

            datapoints_processed_total = batch_num * CONFIG["batch_size"]

            wandb.log({
                "test_loss": test_loss,
                "bleu": bleu,
                #"bert_f1": bert_f1,
                "rouge1": rouge1,
                "rouge2": rouge2,
                "rougeL": rougeL,
                "datapoints_processed_total": datapoints_processed_total,
                "test_time": test_time,
            })

In [4]:
def create_model(CONFIG, model_path=None):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    number_of_tokens = tokenizer.vocab_size
    
    model = Transformer(
        embedding_size=CONFIG["embedding_size"],
        number_of_tokens=number_of_tokens,
        number_of_heads=CONFIG["number_of_heads"],
        number_of_layers=CONFIG["number_of_layers"],
        additional_feed_forward_layers=CONFIG["additional_feed_forward_layers"],
        dropout_rate=CONFIG["dropout_rate"],
        max_sequence_length=CONFIG["max_sequence_length"]
    ).to(device)
    if model_path:
        model.load_state_dict(torch.load(model_path))
        
    pipeline = AutoregressiveWrapper(model).to(device)
    loss_function = nn.CrossEntropyLoss().to(device)
    optimizer = optim.Adam(model.parameters(), lr=CONFIG["lr"])
    model_tester = TestModel(tokenizer, model)

    return pipeline, model, optimizer, loss_function, model_tester

In [5]:
for i in range(1):
    train_dataloader, test_dataloader, _ = create_data_loader(batch_size=CONFIG["batch_size"],
                                    max_sequence_size=CONFIG["max_sequence_length"],
                                    train_size=CONFIG['train_size'], test_size=CONFIG['test_size'])

    wandb.init(
        # set the wandb project where this run will be logged
        project="transformer",
        tags=["long_training_testing"],
        
        # track hyperparameters and run metadata
        config=CONFIG
    )
    
    load_path = CONFIG['model_path']

    pipeline, model, optimizer, loss_function, model_tester = create_model(CONFIG, load_path)
    num_parameters, num_trainable_parameters, memory_allocated = pipeline.count_parameters() 
    print('number of parameters =', num_parameters)
    print('number of trainable parameters =', num_trainable_parameters)
    print('memory allocated in GB =', memory_allocated)
    train(CONFIG, pipeline, model, optimizer, loss_function, model_tester, wandb)


Found cached dataset wikitext (C:/Users/skoro/.cache/huggingface/datasets/wikitext/wikitext-103-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at C:\Users\skoro\.cache\huggingface\datasets\wikitext\wikitext-103-raw-v1\1.0.0\a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126\cache-4bee3b889243690a.arrow
Loading cached processed dataset at C:\Users\skoro\.cache\huggingface\datasets\wikitext\wikitext-103-raw-v1\1.0.0\a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126\cache-4d7d86292bf10585.arrow
[34m[1mwandb[0m: Currently logged in as: [33mskorodumov-work[0m ([33m8667[0m). Use [1m`wandb login --relogin`[0m to force relogin


AttributeError: '_IncompatibleKeys' object has no attribute 'max_sequence_length'

In [6]:
PATH = "savepoints/revived-glitter-56"
torch.save(model.state_dict(), PATH)

In [None]:
model.load_state_dict(torch.load(PATH))
model.eval()

print(test_model(pipeline, model, loss_function))

In [8]:
wandb.finish()

0,1
bleu,▁▁▁▂▃▄▅▆▆▆▇▇▇▇▇▇▇███████████████████████
datapoints_processed_total,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rouge1,▁▁▃▃▄▆▆▇▇▇▇█████████████████████████████
rouge2,▂▁▂▂▄▅▆▆▇▇▇▇▇▇▇▇████████████████████████
rougeL,▁▁▂▃▅▆▆▇▇▇▇█████████████████████████████
test_loss,██▇▆▅▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
test_time,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train_loss,██▅▅▃▄▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_time,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███

0,1
bleu,0.96609
datapoints_processed_total,65536
example_text,s
rouge1,0.98622
rouge2,0.9705
rougeL,0.98617
test_loss,0.01887
test_time,535.30509
train_loss,0.02284
train_time,3327.48932


In [9]:
from random import randint

def predict_next(pipeline, input_text, num_predicted_tokens, tokenizer):
    input_tokens = tokenizer.encode(input_text, return_tensors="pt")
    input_tokens = input_tokens[:, :-1].to(device)
    pipeline.eval()
    
    for i in range(num_predicted_tokens):
        mask = torch.ones_like(input_tokens)

        with torch.no_grad():
            probabilities = pipeline.next_token_probabilities(input_tokens, mask)
        
        answer = probabilities.argsort(dim=-1)[:, -1].unsqueeze(0)
        input_tokens = torch.cat((input_tokens, answer), dim=1)
        
    return tokenizer.decode(input_tokens[0])

In [12]:
input_text = """
it met with positive sales in japan, and was
"""
num_predicted_tokens = 20
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

answer = predict_next(pipeline, input_text, num_predicted_tokens, tokenizer)

tensor([[ 101, 2009, 2777, 2007, 3893, 4341, 1999, 2900, 1010, 1998, 2001]],
       device='cuda:0')
tensor([[ 101, 2009, 2777, 2007, 3893, 4341, 1999, 2900, 1010, 1998, 2001, 5868]],
       device='cuda:0')
tensor([[ 101, 2009, 2777, 2007, 3893, 4341, 1999, 2900, 1010, 1998, 2001, 5868,
         2011]], device='cuda:0')
tensor([[ 101, 2009, 2777, 2007, 3893, 4341, 1999, 2900, 1010, 1998, 2001, 5868,
         2011, 2119]], device='cuda:0')
tensor([[ 101, 2009, 2777, 2007, 3893, 4341, 1999, 2900, 1010, 1998, 2001, 5868,
         2011, 2119, 2887]], device='cuda:0')
tensor([[ 101, 2009, 2777, 2007, 3893, 4341, 1999, 2900, 1010, 1998, 2001, 5868,
         2011, 2119, 2887, 1998]], device='cuda:0')
tensor([[ 101, 2009, 2777, 2007, 3893, 4341, 1999, 2900, 1010, 1998, 2001, 5868,
         2011, 2119, 2887, 1998, 2530]], device='cuda:0')
tensor([[ 101, 2009, 2777, 2007, 3893, 4341, 1999, 2900, 1010, 1998, 2001, 5868,
         2011, 2119, 2887, 1998, 2530, 4401]], device='cuda:0')
tensor([[ 10

In [13]:
answer

'[CLS] it met with positive sales in japan, and was praised by both japanese and western critics. after release, it received downloadable content, along with an expanded'

In [120]:
for batch in train_dataloader:
    text = batch['input_ids'][5]
    break
    
print(tokenizer.decode(text))

[CLS] it met with positive sales in japan, and was praised by both japanese and western critics. after release, it received downloadable content, along with an expanded edition in november of that year. it was also adapted into manga and an original video animation series. due to low sales of valkyria chronicles ii, valkyria chronicles iii was not localized, but a fan translation compatible with the game's expanded edition was released in 2014. media. vision would return to the franchise with the development of valkyria : azure revolution for the playstation 4. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 