In [1]:
import torch
from torch import nn
from torch.nn import functional
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, DataCollatorWithPadding
import torch.optim as optim

from dataset.create_dataset import create_data_loader
from model.model import Transformer
from transformers import BertTokenizer


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

model = Transformer(num_heads=8,
                    embedding_size=512,
                    num_blocks=6,
                    max_sequence_len=1024,
                    vocab_size=tokenizer.vocab_size,
                    output_size=3).to(device)

model.requires_grad = True

In [4]:
train_dataloader, test_dataloader, _ = create_data_loader(batch_size=8,
                                    max_sequence_size=1024,
                                    tokenizer=tokenizer)

Found cached dataset financial_phrasebank (C:/Users/skoro/.cache/huggingface/datasets/financial_phrasebank/sentences_50agree/1.0.0/550bde12e6c30e2674da973a55f57edde5181d53f5a5a34c1531c53f93b7e141)


  0%|          | 0/1 [00:00<?, ?it/s]

AttributeError: 'DatasetDict' object has no attribute 'train_test_split'

In [None]:
loss_function = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
initial_params = []
for param in model.parameters():
    initial_params.append(param.clone().detach())

In [7]:
def test_model():
    full_loss = 0
    model.eval()
    print(len(test_dataloader))
    
    for batch_num, batch in enumerate(test_dataloader):
        input_ids = batch['input_ids'].to(device)
        labels = batch['label'].to(device)

        model_outputs = model(input_ids)

        loss = loss_function(model_outputs, labels)
        full_loss += float(loss)

        if batch_num % 10 == 0:
            print(batch_num, loss)

    return full_loss / batch_num

In [8]:
prev_loss = test_model()

122


RuntimeError: stack expects each tensor to be equal size, but got [80] at entry 0 and [120] at entry 3

In [18]:
model.train()

for batch_num, batch in enumerate(train_dataloader):
    optimizer.zero_grad()

    input_ids = batch['input_ids'].to(device)
    labels = batch['label'].to(device)

    model_outputs = model(input_ids)

    loss = loss_function(model_outputs, labels)

    loss.backward(retain_graph=True)

    optimizer.step()

    if batch_num % 10 == 0:
        print(batch_num, loss)


0 tensor(0.9264, grad_fn=<NllLossBackward0>)
10 tensor(1.0514, grad_fn=<NllLossBackward0>)
20 tensor(0.5514, grad_fn=<NllLossBackward0>)
30 tensor(0.5514, grad_fn=<NllLossBackward0>)
40 tensor(0.5514, grad_fn=<NllLossBackward0>)
50 tensor(1.3014, grad_fn=<NllLossBackward0>)
60 tensor(1.0514, grad_fn=<NllLossBackward0>)
70 tensor(0.5514, grad_fn=<NllLossBackward0>)
80 tensor(0.5514, grad_fn=<NllLossBackward0>)
90 tensor(0.6764, grad_fn=<NllLossBackward0>)
100 tensor(0.5514, grad_fn=<NllLossBackward0>)
110 tensor(0.6764, grad_fn=<NllLossBackward0>)
120 tensor(0.5514, grad_fn=<NllLossBackward0>)
130 tensor(1.4264, grad_fn=<NllLossBackward0>)
140 tensor(1.1764, grad_fn=<NllLossBackward0>)
150 tensor(1.4264, grad_fn=<NllLossBackward0>)
160 tensor(1.3014, grad_fn=<NllLossBackward0>)
170 tensor(1.4264, grad_fn=<NllLossBackward0>)
180 tensor(1.3014, grad_fn=<NllLossBackward0>)
190 tensor(1.4264, grad_fn=<NllLossBackward0>)
200 tensor(0.9782, grad_fn=<NllLossBackward0>)
210 tensor(0.9264, grad_

KeyboardInterrupt: 

In [19]:
param_changed = False
for initial_param, param in zip(initial_params, model.parameters()):
    if not torch.equal(initial_param, param):
        param_changed = True
        break

In [20]:
if param_changed:
    print("Model parameters have changed after the optimizer step.")
else:
    print("Model parameters have not changed after the optimizer step.")

Model parameters have changed after the optimizer step.
