In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
import os
from MinecraftSequencePredict import MinecraftSequencePredict
import Dataset
from Dataset import MinecraftBlockData, custom_collate, custom_collate_binary
from sklearn.model_selection import train_test_split
from torch.optim.lr_scheduler import StepLR

In [2]:
# -----------------------------
#      Hyperparameters
# -----------------------------

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# vocab_size = 251
vocab_size = 3 # block / no block / SOS
d_model = 768 / 2
nhead = 8
num_encoder_layers = 6
num_decoder_layers = 6
dim_feed_forward = 2048 * 2

src_shape  = (5,5,5)
tgt_shape  = (1,5,5)
tgt_offset = (5,0,0)
# Model Definition
dropout = 0.35

model = MinecraftSequencePredict(vocab_size, 
                                 int(d_model), 
                                 nhead, 
                                 int(num_encoder_layers), 
                                 int(num_decoder_layers), 
                                 dim_feed_forward,
                                 src_shape, 
                                 tgt_shape, 
                                 tgt_offset, 
                                 device,
                                 dropout=dropout)

In [3]:
# ------------------------
# Training Parameters
# ------------------------
num_epochs = 10
batch_size = 64
shuffle = True

learning_rate = 1e-6
gamma = 0.5
step_size = 4

eps = 1e-9

num_workers = 8

n_data = 1e4

In [4]:
# ----------------------
# Dataset and DataLoader
# ----------------------

path = '../Datasets/Complete_Datasets/Minecraft6_5_5/data/'

files = Dataset.get_filenames(path, int(n_data))

train_filenames, test_filenames = train_test_split(files, test_size = 0.2, random_state=42)

train_data = MinecraftBlockData(path, train_filenames)

test_data = MinecraftBlockData(path, test_filenames)

training_dataloader = DataLoader(train_data, batch_size, shuffle, num_workers=num_workers, collate_fn=custom_collate_binary)

test_dataloader = DataLoader(test_data, batch_size, shuffle, num_workers=num_workers, collate_fn=custom_collate_binary)

retrieved filenames
all data loaded into memory
all data loaded into memory


In [5]:
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, eps=eps)
loss_fn = nn.CrossEntropyLoss()
scheduler = StepLR(optimizer, step_size=step_size, gamma=gamma)

In [6]:
epoch_losses = []
validation_losses = []

for epoch in range(num_epochs):
    batch_losses = []
    epoch_loss = 0
    validation_loss = 0
    
    
    for i, data in enumerate(training_dataloader):
        model.train()
        src = data['src'].to(device)
        tgt = data['tgt'].to(device)
        output = model(src, tgt) # Batch Size x Tgt Sequence Length x Vocab Size
        
        # Batch Size x Tgt Sequence Length x Vocab Size --> (Batch Size * Tgt Sequence Length x Vocab Size)
        loss = loss_fn(output.view(-1, vocab_size), tgt.view(-1))
        batch_loss = (float(loss) * batch_size / n_data) * 10000.0 / 0.8#consider 32 batch size and 100000 data size 'standard'
        epoch_loss += batch_loss
        batch_losses.append(batch_loss)
        loss.backward()
        
        optimizer.step()
        optimizer.zero_grad()
        if (i+1) % 4 == 0:
            print(f"{i-3}-{i+1} batches done with loss {(sum(batch_losses[i-3:i]) / 4):.2f}")
        # if i == 20:
        #     scheduler.step()
    
    
    for i, data in enumerate(test_dataloader):
        model.eval()
        src = data['src'].to(device)
        tgt = data['tgt'].to(device)
        with torch.no_grad():
            output = model(src, tgt) # Batch Size x Tgt Sequence Length x Vocab Size
            
            loss = loss_fn(output.view(-1, vocab_size), tgt.view(-1))
            validation_loss += (float(loss) * batch_size / n_data * 10000.0 / 0.2)
        
    epoch_losses.append(epoch_loss)
    validation_losses.append(validation_loss)
    
    print(f'\nEpoch {epoch + 1}, Learning Rate: {scheduler.get_last_lr()[0]}')
    print(f'Epoch {epoch + 1} Training loss: {epoch_loss:.2f}')
    print(f'Epoch {epoch + 1} Validation loss: {validation_loss:.2f}\n')
    if(epoch > 0):
        print(f"Improvements: Train: {(epoch_losses[epoch-1] - epoch_losses[epoch]):.2f} | Test: {(validation_losses[epoch-1] - validation_losses[epoch]):.2f}\n")
    
    scheduler.step()

0-4 batches done with loss 508.23
4-8 batches done with loss 460.32
8-12 batches done with loss 442.27

Epoch 1, Learning Rate: 1e-05
Epoch 1 Training loss: 9146.25
Epoch 1 Validation loss: 80.05

0-4 batches done with loss 414.40
4-8 batches done with loss 409.13
8-12 batches done with loss 409.74

Epoch 2, Learning Rate: 1e-05
Epoch 2 Training loss: 8172.80
Epoch 2 Validation loss: 76.34

Improvements: Train: 973.44 | Test: 3.71

0-4 batches done with loss 401.56
4-8 batches done with loss 388.55
8-12 batches done with loss 377.24

Epoch 3, Learning Rate: 1e-05
Epoch 3 Training loss: 7665.17
Epoch 3 Validation loss: 64.77

Improvements: Train: 507.63 | Test: 11.57

0-4 batches done with loss 361.27
4-8 batches done with loss 351.57
8-12 batches done with loss 339.60

Epoch 4, Learning Rate: 1e-05
Epoch 4 Training loss: 6928.67
Epoch 4 Validation loss: 59.02

Improvements: Train: 736.50 | Test: 5.74

0-4 batches done with loss 328.53
4-8 batches done with loss 327.11
8-12 batches done

In [None]:
#create some sample data for a test forward pass
torch.manual_seed(42)

# batch_sizex5x5x5 tensor with random integers from 0 to vocab size
src_example = torch.randint(0, vocab_size - 1, size=src_shape, dtype=torch.int).view(-1).unsqueeze(0).expand(1,-1).to(device)
tgt_example = torch.cat([torch.tensor([vocab_size-1]).view(-1),torch.randint(0, vocab_size - 1, size=tgt_shape, dtype=torch.int).view(-1)]).unsqueeze(0).expand(1,-1).to(device)
tgt_sos = torch.tensor([vocab_size-1]).unsqueeze(0).expand(1,-1).to(device)

src_example = torch.zeros(src_shape, dtype=torch.int).view(-1).unsqueeze(0).expand(1,-1).to(device)


output = model(src_example, tgt_sos)
print(torch.argmax(output).unsqueeze(0))

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x000001865D1B17E0>
Traceback (most recent call last):
  File "c:\Users\hmhor\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\dataloader.py", line 1478, in __del__
    self._shutdown_workers()
  File "c:\Users\hmhor\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\dataloader.py", line 1436, in _shutdown_workers
    if self._persistent_workers or self._workers_status[worker_id]:
AttributeError: '_MultiProcessingDataLoaderIter' object has no attribute '_workers_status'


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
with torch.no_grad():
    for i in range(25):
        output = model(src_example,tgt_sos)
        output = torch.argmax(output).unsqueeze(0).unsqueeze(0)
        print(output)
        tgt_sos = torch.cat([tgt_sos, output], dim=1)
print(src_example)
print(tgt_sos)


tensor([[1]], device='cuda:0')
tensor([[4]], device='cuda:0')


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
name = "binary_predictor_initial"
torch.save(model.state_dict(), os.path.join("./models/", name))

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
