In [1]:
import numpy as np
import pickle
import torch
import torch.nn.functional as F

import os
import glob

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

from src.model import SimpleBigramModel
from src.dataloaders import build_loaders
from hparams import Hparams

In [2]:
hparams = Hparams()

### Tokenization experiments

In [3]:
tokenizer, train_loader, _, _ = build_loaders(hparams)

In [4]:
model = SimpleBigramModel(vocab_size = tokenizer.get_vocab_size())
model = model.to('cuda')

In [5]:
sum(p.numel() for p in model.parameters() if p.requires_grad)

100000000

In [6]:
loss_fn = torch.nn.CrossEntropyLoss()
optim = torch.optim.AdamW(model.parameters())

In [7]:
model.train()
for epoch in range(hparams.epochs):
    for x, y in train_loader:
        optim.zero_grad()

        x = x.to('cuda')
        y = y.to('cuda')
        
        logits = model(x)
        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        y = y.view(B*T)

        loss = loss_fn(logits, y)

        loss.backward()
        optim.step()
        print(loss)

    print(loss)



tensor(9.6320, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(9.7782, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(9.7291, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(9.6758, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(9.6926, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(9.6448, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(9.7221, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(9.7490, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(9.7013, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(9.6348, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(9.7284, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(9.7834, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(9.6200, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(9.6571, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(9.6786, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(9.6642, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(9.7527, device='cuda:0', grad_fn=

KeyboardInterrupt: 