In [1]:
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from numpy import sqrt, log
from dataset import TokenTypesDataset

In [2]:
train_dataset = TokenTypesDataset(folder="../tokentype_keywordasitis_data/train")
val_dataset = TokenTypesDataset(folder="../tokentype_keywordasitis_data/validation", train=False, vocabs=(train_dataset.token2idx, train_dataset.idx2token), max_length=train_dataset.max_length)
test_dataset = TokenTypesDataset(folder="../tokentype_keywordasitis_data/test", train=False, vocabs=(train_dataset.token2idx, train_dataset.idx2token), max_length=train_dataset.max_length)
assert val_dataset.vocab_size == train_dataset.vocab_size == test_dataset.vocab_size
assert val_dataset.max_length == train_dataset.max_length == test_dataset.max_length


In [3]:
def generate_square_subsequent_mask(sz, device):
    mask = (torch.triu(torch.ones((sz, sz), device=device)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


def create_masks(src, pad_idx, device):
    src_seq_len = src.shape[1]

    src_mask = generate_square_subsequent_mask(src_seq_len, device=device)

    # NB: if you use this mask with hf models it should be !=, if vanilla torch => == 
    src_padding_mask = (src != pad_idx)
    return src_mask, src_padding_mask

In [4]:
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm
from transformers import Phi3Config, Phi3ForCausalLM

def calc_accuracy_and_hitrate_at_k(model: Phi3ForCausalLM, loader, device, k=3):
    
    total_seq_len_in_dataloader = 0
    total_correct_predictions = 0
    total_hits_in_loader = 0
    
    for batch in tqdm(loader):
        src_mask, src_pad_mask = (create_masks(batch, pad_idx=loader.dataset.pad_id, device=device))
        src_mask = src_mask.to(device)
        src_pad_mask = src_pad_mask.to(device)
        batch = batch.to(device)
    
        labels = batch[:, 1:]
        labels_pad_mask = (labels == 0)
        labels_without_pad = labels[~labels_pad_mask]
        logits_without_last = model.forward(input_ids=batch, attention_mask=src_pad_mask).logits[:, :-1, :]
        predictions = logits_without_last.argmax(dim=-1)
        predictions_without_pad = predictions[~labels_pad_mask]

        top_k_predictions = torch.argsort(logits_without_last, dim=-1, descending=True)[:, :, :k][~labels_pad_mask]

    
        total_seq_len_in_batch = labels_without_pad.shape[0]
        total_predicions_len = predictions_without_pad.shape[0]
    
        assert total_seq_len_in_batch == total_predicions_len
    
        total_seq_len_in_dataloader += total_seq_len_in_batch
    
        correct_predictions = (labels_without_pad == predictions_without_pad).float().sum()

        total_hits_in_batch = (top_k_predictions == labels_without_pad.unsqueeze(1)).any(dim=1).float().sum()
        
        total_hits_in_loader += total_hits_in_batch
        total_correct_predictions += correct_predictions
    
    return (total_correct_predictions / total_seq_len_in_dataloader).item(), (total_hits_in_loader / total_seq_len_in_dataloader).item()

def train_epoch(model: Phi3ForCausalLM, optimizer, loss_fn, train_dataloader: DataLoader, device):
    model.train()
    losses = 0

    for src in tqdm(train_dataloader, leave=False):
        src = src.to(device)

        src_mask, src_padding_mask = create_masks(src, pad_idx=train_dataloader.dataset.pad_id, device=device)

        logits = model.forward(input_ids=src, attention_mask=src_padding_mask).logits[:, :-1, :]

        src_out = src[:, 1:]

        optimizer.zero_grad()
        
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), src_out.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()

    return losses / len(train_dataloader)


def evaluate(model: Phi3ForCausalLM, loss_fn, val_dataloader, device, k=3):
    model.eval()
    losses = 0

    for src in tqdm(val_dataloader, leave=False):
        src = src.to(device)
        src_mask, src_padding_mask = create_masks(src, pad_idx=val_dataloader.dataset.pad_id, device=device)

        logits = model.forward(input_ids=src, attention_mask=src_padding_mask).logits[:, :-1, :]

        src_out = src[:, 1:]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), src_out.reshape(-1))
        losses += loss.item()

    acc, hitrate = calc_accuracy_and_hitrate_at_k(model, val_dataloader, device, k)

    return losses / len(val_dataloader), acc, hitrate

In [5]:
import torch.nn as nn
from torch.utils.data import DataLoader
from timeit import default_timer as timer
from transformers import Phi3Config, Phi3ForCausalLM

NUM_EPOCHS = 40
BATCH_SIZE = 64




config = Phi3Config(
    vocab_size=train_dataset.vocab_size,
    hidden_size=256,
    intermediate_size=1024,
    num_hidden_layers=2,
    num_attention_heads=4,
    original_max_position_embeddings=512,
    resid_pdrop=0.1,
    embd_pdrop=0.1,
    attention_dropout=0.1,
    max_position_embeddings=512,
    pad_token_id=train_dataset.pad_id,
    bos_token_id=train_dataset.bos_id,
    eos_token_id=train_dataset.eos_id,
    use_cache=False,
)

model = Phi3ForCausalLM(config)

torch.manual_seed(42)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

transformer = model.to(device)

for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)


loss_fn = torch.nn.CrossEntropyLoss(ignore_index=train_dataset.pad_id, label_smoothing=0.07)

optimizer = torch.optim.AdamW(transformer.parameters(), lr=0.001, betas=(0.9, 0.98), eps=1e-9, weight_decay=0.01)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=1, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=1, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=1, pin_memory=True)
k = 3

for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(transformer, optimizer, loss_fn, train_loader, device)
    end_time = timer()
    val_loss, val_acc, val_hitrate = evaluate(transformer, loss_fn, val_loader, device, k=k)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, Val ACC: {val_acc:.3f}, Val hitrate@{k}: {val_hitrate:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))

  0%|          | 0/921 [00:00<?, ?it/s]

You are not running the flash-attention implementation, expect numerical differences.


  0%|          | 0/138 [00:00<?, ?it/s]

  0%|          | 0/138 [00:00<?, ?it/s]

Epoch: 1, Train loss: 1.477, Val loss: 1.410, Val ACC: 0.757, Val hitrate@3: 0.926, Epoch time = 89.823s


  0%|          | 0/921 [00:00<?, ?it/s]

  0%|          | 0/138 [00:00<?, ?it/s]

  0%|          | 0/138 [00:00<?, ?it/s]

Epoch: 2, Train loss: 1.384, Val loss: 1.388, Val ACC: 0.762, Val hitrate@3: 0.928, Epoch time = 89.555s


  0%|          | 0/921 [00:00<?, ?it/s]

  0%|          | 0/138 [00:00<?, ?it/s]

  0%|          | 0/138 [00:00<?, ?it/s]

Epoch: 3, Train loss: 1.366, Val loss: 1.377, Val ACC: 0.763, Val hitrate@3: 0.931, Epoch time = 89.708s


  0%|          | 0/921 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [8]:
transformer.save_pretrained("model")

In [6]:
calc_accuracy_and_hitrate_at_k(transformer, test_loader, device, 5)

  0%|          | 0/229 [00:00<?, ?it/s]

(0.7478238344192505, 0.9565130472183228)

In [7]:
sum(p.numel() for p in transformer.parameters())

2216704

{'SELECT': 4,
 'Identifier': 5,
 'FROM': 6,
 'WHERE': 7,
 'Equals': 8,
 'QuotedIdentifier': 9,
 'GROUP': 10,
 'BY': 11,
 'ORDER': 12,
 'OpeningRoundBracket': 13,
 'NAME': 14,
 'ClosingRoundBracket': 15,
 'DESC': 16,
 'LIMIT': 17,
 'Number': 18,
 'Comma': 19,
 'Asterisk': 20,
 'DISTINCT': 21,
 'HAVING': 22,
 'Greater': 23,
 'AND': 24,
 'MAX': 25,
 'SOURCE': 26,
 'LIKE': 27,
 'TYPE': 28,
 '*': 29,
 'NO': 30,
 'Dot': 31,
 'AS': 32,
 'JOIN': 33,
 'ON': 34,
 'YEAR': 35,
 'UNION': 36,
 'Slash': 37,
 'MIN': 38,
 'ID': 39,
 'GreaterOrEquals': 40,
 '=': 41,
 'BETWEEN': 42,
 'Less': 43,
 'Minus': 44,
 'NotEquals': 45,
 'MATCH': 46,
 'Plus': 47,
 'DIV': 48,
 'EXTRACT': 49,
 'NOT': 50,
 'IN': 51,
 'MONTH': 52,
 'DAY': 53,
 'StringLiteral': 54,
 'Semicolon': 55,
 'INTERSECT': 56,
 'ASC': 57,
 'OR': 58,
 'DATE': 59,
 'EVENTS': 60,
 'EXCEPT': 61,
 '.': 62,
 'CHARACTER': 63,
 'LessOrEquals': 64,
 'POSITION': 65,
 'RANGE': 66,
 'HOST': 67,
 'ROLE': 68,
 'ROLES': 69,
 'EVENT': 70,
 'TIES': 71,
 'LIST': 

In [8]:
from transformers import PreTrainedTokenizerFast

# Create a simple tokenizer
tokenizer = PreTrainedTokenizerFast(tokenizer_file=None)
vocab = train_dataset.token2idx  # Reserve 0, 1, 2 for special tokens

vocab["<pad>"] = 0
vocab["<bos>"] = 1
vocab["<eos>"] = 2

# Initialize the tokenizer
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    vocab=vocab,
    pad_token="<pad>",
    bos_token="<bos>",
    eos_token="<eos>",
)

ValueError: Couldn't instantiate the backend tokenizer from one of: 
(1) a `tokenizers` library serialization file, 
(2) a slow tokenizer instance to convert or 
(3) an equivalent slow tokenizer class to instantiate and convert. 
You need to have sentencepiece installed to convert a slow tokenizer to a fast one.

In [15]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders, processors
from transformers import PreTrainedTokenizerFast

# Define the vocabulary
vocab = train_dataset.token2idx  # Reserve 0, 1, 2 for special tokens

vocab["<pad>"] = 0
vocab["<bos>"] = 1
vocab["<eos>"] = 2  # Add all unique words

# Create a tokenizer
tokenizer = Tokenizer(models.WordLevel(vocab=vocab, unk_token="<unk>"))
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

# Train the tokenizer
trainer = trainers.WordLevelTrainer(vocab=vocab, special_tokens=["<pad>", "<bos>", "<eos>", "<unk>"])
tokenizer.train_from_iterator(vocab, trainer)

# Add post-processing
tokenizer.post_processor = processors.TemplateProcessing(
    single="<bos> $A <eos>",
    pair="<bos> $A <eos> $B:1 <eos>:1",
    special_tokens=[
        ("<bos>", 1),
        ("<eos>", 2),
    ],
)
tokenizer.decoder = decoders.WordPiece()

# Save the tokenizer
tokenizer.save("custom_tokenizer.json")

# Load the tokenizer as a PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast(tokenizer_file="custom_tokenizer.json", pad_token="<pad>", bos_token="<bos>", eos_token="<eos>", model_max_length=512)

Ignored unknown kwargs option vocab


In [23]:
tokenizer.save_pretrained()

TypeError: PreTrainedTokenizerBase.save_pretrained() missing 1 required positional argument: 'save_directory'

In [17]:
tokenizer("SELECT Identifier FROM Identifier WHERE Identifier Equals QuotedIdentifier GROUP BY Identifier ORDER BY Identifier OpeningRoundBracket NAME ClosingRoundBracket DESC LIMIT Number")

{'input_ids': [1, 180, 101, 76, 101, 225, 101, 66, 167, 83, 25, 101, 145, 25, 101, 148, 132, 41, 50, 113, 139, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

'SELECT'

In [21]:
tokenizer.model.save()

AttributeError: 'PreTrainedTokenizerFast' object has no attribute 'model'