In [1]:
from fynesse import access, assess, address

import torch
import torch.nn as nn

import lightning as L

import numpy as np

In [2]:
from config import MODELS, MODELS_EXT, EVENTS, SEEDS

MAX_LEN = 15000

## Mapping

In [3]:
model2layers = assess.eat_pickle('./data/pickle/memo.pickle')

# add <bos> and <eos> markers
for model in model2layers.keys():
    model2layers[model] = ['<bos>'] + model2layers[model] + ['<eos>']
    
# take note of all the tokens that have appeared
all_tokens = set()

for layers in model2layers.values():
    for layer in layers:
        all_tokens.add(layer)

all_tokens = np.array(list(all_tokens))

In [4]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(all_tokens)

In [5]:
def model2seq(model):
    layers = model2layers[model]
    return le.transform(layers)

## Custom Dataset

In [6]:
from torch.utils.data import Dataset, DataLoader, random_split

class TimeseriesDataset(Dataset):

    def _get_timeseries(self):
        # load master dictionary for everything
        master_dict = assess.eat_pickle(f'./data/pickle/master.pickle')
        timeseries = address.make_transformer_features(master_dict, events=self.events, models=self.models, n_samples=500, bin_size=1)
        return timeseries.astype(np.float32)

    def _get_layers(self):
        layers = []
        for model in self.models:
            # max_seq_len is 300?
            seq = model2seq(model)
            seq = np.pad(seq, (0, 300-len(seq)), constant_values=1)
            seq = np.expand_dims(seq, axis=-1)

            # assume 500 samples for each model
            layers += [seq for _ in range(500)]

        return layers

    def __init__(self, models, events):
        self.models = models
        self.events = events

        self.src = self._get_timeseries()
        self.tgt = self._get_layers()

        # should have same number of samples
        assert self.src.shape[0] == len(self.tgt)

    def __len__(self):
        return len(self.src)
    
    def __getitem__(self, idx):
        return self.src[idx], self.tgt[idx]

## Custom Transformer

In [7]:
class TimeSeriesTransformer(nn.Module):
    def __init__(self, tgt_vocab_size, d_model, nhead, num_layers=6, d_ff=2048, dropout=0.1):
        super(TimeSeriesTransformer, self).__init__()
        self.tgt_mask = (1 - torch.triu(torch.ones(300, 300), diagonal=1)).bool()

        self.pos_encoder = PositionalEncoding(d_model, dropout)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)
        
        self.transformer = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_layers, num_decoder_layers=num_layers, dim_feedforward=d_ff, dropout=dropout)
        self.fc = nn.Linear(d_model, tgt_vocab_size)

    def _generate_self_attention_mask(self, s):
        # mask = (torch.triu(torch.ones(s, s)) == 1).transpose(0, 1)
        # mask = (mask.float().masked_fill(mask == 0, float("-inf")).masked_fill(mask == 1, float(0.0)))
        return (1 - torch.triu(torch.ones(s, s), diagonal=1)).bool()
    
    def forward(self, src, tgt):
        print(src.shape)
        print(tgt.shape)

        if self.tgt_mask is None or self.tgt_mask.size(0) != len(tgt):
            self.tgt_mask = self._generate_self_attention_mask(len(tgt)).to(tgt.device)
        
        src = self.pos_encoder(src)
        tgt = self.tgt_embedding(tgt)

        output = self.transformer(src, tgt, tgt_mask=self.tgt_mask, tgt_is_causal=True)
        logits = self.fc(output)

        return logits
    
    def generate(self, src, tgt, num_new_tokens=1):
        src, tgt = src.detach(), tgt.detach()

        for _ in range(num_new_tokens):
            logits = self(src, tgt)
            p = nn.functional.softmax(logits[:, -1, :], dim=-1)
            new_tokens = torch.multinomial(p, num_samples=1)
            tgt = torch.concatenate([tgt, new_tokens], dim=1).contiguous()

        return input
    
    
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=MAX_LEN):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(np.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Arguments:
            x: Tensor, shape ``[S, N, E]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

## Lightning Module

In [8]:
class LitTimeSeriesTransformer(L.LightningModule):
    def __init__(self, tgt_vocab_size, d_model, nhead, num_layers=6, d_ff=2048, dropout=0.1):
        super().__init__()
        self.model = TimeSeriesTransformer(tgt_vocab_size, d_model, nhead, num_layers, d_ff, dropout)

    def forward(self, src, tgt):
        return self.model(src, tgt)
    
    def training_step(self, batch, batch_idx):
        src, tgt = batch
        logits = self(src.permute(1, 0, 2), tgt.permute(1, 0, 2))
        B, L, V = logits.shape
        
        loss = nn.functional.cross_entropy(logits.contiguous().view(B*L, V), tgt.contiguous().view(B*L))

        # logging to TensorBoard (if installed) by default
        self.log("train_loss", loss)

        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.model.parameters(), lr=0.001)
    
    def validation_step(self, batch, batch_idx):
        src, tgt = batch
        logits = self(src, tgt)
        B, L, V = logits.shape
        
        loss = nn.functional.cross_entropy(logits.contiguous().view(B*L, V), tgt.contiguous().view(B*L))

        # logging to TensorBoard (if installed) by default
        self.log("val_loss", loss)

        return loss
    
    def test_step(self, batch, batch_idx):
        src, tgt = batch
        logits = self(src, tgt)
        B, L, V = logits.shape
        
        loss = nn.functional.cross_entropy(logits.contiguous().view(B*L, V), tgt.contiguous().view(B*L))

        # logging to TensorBoard (if installed) by default
        self.log("test_loss", loss)

        return loss

## Training

In [9]:
dataset = TimeseriesDataset(models=MODELS, events=['instructions','cache-misses','cache-references','L1-dcache-load-misses'])

torch.manual_seed(42)
train_set, val_set, test_set = random_split(dataset, [0.6, 0.2, 0.2])

In [10]:
transformer = LitTimeSeriesTransformer(tgt_vocab_size=len(all_tokens), d_model=4, nhead=4)
train_loader = DataLoader(train_set, batch_size=4)



In [11]:
trainer = L.Trainer(max_epochs=1, accelerator="mps", devices=1)
trainer.fit(model=transformer, train_dataloaders=train_loader)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/Users/kate/anaconda3/envs/diss/lib/python3.10/site-packages/lightning/pytorch/trainer/configuration_validator.py:74: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.

  | Name  | Type                  | Params
------------------------------------------------
0 | model | TimeSeriesTransformer | 223 K 
------------------------------------------------
223 K     Trainable params
0         Non-trainable params
223 K     Total params
0.892     Total estimated model params size (MB)
/Users/kate/anaconda3/envs/diss/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.


Epoch 0:   0%|          | 0/225 [00:00<?, ?it/s] torch.Size([11551, 4, 4])
torch.Size([300, 4, 1])
