# Model inspection

In [2]:
import os, sys
sys.path.append(os.path.join(os.path.abspath(''), '../'))

import peewee as pw
from toyDb.databases import ExperimentDb, ShaderDb
from toyDb.utils.Directory import getToyDbRootDir

import matplotlib.pyplot as plt
import numpy as np
import json
from tqdm import tqdm

import transformers
import torch
from copy import deepcopy

ExperimentDb.init_from_default_db()

In [3]:
from misc.HfTracedSpvTokenizer import HfTracedSpvTokenizer

## roberta-base

In [4]:
tokenizer = HfTracedSpvTokenizer(single_entrypoint=False)

config = transformers.RobertaConfig(**{
    "attention_probs_dropout_prob": 0.1,
    "bos_token_id": tokenizer.bos_token_id,
    "eos_token_id": tokenizer.eos_token_id,
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.1,
    "hidden_size": 768,
    "initializer_range": 0.02,
    "intermediate_size": 3072,
    "layer_norm_eps": 1e-05,
    "max_position_embeddings": 4096,
    "model_type": "roberta",
    "num_attention_heads": 12,
    "num_hidden_layers": 12,
    "pad_token_id": tokenizer.pad_token_id,
    "type_vocab_size": 1,
    "vocab_size": 40000
})

config.num_labels = 1
config.problem_type = "regression"

model = transformers.RobertaForSequenceClassification(config)

Do some example inference

In [5]:
selectedEnv = ExperimentDb.Environment.select()[0]
print(f"Environment selected: {selectedEnv.node} - {selectedEnv.cpu} - {selectedEnv.gpu} - {selectedEnv.gpu_driver}")

# https://shadertoy.com/view/lllBR7
# https://shadertoy.com/view/3ttBWN
candidateShaders = ['lllBR7', '3ttBWN']
query = ExperimentDb.ImageOnlyExperiment.select(
      ExperimentDb.ImageOnlyExperiment
    ).where(
      # Canonical condition
      ExperimentDb.ImageOnlyExperiment.num_cycles == ExperimentDb.CANONICAL_NUM_CYCLES,
      ExperimentDb.ImageOnlyExperiment.num_trials == ExperimentDb.CANONICAL_NUM_TRIALS,
      ExperimentDb.ImageOnlyExperiment.width == ExperimentDb.CANONICAL_WIDTH,
      ExperimentDb.ImageOnlyExperiment.height == ExperimentDb.CANONICAL_HEIGHT,
      # Inside all_measurable_and_traceable_canonical_shaders
      ExperimentDb.ImageOnlyExperiment.environment_id == selectedEnv,
      ExperimentDb.ImageOnlyExperiment.shader_shadertoy_id.in_(candidateShaders)
    ).order_by(
      ExperimentDb.ImageOnlyExperiment.shader_shadertoy_id
    )

assert(len(query) == 2)

Environment selected: libreliu-GCL-Arch -  Intel(R) Core(TM) i7-10700K CPU @ 3.80GHz - NVIDIA GeForce RTX 3060 - NVIDIA 535.113.01


In [6]:
dataCollator = transformers.data.DataCollatorWithPadding(tokenizer=tokenizer, padding='longest')

batch = []

for exprIdx, expr in enumerate(query):
    results = json.loads(expr.results)
    result_mean = sum(results) / len(results)

    data = {
        "shaderId": expr.shader.shader_id,
        # SPIR-V bytes
        "fragSpv": expr.shader.fragment_spv,
        # SPIR-V bytes
        "traceFragSpv": expr.trace.traced_fragment_spv,
        # float
        "timeMean": result_mean,
        # dict[int, int]
        "bbIdxMap": {int(k): v for k, v in json.loads(expr.trace.bb_idx_map).items()},
        # List[int]
        "bbTraceCounters": json.loads(expr.trace.bb_trace_counters)
    }

    encoded_inputs = tokenizer(
        spvBinaryRepr=data["fragSpv"],
        id2TraceIdxMap=data["bbIdxMap"],
        traceCounters=data["bbTraceCounters"]
    )
    encoded_inputs_stats = {k: len(v) for k, v in encoded_inputs.items()}

    print(f"expr[{exprIdx}] = {encoded_inputs_stats}")

    del encoded_inputs['trace_labels']
    batch.append(encoded_inputs)

collated = dataCollator(batch)
print(collated)

collated_stats = {k: v.shape for k, v in collated.items()}
print(collated_stats)


expr[0] = {'input_ids': 1132, 'trace_labels': 1132, 'attention_mask': 1132, 'position_ids': 1132}
expr[1] = {'input_ids': 1038, 'trace_labels': 1038, 'attention_mask': 1038, 'position_ids': 1038}
{'input_ids': tensor([[ 1001,  2019, 20002,  ..., 20012, 20194,  2253],
        [ 1000,  1000,  1000,  ..., 20024, 20207,  2253]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1]]), 'position_ids': tensor([[   0,    1,    2,  ..., 1129, 1130, 1131],
        [   0,    0,    0,  ..., 1035, 1036, 1037]])}
{'input_ids': torch.Size([2, 1132]), 'attention_mask': torch.Size([2, 1132]), 'position_ids': torch.Size([2, 1132])}


In [7]:
# => Use Python debugger here to inspect the model

# model.eval()
# model(**collated)

Perfformer model test:

Configure Perfformer model as roberta-base model and check if we can do MLM / other predictions correctly.

In [8]:
from model.modeling_perfformer import PerfformerForMaskedLM
from model.configuration_perfformer import PerfformerConfig

# point to somewhere you cloned
robertaBaseDir = r"C:\Projects\roberta-playground\roberta-base"
robertaTokenizer = transformers.RobertaTokenizer.from_pretrained(robertaBaseDir)

print(robertaTokenizer("Hello world")["input_ids"])
print(robertaTokenizer(" Hello world")["input_ids"])

[0, 31414, 232, 2]
[0, 20920, 232, 2]


In [9]:
# load as roberta-base
perfformerCfgDict = {
    "architectures": [
      "RobertaForMaskedLM"
    ],
    "attention_probs_dropout_prob": 0.1,
    "bos_token_id": 0,
    "eos_token_id": 2,
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.1,
    "hidden_size": 768,
    "initializer_range": 0.02,
    "intermediate_size": 3072,
    "layer_norm_eps": 1e-05,
    "max_position_embeddings": 514,
    "model_type": "roberta",
    "num_attention_heads": 12,
    "num_hidden_layers": 12,
    "pad_token_id": 1,
    "type_vocab_size": 1,
    "vocab_size": 50265,
    # == The following option differs! ==
    "attention_type": "vanilla",
    # BERT-like
    "position_embedding_type": "absolute-learnable"
}

robertaCfgDict = {
    "architectures": [
      "RobertaForMaskedLM"
    ],
    "attention_probs_dropout_prob": 0.1,
    "bos_token_id": 0,
    "eos_token_id": 2,
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.1,
    "hidden_size": 768,
    "initializer_range": 0.02,
    "intermediate_size": 3072,
    "layer_norm_eps": 1e-05,
    "max_position_embeddings": 514,
    "model_type": "roberta",
    "num_attention_heads": 12,
    "num_hidden_layers": 12,
    "pad_token_id": 1,
    "type_vocab_size": 1,
    "vocab_size": 50265,
    # BERT-like
    "position_embedding_type": "absolute"
}

perfformerCfg = PerfformerConfig(**perfformerCfgDict)
robertaCfg = transformers.RobertaConfig(**robertaCfgDict)

perfformerModel = PerfformerForMaskedLM(perfformerCfg)
robertaModel = transformers.RobertaForMaskedLM(robertaCfg)
# perfformerModel = PerfformerForMaskedLM.from_pretrained(robertaBaseDir, config=cfg)

Examine the model structure!

In [10]:
print(perfformerModel)

PerfformerForMaskedLM(
  (perfformer): PerfformerModel(
    (embeddings): PerfformerEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): PerfformerEncoder(
      (layer): ModuleList(
        (0-11): 12 x PerfformerLayer(
          (attention): PerfformerAttention(
            (self): PerfformerSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): PerfformerSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)


In [11]:
print(robertaModel)

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): 

Example evaluation code for MLM:

Method 1:

```python
robertaModelFromPretrained = transformers.RobertaForMaskedLM.from_pretrained(robertaBaseDir)
robertaTokenizer = transformers.RobertaTokenizer.from_pretrained(robertaBaseDir)
robertaPretrainedUnmasker = transformers.pipeline('fill-mask', model=robertaModelFromPretrained, tokenizer=robertaTokenizer)
robertaPretrainedUnmasker("RoBERTa is a model developed by <mask>.")
```

Method 2:

```python
# Make sure to put your model in evaluation mode
perfformerModel.eval()

# Let's test a masked sentence
masked_sentence = "RoBERTa is a model developed by <mask>."

#tokenize the input
input = robertaTokenizer.encode_plus(masked_sentence, return_tensors='pt')

# get output from the model
output = perfformerModel(**input)

# get prediction
predicted_index = torch.argmax(output.logits[0, input['input_ids'][0].tolist().index(robertaTokenizer.mask_token_id)]).item()

# Decode prediction
prediction = robertaTokenizer.decode([predicted_index])

print(f"The masked word is predicted as: {prediction}")
```

## Examine state dict

First, load weight into robertaModel

In [12]:
roberta_state_dict = torch.load(os.path.join(robertaBaseDir, "./pytorch_model.bin"), map_location="cpu")
roberta_state_dict_keys = roberta_state_dict.keys()

for key in roberta_state_dict_keys:
    print(key)

print("")

# Will report missing and unexpected keys
print(robertaModel.load_state_dict(roberta_state_dict, strict=False))
robertaModel.tie_weights()

roberta.embeddings.word_embeddings.weight
roberta.embeddings.position_embeddings.weight
roberta.embeddings.token_type_embeddings.weight
roberta.embeddings.LayerNorm.weight
roberta.embeddings.LayerNorm.bias
roberta.encoder.layer.0.attention.self.query.weight
roberta.encoder.layer.0.attention.self.query.bias
roberta.encoder.layer.0.attention.self.key.weight
roberta.encoder.layer.0.attention.self.key.bias
roberta.encoder.layer.0.attention.self.value.weight
roberta.encoder.layer.0.attention.self.value.bias
roberta.encoder.layer.0.attention.output.dense.weight
roberta.encoder.layer.0.attention.output.dense.bias
roberta.encoder.layer.0.attention.output.LayerNorm.weight
roberta.encoder.layer.0.attention.output.LayerNorm.bias
roberta.encoder.layer.0.intermediate.dense.weight
roberta.encoder.layer.0.intermediate.dense.bias
roberta.encoder.layer.0.output.dense.weight
roberta.encoder.layer.0.output.dense.bias
roberta.encoder.layer.0.output.LayerNorm.weight
roberta.encoder.layer.0.output.LayerNorm

Next: check if the layer weight names can be matched:

In [13]:
perfformer_state_dict = perfformerModel.state_dict()
perfformer_state_dict_keys = [i for i in perfformer_state_dict.keys()]

for key in perfformer_state_dict_keys:
    robertaKeyName = key.replace("perfformer", "roberta")
    if robertaKeyName in roberta_state_dict_keys:
        print(f"{key} -> {robertaKeyName}")
    else:
        print(f"{key} -> None")

perfformer.embeddings.word_embeddings.weight -> roberta.embeddings.word_embeddings.weight
perfformer.embeddings.position_embeddings.weight -> roberta.embeddings.position_embeddings.weight
perfformer.embeddings.token_type_embeddings.weight -> roberta.embeddings.token_type_embeddings.weight
perfformer.embeddings.LayerNorm.weight -> roberta.embeddings.LayerNorm.weight
perfformer.embeddings.LayerNorm.bias -> roberta.embeddings.LayerNorm.bias
perfformer.encoder.layer.0.attention.self.query.weight -> roberta.encoder.layer.0.attention.self.query.weight
perfformer.encoder.layer.0.attention.self.query.bias -> roberta.encoder.layer.0.attention.self.query.bias
perfformer.encoder.layer.0.attention.self.key.weight -> roberta.encoder.layer.0.attention.self.key.weight
perfformer.encoder.layer.0.attention.self.key.bias -> roberta.encoder.layer.0.attention.self.key.bias
perfformer.encoder.layer.0.attention.self.value.weight -> roberta.encoder.layer.0.attention.self.value.weight
perfformer.encoder.layer

Now, do load the weights into my new network

In [14]:
# https://gist.github.com/the-bass/0bf8aaa302f9ba0d26798b11e4dd73e3
from collections import OrderedDict

converted_state_dict = OrderedDict()
old_dict = deepcopy(roberta_state_dict)

for k, v in old_dict.items():
    newK = k.replace("roberta", "perfformer")
    converted_state_dict[newK] = v

print(perfformerModel.load_state_dict(converted_state_dict, strict=False))
perfformerModel.tie_weights()

_IncompatibleKeys(missing_keys=['lm_head.decoder.bias'], unexpected_keys=['perfformer.pooler.dense.weight', 'perfformer.pooler.dense.bias'])


And do some predictions!

In [15]:
perfformerModel.eval()

masked_sentence = "RoBERTa is a model developed by <mask>."
input = robertaTokenizer.encode_plus(masked_sentence, return_tensors='pt')
output = perfformerModel(**input)

predicted_index = torch.argmax(output.logits[0, input['input_ids'][0].tolist().index(robertaTokenizer.mask_token_id)]).item()
prediction = robertaTokenizer.decode([predicted_index])

print(f"The masked word is predicted as: {prediction}")

The masked word is predicted as:  IBM


In [16]:
# Make sure to put your model in evaluation mode
robertaModel.eval()

masked_sentence = "RoBERTa is a model developed by <mask>."
input = robertaTokenizer.encode_plus(masked_sentence, return_tensors='pt')
output = robertaModel(**input)

predicted_index = torch.argmax(output.logits[0, input['input_ids'][0].tolist().index(robertaTokenizer.mask_token_id)]).item()
prediction = robertaTokenizer.decode([predicted_index])

print(f"The masked word is predicted as: {prediction}")

The masked word is predicted as:  IBM


Using the unmasker seems to be way more easy:

In [17]:
robertaUnmasker = transformers.pipeline('fill-mask', model=robertaModel, tokenizer=robertaTokenizer)
robertaUnmasker("RoBERTa is a model developed by <mask>.")

[{'score': 0.14840319752693176,
  'token': 11510,
  'token_str': ' IBM',
  'sequence': 'RoBERTa is a model developed by IBM.'},
 {'score': 0.08577851206064224,
  'token': 1204,
  'token_str': ' Google',
  'sequence': 'RoBERTa is a model developed by Google.'},
 {'score': 0.07717184722423553,
  'token': 3709,
  'token_str': ' Microsoft',
  'sequence': 'RoBERTa is a model developed by Microsoft.'},
 {'score': 0.05991532653570175,
  'token': 6109,
  'token_str': ' NASA',
  'sequence': 'RoBERTa is a model developed by NASA.'},
 {'score': 0.03881894797086716,
  'token': 20124,
  'token_str': ' MIT',
  'sequence': 'RoBERTa is a model developed by MIT.'}]

In [18]:
perfformerUnmasker = transformers.pipeline('fill-mask', model=perfformerModel, tokenizer=robertaTokenizer)
perfformerUnmasker("RoBERTa is a model developed by <mask>.")

[{'score': 0.14840319752693176,
  'token': 11510,
  'token_str': ' IBM',
  'sequence': 'RoBERTa is a model developed by IBM.'},
 {'score': 0.08577851206064224,
  'token': 1204,
  'token_str': ' Google',
  'sequence': 'RoBERTa is a model developed by Google.'},
 {'score': 0.07717184722423553,
  'token': 3709,
  'token_str': ' Microsoft',
  'sequence': 'RoBERTa is a model developed by Microsoft.'},
 {'score': 0.05991532653570175,
  'token': 6109,
  'token_str': ' NASA',
  'sequence': 'RoBERTa is a model developed by NASA.'},
 {'score': 0.03881894797086716,
  'token': 20124,
  'token_str': ' MIT',
  'sequence': 'RoBERTa is a model developed by MIT.'}]

Final check: diff the model!

In [19]:
for (k1, v1), (k2, v2) in zip(perfformerModel.named_parameters(), robertaModel.named_parameters()):
    if k1 == k2 or k1.replace("perfformer", "roberta") == k2:
        print('Diff in weights for layer {}: {}'.format(k1, torch.sum(torch.abs(v1.data - v2.data))))
    else:
        print(f'Key mismatch: {k1} != {k2}')

Diff in weights for layer perfformer.embeddings.word_embeddings.weight: 0.0
Diff in weights for layer perfformer.embeddings.position_embeddings.weight: 0.0
Diff in weights for layer perfformer.embeddings.token_type_embeddings.weight: 0.0
Diff in weights for layer perfformer.embeddings.LayerNorm.weight: 0.0
Diff in weights for layer perfformer.embeddings.LayerNorm.bias: 0.0
Diff in weights for layer perfformer.encoder.layer.0.attention.self.query.weight: 0.0
Diff in weights for layer perfformer.encoder.layer.0.attention.self.query.bias: 0.0
Diff in weights for layer perfformer.encoder.layer.0.attention.self.key.weight: 0.0
Diff in weights for layer perfformer.encoder.layer.0.attention.self.key.bias: 0.0
Diff in weights for layer perfformer.encoder.layer.0.attention.self.value.weight: 0.0
Diff in weights for layer perfformer.encoder.layer.0.attention.self.value.bias: 0.0
Diff in weights for layer perfformer.encoder.layer.0.attention.output.dense.weight: 0.0
Diff in weights for layer perf

Also check out the hidden states!

In [20]:
# inputs = robertaTokenizer("Hello, my dog is cute", return_tensors="pt")
inputs = {
    "input_ids": torch.as_tensor([[10,10,12,5550,2233]]),
    "position_ids": torch.as_tensor([[0, 1, 2, 3, 4]]),
    # "output_attentions": True,
    "output_hidden_states": True
}

outputRoberta = robertaModel(**inputs)
# robertaLogits = outputRoberta.logits
# print(outputRoberta)

outputPerfformer = perfformerModel(**inputs)
# print(outputPerfformer)

def outputDiff(out1, out2):
    for k in out1.keys():
        if isinstance(out1[k], tuple) or isinstance(out1[k], list):
            for idx in range(0, len(out1[k])):
                print(
                    f"out1[{k}][{idx}] == out2[{k}][{idx}]: "
                    f"{torch.sum(torch.abs(out1[k][idx] - out2[k][idx]))}"
                )
        else:
            print(
                f"out1[{k}] == out2[{k}]: "
                f"{torch.sum(torch.abs(out1[k] - out2[k]))}"
            )

outputDiff(outputRoberta, outputPerfformer)

out1[logits] == out2[logits]: 0.0
out1[hidden_states][0] == out2[hidden_states][0]: 0.0
out1[hidden_states][1] == out2[hidden_states][1]: 0.0
out1[hidden_states][2] == out2[hidden_states][2]: 0.0
out1[hidden_states][3] == out2[hidden_states][3]: 0.0
out1[hidden_states][4] == out2[hidden_states][4]: 0.0
out1[hidden_states][5] == out2[hidden_states][5]: 0.0
out1[hidden_states][6] == out2[hidden_states][6]: 0.0
out1[hidden_states][7] == out2[hidden_states][7]: 0.0
out1[hidden_states][8] == out2[hidden_states][8]: 0.0
out1[hidden_states][9] == out2[hidden_states][9]: 0.0
out1[hidden_states][10] == out2[hidden_states][10]: 0.0
out1[hidden_states][11] == out2[hidden_states][11]: 0.0
out1[hidden_states][12] == out2[hidden_states][12]: 0.0


Now let's test the finals...

So seems the perfformer is now working as expected! Hoo-ray!

Let's test the other attention implementations

In [21]:
referenceAttnImpl = 'vanilla'
candidateAttnImpls = [
    'xformers-memeff', 'xformers-memeff-nomask',
    'torch-flash-nomask', 'torch-memeff-nomask'
]

# load as roberta-base
perfformerCfgDict = {
    "architectures": [
      "RobertaForMaskedLM"
    ],
    "attention_probs_dropout_prob": 0.1,
    "bos_token_id": 0,
    "eos_token_id": 2,
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.1,
    "hidden_size": 768,
    "initializer_range": 0.02,
    "intermediate_size": 3072,
    "layer_norm_eps": 1e-05,
    "max_position_embeddings": 514,
    "model_type": "roberta",
    "num_attention_heads": 12,
    "num_hidden_layers": 12,
    "pad_token_id": 1,
    "type_vocab_size": 1,
    "vocab_size": 50265,
    # == The following option differs! ==
    "attention_type": referenceAttnImpl,
    # BERT-like
    "position_embedding_type": "absolute-learnable"
}

refModel = PerfformerForMaskedLM(config=PerfformerConfig(**perfformerCfgDict))
print(f"refModel: {refModel.load_state_dict(converted_state_dict, strict=False)}")
refModel.tie_weights()

candModels = {}
for attnName in candidateAttnImpls:
    perfformerCfgDict["attention_type"] = attnName
    candModels[attnName] = PerfformerForMaskedLM(config=PerfformerConfig(**perfformerCfgDict))

    print(f"candModels[{attnName}]: {candModels[attnName].load_state_dict(converted_state_dict, strict=False)}")
    candModels[attnName].tie_weights()

refModel: _IncompatibleKeys(missing_keys=['lm_head.decoder.bias'], unexpected_keys=['perfformer.pooler.dense.weight', 'perfformer.pooler.dense.bias'])


A matching Triton is not available, some optimizations will not be enabled.
Error caught was: No module named 'triton'


candModels[xformers-memeff]: _IncompatibleKeys(missing_keys=['lm_head.decoder.bias'], unexpected_keys=['perfformer.pooler.dense.weight', 'perfformer.pooler.dense.bias'])
candModels[xformers-memeff-nomask]: _IncompatibleKeys(missing_keys=['lm_head.decoder.bias'], unexpected_keys=['perfformer.pooler.dense.weight', 'perfformer.pooler.dense.bias'])
candModels[torch-flash-nomask]: _IncompatibleKeys(missing_keys=['lm_head.decoder.bias'], unexpected_keys=['perfformer.pooler.dense.weight', 'perfformer.pooler.dense.bias'])
candModels[torch-memeff-nomask]: _IncompatibleKeys(missing_keys=['lm_head.decoder.bias'], unexpected_keys=['perfformer.pooler.dense.weight', 'perfformer.pooler.dense.bias'])


In [22]:
testSentences = [
    "RoBERTa is a model developed by <mask>.",
    "That which we call a <mask>, by any other name, would smell sweet."
]

inputs = [robertaTokenizer.encode_plus(testSentence, return_tensors='pt') for testSentence in testSentences]
print(inputs)

# squeeze or it'll complain, bad!
for inputElem in inputs:
    for k, v in inputElem.items():
        inputElem[k] = v.squeeze()

print(inputs)
print([{k: vv.shape for k, vv in v.items()} for v in inputs])

inputBatch = robertaTokenizer.pad(
    inputs,
    padding='longest',
    pad_to_multiple_of=8,
    return_tensors='pt'
)

print(inputBatch)
print({k: v.shape for k, v in inputBatch.items()})

[{'input_ids': tensor([[    0, 27110, 11126, 38495,    16,    10,  1421,  2226,    30, 50264,
             4,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}, {'input_ids': tensor([[    0,  1711,    61,    52,   486,    10, 50264,     6,    30,   143,
            97,   766,     6,    74, 11362,  4045,     4,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}]
[{'input_ids': tensor([    0, 27110, 11126, 38495,    16,    10,  1421,  2226,    30, 50264,
            4,     2]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])}, {'input_ids': tensor([    0,  1711,    61,    52,   486,    10, 50264,     6,    30,   143,
           97,   766,     6,    74, 11362,  4045,     4,     2]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])}]
[{'input_ids': torch.Size([12]), 'attention_mask': torch.Size([12])}, {'input_ids': torch.Size([18]), 'attention_mask': torch.Size([18])}]
{'inp

In [23]:
refModel.eval()
refOutput = refModel(**inputBatch)

# move to gpu! Could be a view or a copy
def moveTo(dest, batch):
    newBatch = {}
    for k, v in batch.items():
        newBatch[k] = v.to(dest)
    return newBatch

inputBatchGpu = moveTo('cuda', inputBatch)

for attnName in candidateAttnImpls:
    print(f"Testing {attnName}...")
    candModels[attnName].to('cuda')
    candModels[attnName].eval()

    candOutputGpu = candModels[attnName](**inputBatchGpu)
    candOutput = moveTo('cpu', candOutputGpu)

    outputDiff(refOutput, candOutput)
    # print(refOutput)
    # print(candOutput)

Testing xformers-memeff...
out1[logits] == out2[logits]: 12.82652473449707
Testing xformers-memeff-nomask...




out1[logits] == out2[logits]: 3653982.25
Testing torch-flash-nomask...


  context_layer = torch.nn.functional.scaled_dot_product_attention(
  context_layer = torch.nn.functional.scaled_dot_product_attention(
  context_layer = torch.nn.functional.scaled_dot_product_attention(
  context_layer = torch.nn.functional.scaled_dot_product_attention(


RuntimeError: No available kernel.  Aborting execution.

Validate reference output is going to work

In [36]:
mask_pos = [inputBatch['input_ids'][i].tolist().index(robertaTokenizer.mask_token_id) for i in range(0, len(testSentences))]
print(mask_pos)

mask_pos_logits = torch.vstack([refOutput.logits[i, mask_pos[i]] for i in range(0, len(testSentences))])
print(mask_pos_logits)

predicted_index = torch.argmax(mask_pos_logits, dim=1)
prediction = robertaTokenizer.decode(predicted_index)

print(f"The masked word is predicted as: {prediction}")

[9, 6]
tensor([[-3.1715, -3.6989,  4.0094,  ..., -7.1897, -7.6822, -0.5979],
        [-3.6039, -4.2095,  2.5274,  ..., -6.9033, -5.5549, -1.6228]],
       grad_fn=<CatBackward0>)
The masked word is predicted as:  IBM flower


In [37]:
perfformerUnmasker = transformers.pipeline('fill-mask', model=perfformerModel, tokenizer=robertaTokenizer)
perfformerUnmasker("That which we call a <mask>, by any other name, would smell sweet.")

[{'score': 0.0911843329668045,
  'token': 14214,
  'token_str': ' flower',
  'sequence': 'That which we call a flower, by any other name, would smell sweet.'},
 {'score': 0.0380307212471962,
  'token': 17284,
  'token_str': ' potato',
  'sequence': 'That which we call a potato, by any other name, would smell sweet.'},
 {'score': 0.03794030100107193,
  'token': 14099,
  'token_str': ' lemon',
  'sequence': 'That which we call a lemon, by any other name, would smell sweet.'},
 {'score': 0.03546350821852684,
  'token': 3984,
  'token_str': ' wine',
  'sequence': 'That which we call a wine, by any other name, would smell sweet.'},
 {'score': 0.03264541178941727,
  'token': 24410,
  'token_str': ' grape',
  'sequence': 'That which we call a grape, by any other name, would smell sweet.'}]

## The Rotary Embedding 

https://huggingface.co/docs/transformers/notebooks

For test methods see 
- xformers related pull request [here](https://github.com/facebookresearch/xformers/pull/36/files#diff-892ebce2d02f8bfe046f14c60c1754c2e259325d839816c3b8e9286bfabb0b64)
  - tests/test_rotary_embeddings.py

But seems to be buggy? Another resource: 
- https://github.com/EleutherAI/gpt-neox/blob/d8028f8e9d7a3824bee47895c28ee71ae8879234/megatron/model/positional_embeddings.py#L38
- https://github.com/EleutherAI/gpt-neox/blob/d8028f8e9d7a3824bee47895c28ee71ae8879234/megatron/model/transformer.py#L642

```python
        if exists(self.rotary_emb):
            if exists(self.rotary_ndims):
                # partial rotary
                query_rot, query_pass = (
                    query_layer[..., : self.rotary_ndims],
                    query_layer[..., self.rotary_ndims :],
                )
                key_rot, key_pass = (
                    key_layer[..., : self.rotary_ndims],
                    key_layer[..., self.rotary_ndims :],
                )
            else:
                # full rotary
                query_rot, key_rot = query_layer, key_layer

            apply_rotary_fn = (
                apply_rotary_pos_emb_torch if self.bf16 else apply_rotary_pos_emb
            )

            seq_len = key_layer.shape[0]
            offset = 0
            if exists(layer_past) and layer_past.numel() > 0:
                offset = layer_past[0].shape[0]
                seq_len += offset
            cos, sin = self.rotary_emb(value_layer, seq_len=seq_len)
            query_layer, key_layer = apply_rotary_fn(
                query_rot, key_rot, cos, sin, offset=offset
            )

            if exists(self.rotary_ndims):
                query_layer = torch.cat((query_layer, query_pass), dim=-1)
                key_layer = torch.cat((key_layer, key_pass), dim=-1)
```

> Rotary using xformers impl

In [41]:
def rotate_half(x):
    x1, x2 = x.chunk(2, dim=-1)
    return torch.cat((-x2, x1), dim=-1)

def apply_rotary_pos_emb(x, cos, sin):
    # NOTE: This could probably be moved to Triton

    # Handle a possible sequence length mismatch in between q and k
    cos = cos[:, :, : x.shape[-2], :]
    sin = sin[:, :, : x.shape[-2], :]

    return (x * cos) + (rotate_half(x) * sin)

# assumes x: (bsz, num_heads, seq_len, head_size)
def cos_sin_tables(d_model, max_seq_len):
    inv_freq = 1.0 / (10000 ** (torch.arange(0, d_model, 2).float() / d_model))

    t = torch.arange(
        max_seq_len, dtype=torch.float32
    )
    freqs = torch.einsum("i,j->ij", t, inv_freq)
    emb = torch.cat((freqs, freqs), dim=-1)

    cos_cached = emb.cos()[None, None, :, :]
    sin_cached = emb.sin()[None, None, :, :]

    return (cos_cached, sin_cached)

# apply_rotary_pos_emb(q, self._cos_cached, self._sin_cached)

q_test = torch.rand(1, 1, 5, 10)
cos_cached, sin_cached = cos_sin_tables(10, 20)
apply_rotary_pos_emb(q_test, cos_cached, sin_cached)

tensor([[[[ 0.2450,  0.0412,  0.6097,  0.2446,  0.2817,  0.8046,  0.2167,
            0.3141,  0.3358,  0.4200],
          [-0.1789,  0.7276,  0.1119,  0.9259,  0.2643,  0.8379,  0.2453,
            0.4618,  0.3147,  0.7494],
          [-0.3604, -0.0615,  0.3036,  0.5874,  0.7153,  0.6680,  0.3641,
            0.2566,  0.7189,  0.6816],
          [-0.7689,  0.6272,  0.1594,  0.4577,  0.9483, -0.0604,  0.5580,
            0.0189,  0.8505,  0.7860],
          [ 0.2175,  0.6821,  0.3225,  0.9290,  0.2087, -0.4007,  0.5688,
            0.1402,  0.9825,  0.8179]]]])

> Rotary using gpt-neox

https://github.com/EleutherAI/gpt-neox/blob/d8028f8e9d7a3824bee47895c28ee71ae8879234/megatron/model/positional_embeddings.py#L38

In [42]:
def rotate_half(x):
    x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]
    return torch.cat(
        (-x2, x1), dim=x1.ndim - 1
    )  # dim=-1 triggers a bug in earlier torch versions


@torch.jit.script
def apply_rotary_pos_emb(q, k, cos, sin, offset: int = 0):
    cos, sin = (
        cos[offset : q.shape[0] + offset, ...],
        sin[offset : q.shape[0] + offset, ...],
    )
    return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin)

class RotaryEmbedding(torch.nn.Module):
    def __init__(self, dim, max_seq_len, base=10000, precision=torch.half):
        super().__init__()
        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
        self.register_buffer("inv_freq", inv_freq)
        self.seq_len_cached = None
        self.cos_cached = None
        self.sin_cached = None
        self.precision = precision
        self.max_seq_len = max_seq_len
        self.base = base
        self.dim = dim

        # precompute cos_cached, sin_cached in fp32
        cos_cached, sin_cached, inv_freq = self._prepare_cache(
            max_seq_len, precision, base
        )

        self.register_buffer("inv_freq", inv_freq)
        self.cos_cached = cos_cached
        self.sin_cached = sin_cached

    def _prepare_cache(self, seq_len, precision, base):
        # precompute cos_cached, sin_cached in fp32
        inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float() / self.dim))

        t = torch.arange(seq_len).type_as(inv_freq)
        freqs = torch.einsum("i,j->ij", t, inv_freq)
        emb = torch.cat((freqs, freqs), dim=-1)

        cos_cached = emb.cos()[:, None, None, :]
        sin_cached = emb.sin()[:, None, None, :]

        return (
            cos_cached.to(precision),
            sin_cached.to(precision),
            inv_freq.to(precision),
        )

    def forward(self, x, seq_dim=0, seq_len=None):
        if seq_len is None:
            seq_len = x.shape[seq_dim]

        assert seq_len <= self.max_seq_len

        if seq_len != self.max_seq_len:
            # y, z, _ = self._prepare_cache(seq_len, self.precision, self.base)
            return (
                self.cos_cached[:seq_len, ...].to(x.device),
                self.sin_cached[:seq_len, ...].to(x.device),
            )
        else:
            return self.cos_cached.to(x.device), self.sin_cached.to(x.device)

# (bsz, num_heads, seq_len, head_size) => (seqlen, bsz, num_heads, head_size)
q_test_correct = q_test.permute(2, 0, 1, 3)
rotary_emb = RotaryEmbedding(10, 50, precision=torch.float32)
cos_table, sin_table = rotary_emb(q_test_correct, seq_dim=0)

apply_rotary_pos_emb(q_test_correct, q_test_correct, cos_table, sin_table)

(tensor([[[[ 0.2450,  0.0412,  0.6097,  0.2446,  0.2817,  0.8046,  0.2167,
             0.3141,  0.3358,  0.4200]]],
 
 
         [[[-0.1789,  0.7276,  0.1119,  0.9259,  0.2643,  0.8379,  0.2453,
             0.4618,  0.3147,  0.7494]]],
 
 
         [[[-0.3604, -0.0615,  0.3036,  0.5874,  0.7153,  0.6680,  0.3641,
             0.2566,  0.7189,  0.6816]]],
 
 
         [[[-0.7689,  0.6272,  0.1594,  0.4577,  0.9483, -0.0604,  0.5580,
             0.0189,  0.8505,  0.7860]]],
 
 
         [[[ 0.2175,  0.6821,  0.3225,  0.9290,  0.2087, -0.4007,  0.5688,
             0.1402,  0.9825,  0.8179]]]]),
 tensor([[[[ 0.2450,  0.0412,  0.6097,  0.2446,  0.2817,  0.8046,  0.2167,
             0.3141,  0.3358,  0.4200]]],
 
 
         [[[-0.1789,  0.7276,  0.1119,  0.9259,  0.2643,  0.8379,  0.2453,
             0.4618,  0.3147,  0.7494]]],
 
 
         [[[-0.3604, -0.0615,  0.3036,  0.5874,  0.7153,  0.6680,  0.3641,
             0.2566,  0.7189,  0.6816]]],
 
 
         [[[-0.7689,  0.6272,  0.1