## Utility functions

In [2]:
from torch import nn

# this prints out the named parameters of a model
def print_named_params(model: nn.Module) -> None:
    for name, param in model.named_parameters():
        print(f"{name}: {param.shape}")

In [23]:
import torch
from copy import deepcopy
import timm
torch.set_printoptions(precision = 6, sci_mode = False)

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [12]:
import sys
sys.path.append('../../pytei')

In [3]:
def get_num_parameters(model: nn.Module) -> int:
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

## GPT 2

In [18]:
from transformers import GPT2Tokenizer, GPT2Model

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
test_model = GPT2Model.from_pretrained('gpt2')

In [10]:
with open("targets", "w") as f:
    for name, param in model.named_parameters():
        print(f"{name}: {param.shape}")
        f.write(f";{name}\n")

wte.weight: torch.Size([50257, 768])
wpe.weight: torch.Size([1024, 768])
h.0.ln_1.weight: torch.Size([768])
h.0.ln_1.bias: torch.Size([768])
h.0.attn.c_attn.weight: torch.Size([768, 2304])
h.0.attn.c_attn.bias: torch.Size([2304])
h.0.attn.c_proj.weight: torch.Size([768, 768])
h.0.attn.c_proj.bias: torch.Size([768])
h.0.ln_2.weight: torch.Size([768])
h.0.ln_2.bias: torch.Size([768])
h.0.mlp.c_fc.weight: torch.Size([768, 3072])
h.0.mlp.c_fc.bias: torch.Size([3072])
h.0.mlp.c_proj.weight: torch.Size([3072, 768])
h.0.mlp.c_proj.bias: torch.Size([768])
h.1.ln_1.weight: torch.Size([768])
h.1.ln_1.bias: torch.Size([768])
h.1.attn.c_attn.weight: torch.Size([768, 2304])
h.1.attn.c_attn.bias: torch.Size([2304])
h.1.attn.c_proj.weight: torch.Size([768, 768])
h.1.attn.c_proj.bias: torch.Size([768])
h.1.ln_2.weight: torch.Size([768])
h.1.ln_2.bias: torch.Size([768])
h.1.mlp.c_fc.weight: torch.Size([768, 3072])
h.1.mlp.c_fc.bias: torch.Size([3072])
h.1.mlp.c_proj.weight: torch.Size([3072, 768])
h.1.

## Mamba

In [7]:
tokenizer = AutoTokenizer.from_pretrained("state-spaces/mamba-130m-hf")
model = MambaForCausalLM.from_pretrained("state-spaces/mamba-130m-hf")
input_ids = tokenizer("Hey how are you doing?", return_tensors= "pt")["input_ids"]

out = model.generate(input_ids, max_new_tokens=10)
print(tokenizer.batch_decode(out))

The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. Falling back to the sequential implementation of Mamba, as use_mambapy is set to False. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d. For the mamba.py backend, follow https://github.com/alxndrTL/mamba.py.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


["Hey how are you doing?\n\nI'm so glad you're here."]


In [44]:
from transformers import MambaConfig, MambaForCausalLM, AutoTokenizer, MambaModel
import torch

In [50]:
tokenizer = AutoTokenizer.from_pretrained("state-spaces/mamba-130m-hf")
# test_model = MambaForCausalLM.from_pretrained("state-spaces/mamba-130m-hf")
test_model = MambaModel(MambaConfig()) # not sure whether this is the pretrained version or not

In [51]:
print_named_params(test_model)

embeddings.weight: torch.Size([50280, 768])
layers.0.norm.weight: torch.Size([768])
layers.0.mixer.A_log: torch.Size([1536, 16])
layers.0.mixer.D: torch.Size([1536])
layers.0.mixer.conv1d.weight: torch.Size([1536, 1, 4])
layers.0.mixer.conv1d.bias: torch.Size([1536])
layers.0.mixer.in_proj.weight: torch.Size([3072, 768])
layers.0.mixer.x_proj.weight: torch.Size([80, 1536])
layers.0.mixer.dt_proj.weight: torch.Size([1536, 48])
layers.0.mixer.dt_proj.bias: torch.Size([1536])
layers.0.mixer.out_proj.weight: torch.Size([768, 1536])
layers.1.norm.weight: torch.Size([768])
layers.1.mixer.A_log: torch.Size([1536, 16])
layers.1.mixer.D: torch.Size([1536])
layers.1.mixer.conv1d.weight: torch.Size([1536, 1, 4])
layers.1.mixer.conv1d.bias: torch.Size([1536])
layers.1.mixer.in_proj.weight: torch.Size([3072, 768])
layers.1.mixer.x_proj.weight: torch.Size([80, 1536])
layers.1.mixer.dt_proj.weight: torch.Size([1536, 48])
layers.1.mixer.dt_proj.bias: torch.Size([1536])
layers.1.mixer.out_proj.weight: 

In [52]:
# import re
# pattern = r".*weight$"

# with open("targets_mamba", "w") as f:
#     for param_name, _ in test_model.named_parameters():
#         if re.match(pattern, param_name):
#             f.write(f"{param_name}\n")

## Test out Pytei on model

In [53]:
from pytei import Injector

model = deepcopy(test_model).to(device)
model.eval()
text = "blahblahblah"
test_input = tokenizer(text, return_tensors='pt').to(device)

with torch.no_grad():
    # error_map_file = "./targets"
    error_map_file = "./targets_mamba"
    injector = Injector(error_map_file, p = 1e-7, device = device, verbose = True, mitigation = 'clip')
    print('----------Error free----------')
    error_free_out = model(**test_input) # gpt
    # error_free_out = model.generate(test_input["input_ids"]) # mamba
    print('Outputs (error-free):', error_free_out)
    
    print('----------Error Injected----------')
    injector.inject(model)
    error_out = model(**test_input) # gpt
    # error_out = model.generate(test_input["input_ids"]) # mamba
    print('Outputs (error-injected):', error_out)

    # print('----------Error Mitigated----------')
    # model = deepcopy(model).to(device)
    # model.eval()
    # injector.inject(model, use_mitigation = True)
    # error_mitig_out = model(**test_input) # gpt
    # error_mitig_out = model.generate(test_input["input_ids"]) # mamba
    # print('Outputs (error-mitigated):', error_mitig_out)

    # injector.save_error_map('../../temp/testmap.pt', sparse = True)
    # injector.load_error_map('../../temp/testmap.pt', sparse = True)

Injector initialized.
Error probability: 1e-07
Data type: torch.float32
Error model: bit
----------Error free----------
Outputs (error-free): MambaOutput(last_hidden_state=tensor([[[-0.405270,  0.465092, -0.649489,  ..., -0.198023, -0.018413,
           1.771331],
         [ 0.556003,  0.831208,  0.456582,  ...,  0.349721, -1.617129,
           0.776332],
         [-0.170831,  0.653865, -1.187167,  ...,  0.261039,  1.592637,
           1.567974],
         [ 0.836614,  1.391737, -0.590649,  ..., -0.068223, -1.537820,
           1.564583],
         [-0.037992,  0.238477, -1.001789,  ...,  0.189070,  1.148284,
           1.399295],
         [ 1.935914,  0.835130, -0.212235,  ..., -0.337746, -1.460929,
           0.724618]]]), cache_params=<transformers.cache_utils.MambaCache object at 0x12c2b3dd0>, hidden_states=None)
----------Error Injected----------
The following parameters have been injected:
dict_keys(['embeddings.weight', 'layers.0.norm.weight', 'layers.0.mixer.conv1d.weight', 'laye

In [43]:
print('----------Difference (RMSE)----------')
rmse = torch.sqrt(torch.mean((error_out.last_hidden_state - error_free_out.last_hidden_state) ** 2)).item() # gpt
print(f"inject & error-free: ", rmse)
rmse = torch.sqrt(torch.mean((error_mitig_out.last_hidden_state - error_free_out.last_hidden_state) ** 2)).item() # gpt
print(f"mitigated & error-free: ", rmse)

----------Difference (RMSE)----------


RuntimeError: The size of tensor a (7) must match the size of tensor b (20) at non-singleton dimension 1

In [None]:
tokenizer = AutoTokenizer.from_pretrained("state-spaces/mamba-130m-hf")
test_model = MambaForCausalLM.from_pretrained("state-spaces/mamba-130m-hf")

from pytei import Injector

model = deepcopy(test_model).to(device)
model.eval()
text = "blahblahblah"
test_input = tokenizer(text, return_tensors='pt').to(device)

print("model: ", mode.__name__)

with torch.no_grad():
    injector = Injector('./targets', p = 1e-7, device = device, verbose = True, mitigation = 'clip')
    print('----------Error free----------')
    error_free_out = model(**test_input)
    print('Outputs (error-free):', error_free_out)
    
    print('----------Error Injected----------')
    injector.inject(model)
    error_out = model(**test_input)
    print('Outputs (error-injected):', error_out)

    print('----------Error Mitigated----------')
    model = deepcopy(model).to(device)
    model.eval()
    injector.inject(model, use_mitigation = True)
    error_mitig_out = model(**test_input)
    print('Outputs (error-mitigated):', error_mitig_out)