In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from err_gen import error_injection
from utils import evaluate_opt
from quant import quant, dequant
from copy import deepcopy

model_id = 'facebook/opt-125m'
device = "cuda"
model = AutoModelForCausalLM.from_pretrained(
    model_id
)
cp_model = deepcopy(model)
tokenizer = AutoTokenizer.from_pretrained(model_id)
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
testenc = tokenizer("\n\n".join(dataset["text"]), return_tensors="pt")

In [6]:
print(f'original model accuracy(ppl): {evaluate_opt(model.to(device), testenc)}')
model.to("cpu")

original model accuracy(ppl): 27.579069137573242


OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 768, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 768)
      (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (layers): ModuleList(
        (0-11): 12 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), ep

In [7]:
from err_gen import error_injection
import numpy as np
import gc

bits = 8
gs = 16
scale, zero, qs = quant(bits, gs, model)


In [8]:
err_rate = np.linspace(1, 10, 10) * 1e-3
loop_cnt = 1
cnt = 0
target_layer = [
    'q_proj',
    'k_proj',
    'v_proj',
    'out_proj',
    'fc1',
    'fc2',
]

for rate in err_rate:
    print(f'Error rate: {rate}')
    for loop in range(loop_cnt):
        #print(f'Loop count: {loop+1}')
        cp_qs = deepcopy(qs)
        for key in cp_qs:
            if key.split('.')[-1] in target_layer:
                temp = error_injection(param=cp_qs[key].T[len(cp_qs[key].T)//2:], rate=rate, seed=int(42+cnt), device='cpu')
                cp_qs[key].T[len(cp_qs[key].T)//2:] = temp
                cnt+=1
            
        q_x_err = dequant(scale, zero, cp_qs, gs, bits)
        cp_model = deepcopy(model)
        for key in q_x_err.keys():
            if key.split('.')[-1] != 'lm_head':
                weight = key+'.weight'
                cp_model.state_dict()[weight][:] = q_x_err[key]
        
        print(f'errored qweight model accuracy(ppl): {evaluate_opt(cp_model.to(device), testenc)}')
        del q_x_err, cp_model
        gc.collect()

Error rate: 0.001
errored qweight model accuracy(ppl): 28.728588104248047
Error rate: 0.002
errored qweight model accuracy(ppl): 31.677818298339844
Error rate: 0.003
errored qweight model accuracy(ppl): 33.16647720336914
Error rate: 0.004
errored qweight model accuracy(ppl): 82.16188049316406
Error rate: 0.005
errored qweight model accuracy(ppl): 33.98486328125
Error rate: 0.006
errored qweight model accuracy(ppl): 42.31205749511719
Error rate: 0.007
errored qweight model accuracy(ppl): 50.46514892578125
Error rate: 0.008
errored qweight model accuracy(ppl): 44.27627182006836
Error rate: 0.009000000000000001
errored qweight model accuracy(ppl): 70.84103393554688
Error rate: 0.01
errored qweight model accuracy(ppl): 63.760005950927734
