In [1]:
from torch import nn

# this prints out the named parameters of a model
def print_named_params(model: nn.Module) -> None:
    for name, param in model.named_parameters():
        print(f"{name}: {param.shape}")

def output_targets(model: nn.Module, file: str) -> None:
    with open(file, 'w') as f:
        for name, param in model.named_parameters():
            f.write(f"{name}\n")

In [2]:
import torch
from copy import deepcopy
#import timm
torch.set_printoptions(precision = 6, sci_mode = False)

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [3]:
import sys
sys.path.append('../../pytei')

In [4]:
def get_num_parameters(model: nn.Module) -> int:
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [5]:
from pytei import Injector
def inject_error(model: nn.Module, error_map_file: str, prob) -> nn.Module:
    model_error = deepcopy(model).to(device)
    injector = Injector(error_map_file, p = prob, device = device, verbose = True)
    injector.inject(model_error)
    return model_error

## GPT 2

In [6]:
from transformers import GPT2Tokenizer, GPT2Model
from collections import OrderedDict

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2 = GPT2Model.from_pretrained('gpt2')

def get_modified_state_dictGPT2(model: nn.Module):
    new_state_dict = deepcopy(model.state_dict())
    for key in list(new_state_dict.keys()):
        new_state_dict[f"transformer.{key}"] = new_state_dict.pop(key)
    return new_state_dict

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
print_named_params(gpt2)
output_targets(gpt2, "gpt2_targets")

wte.weight: torch.Size([50257, 768])
wpe.weight: torch.Size([1024, 768])
h.0.ln_1.weight: torch.Size([768])
h.0.ln_1.bias: torch.Size([768])
h.0.attn.c_attn.weight: torch.Size([768, 2304])
h.0.attn.c_attn.bias: torch.Size([2304])
h.0.attn.c_proj.weight: torch.Size([768, 768])
h.0.attn.c_proj.bias: torch.Size([768])
h.0.ln_2.weight: torch.Size([768])
h.0.ln_2.bias: torch.Size([768])
h.0.mlp.c_fc.weight: torch.Size([768, 3072])
h.0.mlp.c_fc.bias: torch.Size([3072])
h.0.mlp.c_proj.weight: torch.Size([3072, 768])
h.0.mlp.c_proj.bias: torch.Size([768])
h.1.ln_1.weight: torch.Size([768])
h.1.ln_1.bias: torch.Size([768])
h.1.attn.c_attn.weight: torch.Size([768, 2304])
h.1.attn.c_attn.bias: torch.Size([2304])
h.1.attn.c_proj.weight: torch.Size([768, 768])
h.1.attn.c_proj.bias: torch.Size([768])
h.1.ln_2.weight: torch.Size([768])
h.1.ln_2.bias: torch.Size([768])
h.1.mlp.c_fc.weight: torch.Size([768, 3072])
h.1.mlp.c_fc.bias: torch.Size([3072])
h.1.mlp.c_proj.weight: torch.Size([3072, 768])
h.1.

In [8]:
gpt2_error = inject_error(gpt2, "gpt2_targets", 1*10**-9)
gpt2_error.eval()
text = "blahblahblah"
test_input = tokenizer(text, return_tensors='pt').to(device)


with torch.no_grad():
    error_out = gpt2_error(**test_input) # gpt

Injector initialized.
Error probability: 1e-09
Data type: torch.float32
Error model: bit
Started allocation
Injecting into wte.weight. (First one takes ~30 seconds)
Injecting into wpe.weight. (First one takes ~30 seconds)
Injecting into h.0.ln_1.weight. (First one takes ~30 seconds)
Injecting into h.0.ln_1.bias. (First one takes ~30 seconds)
Injecting into h.0.attn.c_attn.weight. (First one takes ~30 seconds)
Injecting into h.0.attn.c_attn.bias. (First one takes ~30 seconds)
Injecting into h.0.attn.c_proj.weight. (First one takes ~30 seconds)
Injecting into h.0.attn.c_proj.bias. (First one takes ~30 seconds)
Injecting into h.0.ln_2.weight. (First one takes ~30 seconds)
Injecting into h.0.ln_2.bias. (First one takes ~30 seconds)
Injecting into h.0.mlp.c_fc.weight. (First one takes ~30 seconds)
Injecting into h.0.mlp.c_fc.bias. (First one takes ~30 seconds)
Injecting into h.0.mlp.c_proj.weight. (First one takes ~30 seconds)
Injecting into h.0.mlp.c_proj.bias. (First one takes ~30 seconds

## MAMBA

## Evaluate

In [9]:
from transformers import AutoModelForCausalLM,  AutoTokenizer
from deepeval.models.base_model import DeepEvalBaseLLM
from typing import List
from pydantic import BaseModel
from lmformatenforcer import JsonSchemaParser
from lmformatenforcer.integrations.transformers import (
    build_transformers_prefix_allowed_tokens_fn,
)
import transformers
from transformers import pipeline
import json

class GPT2(DeepEvalBaseLLM):
    def __init__(
        self,
        model,
        tokenizer
    ):
        self.model = model
        self.tokenizer = tokenizer

    def load_model(self):
        return self.model

    def generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        # Same as the previous example above
        model = self.load_model()
        pipeline = transformers.pipeline(
            "text-generation",
            model=model,
            tokenizer=self.tokenizer,
            use_cache=True,
            device_map="auto",
            max_new_tokens=100,
            do_sample=True,
            top_k=5,
            num_return_sequences=1,
            eos_token_id=self.tokenizer.eos_token_id,
            pad_token_id=self.tokenizer.eos_token_id,
        )

        # Create parser required for JSON confinement using lmformatenforcer
        parser = JsonSchemaParser(schema.schema())
        prefix_function = build_transformers_prefix_allowed_tokens_fn(
            pipeline.tokenizer, parser
        )

        # Output and load valid JSON
        output_dict = pipeline(prompt, prefix_allowed_tokens_fn=prefix_function)
        output = output_dict[0]["generated_text"][len(prompt) :]
        json_result = json.loads(output)

        # Return valid JSON object according to the schema DeepEval supplied
        return schema(**json_result)

    async def a_generate(self, prompt: str, schema) -> BaseModel:
        return self.generate(prompt, schema)

    # This is optional.
    def batch_generate(self, promtps: List[str]) -> List[str]:
        model = self.load_model()
        device = "cuda" # the device to load the model onto

        model_inputs = self.tokenizer(promtps, return_tensors="pt").to(device)
        model.to(device)

        generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
        return self.tokenizer.batch_decode(generated_ids)

    def get_model_name(self):
        return "GPT2"

#model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2") # Can be replaced with any huggingface model
#tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")



In [10]:
from transformers import AutoModelForCausalLM, AutoConfig
from deepeval.benchmarks import MMLU
from deepeval.benchmarks.tasks import MMLUTask

'''
def convert_torch_to_huggingface(model: torch.nn.Module):
    config = AutoConfig.from_pretrained("gpt2") # Change to whichever model architecture being used
    hf_model = AutoModelForCausalLM.from_config(config)    
    hf_model.load_state_dict(model.state_dict(), strict=False)
    return hf_model
'''

def convert_torch_to_huggingface_stateGPT2(state_dict):
    config = AutoConfig.from_pretrained("gpt2") # Change to whichever model architecture being used
    hf_model = AutoModelForCausalLM.from_config(config)    
    hf_model.load_state_dict(state_dict, strict=False)
    return hf_model

def evaluate_model_MMLU(model):

    benchmark = MMLU(
        tasks=[MMLUTask.HIGH_SCHOOL_COMPUTER_SCIENCE, MMLUTask.ASTRONOMY],
        n_shots=3
    )

    benchmark.evaluate(model=model)
    return benchmark.task_scores

def convert_to_hf_GPT2(model: nn.Module):
    return convert_torch_to_huggingface_stateGPT2(get_modified_state_dictGPT2(model))
    
def evaluate_model_MMLU_GPT2(model: nn.Module):
    hf_model = convert_to_hf_GPT2(model)
    tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
    test_model = GPT2(model=hf_model, tokenizer=tokenizer)
    return evaluate_model_MMLU(test_model)

In [None]:
probability = [2e-9, 4e-9, 6e-9, 8e-9]
for prob in probability:
    while True:
        test_model = inject_error(gpt2, "gpt2_targets", prob)
        try:
            result = evaluate_model_MMLU_GPT2(test_model)
            for i in result.index:
                task = result.loc[i, "Task"]
                score = result.loc[i, "Score"]
                with open(f"results/gpt2_{str(prob)}_{task}", "w") as f:
                    f.write(f"{str(score)}\n") 
            print("Succeeded no nan")
            break
        except:
            print(sys.exc_info())
            print("Got an error")
            pass

Injector initialized.
Error probability: 2e-09
Data type: torch.float32
Error model: bit
Started allocation
Injecting into wte.weight. (First one takes ~30 seconds)
Injecting into wpe.weight. (First one takes ~30 seconds)
Injecting into h.0.ln_1.weight. (First one takes ~30 seconds)
Injecting into h.0.ln_1.bias. (First one takes ~30 seconds)
Injecting into h.0.attn.c_attn.weight. (First one takes ~30 seconds)
Injecting into h.0.attn.c_attn.bias. (First one takes ~30 seconds)
Injecting into h.0.attn.c_proj.weight. (First one takes ~30 seconds)
Injecting into h.0.attn.c_proj.bias. (First one takes ~30 seconds)
Injecting into h.0.ln_2.weight. (First one takes ~30 seconds)
Injecting into h.0.ln_2.bias. (First one takes ~30 seconds)
Injecting into h.0.mlp.c_fc.weight. (First one takes ~30 seconds)
Injecting into h.0.mlp.c_fc.bias. (First one takes ~30 seconds)
Injecting into h.0.mlp.c_proj.weight. (First one takes ~30 seconds)
Injecting into h.0.mlp.c_proj.bias. (First one takes ~30 seconds

Processing high_school_computer_science:   0%|          | 0/100 [00:00<?, ?it/s]Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
/var/folders/3g/hc5jtysd42z_vgyjs7k8rp400000gn/T/ipykernel_69610/355104648.py:43: PydanticDeprecatedSince20: The `schema` method is deprecated; use `model_json_schema` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  parser = JsonSchemaParser(schema.schema())
Processing high_school_computer_science:   1%|          | 1/100 [00:03<05:06,  3.10s/it]Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Processing high_school_computer_science:   2%|▏         | 2/100 [00:05<04:43,  2.89s/it]Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to

MMLU Task Accuracy (task=high_school_computer_science): 0.2


Generating test split: 100%|██████████| 152/152 [00:00<00:00, 7805.55 examples/s]
Generating validation split: 100%|██████████| 16/16 [00:00<00:00, 4327.93 examples/s]
Generating train split: 100%|██████████| 5/5 [00:00<00:00, 1684.59 examples/s]
Processing astronomy:   0%|          | 0/152 [00:00<?, ?it/s]Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
/var/folders/3g/hc5jtysd42z_vgyjs7k8rp400000gn/T/ipykernel_69610/355104648.py:43: PydanticDeprecatedSince20: The `schema` method is deprecated; use `model_json_schema` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  parser = JsonSchemaParser(schema.schema())
Processing astronomy:   1%|          | 1/152 [00:02<07:14,  2.88s/it]Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. M

In [68]:
test_model = gpt2
result = evaluate_model_MMLU_GPT2(test_model)
for i in result.index:
    task = result.loc[i, "Task"]
    score = result.loc[i, "Score"]
    with open(f"results/gpt2_{str(0)}_{task}", "w") as f:
        f.write(f"{str(score)}\n")        

Processing high_school_computer_science:   0%|                                                                                                                                                                               | 0/100 [00:00<?, ?it/s]Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Processing high_school_computer_science:   1%|█▋                                                                                                                                                                     | 1/100 [00:02<03:49,  2.32s/it]Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Processing high_school_computer_science:   2%|███▎                                                                                                                                                                   | 2/1

MMLU Task Accuracy (task=high_school_computer_science): 0.23


Processing astronomy:   0%|                                                                                                                                                                                                  | 0/152 [00:00<?, ?it/s]Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Processing astronomy:   1%|█▏                                                                                                                                                                                        | 1/152 [00:02<05:36,  2.23s/it]Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Processing astronomy:   1%|██▍                                                                                                                                                                                       | 2/1

MMLU Task Accuracy (task=astronomy): 0.20394736842105263
Overall MMLU Accuracy: 0.21428571428571427





In [35]:
result = evaluate_model_MMLU_GPT2(gpt2_error)

Processing high_school_computer_science:   0%|                                                                                                                                                                               | 0/100 [00:00<?, ?it/s]Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Processing high_school_computer_science:   1%|█▋                                                                                                                                                                     | 1/100 [00:02<04:29,  2.72s/it]Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Processing high_school_computer_science:   2%|███▎                                                                                                                                                                   | 2/1

MMLU Task Accuracy (task=high_school_computer_science): 0.19


Processing astronomy:   0%|                                                                                                                                                                                                  | 0/152 [00:00<?, ?it/s]Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Processing astronomy:   1%|█▏                                                                                                                                                                                        | 1/152 [00:02<06:26,  2.56s/it]Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Processing astronomy:   1%|██▍                                                                                                                                                                                       | 2/1

MMLU Task Accuracy (task=astronomy): 0.15789473684210525
Overall MMLU Accuracy: 0.17063492063492064





In [40]:
result

Unnamed: 0,Task,Score
0,high_school_computer_science,0.19
1,astronomy,0.157895


In [41]:
result.loc[result["Task"] == "high_school_computer_science"]

NameError: name 'high_school_computer_science' is not defined

In [49]:
len(result)

2

In [63]:
for i in result.index:
    task = result.loc[i, "Task"]
    score = result.loc[i, "Score"]
    with open(f"results/gpt2_{str(prob)}_{task}", "w") as f:
        f.write(f"{str(score)}\n") 
    print("Succeeded no nan")

Succeeded no nan
Succeeded no nan


In [69]:
import matplotlib as plt

ModuleNotFoundError: No module named 'matplotlib'