In [None]:
!pip install deepeval

Collecting deepeval
  Downloading deepeval-1.5.0-py3-none-any.whl.metadata (977 bytes)
Collecting pytest-repeat (from deepeval)
  Downloading pytest_repeat-0.9.3-py3-none-any.whl.metadata (4.9 kB)
Collecting pytest-xdist (from deepeval)
  Downloading pytest_xdist-3.6.1-py3-none-any.whl.metadata (4.3 kB)
Collecting portalocker (from deepeval)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Collecting langchain-openai (from deepeval)
  Downloading langchain_openai-0.2.6-py3-none-any.whl.metadata (2.6 kB)
Collecting ragas (from deepeval)
  Downloading ragas-0.2.4-py3-none-any.whl.metadata (8.0 kB)
Collecting docx2txt~=0.8 (from deepeval)
  Downloading docx2txt-0.8.tar.gz (2.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tenacity~=8.4.1 (from deepeval)
  Downloading tenacity-8.4.2-py3-none-any.whl.metadata (1.2 kB)
Collecting opentelemetry-api~=1.24.0 (from deepeval)
  Downloading opentelemetry_api-1.24.0-py3-none-any.whl.metadata (1.3 kB)
Collect

In [None]:
!pip install lm-format-enforcer

Collecting lm-format-enforcer
  Downloading lm_format_enforcer-0.10.9-py3-none-any.whl.metadata (17 kB)
Collecting interegular>=0.3.2 (from lm-format-enforcer)
  Downloading interegular-0.3.3-py37-none-any.whl.metadata (3.0 kB)
Downloading lm_format_enforcer-0.10.9-py3-none-any.whl (43 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.9/43.9 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading interegular-0.3.3-py37-none-any.whl (23 kB)
Installing collected packages: interegular, lm-format-enforcer
Successfully installed interegular-0.3.3 lm-format-enforcer-0.10.9


In [None]:
!pip install -U bitsandbytes



In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from deepeval.models.base_model import DeepEvalBaseLLM
from typing import List

class GPT2(DeepEvalBaseLLM):
    def __init__(
        self,
        model,
        tokenizer
    ):
        self.model = model
        self.tokenizer = tokenizer

    def load_model(self):
        return self.model

    def generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        # Same as the previous example above
        model = self.load_model()
        pipeline = transformers.pipeline(
            "text-generation",
            model=model,
            tokenizer=self.tokenizer,
            use_cache=True,
            device_map="auto",
            max_new_tokens=100,
            do_sample=True,
            top_k=5,
            num_return_sequences=1,
            eos_token_id=self.tokenizer.eos_token_id,
            pad_token_id=self.tokenizer.eos_token_id,
        )

        # Create parser required for JSON confinement using lmformatenforcer
        parser = JsonSchemaParser(schema.schema())
        prefix_function = build_transformers_prefix_allowed_tokens_fn(
            pipeline.tokenizer, parser
        )

        # Output and load valid JSON
        output_dict = pipeline(prompt, prefix_allowed_tokens_fn=prefix_function)
        output = output_dict[0]["generated_text"][len(prompt) :]
        json_result = json.loads(output)

        # Return valid JSON object according to the schema DeepEval supplied
        return schema(**json_result)

    async def a_generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        return self.generate(prompt, schema)

    # This is optional.
    def batch_generate(self, promtps: List[str]) -> List[str]:
        model = self.load_model()
        device = "cuda" # the device to load the model onto

        model_inputs = self.tokenizer(promtps, return_tensors="pt").to(device)
        model.to(device)

        generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
        return self.tokenizer.batch_decode(generated_ids)

    def get_model_name(self):
        return "GPT2"

model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2") # Can be replaced with any huggingface model
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")

gpt2 = GPT2(model=model, tokenizer=tokenizer)


In [None]:
from deepeval.benchmarks import MMLU
from deepeval.benchmarks.tasks import MMLUTask

# Benchmarks include MMLU, HellaSwag, BigBench, TruthfulQA, DROP, HumanEval, GSM8K

# Define benchmark with specific tasks and shots
benchmark = MMLU(
    tasks=[MMLUTask.HIGH_SCHOOL_COMPUTER_SCIENCE, MMLUTask.ASTRONOMY],
    n_shots=3
)

# Replace 'gpt2' with testing model
benchmark.evaluate(model=gpt2)
print(benchmark.overall_score)

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/9 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Processing high_school_computer_science:   0%|          | 0/100 [00:00<?, ?it/s]Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Processing high_school_computer_science:   1%|          | 1/100 [00:11<18:33, 11.24s/it]Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Processing high_school_computer_science:   2%|▏         | 2/100 [00:18<14:49,  9.08s/it]Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Processing high_school_computer_science:   3%|▎         | 3/100 [00:24<12:16,  7.59s/it]Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Processing high_school_computer_science:   4%|▍         | 4/100 [00:30<1

MMLU Task Accuracy (task=high_school_computer_science): 0.23


Generating test split:   0%|          | 0/152 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/16 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Processing astronomy:   0%|          | 0/152 [00:00<?, ?it/s]Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Processing astronomy:   1%|          | 1/152 [00:05<13:49,  5.49s/it]Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Processing astronomy:   1%|▏         | 2/152 [00:11<14:11,  5.68s/it]Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Processing astronomy:   2%|▏         | 3/152 [00:16<13:34,  5.46s/it]Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Processing astronomy:   3%|▎         | 4/152 [00:23<14:29,  5.87s/it]Hardware accelerator e.g. GPU is available in the environment, but no `device` 

MMLU Task Accuracy (task=astronomy): 0.21710526315789475
Overall MMLU Accuracy: 0.2222222222222222
0.2222222222222222





In [None]:
from transformers import AutoModelForCausalLM, AutoConfig

def convert_torch_to_huggingface(model: torch.nn.Module):
  config = AutoConfig.from_pretrained("gpt2") # Change to whichever model architecture being used
  hf_model = AutoModelForCausalLM.from_config(config)
  hf_model.load_state_dict(model.state_dict())
  return hf_model
