In [2]:
pip install deepeval

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install lm-format-enforcer

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install jsonschema

Note: you may need to restart the kernel to use updated packages.


In [6]:
pip install -U bitsandbytes

Note: you may need to restart the kernel to use updated packages.


In [17]:
from transformers import AutoModelForCausalLM,  AutoTokenizer
from deepeval.models.base_model import DeepEvalBaseLLM
from typing import List
from lmformatenforcer import JsonSchemaParser
from lmformatenforcer.integrations.transformers import build_transformers_prefix_allowed_tokens_fn
from transformers import pipeline
import json

class GPT2(DeepEvalBaseLLM):
    def __init__(
        self,
        model,
        tokenizer
    ):
        self.model = model
        self.tokenizer = tokenizer

    def load_model(self):
        return self.model

    def generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        # Same as the previous example above
        model = self.load_model()
        pipeline = transformers.pipeline(
            "text-generation",
            model=model,
            tokenizer=self.tokenizer,
            use_cache=True,
            device_map="auto",
            max_new_tokens=100,
            do_sample=True,
            top_k=5,
            num_return_sequences=1,
            eos_token_id=self.tokenizer.eos_token_id,
            pad_token_id=self.tokenizer.eos_token_id,
        )

        # Create parser required for JSON confinement using lmformatenforcer
        parser = JsonSchemaParser(schema.schema())
        prefix_function = build_transformers_prefix_allowed_tokens_fn(
            pipeline.tokenizer, parser
        )

        # Output and load valid JSON
        output_dict = pipeline(prompt, prefix_allowed_tokens_fn=prefix_function)
        output = output_dict[0]["generated_text"][len(prompt) :]
        json_result = json.loads(output)

        # Return valid JSON object according to the schema DeepEval supplied
        return schema(**json_result)

    async def a_generate(self, prompt: str, schema) -> BaseModel:
        return self.generate(prompt, schema)

    # This is optional.
    def batch_generate(self, promtps: List[str]) -> List[str]:
        model = self.load_model()
        device = "cuda" # the device to load the model onto

        model_inputs = self.tokenizer(promtps, return_tensors="pt").to(device)
        model.to(device)

        generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
        return self.tokenizer.batch_decode(generated_ids)

    def get_model_name(self):
        return "GPT2"

model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2") # Can be replaced with any huggingface model
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")

gpt2 = GPT2(model=model, tokenizer=tokenizer)


In [18]:
from deepeval.benchmarks import MMLU
from deepeval.benchmarks.tasks import MMLUTask

# Benchmarks include MMLU, HellaSwag, BigBench, TruthfulQA, DROP, HumanEval, GSM8K

# Define benchmark with specific tasks and shots
benchmark = MMLU(
    tasks=[MMLUTask.HIGH_SCHOOL_COMPUTER_SCIENCE, MMLUTask.ASTRONOMY],
    n_shots=3
)

# Replace 'gpt2' with testing model
benchmark.evaluate(model=gpt2)
print(benchmark.overall_score)

Processing high_school_computer_science:   0%|                                                                                                                                                                               | 0/100 [00:00<?, ?it/s]Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Processing high_school_computer_science:   1%|█▋                                                                                                                                                                     | 1/100 [00:02<03:37,  2.20s/it]Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Processing high_school_computer_science:   2%|███▎                                                                                                                                                                   | 2/1

MMLU Task Accuracy (task=high_school_computer_science): 0.22


Generating test split:   0%|          | 0/152 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/16 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Processing astronomy:   0%|                                                                                                                                                                                                  | 0/152 [00:00<?, ?it/s]Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Processing astronomy:   1%|█▏                                                                                                                                                                                        | 1/152 [00:02<05:40,  2.25s/it]Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Processing astronomy:   1%|██▍                                                                                                                                                                                       | 2/1

MMLU Task Accuracy (task=astronomy): 0.21710526315789475
Overall MMLU Accuracy: 0.21825396825396826
0.21825396825396826





In [25]:
print(benchmark.task_scores)

                           Task     Score
0  high_school_computer_science  0.220000
1                     astronomy  0.217105


In [None]:
from transformers import AutoModelForCausalLM, AutoConfig

def convert_torch_to_huggingface(model: torch.nn.Module):
  config = AutoConfig.from_pretrained("gpt2") # Change to whichever model architecture being used
  hf_model = AutoModelForCausalLM.from_config(config)
  hf_model.load_state_dict(model.state_dict())
  return hf_model
