## Bechmarking using DeepEval

In [None]:
import wandb
from transformers import AutoModelForCausalLM, AutoTokenizer
from deepeval.models.base_model import DeepEvalBaseLLM

### Step 1: Initialize Model Class (required to conduct bechmarking with DeepEval )

In [1]:
import wandb
from transformers import AutoModelForCausalLM, AutoTokenizer

# Initialize wandb
wandb.login()

# Load the latest model artifact from wandb
run = wandb.init(project="master_thesis_math_lm")
artifact = run.use_artifact("master_thesis_math_lm/gpt2-math/gpt2-math-model:v0", type="model")
artifact_dir = artifact.download()  # This will download the model to a local path

# Load the model from the downloaded path
model = AutoModelForCausalLM.from_pretrained(artifact_dir)
tokenizer = AutoTokenizer.from_pretrained(artifact_dir)

print("Model loaded successfully from Weights & Biases!")

  from .autonotebook import tqdm as notebook_tqdm
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/konstantinreicheneder/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkreicheneder[0m ([33mkreicheneder-copenhagen-business-school[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Downloading large artifact gpt2-math-model:v0, 479.31MB. 8 files... 
[34m[1mwandb[0m:   8 of 8 files downloaded.  
Done. 0:0:

Model loaded successfully from Weights & Biases!


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

class FineTunedGPT2:
    def __init__(self, model, tokenizer):
        """Initialize with preloaded model and tokenizer."""
        self.model = model
        self.tokenizer = tokenizer

        # Move model to the appropriate device
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model.to(self.device)

    def generate(self, prompt: str, max_length=100) -> str:
        """Generates text from the fine-tuned GPT-2 model."""
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
        outputs = self.model.generate(**inputs, max_new_tokens=max_length, do_sample=True)
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

# Now, instead of reloading from wandb, pass your already loaded model & tokenizer
gpt2_model = FineTunedGPT2(model, tokenizer)

# Test text generation
#print(gpt2_model.generate("Tell me a math joke"))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Tell me a math joke or problem with physics? Let's think about it!

The math joke.

I don't know how they get on.

What do your friends say about it?

# 1.

### Eulal and he are in love at the moment, the latter loves to watch. He goes to the cinema to buy popcorn and he notices that he was asked if there is any movie with the same title or the same director.

#### Explanation:



### Benchmarking on GSM8K

In [4]:
from deepeval.benchmarks import GSM8K

# Define benchmark with n_problems and shots
benchmark = GSM8K(
    n_problems=10,
    n_shots=3,
    enable_cot=True
)

# Replace 'mistral_7b' with your own custom model
benchmark.evaluate(model=gpt2_model)
print(benchmark.overall_score)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Generating train split: 100%|██████████| 7473/7473 [00:00<00:00, 175430.73 examples/s]
Generating test split: 100%|██████████| 1319/1319 [00:00<00:00, 478447.37 examples/s]
Processing 10 problems:   0%|          | 0/10 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Processing 10 problems:  10%|█         | 1/10 [00:02<00:22,  2.51s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-e

Overall GSM8K Accuracy: 0.0
0.0



