## Bechmarking using DeepEval

### BASIC Step 1: Initialize Model Class (required to conduct benchmarking with DeepEval)

In [6]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from deepeval.models.base_model import DeepEvalBaseLLM

class Mistral7B(DeepEvalBaseLLM):
    def __init__(
        self,
        model,
        tokenizer
    ):
        self.model = model
        self.tokenizer = tokenizer

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        model = self.load_model()

        device = "cuda" # the device to load the model onto

        model_inputs = self.tokenizer([prompt], return_tensors="pt").to(device)
        model.to(device)

        generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
        return self.tokenizer.batch_decode(generated_ids)[0]

    async def a_generate(self, prompt: str) -> str:
        return self.generate(prompt)

    def get_model_name(self):
        return "Mistral 7B"

model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")

mistral_7b = Mistral7B(model=model, tokenizer=tokenizer)
#print(mistral_7b.generate("Write me a joke"))

Downloading shards:   0%|          | 0/2 [01:44<?, ?it/s]


KeyboardInterrupt: 

### Step 1 Other Set-Up to Initialize Model Class (required to conduct benchmarking with DeepEval)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from deepeval.models.base_model import DeepEvalBaseLLM

class Mistral7B(DeepEvalBaseLLM):
    def __init__(
        self,
        model,
        tokenizer,
        max_input_length=4096,  # Mistral's context window
        max_new_tokens=1024     # Generous output length
    ):
        self.model = model
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_new_tokens = max_new_tokens
        
    def load_model(self):
        return self.model
        
    def generate(self, prompt: str) -> str:
        model = self.load_model()
        device = "cuda"  # the device to load the model onto
        
        # Configure tokenizer to handle longer inputs without truncation
        self.tokenizer.model_max_length = self.max_input_length
        inputs = self.tokenizer(prompt, return_tensors="pt", truncation=False).to(device)
        
        # Check if input exceeds model's context window
        if inputs.input_ids.shape[1] > self.max_input_length:
            print(f"Warning: Input length ({inputs.input_ids.shape[1]} tokens) exceeds maximum ({self.max_input_length}). Truncating.")
            inputs.input_ids = inputs.input_ids[:, :self.max_input_length]
            inputs.attention_mask = inputs.attention_mask[:, :self.max_input_length]
            
        model.to(device)
        
        # Generate with more tokens
        generated_ids = model.generate(
            inputs.input_ids, 
            attention_mask=inputs.attention_mask,
            max_new_tokens=self.max_new_tokens,
            do_sample=True,
            pad_token_id=self.tokenizer.eos_token_id  # Ensure proper padding
        )
        
        # Extract only the generated text (not including the prompt)
        input_length = inputs.input_ids.shape[1]
        generated_text = self.tokenizer.decode(
            generated_ids[0][input_length:], 
            skip_special_tokens=True
        )
        
        return generated_text
        
    async def a_generate(self, prompt: str) -> str:
        return self.generate(prompt)
        
    def get_model_name(self):
        return "Mistral 7B"

# Initialize the model
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
mistral_7b = Mistral7B(model=model, tokenizer=tokenizer)

### Benchmarking on GSM8K

In [None]:
from deepeval.benchmarks import GSM8K

# Define benchmark with n_problems and shots
benchmark = GSM8K(
    n_problems=10,
    n_shots=3,
    enable_cot=True
)

# Replace 'mistral_7b' with your own custom model
benchmark.evaluate(model=mistral_7b)
print(benchmark.overall_score)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Generating train split: 100%|██████████| 7473/7473 [00:00<00:00, 175430.73 examples/s]
Generating test split: 100%|██████████| 1319/1319 [00:00<00:00, 478447.37 examples/s]
Processing 10 problems:   0%|          | 0/10 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Processing 10 problems:  10%|█         | 1/10 [00:02<00:22,  2.51s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-e

Overall GSM8K Accuracy: 0.0
0.0



