This notebook is created to test how current open source state-of-the-art LLMs perform for the multilingual-chatbot-arena dataset.

The experiment's constraints are as follows:

1. Model's inference will be held using an NVIDIA GeForce RTX 4060. Therefore the GPU computing is limited. Recall that is most likely that the current pretrained models with not perform very well on this challenge's data. The benchmarked LLMs must hold around 7-9B parameters in order for the hardware to handle inferencing/fine tuning.
2. For the fine tuning of LLMs. The best course of action will be to use QLORA. Due to hardware constraints.
3. There are an abundant number of capable open-source LLMs. In this Demo we will be benchmarking 3 model's families: Qwen2.5, Llama 3.X and gemini.
4. Dataset for the experiment: training set.
5. Performance metric: Accuracy (For the whole dataset, how many prompts what's the proportion of accurately predicted answers).

# Batch processing pipelines

In [1]:
import sys
import pathlib
root_repo_directory = pathlib.Path().resolve().parent.__str__()
sys.path.append(root_repo_directory)
from multilingual_chatbot_arena import initialize
import datasets_creator.src.constants as c
import datasets_creator.src.utils as utils
import pandas as pd
from fire import Fire
from pydantic import BaseModel
from typing import List,Optional,Dict,Union,Any
import pathlib
import numpy as np
import pickle
from dataclasses import dataclass
import re
import requests

import os
import opik
from loguru import logger
initialize()

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, AutoConfig, BitsAndBytesConfig
from transformers.tokenization_utils import PreTrainedTokenizer
from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
from transformers.pipelines.pt_utils import KeyDataset

from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score,f1_score
from tqdm import tqdm

from collections import defaultdict
import time

import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns

[32m2025-02-19 11:43:46.675[0m | [1mINFO    [0m | [36mmultilingual_chatbot_arena[0m:[36minitialize[0m:[36m13[0m - [1mInitializing env vars...[0m
[32m2025-02-19 11:43:46.676[0m | [1mINFO    [0m | [36mmultilingual_chatbot_arena[0m:[36minitialize[0m:[36m18[0m - [1mLoading environment variables from: /home/kevinmg96/Kaggle competitions/WSDM Cup/multilingual-chatbot-arena/.env[0m
  from .autonotebook import tqdm as notebook_tqdm


In [None]:
#colors
black = mcolors.CSS4_COLORS["black"]
lime = mcolors.CSS4_COLORS["lime"]
aqua = mcolors.CSS4_COLORS["aqua"]
magenta = mcolors.CSS4_COLORS["magenta"]
red = mcolors.CSS4_COLORS["red"]
grey = mcolors.CSS4_COLORS["grey"]
orange = mcolors.CSS4_COLORS["orangered"]
gold = mcolors.CSS4_COLORS["gold"]
blue = mcolors.CSS4_COLORS["blue"]
indigo = mcolors.CSS4_COLORS["indigo"]

## Setting configuration arguments for the whole script

In [2]:
dict_vals_datatype = Union[list[int],int]
@dataclass
class InferenceArgs:
    comet_dataset_name : str
    comet_dataset_description : str
    comet_prompt_template_name : str
    comet_prompt_version : str
    model_name : str
    max_new_tokens : int
    batch_size : Optional[int] = None
    cache : Optional[str] = None
    
    



@dataclass
class InferenceFile:

    data : list[dict[str,dict_vals_datatype]]
    inference_dataset : str

    def save(self,file_path, file_name,include_version = False):
        utils.to_pickle(self.data,file_path,file_name,include_version)


config = InferenceArgs(
    comet_dataset_name="multilingual-chatbot-arena-v0.2.1-train",
    comet_dataset_description="Challenge: WSDM CUP. Curated-smal-dataset - version0.2.1 - Training set 1.",
    comet_prompt_template_name = 'Prompt_template_wsdm_cup_1',
    comet_prompt_version = "v2.1.0",
    model_name="unsloth/DeepSeek-R1-Distill-Qwen-1.5B-bnb-4bit",
    cache="quantized KV cache: quanto",
    max_new_tokens= 2500,
    batch_size=2)



## Loading dataset from Comet ML

In [3]:
opik_client = opik.Opik(project_name=os.environ['COMET_PROJECT_NAME'],
        workspace=os.environ['COMET_WORKSPACE'],api_key=os.environ['COMET_API_KEY'])


## Loading prompt

In [None]:
PROMPT_NAME = "Prompt_template_wsdm_cup_1"
comet_prompt = opik_client.get_prompt(PROMPT_NAME)

## Dataset's Classes Distribution

In [None]:
df_winner_series = df["answer"].value_counts()
df_winner_series = pd.DataFrame({
    "answer" : df_winner_series.keys(),
    "proportion" : df_winner_series.values / df.shape[0]
})

In [None]:
fig,ax = plt.subplots(1,1,figsize= (5,5))
sns.barplot(data=df_winner_series,x="answer",y="proportion",ax=ax,color=magenta)


#ax.tick_params(axis='x',labelrotation=90,labelsize=12)
ax.set_xlabel("Winner model",fontsize=10)
ax.set_ylabel("Proportion",fontsize=10)
ax.set_title("Winner Model's Proportion in Training Set",fontsize=10)

In [None]:
diff_percentage = ((df_winner_series['proportion'][0] - df_winner_series['proportion'][1]) / df_winner_series['proportion'][1]) * 100

print(f"Difference in classes distribution : {diff_percentage} %")

In [None]:
opik_client.search_traces()

In [None]:

# Create a trace
trace = opik_client.trace(
    name="my_trace-2",
    input="Hello, how are you?",
    output={"response": "Comment ça va?"}
)

opik_client.log_traces_feedback_scores(
    scores=[
        {"id": trace.id, "name": "overall_quality", "value": 0.85, "reason": "The response was helpful and accurate."},
        {"id": trace.id, "name": "coherence", "value": 0.75},
        {"id" : trace.id, "name" : "correctness", "value" : 1.0}
    ]
)


# Add a span
trace.span(
    name="Add prompt template",
    input={"text": "Hello, how are you?", "prompt_template": "Translate the following text to French: {text}"},
    output={"text": "Translate the following text to French: hello, how are you?"}
)

# Add an LLM call
trace.span(
    name="llm_call",
    type="llm",
    input={"prompt": "Translate the following text to French: hello, how are you?"},
    output={"response": "Comment ça va?"}
)

# End the trace
trace.end()

## Creating custom dataset for the input dataset.

In [4]:
class ChatbotDataset(Dataset):
    def __init__(self,data : pd.DataFrame):
        """

        Args:
            data pd.DataFrame : data from dataset,
            comet_id Optional[str] : dataset's name id from comet ML
            prompt_id Optional[str] : prompt's template id from comet ML
        """
        self.data = data
          


    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        #get either a single data point or a pandas Dataframe window of data points
        data_window = self.data.iloc[idx]    

        return data_window.to_dict()
          

## Setting Pytorch Dataloader

In [5]:
class ChatbotDataloader(DataLoader):
    def __init__(self, tokenizer :  PreTrainedTokenizer | PreTrainedTokenizerFast, **kwargs):
        self.tokenizer = tokenizer
        
        kwargs["collate_fn"] = self.chatbot_collate
        super().__init__(**kwargs)

    
    def chatbot_collate(self,batch):
        """Custom collate function to teach the Dataloader class how to parse the batches into an llm friendly format
        Args:
            original_batch : List of batch elements with len -> batch_size. Each list's element strictly follows 
            the format inside __getitem__ from Dataset class. 
        
        """
        prompts,answers,languages,records_id = [],[],[],[]

        
        for dic in batch:
            if self.tokenizer.chat_template: #tokenizer has chat template
                
                prompt_messages = [
                    {"role": "system", "content": c.SYSTEM_TEMPLATE},
                    {"role" : "user", "content" : dic["prompt"]}
                ]

                try:
                    prompt_text  = self.tokenizer.apply_chat_template(
                        prompt_messages,
                        tokenize=False,
                        add_generation_prompt=True,
                    )
                except Exception:
                    # chat template does not support system role

                    prompt_messages = [
                    {"role": "user", "content": c.SYSTEM_TEMPLATE},
                    {"role" : "assistant" , "content" : "Ok"},
                    {"role" : "user", "content" : dic["prompt"]}
                    ]

                    prompt_text  = self.tokenizer.apply_chat_template(
                        prompt_messages,
                        tokenize=False,
                        add_generation_prompt=True,
                    )              

            else:
                prompt_text = """
                {system}{prompt}
                """.format(system=c.SYSTEM_TEMPLATE,prompt=dic['prompt'])

                
            answers.append(dic['answer'])
            prompts.append(prompt_text)
            languages.append(dic['language'])
            records_id.append(dic['id'])


        #tokenize batch of prompts and answers
        prompt_tokenize = self.tokenizer(prompts,
                padding='longest',truncation=True,return_tensors="pt")

        return {
            "inputs" : prompt_tokenize, #Dict[str,torch.Tensor]
            "prompts" : prompts, #list[str],
            "labels" : answers, #list[str]
            "languages" : languages, #list[str]
            "records_id" : records_id, #list[str]
            "longest_seq" : prompt_tokenize["input_ids"].shape[1] #int
        }

# Model Inference

## Setting inference pipeline

DeepSeek models: post generated tokens processing. <br>
These models were trained to think about the given prompt, assess its contents, and finally respond to the
question.
Thus, I'll be clearing its thought process from the answer, which is under the tag: think/think

In [None]:
   
def postprocess_generated_output_deepseek(pattern : str,outputs : list[str]) -> list[Any]:
    processed = []
    for output in outputs:
        match = re.search(pattern, output)
        if match:
            processed.append(output[match.end():])
        else:
            processed.append(output)
    return processed
    
    


In [8]:
@torch.inference_mode()
@torch.no_grad()
def model_inference(dataset_name,model,dataloader,config : InferenceArgs,
        resume : Optional[dict[str,dict_vals_datatype]] = None) -> dict[str,dict_vals_datatype]:
    """
    Retrieves two lists, the first list specifies the LLM's decisions per record, on which response was more humanly
    seen. The other specifies the challenge's ground truth.

    Args:
        model : HuggingFace Pretrained LLM.
    """
    

    """     if resume:
        global_output_winners = resume['predictions']
        global_answers = resume['answers']
        resume_idx = resume['last_idx'] + 1 """
    
    global_output = []

    deepseek_model_pattern = r'DeepSeek'
    try:

        i = 0
        for i,batch in enumerate(tqdm(dataloader,desc=f"Dataset : {dataset_name} - Model Inference")):
            #if i < resume_idx:
            #    continue

            # Let's send current batch into model device

            inputs= batch["inputs"].to(model.device)

            logger.info(f"Batch: {i}. Max Batch Input tokens size : {inputs['input_ids'].shape[1]}")


            #forward batch of input tokens into the model, get output token ids
            output_token_ids  = model.generate(
                **inputs,
                max_new_tokens=config.max_new_tokens,
                #do_sample = False,
                #cache_implementation = "quantized",
                #cache_config= {"nbits" : 4, "backend" : "quanto"}
            )

            output_token_ids = output_token_ids.detach().cpu()

            #Remove prompt from generated response
            
            output_token_ids = [output_token_ids[i,batch["longest_seq"]:]  for i in range(
                output_token_ids.shape[0])]

            #Decode batch's output
            #list[config.batch_size]
            batch_decoded_responses = dataloader.tokenizer.batch_decode(output_token_ids, skip_special_tokens=True)

            #if re.search(deepseek_model_pattern, config.model_name):
            #    think_pattern = "</think>"
            #    batch_decoded_responses = postprocess_generated_output_deepseek(think_pattern,batch_decoded_responses)
                

            
            #store batch predictions and dataset's metadata

            batch.update({
                'predictions' : batch_decoded_responses,
                'dataset_name' : [dataset_name] * config.batch_size,
                'comet_prompt_template_name' : [config.comet_prompt_template_name] * config.batch_size,
                "comet_prompt_version" : [config.comet_prompt_version] * config.batch_size
            })




            #transform batch dict, dict[collumn para, list] to store a DS where each element is a record,
            #i.e. list[record in batch]

            def from_batch_to_records_struct(batch):
                """
                Args:
                    batch: dict of parameters, where each param is constituted of a data struct of size batch
                returns:
                    list of records inside batch
                """
                return [
                    {
                       'dataset_name' : batch['dataset_name'][i],
                       'record_id' : batch['records_id'][i],
                       'prompt_template_name' : batch['comet_prompt_template_name'][i],
                       "prompt_version" : batch['comet_prompt_version'][i],
                       'prompt' : batch['prompts'][i],
                       'prediction' : batch['predictions'][i],
                       'label' : batch['labels'][i],
                       'language' : batch['languages'][i]
                    }
                    for i in range(config.batch_size)
                ]
            
            #store record outputs in global container
            global_output.extend(from_batch_to_records_struct(batch))


            #clear GPU cache
            torch.cuda.empty_cache()                
    except KeyboardInterrupt as k:
        print(k)
    except Exception as e:
        print(e)
    finally:
        return {
            'output' : global_output,
            'last_idx' : i
        }



In [10]:
def inference_pipeline(opik_client,model, tokenizer, num_datasets : int,config : InferenceArgs,
                       resume : Optional[list[dict[str,dict_vals_datatype]]] = None):
    
    global_ouput = []

    """     resume_dataset_id = 0
    resume_last_dict = None
    if resume:
        global_ouput = resume
        resume_dataset_id = len(resume)
        resume_last_dict = resume.pop() """



    
    for dataset_id in range(1,num_datasets+1):
        """         if dataset_id < resume_dataset_id:
            continue """

        #get dataset from commet ML
        dataset_name = f"{config.comet_dataset_name}-{dataset_id}"
        dataset = opik_client.get_or_create_dataset(dataset_name).to_pandas()

        #construct Dataset and Dataloader
        dataset = ChatbotDataset(dataset)
        dataloader = ChatbotDataloader(tokenizer=tokenizer,dataset=dataset,batch_size=config.batch_size)

        #run inference per dataset inside function...
        output = model_inference(dataset_name,model,dataloader,config)

        #store outputs from current dataset in the specified project from comet ML

        def store_results_in_project_comet_ml(output):
            for i,record in enumerate(output):
                trace_dict = {
                    "comet_dataset_name" : record['dataset_name'],
                    "comet_prompt_template_name" : record['prompt_template_name'],
                    "comet_prompt_version" : record['prompt_version']
                }
                
                trace = opik_client.trace(
                    name=f"record_results:{record['record_id']}",
                    metadata=trace_dict
                )

                # Add llm call
                trace.span(
                    name="llm call",
                    input={'prompt' : record['prompt']},
                    output={'response' : record['prediction']},
                    metadata={'model' : config.model_name, 'label' : record['label'], 'language' : record['language']}
                )

                trace.end()

        store_results_in_project_comet_ml(output['output'])
        
        """         n = len(dataset)

        if resume_last_dict:
            outputs = model_inference(dataset_id,dataset,config,server_client,
                                      resume_last_dict)
            resume_last_dict = None
        else:
            outputs = model_inference(dataset_id,dataset,config,server_client) """

        global_ouput.append(output)

        """         if outputs['last_idx'] < n - 1:
            print(f"Error during batch datasets inferencing...")
            return global_ouput """
    return global_ouput


        
        

# Model Inference Execution

### Model: Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4 inference pipeline for dataset: v0

Loading model & tokenizer

In [None]:
quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
)

tokenizer2 = AutoTokenizer.from_pretrained(
"Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4",padding_side="left",legacy=False)
model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4",
    device_map="auto",
    torch_dtype="auto",
    attn_implementation="flash_attention_2",
    use_cache=False
)


In [None]:
tokenizer2

In [None]:
model

In [None]:
tokenizer2 = AutoTokenizer.from_pretrained(
config.model_name,padding_side="left",legacy=False)
model = AutoModelForCausalLM.from_pretrained(
    config.model_name,
    device_map="auto"
)
model.eval()

In [None]:
tokenizer2

In [None]:
tokenizer2.special_tokens_map

In [None]:
tokenizer2.apply_chat_template()

In [None]:
tokenizer.chat_template.

Execute Inference Pipeline...

In [None]:
output  = inference_pipeline(opik_client,model,tokenizer,1,config)

### Model: Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4 inference pipeline for dataset: v0

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
config.model_name,padding_side="left",legacy=False)
model = AutoModelForCausalLM.from_pretrained(
    config.model_name,
    device_map="auto"
)
model.eval()

Execute Inference Pipeline...

In [None]:
output  = inference_pipeline(opik_client,model,tokenizer,1,config)

### Model: TJUNLP/FuxiTranyu-8B-SFT inference pipeline for dataset: v0

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
config.model_name,padding_side="left",legacy=False)
model = AutoModelForCausalLM.from_pretrained(
    config.model_name,
    device_map="auto",
    trust_remote_code=True
)
model.eval()

In [None]:
output  = inference_pipeline(opik_client,model,tokenizer,1,config)

### Model: unsloth/gemma-2b-it-bnb-int4 inference pipeline for dataset: v0

In [None]:
from unsloth import FastLanguageModel

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = config.model_name,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

Add special tokens and resize vocabulary

In [None]:
special_tokens_dict = {'additional_special_tokens': ['<|im_start|>','<|im_end|>']}
tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

Set Chat template

In [None]:
JINJA_TEMPLATE = '{%- if tools %}\n    {{- \'<|im_start|>system\\n\' }}\n    {%- if messages[0][\'role\'] == \'system\' %}\n        {{- messages[0][\'content\'] }}\n    {%- else %}\n        {{- \'You are Gemma. You are a helpful assistant.\' }}\n    {%- endif %}\n    {{- "\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>" }}\n    {%- for tool in tools %}\n        {{- "\\n" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- "\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\"name\\": <function-name>, \\"arguments\\": <args-json-object>}\\n</tool_call><|im_end|>\\n" }}\n{%- else %}\n    {%- if messages[0][\'role\'] == \'system\' %}\n        {{- \'<|im_start|>system\\n\' + messages[0][\'content\'] + \'<|im_end|>\\n\' }}\n    {%- else %}\n        {{- \'<|im_start|>system\\nYou are Gemma. You are a helpful assistant.<|im_end|>\\n\' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}\n        {{- \'<|im_start|>\' + message.role + \'\\n\' + message.content + \'<|im_end|>\' + \'\\n\' }}\n    {%- elif message.role == "assistant" %}\n        {{- \'<|im_start|>\' + message.role }}\n        {%- if message.content %}\n            {{- \'\\n\' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- \'\\n<tool_call>\\n{"name": "\' }}\n            {{- tool_call.name }}\n            {{- \'", "arguments": \' }}\n            {{- tool_call.arguments | tojson }}\n            {{- \'}\\n</tool_call>\' }}\n        {%- endfor %}\n        {{- \'<|im_end|>\\n\' }}\n    {%- elif message.role == "tool" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}\n            {{- \'<|im_start|>user\' }}\n        {%- endif %}\n        {{- \'\\n<tool_response>\\n\' }}\n        {{- message.content }}\n        {{- \'\\n</tool_response>\' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}\n            {{- \'<|im_end|>\\n\' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- \'<|im_start|>assistant\\n\' }}\n{%- endif %}\n'

tokenizer.chat_template = JINJA_TEMPLATE

In [None]:
output  = inference_pipeline(opik_client,model,tokenizer,1,config)

### Model: unsloth/DeepSeek-R1-Distill-Qwen-1.5B-bnb-4bit inference pipeline for dataset: v0

In [11]:
from unsloth import FastLanguageModel

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = config.model_name,
    #max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.2.9: Fast Qwen2 patching. Transformers: 4.48.3.
   \\   /|    GPU: NVIDIA GeForce RTX 4060 Laptop GPU. Max memory: 7.996 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536, padding_idx=151654)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear4bit(in_features=1536, out_features=1536, bias=True)
          (k_proj): Linear4bit(in_features=1536, out_features=256, bias=True)
          (v_proj): Linear4bit(in_features=1536, out_features=256, bias=True)
          (o_proj): Linear4bit(in_features=1536, out_features=1536, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear4bit(in_features=1536, out_features=8960, bias=False)
          (up_proj): Linear4bit(in_features=1536, out_features=8960, bias=False)
          (down_proj): Linear4bit(in_features=8960, out_features=1536, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((153

In [12]:
output  = inference_pipeline(opik_client,model,tokenizer,1,config)

Dataset : multilingual-chatbot-arena-v0.2.1-train-1 - Model Inference:   0%|          | 0/234 [00:00<?, ?it/s][32m2025-02-19 11:48:11.211[0m | [1mINFO    [0m | [36m__main__[0m:[36mmodel_inference[0m:[36m33[0m - [1mBatch: 0. Max Batch Input tokens size : 3843[0m
Dataset : multilingual-chatbot-arena-v0.2.1-train-1 - Model Inference:   0%|          | 1/234 [01:42<6:37:04, 102.25s/it][32m2025-02-19 11:49:53.458[0m | [1mINFO    [0m | [36m__main__[0m:[36mmodel_inference[0m:[36m33[0m - [1mBatch: 1. Max Batch Input tokens size : 2732[0m
Dataset : multilingual-chatbot-arena-v0.2.1-train-1 - Model Inference:   1%|          | 2/234 [03:16<6:16:02, 97.25s/it] [32m2025-02-19 11:51:27.206[0m | [1mINFO    [0m | [36m__main__[0m:[36mmodel_inference[0m:[36m33[0m - [1mBatch: 2. Max Batch Input tokens size : 955[0m
Dataset : multilingual-chatbot-arena-v0.2.1-train-1 - Model Inference:   1%|▏         | 3/234 [04:50<6:09:12, 95.90s/it][32m2025-02-19 11:53:01.499[0m | [

CUDA driver error: out of memory


### Model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B inference pipeline for dataset v0

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
config.model_name,padding_side="left",legacy=False)
model = AutoModelForCausalLM.from_pretrained(
    config.model_name,
    device_map="auto",
    trust_remote_code=True
)
model.eval()

Execute inference pipeline

In [None]:
output  = inference_pipeline(opik_client,model,tokenizer,1,config)

## Model: deepseek-ai/DeepSeek-R1-Distill-Qwen-32B inference pipeline for dataset: v5

### Connect to beam cloud GPU rental services

In [None]:
from beam import endpoint


@endpoint(gpu="A100-40")
def handler():
    # Prints the available GPU drivers
    import subprocess
    print(subprocess.check_output(["nvidia-smi"], shell=True))

    return {"gpu":"true"}

# Models Benchmark

In this section I am going to compare the performance of the different open source models to a variety of different datasets.

Metrics:
* Accurracy
* Precision
* Recall
* f1_score

In [40]:
pattern = r'best response[\s\S]*[ab]{1}'
s = "best response is response a"
re.search(pattern,s)

<re.Match object; span=(0, 27), match='best response is response a'>

In [61]:
label2id = {
    'model_a' : 1,
    'model_b' : 0
}
id2label = {
    1 : 'model_a',
    0 : 'model_b'
}


In [None]:
def parse_output_llm(response) -> Union[str,int]:
    """
    Retrieves a list specifying which of the two paired models in each training record adheres the best
    to human responses.

    Args:
        responses List[str]: Batch of LLM's responses.
    """
    #if model_abAB or Reponse Aa reponse bB are the last chars in response, then i would keep those, if not then ill keep the first
    #occurrence
    expression = r'</think>'#r'([mM]odel[_\s][abAB][\W]?)|([rR]esponse [abAB][\W])'#r'model.+[a,b,A,B]'


    #Extract pattern from response
    match = re.search(expression, response) 
    #print(matches)

    def get_model_winner(match,response : str) -> Union[str,int]:
        """
        Extract which model's reponse is better from input_response
        """
        #now lets extract the last match based on regexp
        #if the last match corresponds to the last chars in response, then it will be the output to processed
        #if not, then probably the first match is going to be the best model

        def eval_string(string):
            if 'a' in string:
                return 1
            return 0 #response B / model_b

        if not match:
            return response
        
        response = response[match.end():].strip().lower()

        #conditions on finding model's response:
        #1.- find tag: <ans>answer here</ans> or **answer here**
        #2.- find: Response [ABab] ... better
        #3.- best reponse ... [ab]
        #4.- ans ... response [ab]
        #5.- model_[ab] or _[ab]


        pattern = r'(<ans>|\*\*){1}[\s\S]*(</ans>|\*\*){1}'

        match = re.search(pattern,response)

        if match:
            return eval_string(response[match.start():match.end()])

        #try finding pattern 2

        pattern = r'response[\s]*[ab]{1}[\s\S]*better'

        match = re.search(pattern,response)

        if match:
            pattern1 = r'response[\s]*[ab]{1}'
            response1 = response[match.start():match.end()]
            match1 = re.search(pattern1,response1)
            return eval_string(response1[match1.start():match1.end()])
                
        #pattern 3

        pattern = r'best response[\s\S]*[ab]{1}'

        match = re.search(pattern,response)

        if match:
            pattern1 = r'response[\s]*[ab]{1}'
            response1 = response[match.start():match.end()]
            match1 = re.search(pattern1,response1)
            return eval_string(response1[match1.start():match1.end()])
        
        #pattern 4

        pattern = r'ans[\s]*response [ab]{1}'

        match = re.search(pattern,response)

        if match:
            pattern1 = r'response [ab]{1}'
            response1 = response[match.start():match.end()]
            match1 = re.search(pattern1,response1)
            return eval_string(response1[match1.start():match1.end()])
        
        #pattern 5

        pattern = r'(model)?_[ab]{1}'

        match = re.search(pattern,response)

        if match:
            return eval_string(response[match.start():match.end()])
        
        #if patter not found return answer
        return response


    return get_model_winner(match,response)
    






In [None]:
@dataclass
class ModelPerformance:
    accuracy : float
    precision : Optional[float] = None
    recall : Optional[float] = None
    f1_score : Optional[float] = None

In [59]:
def benchmark_model(opik_client):
    # get traces
    traces = opik_client.search_traces()
    data = defaultdict(lambda: defaultdict(list))

    for i,trace in enumerate(traces):
        span = opik_client.search_spans(trace_id=trace.id)[0]
        label: int = label2id[span.metadata["label"]]
        
        prediction = span.output["response"]
        #regexp prediction to extract best model's response
        prediction = parse_output_llm(prediction)

        if isinstance(prediction,str):
            struct_name = 'think_pattern_not_found'
        else:
            struct_name = 'correct'

        data[struct_name]["predictions"].append(prediction)
        data[struct_name]["labels"].append(label)

    return data
                
    


    

        

## Model results: Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4 inference pipeline for dataset: v0

In [None]:
opik_client = opik.Opik(project_name='inference-wsdm-cup-v0-qwen2.5-1.5b-int4',
        workspace=os.environ['COMET_WORKSPACE'],api_key=os.environ['COMET_API_KEY'])

In [None]:
data_qwen_2_5_instruct_1_5b_int4_v0 = benchmark_model(opik_client)


performance_qwen_2_5_instruct_1_5b_int4_v0 = ModelPerformance(accuracy=accuracy_score(data_qwen_2_5_instruct_1_5b_int4_v0["labels"],
                                                                        data_qwen_2_5_instruct_1_5b_int4_v0["predictions"]))

In [None]:
performance_qwen_2_5_instruct_1_5b_int4_v0

## Model results: Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4 inference pipeline for dataset: v0

In [None]:
opik_client = opik.Opik(project_name='inference-wsdm-cup-v0-qwen2.5-coder-7b-instruct-int4',
        workspace=os.environ['COMET_WORKSPACE'],api_key=os.environ['COMET_API_KEY'])

In [None]:
data_qwen_2_5_coder_7b_int4_v0 = benchmark_model(opik_client)


performance_qwen_2_5_coder_7b_int4_v0 = ModelPerformance(accuracy=accuracy_score(data_qwen_2_5_coder_7b_int4_v0["labels"],
                                                                        data_qwen_2_5_coder_7b_int4_v0["predictions"]))

In [None]:
performance_qwen_2_5_coder_7b_int4_v0

## Model results: DeepSeek-R1-Distill-Qwen-1.5B-bnb-4bit inference pipeline for dataset: v0

In [None]:
opik_client = opik.Opik(project_name=os.environ['COMET_PROJECT_NAME'],
        workspace=os.environ['COMET_WORKSPACE'],api_key=os.environ['COMET_API_KEY'])


In [None]:
data_deepseek_qwen_v0 = benchmark_model(opik_client)
data_deepseek_qwen_v0["labels"].pop()
data_deepseek_qwen_v0["labels"].pop()
data_deepseek_qwen_v0["predictions"].pop()
data_deepseek_qwen_v0["predictions"].pop()

performance_deepseek_qwen_v0 = ModelPerformance(accuracy=accuracy_score(data_deepseek_qwen_v0["labels"],data_deepseek_qwen_v0["predictions"]))



In [None]:
performance_deepseek_qwen_v0

In [None]:
arr = np.array(data_deepseek_qwen_v0['predictions'])
_,count = np.unique(arr,return_counts=True)
count / arr.shape[0] * 100

In [None]:
arr = np.array(data_deepseek_qwen_v0['labels'])
_,count = np.unique(arr,return_counts=True)
count / arr.shape[0] * 100

## Model results: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B inference pipeline for dataset: v0.2.1

In [27]:
opik_client = opik.Opik(project_name=os.environ['COMET_PROJECT_NAME'],
        workspace=os.environ['COMET_WORKSPACE'],api_key=os.environ['COMET_API_KEY'])

In [62]:
data_deepseek_r1_qwen1_5b_v0_2_1 = benchmark_model(opik_client)


""" performance_qwen_2_5_coder_7b_int4_v0 = ModelPerformance(accuracy=accuracy_score(data_qwen_2_5_coder_7b_int4_v0["labels"],
                                                                        data_qwen_2_5_coder_7b_int4_v0["predictions"])) """

' performance_qwen_2_5_coder_7b_int4_v0 = ModelPerformance(accuracy=accuracy_score(data_qwen_2_5_coder_7b_int4_v0["labels"],\n                                                                        data_qwen_2_5_coder_7b_int4_v0["predictions"])) '

In [63]:
data_deepseek_r1_qwen1_5b_v0_2_1['think_pattern_not_found']

defaultdict(list,
            {'predictions': ['**Megvétösek és megválasítás:**\n\n**1. Az integrált ökológia és AQAL modell:**\n   - Az integrált ökológia fontosságát és hatékony működése érdekében fontos az egyéni, kollektív, belső és környezett dimenziók. A háromszintű szervezett felületében a stratosiai, operatív és hatósági szintek együttműködés hozzájárul az hatékony működés, amelyet a természetvédelmi integrált folyamok megfelelős megfelelősításával biztosítjuk.\n\n**2. Az "Ökológiai megtérés" kapcsolatok:**\n   - Az "ökológiai megtérés" szükségességét hangszó a természetvédelmi folyamatok megőrzésére, a különböző szervezetek közötti együttműködés, és a természetvédelmi integrált folyamok megfelelős megfelelősításával. A kapcsolatok egészében azok az ismeret, ismeret, helyelet, fajok, résztép, értéke, törtelet, dövöny, szervezés, halmaz, függetiszt, ismeret, hely, ismeret, törtelet, dövöny, szervezés, halmaz, függetiszt, ismeret, hely, ismeret, törtelet, dövöny, szervezés, halma

In [67]:
predictions =data_deepseek_r1_qwen1_5b_v0_2_1['correct']['predictions']
labels = data_deepseek_r1_qwen1_5b_v0_2_1['correct']['labels']

In [68]:
preds_arr = np.array(predictions)
np.unique(preds_arr,return_counts=True)

(array([0, 1]), array([ 13, 213]))

In [70]:
labels_arr = np.array(labels)
np.unique(labels_arr,return_counts=True)

(array([0, 1]), array([ 97, 129]))

In [71]:
accuracy_score(labels_arr,preds_arr)

0.5398230088495575