This notebook is created to test how current open source state-of-the-art LLMs perform for the multilingual-chatbot-arena dataset.

The experiment's constraints are as follows:

1. Model's inference will be held using an NVIDIA GeForce RTX 4060. Therefore the GPU computing is limited. Recall that is most likely that the current pretrained models with not perform very well on this challenge's data. The benchmarked LLMs must hold around 7-9B parameters in order for the hardware to handle inferencing/fine tuning.
2. For the fine tuning of LLMs. The best course of action will be to use QLORA. Due to hardware constraints.
3. There are an abundant number of capable open-source LLMs. In this Demo we will be benchmarking 3 model's families: Qwen2.5, Llama 3.X and gemini.
4. Dataset for the experiment: training set.
5. Performance metric: Accuracy (For the whole dataset, how many prompts what's the proportion of accurately predicted answers).

# Batch processing workloads

## Loading challenge's  data from Comet ML

In [1]:
import sys
import pathlib
root_repo_directory = pathlib.Path().resolve().parent.__str__()
sys.path.append(root_repo_directory)
from multilingual_chatbot_arena import initialize
import datasets_creator.src.constants as c
import datasets_creator.src.utils as utils
import pandas as pd
from fire import Fire
from pydantic import BaseModel
from typing import List,Optional,Dict,Union
import pathlib
import numpy as np
import pickle
from dataclasses import dataclass
import re
import requests

import os
import opik
from loguru import logger
initialize()

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, AutoConfig
from transformers.tokenization_utils import PreTrainedTokenizer
from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
from transformers.pipelines.pt_utils import KeyDataset

from sklearn.metrics import accuracy_score,confusion_matrix
from tqdm import tqdm

from collections import defaultdict
import time

* 'fields' has been removed
[32m2025-01-24 08:09:49.389[0m | [1mINFO    [0m | [36mmultilingual_chatbot_arena[0m:[36minitialize[0m:[36m13[0m - [1mInitializing env vars...[0m
[32m2025-01-24 08:09:49.390[0m | [1mINFO    [0m | [36mmultilingual_chatbot_arena[0m:[36minitialize[0m:[36m18[0m - [1mLoading environment variables from: /home/kevinmg96/Kaggle competitions/WSDM Cup/multilingual-chatbot-arena/.env[0m
  from .autonotebook import tqdm as notebook_tqdm


In [3]:
opik_client = opik.Opik(project_name=os.environ['COMET_PROJECT_NAME'],
        workspace=os.environ['COMET_WORKSPACE'],api_key=os.environ['COMET_API_KEY'])

In [6]:
opik_client.search_traces()

[TracePublic(id='019498ae-d271-7cde-bde6-e1bbdf0ab224', project_id='01948f5c-3138-7b41-8061-5b1f11b296e4', name='my_trace-2', start_time=datetime.datetime(2025, 1, 24, 14, 19, 41, 554018, tzinfo=TzInfo(UTC)), end_time=datetime.datetime(2025, 1, 24, 14, 19, 41, 555282, tzinfo=TzInfo(UTC)), input={'user_question': 'Hello, how are you?'}, output={'response': 'Comment ça va?'}, metadata=None, tags=None, error_info=None, usage=None, created_at=datetime.datetime(2025, 1, 24, 14, 19, 43, 631524, tzinfo=TzInfo(UTC)), last_updated_at=datetime.datetime(2025, 1, 24, 14, 19, 43, 720692, tzinfo=TzInfo(UTC)), created_by='kevinmedinag96', last_updated_by='kevinmedinag96', feedback_scores=[FeedbackScorePublic(name='coherence', category_name=None, value=0.75, reason=None, source='sdk', created_at=datetime.datetime(2025, 1, 24, 14, 19, 43, 622745, tzinfo=TzInfo(UTC)), last_updated_at=datetime.datetime(2025, 1, 24, 14, 19, 43, 622745, tzinfo=TzInfo(UTC)), created_by='kevinmedinag96', last_updated_by='kev

In [8]:

# Create a trace
trace = opik_client.trace(
    name="my_trace-2",
    input="Hello, how are you?",
    output={"response": "Comment ça va?"}
)

opik_client.log_traces_feedback_scores(
    scores=[
        {"id": trace.id, "name": "overall_quality", "value": 0.85, "reason": "The response was helpful and accurate."},
        {"id": trace.id, "name": "coherence", "value": 0.75},
        {"id" : trace.id, "name" : "correctness", "value" : 1.0}
    ]
)


# Add a span
trace.span(
    name="Add prompt template",
    input={"text": "Hello, how are you?", "prompt_template": "Translate the following text to French: {text}"},
    output={"text": "Translate the following text to French: hello, how are you?"}
)

# Add an LLM call
trace.span(
    name="llm_call",
    type="llm",
    input={"prompt": "Translate the following text to French: hello, how are you?"},
    output={"response": "Comment ça va?"}
)

# End the trace
trace.end()

In [3]:
label2id = {
    'model_a' : 1,
    'model_b' : 2
}
id2label = {
    1 : 'model_a',
    2 : 'model_b'
}


## Creating custom dataset for the imported data.

In [4]:
class ChatbotDataset(Dataset):
    def __init__(self,data : pd.DataFrame):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        #get either a single data point or a pandas Dataframe window of data points
        data_window = self.data.iloc[idx]    

        return data_window.to_dict()
    

        

# Model Inference Setup (Inference API server version)

In [5]:
from huggingface_hub import InferenceClient
import openai



This section models inferencing pipelines for each of the benchmark models 

In [6]:
def custom_accurracy_metric(predictions,labels):
    """
    """
    unmatched_idxs = [] #incorrectly predicted records idxs
    accurracy = 0
    for i,(pred,lab) in enumerate(zip(predictions,labels)):
        if pred == lab:
            accurracy += 1
        else:
            unmatched_idxs.append(i)

    return accurracy / len(predictions), unmatched_idxs


def get_model_winner(matches) -> int:
    """
    Extract which model's reponse is better from input_response
    """
    for match in matches:
        if 'a' in match:
            return 1
        elif 'b' in match:
            return 2
    return 3

def parse_output_llm(response) -> int:
    """
    Retrieves a list specifying which of the two paired models in each training record adheres the best
    to human responses.

    Args:
        responses List[str]: Batch of LLM's responses.
    """
    pattern = r'model_[ab]'


    #Extract pattern from response
    match = re.search(pattern, response)        
    
    return get_model_winner(match.group(0))




In [7]:
dict_vals_dataype = Union[list[int],int]
def model_inference(id,dataset,config,client,resume : Optional[dict[str,dict_vals_dataype]] = None) -> dict[str,dict_vals_dataype]:
    """
    Retrieves two lists, the first list specifies the LLM's decisions per record, on which response was more humanly
    seen. The other specifies the challenge's ground truth.

    Args:
        model : HuggingFace Pretrained LLM.
    """
    global_output_winners = []
    global_answers = []
    resume_idx = 0
    if resume:
        global_output_winners = resume['predictions']
        global_answers = resume['answers']
        resume_idx = resume['last_idx'] + 1
    
    try:
        i = 0
        for i,record in enumerate(tqdm(dataset,desc=f"Dataset : {id} - Model Inference")):

            if i < resume_idx:
                continue

            if i % 15 == 0:
                time.sleep(60.)
                

            """             messages = [
                {'role' : 'system', 'content' : c.SYSTEM_TEMPLATE},
                {'role' : 'user',     'content' : record['prompt'] }] """
            
            PROMPT = """
            {system}{prompt}
            """.format(system=c.SYSTEM_TEMPLATE,prompt=record['prompt'])

            server_response = None
            try:
                """ server_response = client.chat.completions.create(
                        model= config.model_name,
                        messages=messages,
                        max_tokens=50,
                        stream = False
                    )
                response: str = server_response.choices[0].message.content """

                """ input = { "messages": messages }
                server_response = requests.post(f"{CLOUDFARE_URL}{config.model_name}", headers=HEADER, json=input).json()
                response = server_response['result']['response'] """

                server_response = client.generate_content(
                    contents= PROMPT,
                    stream = False)
                
                response = server_response.text

            except Exception as e:
                print("Something has gone wrong requesting from API server...")
                print(e)
            finally:
                if not server_response:
                    response = 'c'

        
            global_output_winners.append(parse_output_llm(response))
            global_answers.append(label2id[record['answer']])
            
    except KeyboardInterrupt as k:
        print(k)
    finally:
        return {
            'predictions' : global_output_winners,
            'answers' : global_answers,
            'last_idx' : i
        }



# Config Inference Arguments and saving results into files

In [8]:
@dataclass
class InferenceArgs:
    model_name : str
    inference_file_path : str = root_repo_directory
    batch_size : Optional[int] = None
    



@dataclass
class InferenceFile:

    data : list[dict[str,dict_vals_dataype]]
    inference_dataset : str

    def save(self,file_path, file_name,include_version = False):
        utils.to_pickle(self.data,file_path,file_name,include_version)





In [9]:
def inference_pipeline(opik_client, num_datasets : int,config : InferenceArgs,server_client,
                       resume : Optional[list[dict[str,dict_vals_dataype]]] = None):
    global_ouputs = []

    resume_dataset_id = 0
    resume_last_dict = None
    if resume:
        global_ouputs = resume
        resume_dataset_id = len(resume)
        resume_last_dict = resume.pop()



    
    for dataset_id in range(1,num_datasets+1):
        if dataset_id < resume_dataset_id:
            continue

        #get dataset from commet
        dataset = opik_client.get_or_create_dataset(f"multilingual-chatbot-arena-validation-complete-{dataset_id}").to_pandas()

        #construct Dataset object
        dataset = ChatbotDataset(dataset)
        n = len(dataset)

        if resume_last_dict:
            outputs = model_inference(dataset_id,dataset,config,server_client,
                                      resume_last_dict)
            resume_last_dict = None
        else:
            outputs = model_inference(dataset_id,dataset,config,server_client)

        global_ouputs.append(outputs)

        if outputs['last_idx'] < n - 1:
            print(f"Error during batch datasets inferencing...")
            return global_ouputs
    return global_ouputs


        
        

# Together AI

In [46]:
from together import Together
client = Together(api_key=os.environ['TOGETHER_AI_API_KEY'])

## meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo

In [47]:
config = InferenceArgs(
    model_name="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
    inference_file_path = root_repo_directory + c.SLASH + 'datasets_creator/data/inference'
)

In [48]:
inference_dataset = 'challenge_complete_inference-2'
try:
    outputs  = inference_pipeline(opik_client,49,config,client,outputs)
finally:
    out_groq_file = InferenceFile(inference_dataset=inference_dataset,data=outputs)
    out_groq_file.save(config.inference_file_path + c.SLASH + config.model_name,inference_dataset)


Dataset : 1 - Model Inference: 100%|██████████| 988/988 [02:36<00:00,  6.33it/s] 
Dataset : 2 - Model Inference: 100%|██████████| 988/988 [03:28<00:00,  4.74it/s]
Dataset : 3 - Model Inference: 100%|██████████| 988/988 [03:32<00:00,  4.65it/s]
Dataset : 4 - Model Inference:   8%|▊         | 75/988 [00:15<03:12,  4.75it/s]

Something has gone wrong requesting from API server...
Error code: 402 - {"message": "Credit limit exceeded. Please navigate to https://api.together.xyz/settings/billing to upgrade your plan.", "type_": "credit_limit"}
Error during batch datasets inferencing...





# Groq

In [17]:
from groq import Groq

client = Groq(
    api_key=os.environ["GROQ_API_KEY"],
)


## llama-3.1-8b-instant

In [18]:
config = InferenceArgs(
    model_name="llama-3.1-8b-instant",
    inference_file_path = root_repo_directory + c.SLASH + 'datasets_creator/data/inference'
)

In [40]:
inference_dataset = 'challenge_complete_inference-2'
try:
    outputs  = inference_pipeline(opik_client,49,config,client,outputs)
finally:
    out_groq_file = InferenceFile(inference_dataset=inference_dataset,data=outputs)
    out_groq_file.save(config.inference_file_path + c.SLASH + config.model_name,inference_dataset)


Dataset : 1 - Model Inference:  26%|██▌       | 255/988 [03:35<10:18,  1.19it/s]  

Something has gone wrong requesting from API server...
Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama-3.1-8b-instant` in organization `org_01jj4kakh5fkrah8srqqd2abrw` service tier `on_demand` on : Limit 500000, Used 496731, Requested 4544. Please try again in 3m40.2668s. Visit https://console.groq.com/docs/rate-limits for more information.', 'type': '', 'code': 'rate_limit_exceeded'}}
Error during batch datasets inferencing...





# Google AI Studio

In [11]:
import google.generativeai as genai

genai.configure(api_key=os.environ['GOOGLE_API_KEY'])


## models/gemini-1.5-flash-8b

In [12]:
config = InferenceArgs(
    model_name="models/gemini-1.5-flash-8b",
    inference_file_path = root_repo_directory + c.SLASH + 'datasets_creator/data/inference'
)

In [13]:

model = genai.GenerativeModel(config.model_name)


In [17]:
root_repo_directory + '/datasets_creator/data/inference/models/gemini-1.5-flash-8b/challenge_complete_inference.pkl'

'/home/kevinmg96/Kaggle competitions/WSDM Cup/multilingual-chatbot-arena/datasets_creator/data/inference/models/gemini-1.5-flash-8b/challenge_complete_inference.pkl'

In [44]:
with open(root_repo_directory + '/datasets_creator/data/inference/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo/challenge_complete_inference-2.pkl', 'rb') as f:
    outputs = pickle.load(f)

In [46]:
outputs

{'predictions': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'answers': [1,
  1,
  1,
  2,
  1,
  2,
  1,
  1,
  1,
  2,
  2,
  2,
  1,
  2,
  1,
  2,
  2,
  1,
  2,
  2,
  1,
  1,
  2,
  1,
  1,
  2,
  2,
  1,
  2,
  2,
  2,
  2,
  2,
  1,
  2,
  2,
  1,
  1,
  1,
  2,
  1,
  1,
  2,
  2,
  2,
  1,
  1,
  2,
  2,
  2,
  1,
  1,
  2,
  2,
  1,
  1,
  2,
  2,
  2,
  1,
  2,
  2,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  2,
  2,
  1,
  1,
  1,
  1],
 'last_idx': 74}

In [47]:
inference_dataset = 'challenge_complete_inference'
try:
    outputs  = inference_pipeline(opik_client,49,config,model)
finally:
    out_groq_file = InferenceFile(inference_dataset=inference_dataset,data=outputs)
    out_groq_file.save(config.inference_file_path + c.SLASH + config.model_name,inference_dataset)

Dataset : 1 - Model Inference:   5%|▍         | 45/988 [03:59<1:23:41,  5.33s/it]


Error during batch datasets inferencing...





# Cloudfare Workers AI

In [21]:
CLOUDFARE_URL = "https://api.cloudflare.com/client/v4/accounts/5b297e1ca90d051d39f0c1851824f0ad/ai/run/"
HEADER = {'Authorization' : f"Bearer {os.environ['CLOUDFARE_API_KEY']}"}

## meta/llama-3.3-70b-instruct-fp8-fast

In [17]:
config = InferenceArgs(
    model_name="@cf/meta/llama-3.3-70b-instruct-fp8-fast",
    inference_file_path = root_repo_directory + c.SLASH + 'datasets_creator/data/inference'
)

In [None]:
inference_dataset = 'validation_dataset_1'
out_val_1  = model_inference(validation_dataset_1,config,client)
out_val_1_file = InferenceFile(inference_dataset=inference_dataset,**out_val_1)
out_val_1_file.save(config.inference_file_path + c.SLASH + config.model_name,inference_dataset)

## Qwen/Qwen2.5-72B-Instruct - SambaNovaCloud

In [13]:
client = openai.OpenAI(
    api_key=os.environ['SAMBANOVA_API_KEY'],
    base_url="https://api.sambanova.ai/v1",
)

In [14]:
config = InferenceArgs(
    model_name="Qwen2.5-Coder-32B-Instruct",
    batch_size=8
)

In [None]:
output_winners_validation_dataset_1  = model_inference(validation_dataset_1,config,client)
output_winners_validation_dataset_2  = model_inference(validation_dataset_2,config,client)
output_winners_validation_dataset_3  = model_inference(validation_dataset_3,config,client)

In [None]:
raise ValueError()

In [None]:
custom_accurracy_metric(output_winners,label_winners)

### Model pipeline inference test. Task: Text Generation

In [6]:
class CustomPipelineDataset(Dataset):
    def __init__(self,data : pd.DataFrame):
        self.data = data
    
    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, idx):
        #get either a single data point or a pandas Dataframe window of data points
        data_window = self.data.iloc[idx]    

        return data_window['prompt']
    
train_pipeline_dataset = CustomPipelineDataset(data)

        

In [None]:
pipe = pipeline(task='zero-shot-classification',model=config.model_name,device_map='auto')
pipe.model.config.id2label = id2label
pipe.model.config.label2id = label2id

In [None]:
def zero_shot_classification_inference(pipe,dataset : Dataset):
    try:
        prediction_results = defaultdict(int) #predicted model : count

        prediction_results = []
        for i,prediction_dict in enumerate(tqdm(pipe(dataset,candidate_labels= ['model_a','model_b']),
                                                 total=len(dataset))):

            best_model_id = np.argmax(prediction_dict['scores']) + 1
            prediction_results.append(best_model_id)

    except KeyboardInterrupt:
        print(f"Inference pipeline is interrumpted...")
    finally:
        return {'predictions' : prediction_results, 'current_id' : i+1}
    

predictions =   zero_shot_classification_inference(pipe,train_pipeline_dataset)

    



In [None]:
predictions

## meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8

Currently not functional. Model text generation does not align with prompt's requirements

In [None]:
config = InferenceArgs(
    model_name="meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8",
    batch_size=8
)

tokenizer = AutoTokenizer.from_pretrained(
config.model_name,padding_side="left",legacy=False)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

In [None]:
print(f"Model : {config.model_name} max context length : {tokenizer.model_max_length}")

### Dataloader

In [12]:
arguments = {
    "dataset" :train_dataset,
    "batch_size" : config.batch_size
}

llama_3_2_1b_spinquant_4int_dataloader = ChatbotDataloader(
    tokenizer,**arguments
)

### Loading model into VRAM

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    "/home/kevinmg96/Kaggle competitions/WSDM Cup/multilingual-chatbot-arena/models/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8/hf_weights", 
    device_map="auto"
)
model.eval()

In [None]:
with torch.no_grad():
    my_batch = next(iter(llama_3_2_1b_spinquant_4int_dataloader))
    my_batch_input = my_batch["inputs"].to("cuda")

    #logits = model(**my_batch_input).logits

    output_ids  = model.generate(
        **my_batch_input,
        max_new_tokens=512
    )

    output_ids = output_ids.detach().cpu()

    response = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    print(response)
    output_ids = [output_ids[i,my_batch["longest_seq"]:]  for i in range(
            output_ids.shape[0])]
    response = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    print(response)

    print(parse_output_llm(response))
    print(my_batch["labels"])
    print(parse_output_llm(my_batch["labels"]))

