This notebook is created to test how current open source state-of-the-art LLMs perform for the multilingual-chatbot-arena dataset.

The experiment's constraints are as follows:

1. Model's inference will be held using an NVIDIA GeForce RTX 4060. Therefore the GPU computing is limited. Recall that is most likely that the current pretrained models with not perform very well on this challenge's data. The benchmarked LLMs must hold around 7-9B parameters in order for the hardware to handle inferencing/fine tuning.
2. For the fine tuning of LLMs. The best course of action will be to use QLORA. Due to hardware constraints.
3. There are an abundant number of capable open-source LLMs. In this Demo we will be benchmarking 3 model's families: Qwen2.5, Llama 3.X and gemini.
4. Dataset for the experiment: training set.
5. Performance metric: Accuracy (For the whole dataset, how many prompts what's the proportion of accurately predicted answers).

# Batch processing workloads

## Loading challenge's  data from Comet ML

In [23]:
import sys
import pathlib
root_repo_directory = pathlib.Path().resolve().parent.__str__()
sys.path.append(root_repo_directory)
from multilingual_chatbot_arena import initialize
import datasets_creator.src.constants as c
import datasets_creator.src.utils as utils
import pandas as pd
from fire import Fire
from pydantic import BaseModel
from typing import List,Optional,Dict,Union
import pathlib
import numpy as np
import pickle
from dataclasses import dataclass
import re
import requests

import os
import opik
from loguru import logger
initialize()

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, AutoConfig
from transformers.tokenization_utils import PreTrainedTokenizer
from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
from transformers.pipelines.pt_utils import KeyDataset

from sklearn.metrics import accuracy_score,confusion_matrix
from tqdm import tqdm

from collections import defaultdict

[32m2025-01-21 13:32:15.407[0m | [1mINFO    [0m | [36mmultilingual_chatbot_arena[0m:[36minitialize[0m:[36m13[0m - [1mInitializing env vars...[0m
[32m2025-01-21 13:32:15.408[0m | [1mINFO    [0m | [36mmultilingual_chatbot_arena[0m:[36minitialize[0m:[36m18[0m - [1mLoading environment variables from: /home/kevinmg96/Kaggle competitions/WSDM Cup/multilingual-chatbot-arena/.env[0m


In [2]:
label2id = {
    'model_a' : 1,
    'model_b' : 2
}
id2label = {
    1 : 'model_a',
    2 : 'model_b'
}

model_human_preferance_pattern = r'model_.'

In [3]:
#getting challenge's train dataset
client = opik.Opik(workspace=os.environ['COMET_WORKSPACE'],api_key=os.environ['COMET_API_KEY'])
dataset_comet = client.get_or_create_dataset("multilingual-chatbot-arena-validation-1")
data_validation_1 = dataset_comet.to_pandas()
dataset_comet = client.get_or_create_dataset("multilingual-chatbot-arena-validation-2")
data_validation_2 = dataset_comet.to_pandas()
dataset_comet = client.get_or_create_dataset("multilingual-chatbot-arena-validation-3")
data_validation_3 = dataset_comet.to_pandas()

def get_answer_id(x):
    match = re.search(model_human_preferance_pattern,x.answer)
    return label2id[x.answer[match.start():match.end()]]

data_validation_1['answer_model_id'] = data_validation_1.apply(get_answer_id,axis=1)
data_validation_2['answer_model_id'] = data_validation_2.apply(get_answer_id,axis=1)
data_validation_3['answer_model_id'] = data_validation_3.apply(get_answer_id,axis=1)

In [4]:
data_validation_1.head()

Unnamed: 0,answer,language,prompt,id,answer_model_id
0,Best model is model_b based on its human prefe...,Chinese,\nYou are an expert in assesing LLM's model re...,01945b2b-7183-7e0d-bafe-7ddb2f496d16,2
1,Best model is model_a based on its human prefe...,Russian,\nYou are an expert in assesing LLM's model re...,01945b2b-7182-70b9-84d2-1e794292d8e0,1
2,Best model is model_b based on its human prefe...,Chinese,\nYou are an expert in assesing LLM's model re...,01945b2b-7181-7fd4-88bc-6304ae7d015b,2
3,Best model is model_a based on its human prefe...,English,\nYou are an expert in assesing LLM's model re...,01945b2b-7180-7dda-82bb-cf64466d9b07,1
4,Best model is model_a based on its human prefe...,Chinese,\nYou are an expert in assesing LLM's model re...,01945b2b-717f-7c43-96f4-1b4d180881cb,1


Remove from prompt column, system's message declaration

In [5]:
system_substring = """\nYou are an expert in assesing LLM's model response based on a prompt. I will give you an input prompt (**prompt**) with two different responses coming from fellow LLM models; the first model's response is called **response_a** and second model's response is **response_b**. You can find the previous information after the double slashes (//), respecting the correct title based on the proper input.Your task is to assess the content of each response based on its quality and human's language similarity, then choose the model's response which adheres best to the given guidelines.\nYour response must obey the following format: "Best model is model_[] based on its human preferability response for the input prompt.". You will substitute "[]" with either "a" if you think **response_a** is better than **response_b**, or "b" otherwise."""

In [6]:
def del_system_sentence(x):
    prompt = x.prompt

    return prompt.split(system_substring)[-1]


data_validation_1['instructionless_prompt'] = data_validation_1.apply(del_system_sentence,axis=1)
data_validation_2['instructionless_prompt'] = data_validation_2.apply(del_system_sentence,axis=1)
data_validation_3['instructionless_prompt'] = data_validation_3.apply(del_system_sentence,axis=1)


## Creating custom dataset for the imported data.

In [7]:
class ChatbotDataset(Dataset):
    def __init__(self,data : pd.DataFrame):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        #get either a single data point or a pandas Dataframe window of data points
        data_window = self.data.iloc[idx]    

        return data_window.to_dict()
    
validation_dataset_1 = ChatbotDataset(data_validation_1)
validation_dataset_2 = ChatbotDataset(data_validation_2)
validation_dataset_3 = ChatbotDataset(data_validation_3)

        

## Setting Custom Dataloader

In [8]:


class ChatbotDataloader(DataLoader):
    def __init__(self, tokenizer :  PreTrainedTokenizer | PreTrainedTokenizerFast, **kwargs):
        self.tokenizer = tokenizer
        
        kwargs["collate_fn"] = self.chatbot_collate
        super().__init__(**kwargs)

    
    def chatbot_collate(self,batch):
        """Custom collate function to teach the Dataloader class how to parse the batches into an llm friendly format
        Args:
            original_batch : List of batch elements with len -> batch_size. Each list's element strictly follows 
            the format inside __getitem__ from Dataset class. 
        
        """
        prompts = []
        answers = []
        languages = []
        for dic in batch:
            if self.tokenizer.chat_template: #tokenizer has chate template
                
                prompt_messages = [
                    {"role": "system", "content": system_substring},
                    {"role" : "user", "content" : dic["instructionless_prompt"]}
                ]

                prompt_text  = self.tokenizer.apply_chat_template(
                    prompt_messages,
                    tokenize=False,
                    add_generation_prompt=True,
                )

            else:
                prompt_text  = dic["prompt"]
                

            """ answer_messages = [
                {"role" : "user", "content" : dic["answer"]}
            ]

            answer_text  = self.tokenizer.apply_chat_template(
                answer_messages,
                tokenize=False,
                add_generation_prompt=True,
            ) """


            
            prompts.append(prompt_text)
            #answers.append(answer_text)
            answers.append(dic["answer"])
            languages.append(dic["language"])

        #tokenize batch of prompts and answers
        prompt_tokenize = self.tokenizer(prompts,
                padding='longest',truncation=True,return_tensors="pt")

        """ answer_tokenize = self.tokenizer(answers,
                padding='longest',truncation=True,return_tensors="pt") """

        return {
            "inputs" : prompt_tokenize, #Dict[str,torch.Tensor]
            "labels" : answers, #List[str]  ##answer_tokenize, #Dict[str,torch.Tensor],
            "languages" : languages, #List[str]
            "longest_seq" : prompt_tokenize["input_ids"].shape[1] #int
        }

# Model Inference Setup (Inference API server version)

In [9]:
from huggingface_hub import InferenceClient
import openai



This section models inferencing pipelines for each of the benchmark models 

In [10]:
def custom_accurracy_metric(predictions,labels):
    """
    """
    unmatched_idxs = [] #incorrectly predicted records idxs
    accurracy = 0
    for i,(pred,lab) in enumerate(zip(predictions,labels)):
        if pred == lab:
            accurracy += 1
        else:
            unmatched_idxs.append(i)

    return accurracy / len(predictions), unmatched_idxs


def get_model_winner(matches) -> int:
    """
    Extract which model's reponse is better from input_response
    """
    for match in matches:
        if 'a' in match:
            return 1
        elif 'b' in match:
            return 2
    return 3

def parse_output_llm(response) -> int:
    """
    Retrieves a list specifying which of the two paired models in each training record adheres the best
    to human responses.

    Args:
        responses List[str]: Batch of LLM's responses.
    """
    pattern = r'Best model(.*?)based on its human preferability response'


    #Extract pattern from response
    matches = re.findall(pattern, response,re.DOTALL)        
    
    return get_model_winner(matches)




In [49]:

def model_inference(dataset,config,client,resume : Optional[dict] = None) -> dict[str,Union[list[int],int]]:
    """
    Retrieves two lists, the first list specifies the LLM's decisions per record, on which response was more humanly
    seen. The other specifies the challenge's ground truth.

    Args:
        model : HuggingFace Pretrained LLM.
    """
    global_output_winners = []
    resume_idx = 0
    if resume:
        global_output_winners = resume['predictions']
        resume_idx = resume['last_idx'] + 1
    
    try:
        current_id = 0
        for i,record in enumerate(tqdm(dataset,desc="Training set - Model Inference")):
            if i < resume_idx:
                continue

            messages = [
                {'role' : 'system', 'content' : system_substring},
                {'role' : 'user',     'content' : record['instructionless_prompt'] }]

            server_response = None
            try:
                server_response = client.chat.completions.create(
                        model= config.model_name,
                        messages=messages,
                        max_tokens=50,
                        stream = False
                    )
                response: str = server_response.choices[0].message.content
                """ input = { "messages": messages }
                server_response = requests.post(f"{CLOUDFARE_URL}{config.model_name}", headers=HEADER, json=input).json()
                response = server_response['result']['response'] """

            except Exception as e:
                print("Something has gone wrong requesting from API server...")
                print(e)
            finally:
                if not server_response:
                    response = 'c'

        
            global_output_winners.append(parse_output_llm(response))
            current_id = i
    except KeyboardInterrupt as k:
        print(k)
    finally:
        return {
            'predictions' : global_output_winners,
            'last_idx' : current_id 
        }



# Config Inference Arguments and saving results into files

In [12]:
@dataclass
class InferenceArgs:
    model_name : str
    inference_file_path : str = root_repo_directory
    batch_size : Optional[int] = None
    



@dataclass
class InferenceFile:

    predictions : list[int]
    last_idx : int
    inference_dataset : str

    def save(self,file_path, file_name,include_version = False):

        data = {
            'predictions' : self.predictions,
            'last_idx' : self.last_idx,
            'inference_dataset' : self.inference_dataset
        }
        utils.to_pickle([data],file_path,file_name,include_version)





# Together AI

In [13]:
from together import Together
client = Together(api_key=os.environ['TOGETHER_AI_API_KEY'])

## meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo

In [14]:
config = InferenceArgs(
    model_name="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
    inference_file_path = root_repo_directory + c.SLASH + 'datasets_creator/data/inference'
)

In [None]:

inference_dataset = 'validation_dataset_1'
out_val_1  = model_inference(validation_dataset_1,config,client)
out_val_1_file = InferenceFile(inference_dataset=inference_dataset,**out_val_1)
out_val_1_file.save(config.inference_file_path + c.SLASH + config.model_name,inference_dataset)

""" inference_dataset = 'validation_dataset_2'
out_val_2  = model_inference(validation_dataset_2,config,client)
out_val_2_file = InferenceFile(inference_dataset=inference_dataset,**out_val_2)
out_val_2_file.save(config.inference_file_path + c.SLASH + config.model_name,inference_dataset)

inference_dataset = 'validation_dataset_3'
out_val_3  = model_inference(validation_dataset_3,config,client)
out_val_3_file = InferenceFile(inference_dataset=inference_dataset,**out_val_3)
out_val_3_file.save(config.inference_file_path + c.SLASH + config.model_name,inference_dataset) """

In [None]:
new_out_val_2  = model_inference(validation_dataset_2,config,client,out_val_2)
out_val_3  = model_inference(validation_dataset_3,config,client)

# Groq

In [46]:
from groq import Groq

client = Groq(
    api_key=os.environ["GROQ_API_KEY"],
)


## llama-3.1-8b-instant

In [45]:
config = InferenceArgs(
    model_name="llama-3.1-8b-instant",
    inference_file_path = root_repo_directory + c.SLASH + 'datasets_creator/data/inference'
)

In [None]:
inference_dataset = 'validation_dataset_2'
out_val_2  = model_inference(validation_dataset_2,config,client)
out_val_2_file = InferenceFile(inference_dataset=inference_dataset,**out_val_2)
out_val_2_file.save(config.inference_file_path + c.SLASH + config.model_name,inference_dataset)


In [None]:
inference_dataset = 'validation_dataset_3'
out_val_3  = model_inference(validation_dataset_3,config,client)
out_val_3_file = InferenceFile(inference_dataset=inference_dataset,**out_val_3)
out_val_3_file.save(config.inference_file_path + c.SLASH + config.model_name,inference_dataset)

# Cloudfare Workers AI

In [21]:
CLOUDFARE_URL = "https://api.cloudflare.com/client/v4/accounts/5b297e1ca90d051d39f0c1851824f0ad/ai/run/"
HEADER = {'Authorization' : f"Bearer {os.environ['CLOUDFARE_API_KEY']}"}

## meta/llama-3.3-70b-instruct-fp8-fast

In [17]:
config = InferenceArgs(
    model_name="@cf/meta/llama-3.3-70b-instruct-fp8-fast",
    inference_file_path = root_repo_directory + c.SLASH + 'datasets_creator/data/inference'
)

In [None]:
inference_dataset = 'validation_dataset_1'
out_val_1  = model_inference(validation_dataset_1,config,client)
out_val_1_file = InferenceFile(inference_dataset=inference_dataset,**out_val_1)
out_val_1_file.save(config.inference_file_path + c.SLASH + config.model_name,inference_dataset)

## Qwen/Qwen2.5-72B-Instruct - SambaNovaCloud

In [13]:
client = openai.OpenAI(
    api_key=os.environ['SAMBANOVA_API_KEY'],
    base_url="https://api.sambanova.ai/v1",
)

In [14]:
config = InferenceArgs(
    model_name="Qwen2.5-Coder-32B-Instruct",
    batch_size=8
)

In [None]:
output_winners_validation_dataset_1  = model_inference(validation_dataset_1,config,client)
output_winners_validation_dataset_2  = model_inference(validation_dataset_2,config,client)
output_winners_validation_dataset_3  = model_inference(validation_dataset_3,config,client)

In [None]:
raise ValueError()

In [None]:
custom_accurracy_metric(output_winners,label_winners)

### Model pipeline inference test. Task: Text Generation

In [6]:
class CustomPipelineDataset(Dataset):
    def __init__(self,data : pd.DataFrame):
        self.data = data
    
    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, idx):
        #get either a single data point or a pandas Dataframe window of data points
        data_window = self.data.iloc[idx]    

        return data_window['prompt']
    
train_pipeline_dataset = CustomPipelineDataset(data)

        

In [None]:
pipe = pipeline(task='zero-shot-classification',model=config.model_name,device_map='auto')
pipe.model.config.id2label = id2label
pipe.model.config.label2id = label2id

In [None]:
def zero_shot_classification_inference(pipe,dataset : Dataset):
    try:
        prediction_results = defaultdict(int) #predicted model : count

        prediction_results = []
        for i,prediction_dict in enumerate(tqdm(pipe(dataset,candidate_labels= ['model_a','model_b']),
                                                 total=len(dataset))):

            best_model_id = np.argmax(prediction_dict['scores']) + 1
            prediction_results.append(best_model_id)

    except KeyboardInterrupt:
        print(f"Inference pipeline is interrumpted...")
    finally:
        return {'predictions' : prediction_results, 'current_id' : i+1}
    

predictions =   zero_shot_classification_inference(pipe,train_pipeline_dataset)

    



In [None]:
predictions

## meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8

Currently not functional. Model text generation does not align with prompt's requirements

In [None]:
config = InferenceArgs(
    model_name="meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8",
    batch_size=8
)

tokenizer = AutoTokenizer.from_pretrained(
config.model_name,padding_side="left",legacy=False)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

In [None]:
print(f"Model : {config.model_name} max context length : {tokenizer.model_max_length}")

### Dataloader

In [12]:
arguments = {
    "dataset" :train_dataset,
    "batch_size" : config.batch_size
}

llama_3_2_1b_spinquant_4int_dataloader = ChatbotDataloader(
    tokenizer,**arguments
)

### Loading model into VRAM

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    "/home/kevinmg96/Kaggle competitions/WSDM Cup/multilingual-chatbot-arena/models/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8/hf_weights", 
    device_map="auto"
)
model.eval()

In [None]:
with torch.no_grad():
    my_batch = next(iter(llama_3_2_1b_spinquant_4int_dataloader))
    my_batch_input = my_batch["inputs"].to("cuda")

    #logits = model(**my_batch_input).logits

    output_ids  = model.generate(
        **my_batch_input,
        max_new_tokens=512
    )

    output_ids = output_ids.detach().cpu()

    response = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    print(response)
    output_ids = [output_ids[i,my_batch["longest_seq"]:]  for i in range(
            output_ids.shape[0])]
    response = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    print(response)

    print(parse_output_llm(response))
    print(my_batch["labels"])
    print(parse_output_llm(my_batch["labels"]))

