#### Learning what this work is doing

In [1]:
import torch
import sklearn
import numpy as np
import pandas as pd
import time

from transformers import AutoTokenizer, LlamaModel, LlamaForSequenceClassification, BitsAndBytesConfig
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
from torch.cuda.amp import autocast
from threading import Thread

In [2]:
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

if (not torch.cuda.is_available()):
    print("GPU not availabe")

MODEL_NAME = "../llama3/Meta-Llama-3-8B/"
WEIGHTS_PATH = "../llama3/Meta-Llama-3-8B/path-to-finetuned-model"
MAX_LENGTH=256
BATCH_SIZE=4
DEVICE = torch.device("cuda")

### Prepare Data

In [3]:
test = pd.read_csv('../lmsys-chatbot-arena/test.csv')
sample_sub = pd.read_csv('../lmsys-chatbot-arena/sample_submission.csv')

# reformat the input texts
def process(input_str):
    # remove the [ and ] and the begin and end of text
    stripped_str = input_str.strip('[]')
    # split the string with ",", then remove "
    sentences = [s.strip('"') for s in stripped_str.split('","')]
    # combine the strings with space
    return ' '.join(sentences)

In [6]:
test.loc[:,'prompt']=test['prompt'].apply(process)
test.loc[:,'response_a']=test['response_a'].apply(process)
test.loc[:,'response_b']=test['response_b'].apply(process)
display(sample_sub)
display(test.head(5))

test['text'] = 'User prompt: ' + test['prompt'] +  '\n\nModel A :\n' + test['response_a'] +'\n\n--------\n\nModel B:\n'  + test['response_b']
print(test['text'][0])

Unnamed: 0,id,winner_model_a,winner_model_b,winner_tie
0,136060,0.333333,0.333333,0.333333
1,211333,0.333333,0.333333,0.333333
2,1233961,0.333333,0.333333,0.333333


Unnamed: 0,id,prompt,response_a,response_b
0,136060,"I have three oranges today, I ate an orange ye...",You have two oranges today.,You still have three oranges. Eating an orange...
1,211333,You are a mediator in a heated political debat...,Thank you for sharing the details of the situa...,Mr Reddy and Ms Blue both have valid points in...
2,1233961,How to initialize the classification head when...,When you want to initialize the classification...,To initialize the classification head when per...


User prompt: I have three oranges today, I ate an orange yesterday. How many oranges do I have?

Model A :
You have two oranges today.

--------

Model B:
You still have three oranges. Eating an orange yesterday does not affect the number of oranges you have today.


### tokenize

In [11]:
tokenizer = AutoTokenizer.from_pretrained("../llama3/Meta-Llama-3-8B/")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_size = 'right'
tokenizer.add_eos_token = True
tokens = tokenizer(test['text'].tolist(), padding='max_length', max_length=MAX_LENGTH, truncation=True, return_tensors='pt')
INPUT_IDS = tokens['input_ids'].to(DEVICE, dtype=torch.int32)
ATTENTION_MASKS = tokens['attention_mask'].to(DEVICE, dtype=torch.int32)

input_ids_cpu = [tensor.cpu().tolist() for tensor in INPUT_IDS]
attention_mask_cpu = [tensor.cpu().tolist() for tensor in ATTENTION_MASKS]

data = pd.DataFrame()
data['INPUT_IDS'] = input_ids_cpu
data['ATTENTION_MASK'] = attention_mask_cpu
data[:2]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Unnamed: 0,INPUT_IDS,ATTENTION_MASK
0,"[128000, 1502, 10137, 25, 358, 617, 2380, 8513...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,"[128000, 1502, 10137, 25, 1472, 527, 264, 6903...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


## Load Model & weights

In [14]:
bnb_config = BitsAndBytesConfig(
    load_in_8bit = True,
    bnb_8bit_compute_dtype = torch.float16,
    bnb_8bit_use_double_quant=False)

# load model to GPU
device_single = torch.device('cuda')
base_model = LlamaForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels = 3,
    torch_dtype=torch.float16,
    quantization_config=bnb_config,
    device_map='cuda')

base_model.config.pad_token_id=tokenizer.pad_token_id



Unused kwargs: ['bnb_8bit_compute_dtype', 'bnb_8bit_use_double_quant']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at ../llama3/Meta-Llama-3-8B/ and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Load weights

In [None]:
# Lora configuration
peft_config = LoraConfig(
    r=1,
    lora_alpha = 2,
    lora_dropout = 0.1,
    bias = 'none',
    inference_mode = True,
    task_type = TaskType.SEQ_CLS,
    target_modules = ['o_proj','v_proj']
)

# get peft
model = get_peft_model(base_model, peft_config).to(device_single)
model.load_state_dict(torch.load(WEIGHTS_PATH), strict = False)
model.eval()

model.print_trainable_parameters()


#### Inference

In [None]:
import gc
gc.collect()

# define the inference function (the original design has two gpu models, which requires a function to handle inference)
def inference(df, model, device, batch_size = BATCH_SIZE):
    input_ids = torch.tensor(df['INPUT_IDS'].values.tolist(), dtype=torch.long)
    attention_mask = torch.tensor(df['ATTENTION_MASKS'].values.tolist(), dtype=torch.long)
    generated_class_a = []
    generated_class_b = []
    generated_class_c = []

    model.eval()
    for start_idx in range(0, len(df), batch_size):
        # consider when the training reaches the end of training
        end_idx = min(start_idx + batch_size, len(df))
        # get the batched input id and attention mask
        batch_input_ids = input_ids[start_idx:end_idx].to(device)
        batch_attention_mask = attention_mask[start_idx:end_idx].to(device)
        # get the inference output
        with torch.no_grad():
            with autocast():
                ouputs = model(
                    input_ids = batch_input_ids,
                    attention_mask = batch_attention_mask
                )
        # apply softmax to get the probabilities
        probabilities = torch.softmax(outputs.logits, dim = -1).cpu().numpy()
        # append probabilities to the results
        generated_class_a.extend(probabilities[:, 0])
        generated_class_b.extend(probabilities[:, 0])
        generated_class_c.extend(probabilities[:, 0])

    df['winner_model_a']=generated_class_a
    df['winner_model_b']=generated_class_b
    df['winner_tie']=generated_class_c

    torch.cuda.empty_cache()
    return df

st = time.time()

N_SAMPLE = len(data)

# inference
# May need to use a larger gpu to test the inference function with finetuned model. 
# For now, continued to the next section

## LGBM + tfdif
learn how the tfdif is used in this inference. Can test train with this feature?