## Import related packages

In [2]:
import os
import gc # garbage collection. help with memory management
from time import time
import random
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

import torch
import transformers
from sklearn.metrics import accuracy_score

# finetuning related modules
from transformers import AutoTokenizer, LlamaModel, LlamaForSequenceClassification
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
# end

import torch.nn.functional as F

# TPU related, not used
"""
import torch_xla.debug.profiler as xp
import torch_xla.core.xla_model as xm
import torch_xla.experimental.xla_sharding as xs
import torch_xla.runtime as xr

xr.use_spmd()

from torch_xla.experimental.xla_sharded_tensor import XLAShardedTensor
from torch_xla.experimental.xla_sharding import Mesh
from spmd_util import partition_module
"""

tqdm.pandas()

print(f'Torch Version: {torch.__version__}')

Torch Version: 2.3.1+cu121


# Configs

In [3]:
class CFG:
    NUM_EPOCHS = 1
    BATCH_SIZE = 2
    DROPOUT = 0.05
    MODEL_NAME = "../llama3/Meta-Llama-3-8B/"
    SEED = 2024
    MAX_LENGTH = 512
    NUM_WARMUP_STEPS = 128
    LR_MAX = 5E-5
    NUM_LABELS = 3
    LORA_RANK = 2
    LORA_ALPHA = 8
    LORA_MODULES = ['o_proj', 'v_proj']

if torch.cuda.is_available():
    DEVICE = 'cuda'
    print('GPU is used')
else:
    DEVICE = 'cpu'
    print('CPU is used')

GPU is used


In [4]:
torch.cuda.empty_cache()

In [5]:
# set seed to ensure reproducibility
def set_seeds(seed):
    os.environ['PYTHONHASHSEED']=str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seeds(seed=CFG.SEED)

## Tokenizer

In [6]:
tokenizer = AutoTokenizer.from_pretrained(CFG.MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_size = 'right'
tokenizer.add_eos_token = True
tokenizer.tokenize("shuo bu shuo!")

tokenizer.save_pretrained('tokenizer')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


('tokenizer\\tokenizer_config.json',
 'tokenizer\\special_tokens_map.json',
 'tokenizer\\tokenizer.json')

In [7]:
# utility function giving token length
# only takes data frame input?
def get_token_lengths(texts):
    # tokenize and receive inputs_ids for each text
    inputs_ids = tokenizer(texts.tolist(),return_tensors='np')['input_ids']
    # input_ids = tokenizer(texts.tolist(), return_tensors='np')['input_ids']
    # return length of inputs_ids for each text
    return [len(t) for t in inputs_ids]

# Prepare train

In [8]:
train_df = pd.read_csv('../lmsys-chatbot-arena/train.csv')
def process(input_str):
    # remove the [ and ] and the begin and end of text
    stripped_str = input_str.strip('[]')
    # split the string with ",", then remove "
    sentences = [s.strip('"') for s in stripped_str.split('","')]
    # combine the strings with space
    return ' '.join(sentences)

In [9]:
## Check
before_process = train_df.loc[1,'prompt']
after_process = train_df['prompt'].apply(process)
# train.loc[:,'prompt'] = train['prompt'].apply(process)
print(before_process)
print('-'*50)
print(after_process.loc[1])

["What is the difference between marriage license and marriage certificate?","How can I get both of them as quick as possible in California ","What is the minimal time to get them? 1 day or 10 days?"]
--------------------------------------------------
What is the difference between marriage license and marriage certificate? How can I get both of them as quick as possible in California  What is the minimal time to get them? 1 day or 10 days?


In [10]:
# do the process for all data
train_df.loc[:, 'prompt'] = train_df['prompt'].apply(process)
train_df.loc[:, 'response_a'] = train_df['response_a'].apply(process)
train_df.loc[:, 'response_b'] = train_df['response_b'].apply(process)

# Drop 'Null' for training
condition = (train_df.response_a=='null') & (train_df.response_b=='null')
indexes = train_df[condition].index
# print(indexes)
train_df.drop(indexes, inplace=True)
train_df.reset_index(inplace=True, drop=True)

print(f"Total {len(indexes)} Null reponse rows dropped")
print('Total train samples: ', len(train_df))

Total 19 Null reponse rows dropped
Total train samples:  57458


In [11]:
train_df.head(5)

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
0,30192,gpt-4-1106-preview,gpt-4-0613,Is it morally right to try to have a certain p...,The question of whether it is morally right to...,"As an AI, I don't have personal beliefs or opi...",1,0,0
1,53567,koala-13b,gpt-4-0613,What is the difference between marriage licens...,A marriage license is a legal document that al...,A marriage license and a marriage certificate ...,0,1,0
2,65089,gpt-3.5-turbo-0613,mistral-medium,explain function calling. how would you call a...,Function calling is the process of invoking or...,Function calling is the process of invoking a ...,0,0,1
3,96401,llama-2-13b-chat,mistral-7b-instruct,How can I create a test set for a very rare ca...,Creating a test set for a very rare category c...,When building a classifier for a very rare cat...,1,0,0
4,198779,koala-13b,gpt-3.5-turbo-0314,What is the best way to travel from Tel-Aviv t...,The best way to travel from Tel Aviv to Jerusa...,The best way to travel from Tel-Aviv to Jerusa...,0,1,0


In [12]:
# Show the conversation
train_df['text']='User prompt: '+train_df['prompt']+ '\n\nModel A:\n'+train_df['response_a']+'\n\n--------\n\nModel B:\n'+train_df['response_b']
print(train_df['text'][4])

User prompt: What is the best way to travel from Tel-Aviv to Jerusalem? Car? Bus? Plane?

Model A:
The best way to travel from Tel Aviv to Jerusalem depends on your personal preference and the availability of transportation options. All the options you have mentioned are valid options, but here are some details to help you make your decision:\n\n*   By car: Traveling by car is the quickest way to get from Tel Aviv to Jerusalem, as the distance between the two cities is only about 60 kilometers (37 miles). It takes around 45 minutes to drive from Tel Aviv to Jerusalem by car, depending on the traffic.\n*   By bus: There are several bus lines that run from Tel Aviv to Jerusalem, and the journey takes around 1 hour and 30 minutes by bus. The buses are comfortable and reliable, and they offer a scenic view of the beautiful Israeli countryside.\n*   By plane: There are no direct flights from Tel Aviv to Jerusalem, so you need to take a flight from Tel Aviv's Ben Gurion International Airport

In [13]:
type(train_df['text'])

pandas.core.series.Series

In [14]:
# Train only 50% train dataset
train = train_df[:int(len(train_df)/2)]
texts = train['text']
train.loc[:,'token_count'] = get_token_lengths(texts)

# prepare label for model
train.loc[:, 'label']=np.argmax(train[['winner_model_a','winner_model_b','winner_tie']].values, axis=1)
display(train.head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.loc[:,'token_count'] = get_token_lengths(texts)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.loc[:, 'label']=np.argmax(train[['winner_model_a','winner_model_b','winner_tie']].values, axis=1)


Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie,text,token_count,label
0,30192,gpt-4-1106-preview,gpt-4-0613,Is it morally right to try to have a certain p...,The question of whether it is morally right to...,"As an AI, I don't have personal beliefs or opi...",1,0,0,User prompt: Is it morally right to try to hav...,1206,0
1,53567,koala-13b,gpt-4-0613,What is the difference between marriage licens...,A marriage license is a legal document that al...,A marriage license and a marriage certificate ...,0,1,0,User prompt: What is the difference between ma...,1393,1
2,65089,gpt-3.5-turbo-0613,mistral-medium,explain function calling. how would you call a...,Function calling is the process of invoking or...,Function calling is the process of invoking a ...,0,0,1,User prompt: explain function calling. how wou...,664,2
3,96401,llama-2-13b-chat,mistral-7b-instruct,How can I create a test set for a very rare ca...,Creating a test set for a very rare category c...,When building a classifier for a very rare cat...,1,0,0,User prompt: How can I create a test set for a...,1008,0
4,198779,koala-13b,gpt-3.5-turbo-0314,What is the best way to travel from Tel-Aviv t...,The best way to travel from Tel Aviv to Jerusa...,The best way to travel from Tel-Aviv to Jerusa...,0,1,0,User prompt: What is the best way to travel fr...,479,1


In [15]:
train.label.value_counts()

label
0    10092
1     9837
2     8800
Name: count, dtype: int64

In [16]:
# token Count
display(train['token_count'].describe().to_frame())

Unnamed: 0,token_count
count,28729.0
mean,729.613526
std,768.325978
min,18.0
25%,288.0
50%,563.0
75%,900.0
max,15428.0


In [17]:
# get length of tokens which covers 90% of data, we'll still take 1024 length
np.percentile(train['token_count'],90)

1399.2000000000007

## Tokenize

In [18]:
CFG.MAX_LENGTH
tokenizer("I have an apple",padding='max_length',max_length=2,truncation=True)

{'input_ids': [128000, 40], 'attention_mask': [1, 1]}

In [19]:
# tokenize data
tokens = tokenizer(
    train['text'].tolist(),
    padding='max_length',
    max_length=CFG.MAX_LENGTH,
    truncation=True,
    return_tensors='np'
)

# Input IDs are teh token IDs
INPUT_IDS = tokens['input_ids']
# Attention Masks to Ignore Padding Tokens
ATTENTION_MASKS = tokens['attention_mask']
# Label of Texts
LABELS = train[['winner_model_a','winner_model_b','winner_tie']].values

print(f'INPUT_IDS shape: {INPUT_IDS.shape}, ATTENTION_MASKS shape: {ATTENTION_MASKS.shape}')
print(f'LABELS shape: {LABELS.shape}')

INPUT_IDS shape: (28729, 512), ATTENTION_MASKS shape: (28729, 512)
LABELS shape: (28729, 3)


In [20]:
def train_dataset(batch_size):
    N_SAMPLES = LABELS.shape[0]
    IDXS = np.arange(N_SAMPLES - (N_SAMPLES % batch_size))
    while True:
        # Shuffle Indices
        np.random.shuffle(IDXS)
        # Iterate Over All Indices Once
        for idxs in IDXS.reshape(-1, batch_size):
            input_ids = torch.tensor(INPUT_IDS[idxs]).to(DEVICE)
            attention_mask = torch.tensor(ATTENTION_MASKS[idxs]).to(DEVICE)
            labels = torch.tensor(LABELS[idxs]).to(DEVICE) # Multi-label

            # yield returns a returns a generator object to 
            # the one who calls the function which contains 
            # yield, instead of simply returning a value
            yield input_ids, attention_mask, labels
            
TRAIN_DATASET = train_dataset(CFG.BATCH_SIZE)

        

## Load Model

In [21]:
# Load model for classification with 3 target label
base_model = LlamaForSequenceClassification.from_pretrained(
    CFG.MODEL_NAME,
    num_labels=CFG.NUM_LABELS,
    torch_dtype = torch.bfloat16)

base_model.config.pretraining_tp = 1

# Assign Padding TOKEN
base_model.config.pad_token_id = tokenizer.pad_token_id

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at ../llama3/Meta-Llama-3-8B/ and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
print(tokenizer.pad_token_id)

128001


## Low-Rank Adaption [LORA]

In [23]:
CFG.LORA_RANK

2

In [24]:
lora_config = LoraConfig(
    r=CFG.LORA_RANK, # the dimension of the low-rank matrices
    lora_alpha = CFG.LORA_ALPHA, # scaling factor for LoRA activations vs pre-trained weight activations
    lora_dropout = CFG.DROPOUT,
    bias='none',
    inference_mode=False,
    task_type = TaskType.SEQ_CLS, # refer to https://github.com/huggingface/peft/blob/v0.8.2/src/peft/utils/peft_types.py#L68-L73 for the TaskType Class
    target_modules = CFG.LORA_MODULES # Only use Output and Values Projection
)

In [25]:
# Exciting!!!!!!!!!!!!!!!!!!!!!!!!
# Create LoRa Model
model = get_peft_model(base_model, lora_config)
# Trainable Parameters
model.print_trainable_parameters()

trainable params: 864,256 || all params: 7,505,801,216 || trainable%: 0.0115


In [26]:
# Number of TPU Nodes ???????????????????????????????????
# num_devices = xr.global_runtime_device_count()
# mesh_shape = (1, num_devices, 1)
# device_ids = np.array(range(num_devices))
# mesh = Mesh(device_ids, mesh_shape, ('dp', 'fsdp', 'mp'))
# # distribute model
# partition_module(model, mesh)

# print(f'num_devices: {num_devices}')

In [27]:
# verify teh trainable layers
MODEL_LAYERS_ROWS = []
TRAINABLE_PARAMS = []
N_TRAINABLE_PARAMS = 0

for name, param in model.named_parameters():
    # Layer Parameter Count
    n_parameters = int(torch.prod(torch.tensor(param.shape)))
    # Only Trainable Layers
    if param.requires_grad:
        # Add Layer Information
        MODEL_LAYERS_ROWS.append({
            'param': n_parameters,
            'name': name,
            'dtype': param.data.dtype,
        })
        # Append Trainable Parameter
        TRAINABLE_PARAMS.append({'params': param})
        # Add Number of Trainable Parameters
        N_TRAINABLE_PARAMS += n_parameters

display(pd.DataFrame(MODEL_LAYERS_ROWS))

print(f"""
===============================
N_TRAINABLE_PARAMS: {N_TRAINABLE_PARAMS:,}
N_TRAINABLE_LAYERS: {len(TRAINABLE_PARAMS)}
===============================
""")

Unnamed: 0,param,name,dtype
0,8192,base_model.model.model.layers.0.self_attn.v_pr...,torch.bfloat16
1,2048,base_model.model.model.layers.0.self_attn.v_pr...,torch.bfloat16
2,8192,base_model.model.model.layers.0.self_attn.o_pr...,torch.bfloat16
3,8192,base_model.model.model.layers.0.self_attn.o_pr...,torch.bfloat16
4,8192,base_model.model.model.layers.1.self_attn.v_pr...,torch.bfloat16
...,...,...,...
124,8192,base_model.model.model.layers.31.self_attn.v_p...,torch.bfloat16
125,2048,base_model.model.model.layers.31.self_attn.v_p...,torch.bfloat16
126,8192,base_model.model.model.layers.31.self_attn.o_p...,torch.bfloat16
127,8192,base_model.model.model.layers.31.self_attn.o_p...,torch.bfloat16



N_TRAINABLE_PARAMS: 864,256
N_TRAINABLE_LAYERS: 129



## Training

In [28]:
# learning rate and optimizer?
N_SAMPLES = len(train)
STEPS_PER_EPOCH = N_SAMPLES // CFG.BATCH_SIZE

OPTIMIZER=torch.optim.AdamW(model.parameters(), lr = CFG.LR_MAX)

# Cosine Learning Rate with Warmup
lr_scheduler = transformers.get_cosine_schedule_with_warmup(
    optimizer=OPTIMIZER,
    num_warmup_steps=CFG.NUM_WARMUP_STEPS,
    num_training_steps=STEPS_PER_EPOCH * CFG.NUM_EPOCHS
)

print(f'BATCH_SIZE: {CFG.BATCH_SIZE}, N_SAMPLES: {N_SAMPLES}, STEPS_PER_EPOCH: {STEPS_PER_EPOCH}')

BATCH_SIZE: 2, N_SAMPLES: 28729, STEPS_PER_EPOCH: 14364


In [29]:
# Set the data type for the optimizer's state(e.g., momentum buffers)
for state in OPTIMIZER.state.values():
    for k,v in state.items():
        if isinstance(v, torch.Tensor) and state[k].dtype is not torch.float32:
            state[v] = v.to(dtype=torch.float32)
            

In [30]:
input_ids, attention_mask, labels = next(TRAIN_DATASET)
print(f'input_ids shape: {input_ids.shape}, dtype: {input_ids.dtype}')
print(f'attention_mask shape: {attention_mask.shape}, dtype: {attention_mask.dtype}')
print(f'labels shape: {labels.shape}, dtype: {labels.dtype}')

input_ids shape: torch.Size([2, 512]), dtype: torch.int32
attention_mask shape: torch.Size([2, 512]), dtype: torch.int32
labels shape: torch.Size([2, 3]), dtype: torch.int64


In [31]:
model.to(DEVICE)

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): LlamaForSequenceClassification(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
              (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
              (v_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=1024, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=2, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=2, out_features=1024, bias=False)
                )
                (lo

In [32]:
%%time
# Dummy Prediction
print(input_ids.get_device(), attention_mask.get_device())
with torch.no_grad():
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)

print(f'logits: {outputs.logits}, dtype: {outputs.logits.dtype}')

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


0 0


  attn_output = torch.nn.functional.scaled_dot_product_attention(


logits: tensor([[-0.1641,  2.9062,  4.1562],
        [-0.7070,  4.3125, -0.4434]], device='cuda:0', dtype=torch.bfloat16), dtype: torch.bfloat16
CPU times: total: 9.55 s
Wall time: 20.6 s


In [33]:
# put model in train_mode
model.train()

# loss function, cross entropy
LOSS_FN = torch.nn.CrossEntropyLoss().to(dtype=torch.float32)

In [34]:
CFG.NUM_EPOCHS
STEPS_PER_EPOCH

14364

In [None]:
st = time()
warnings.filterwarnings("error")
METRICS={
    'loss':[],
    'accuracy': {'y_true':[], 'y_pred':[]}
}

for epoch in tqdm(range(CFG.NUM_EPOCHS)):
    ste = time()
    for step in range(STEPS_PER_EPOCH):
        # Zero out gradients
        OPTIMIZER.zero_grad()

        # get batch
        input_ids, attention_mask, labels = next(TRAIN_DATASET)

        # forward pass
        outputs = model(input_ids = input_ids, attention_mask = attention_mask)

        # logits float32
        logits = outputs.logits.to(dtype=torch.float32)

        # backward pass
        loss = LOSS_FN(logits, labels.to(dtype=torch.float32))
        loss.backward()

        # optimizer step
        OPTIMIZER.step()

        # update learning rate scheduler
        lr_scheduler.step()

        METRICS['loss'].append(float(loss))
        METRICS['accuracy']['y_true'] += labels.squeeze().tolist()
        METRICS['accuracy']['y_pred'] += torch.argmax(F.softmax(logits,dim=-1), dim=1).cpu().tolist()

        if (step+1)%200 == 0:
            metrics = 'mu_loss: {:.3f}'.format(np.mean(METRICS['loss']))
            metrics += ', step_loss: {:.3f}'.format(METRICS['loss'][-1])
            metrics += ', mu_auc: {:,3f}'.format(accuracy_score(torch.argmax(torch.tensor(METRICS['accuracy']['y_true']), axis=-1), \
                        METRICS['accuracy']['y_pred']))
            lr = OPTIMIZER.param_groupps[0]['lr']
            print(f'{epoch+1:02}/{CFG.NUM_EPOCHS:02} | {step+1:04}/{STEPS_PER_EPOCH} lr: {lr:.2E}, {metrics}', end='')
            print(f'\nSteps per epoch: {step+1} complete | Time elapsed: {time() - st}')

    print(f'\nEpoch {epoch+1} Completed | Total time for epoch: {time()-ste} ')
    

  0%|          | 0/1 [00:00<?, ?it/s]