In [17]:
import pandas as pd
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from sklearn.model_selection import train_test_split


In [18]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("token")

login(token=secret_value_0)

In [19]:
import wandb

user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("wanb_key")
wandb.login(key=secret_value_0)

# Initialize W&B run
wandb.init(
    project="my-awesome-project", 
    name="Jina_trainer")



In [20]:
dataset_path = '/kaggle/input/your-dataset-folder/'
df = pd.read_csv('/kaggle/input/llm-classification-finetuning/train.csv')

In [21]:
def process_text(text):
    stripped_text = text.strip('[]')
    sentences = stripped_text.split('","') 
    strip_sentences = [s.strip('"') for s in sentences]
    
    return ' '.join(strip_sentences)

# Apply the function to the first 5 rows of the dataframe
df['prompt'] = df['prompt'].apply(process_text)
df['response_a'] = df['response_a'].apply(process_text)
df['response_b'] = df['response_b'].apply(process_text)
df.head()
# Original text: ["Is it morally right to try to have a certain percentage of females on managerial positions?","OK, does pineapple belong on a pizza? Relax and give me fun answer."]
# Stripped text: "Is it morally right to try to have a certain percentage of females on managerial positions?","OK, does pineapple belong on a pizza? Relax and give me fun answer."
# Split sentences: ['"Is it morally right to try to have a certain percentage of females on managerial positions?', 'OK, does pineapple belong on a pizza? Relax and give me fun answer."']
# Strip sentences: Is it morally right to try to have a certain percentage of females on managerial positions? OK, does pineapple belong on a pizza? Relax and give me fun answer.

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
0,30192,gpt-4-1106-preview,gpt-4-0613,Is it morally right to try to have a certain p...,The question of whether it is morally right to...,"As an AI, I don't have personal beliefs or opi...",1,0,0
1,53567,koala-13b,gpt-4-0613,What is the difference between marriage licens...,A marriage license is a legal document that al...,A marriage license and a marriage certificate ...,0,1,0
2,65089,gpt-3.5-turbo-0613,mistral-medium,explain function calling. how would you call a...,Function calling is the process of invoking or...,Function calling is the process of invoking a ...,0,0,1
3,96401,llama-2-13b-chat,mistral-7b-instruct,How can I create a test set for a very rare ca...,Creating a test set for a very rare category c...,When building a classifier for a very rare cat...,1,0,0
4,198779,koala-13b,gpt-3.5-turbo-0314,What is the best way to travel from Tel-Aviv t...,The best way to travel from Tel Aviv to Jerusa...,The best way to travel from Tel-Aviv to Jerusa...,0,1,0


In [22]:
df['labels'] = df[['winner_model_a', 'winner_model_b', 'winner_tie']].dot([1,2,0])
df.drop(['winner_model_a', 'winner_model_b', 'winner_tie'], axis=1, inplace=True)

In [23]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

train_df, temp_df = train_test_split(df, test_size=0.9, stratify=df['labels'])


# Split % for training, rest (%) as temp
_, val_df = train_test_split(temp_df, test_size=1/9, stratify=temp_df["labels"])


In [10]:
# !pip install peft datasets accelerate bitsandbytes


In [26]:
max_length = 1024
spread_max_length = True

def tokenize(
    tokenizer, prompt, response_a, response_b, max_length=max_length, spread_max_length=spread_max_length
):
    tokenizer.pad_token = tokenizer.eos_token or '[PAD]'  # Ensure padding token is set
    tokenizer.padding_side = "right" 
    
    prompt = ['<prompt>:' + p for p in prompt]
    response_a = ['<response_a>:' + res_a for res_a in response_a]
    response_b = ['<response_b>:' + res_b for res_b in response_b]
    
    if spread_max_length:
        prompt = tokenizer(prompt, truncation=True, max_length=max_length//3, padding=True, return_tensors='pt')['input_ids']
        response_a = tokenizer(response_a, truncation=True, max_length=max_length//3, padding=True, return_tensors='pt')['input_ids']
        response_b = tokenizer(response_b, truncation=True, max_length=max_length//3, padding=True, return_tensors='pt')['input_ids']

        input_ids = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
        
        attention_mask = [[1]*len(i) + [0]*(max_length - len(i)) for i in input_ids]
    else:
        text = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]

        tokenized = tokenizer(text, max_length=max_length, truncation=True, padding=max_length)
        input_ids = tokenized['input_ids']
        attention_mask = tokenized['attention_mask']
        
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
    }


In [27]:
# Tokenize the training and validation datasets
def tokenize_batch(batch):
    return tokenize(tokenizer, batch['prompt'], batch['response_a'], batch['response_b'])

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df[['prompt','response_a', 'response_b','labels']])
val_dataset = Dataset.from_pandas(val_df[['prompt','response_a', 'response_b', 'labels']])

# Apply the tokenizer to the datasets using the `map` function
train_dataset = train_dataset.map(tokenize_batch, batched=True)
val_dataset = val_dataset.map(tokenize_batch, batched=True)

Map:   0%|          | 0/5747 [00:00<?, ? examples/s]

Map:   0%|          | 0/5748 [00:00<?, ? examples/s]

In [28]:
train_dataset.shape

(5747, 7)

In [3]:
import bitsandbytes as bnb
print(bnb.__version__)

0.45.1


In [2]:
pip install -U bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.1-py3-none-manylinux_2_24_x86_64.whl.metadata (5.8 kB)
Downloading bitsandbytes-0.45.1-py3-none-manylinux_2_24_x86_64.whl (69.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.7/69.7 MB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.1
Note: you may need to restart the kernel to use updated packages.


In [14]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig



# Configure 4-bit quantization for QLoRA
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Enable 4-bit quantization
    bnb_4bit_use_double_quant=True,  # Use double quantization for better precision
    bnb_4bit_quant_type="nf4",  # Use 4-bit NormalFloat quantization
    bnb_4bit_compute_dtype=torch.float16  # Compute dtype for 4-bit tensors
)

# Load the LLaMA model with 4-bit quantization
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    quantization_config=bnb_config,
    device_map="auto"  # Automatically map model layers to available devices (e.g., GPU)
)

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [29]:
from peft import get_peft_model, LoraConfig, TaskType

# Configure LoRA
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,  # Task type (causal language modeling)
    r=8,  # Rank of the low-rank matrices
    lora_alpha=32,  # Scaling factor
    lora_dropout=0.1,  # Dropout for LoRA layers
    target_modules=["q_proj", "v_proj"],  # Target modules to apply LoRA
)

# Apply QLoRA to the model
model = get_peft_model(model, lora_config)

In [30]:
from sklearn.metrics import accuracy_score

def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)  # Convert logits to predicted class indices
    return {"accuracy": accuracy_score(labels, predictions)}

In [32]:
from transformers import  Trainer, TrainingArguments

# Remove the original 'text' column (we only need the tokenized output)
# train_dataset = train_dataset.remove_columns(['text'])
# val_dataset = val_dataset.remove_columns(['text'])


# Define the TrainingArguments
training_args = TrainingArguments(
    output_dir="./results",      
    eval_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=2e-5,              
    per_device_train_batch_size=4,   
    per_device_eval_batch_size=4,    
    num_train_epochs=3, 
    report_to="wandb",
    weight_decay=0.01,
    run_name="Jina_trainer_2",
    fp16=True,
)

# Initialize the Trainer

trainer = Trainer(
    model=model,                  
    args=training_args,              
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,                 
    compute_metrics=compute_metrics,
)

# Start training
trainer.train()
wandb.finish()


  trainer = Trainer(


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
model.save_pretrained("./fine-tuned-llama-qlora")
tokenizer.save_pretrained("./fine-tuned-llama-qlora")