## Fine-tune Llama 3 for Sentiment Analysis

## Installations and imports

In [1]:
!pip install -q -U torch --index-url https://download.pytorch.org/whl/cu117
!pip install -q -U -i https://pypi.org/simple/ bitsandbytes
!pip install -q -U transformers=="4.40.0"
!pip install -q -U accelerate
!pip install -q -U datasets
!pip install -q -U trl
!pip install -q -U peft
!pip install -q -U tensorboard

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
trl 0.17.0 requires transformers>=4.46.0, but you have transformers 4.40.0 which is incompatible.


In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer, SFTConfig
from trl import setup_chat_format
from transformers import (AutoModelForCausalLM, 
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          pipeline, 
                          logging)
from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix)
from sklearn.model_selection import train_test_split

In [5]:
print(f"pytorch version {torch.__version__}")

pytorch version 2.5.1+cu121


In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"working on {device}")

working on cuda:0


In [7]:
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

## Preparing the data

In [8]:
filename = "../dataset/all-data.csv"

df = pd.read_csv(filename, 
                 names=["sentiment", "text"],
                 encoding="utf-8", encoding_errors="replace")

X_train = list()
X_test = list()
for sentiment in ["positive", "neutral", "negative"]:
    train, test  = train_test_split(df[df.sentiment==sentiment], 
                                    train_size=300,
                                    test_size=300, 
                                    random_state=42)
    X_train.append(train)
    X_test.append(test)

X_train = pd.concat(X_train).sample(frac=1, random_state=10)
X_test = pd.concat(X_test)

eval_idx = [idx for idx in df.index if idx not in list(X_train.index) + list(X_test.index)]
X_eval = df[df.index.isin(eval_idx)]
X_eval = (X_eval
          .groupby('sentiment', group_keys=False)
          .apply(lambda x: x.sample(n=50, random_state=10, replace=True)))
X_train = X_train.reset_index(drop=True)

def generate_prompt(data_point):
    return f"""
            Analyze the sentiment of the news headline enclosed in square brackets, 
            determine if it is positive, neutral, or negative, and return the answer as 
            the corresponding sentiment label "positive" or "neutral" or "negative".

            [{data_point["text"]}] = {data_point["sentiment"]}
            """.strip()

def generate_test_prompt(data_point):
    return f"""
            Analyze the sentiment of the news headline enclosed in square brackets, 
            determine if it is positive, neutral, or negative, and return the answer as 
            the corresponding sentiment label "positive" or "neutral" or "negative".

            [{data_point["text"]}] = """.strip()

X_train = pd.DataFrame(X_train.apply(generate_prompt, axis=1), 
                       columns=["text"])
X_eval = pd.DataFrame(X_eval.apply(generate_prompt, axis=1), 
                      columns=["text"])

y_true = X_test.sentiment
X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=["text"])

train_data = Dataset.from_pandas(X_train)
eval_data = Dataset.from_pandas(X_eval)

In [9]:
def evaluate(y_true, y_pred):
    labels = ['positive', 'neutral', 'negative']
    mapping = {'positive': 2, 'neutral': 1, 'none':1, 'negative': 0}
    def map_func(x):
        return mapping.get(x, 1)
    
    y_true = np.vectorize(map_func)(y_true)
    y_pred = np.vectorize(map_func)(y_pred)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    print(f'Accuracy: {accuracy:.3f}')
    
    # Generate accuracy report
    unique_labels = set(y_true)  # Get unique labels
    
    for label in unique_labels:
        label_indices = [i for i in range(len(y_true)) 
                         if y_true[i] == label]
        label_y_true = [y_true[i] for i in label_indices]
        label_y_pred = [y_pred[i] for i in label_indices]
        accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {label}: {accuracy:.3f}')
        
    # Generate classification report
    class_report = classification_report(y_true=y_true, y_pred=y_pred)
    print('\nClassification Report:')
    print(class_report)
    
    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=[0, 1, 2])
    print('\nConfusion Matrix:')
    print(conf_matrix)

## Testing the model without fine-tuning

In [10]:
# model_name = "llama-3/transformers/8b-chat-hf/1"
model_name = "meta-llama/Meta-Llama-3-8B"
# model_name = "google/gemma-7b"


compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device,
    torch_dtype=compute_dtype,
    quantization_config=bnb_config, 
)

model.config.use_cache = False
model.config.pretraining_tp = 1

max_seq_length = 512 #2048
tokenizer = AutoTokenizer.from_pretrained(model_name, max_seq_length=max_seq_length)
tokenizer.pad_token_id = tokenizer.eos_token_id

Loading checkpoint shards: 100%|██████████| 4/4 [00:21<00:00,  5.38s/it]


In [11]:
def predict(test, model, tokenizer):
    y_pred = []
    for i in tqdm(range(len(X_test))):
        prompt = X_test.iloc[i]["text"]
        pipe = pipeline(task="text-generation", 
                        model=model, 
                        tokenizer=tokenizer, 
                        max_new_tokens = 1, 
                        # temperature = 0.0,
                        do_sample=False
                       )
        result = pipe(prompt)
        answer = result[0]['generated_text'].split("=")[-1]
        if "positive" in answer:
            y_pred.append("positive")
        elif "negative" in answer:
            y_pred.append("negative")
        elif "neutral" in answer:
            y_pred.append("neutral")
        else:
            y_pred.append("none")
    return y_pred

In [12]:
y_pred = predict(test, model, tokenizer)

  0%|          | 0/900 [00:00<?, ?it/s]Device set to use cuda:0
  0%|          | 1/900 [00:00<09:37,  1.56it/s]Device set to use cuda:0
  0%|          | 2/900 [00:00<05:18,  2.82it/s]Device set to use cuda:0
  0%|          | 3/900 [00:00<03:54,  3.82it/s]Device set to use cuda:0
  0%|          | 4/900 [00:01<03:14,  4.61it/s]Device set to use cuda:0
  1%|          | 5/900 [00:01<02:52,  5.20it/s]Device set to use cuda:0
  1%|          | 6/900 [00:01<02:40,  5.58it/s]Device set to use cuda:0
  1%|          | 7/900 [00:01<02:31,  5.88it/s]Device set to use cuda:0
  1%|          | 8/900 [00:01<02:26,  6.07it/s]Device set to use cuda:0
  1%|          | 9/900 [00:01<02:22,  6.27it/s]Device set to use cuda:0
  1%|          | 10/900 [00:02<02:19,  6.38it/s]Device set to use cuda:0
  1%|          | 11/900 [00:02<02:17,  6.48it/s]Device set to use cuda:0
  1%|▏         | 12/900 [00:02<02:15,  6.57it/s]Device set to use cuda:0
  1%|▏         | 13/900 [00:02<02:13,  6.64it/s]Device set to use cud

In [13]:
evaluate(y_true, y_pred)

Accuracy: 0.439
Accuracy for label 0: 0.297
Accuracy for label 1: 0.057
Accuracy for label 2: 0.963

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.30      0.45       300
           1       0.15      0.06      0.08       300
           2       0.42      0.96      0.59       300

    accuracy                           0.44       900
   macro avg       0.50      0.44      0.37       900
weighted avg       0.50      0.44      0.37       900


Confusion Matrix:
[[ 89  90 121]
 [  6  17 277]
 [  1  10 289]]


## Fine-tuning

In [15]:
from sklearn.metrics import (accuracy_score, 
                             recall_score, 
                             precision_score, 
                             f1_score)

from transformers import EarlyStoppingCallback, IntervalStrategy

def compute_metrics(p):    
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)    
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [25]:
output_dir="trained_weigths"

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj",],
)

# training_arguments = TrainingArguments(
#     output_dir=output_dir,                    # directory to save and repository id
#     num_train_epochs=5,                       # number of training epochs
#     per_device_train_batch_size=1,            # batch size per device during training
#     gradient_accumulation_steps=8,            # number of steps before performing a backward/update pass
#     gradient_checkpointing=True,              # use gradient checkpointing to save memory
#     optim="paged_adamw_32bit",
#     save_steps=0,
#     logging_steps=25,                         # log every 10 steps
#     learning_rate=2e-4,                       # learning rate, based on QLoRA paper
#     weight_decay=0.001,
#     fp16=True,
#     bf16=False,
#     max_grad_norm=0.3,                        # max gradient norm based on QLoRA paper
#     max_steps=-1,
#     warmup_ratio=0.03,                        # warmup ratio based on QLoRA paper
#     group_by_length=False,
#     lr_scheduler_type="cosine",               # use cosine learning rate scheduler
#     report_to="tensorboard",                  # report metrics to tensorboard
#     max_seq_length=max_seq_length,
#     #evaluation_strategy="steps",              # save checkpoint every epoch
#     #load_best_model_at_end = True,
#     #eval_steps = 25,
#     #metric_for_best_model = 'accuracy',
# )

trainer = SFTTrainer(
    model=model,
    # args=training_arguments,
    args = SFTConfig(
    output_dir=output_dir,                    # directory to save and repository id
    num_train_epochs=5,                       # number of training epochs
    per_device_train_batch_size=1,            # batch size per device during training
    gradient_accumulation_steps=8,            # number of steps before performing a backward/update pass
    gradient_checkpointing=True,              # use gradient checkpointing to save memory
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=25,                         # log every 10 steps
    learning_rate=2e-4,                       # learning rate, based on QLoRA paper
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,                        # max gradient norm based on QLoRA paper
    max_steps=-1,
    warmup_ratio=0.03,                        # warmup ratio based on QLoRA paper
    group_by_length=False,
    lr_scheduler_type="cosine",               # use cosine learning rate scheduler
    report_to="tensorboard",                  # report metrics to tensorboard
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    packing=False,
    dataset_kwargs={
        "add_special_tokens": False,
        "append_concat_token": False,
    },
    #evaluation_strategy="steps",              # save checkpoint every epoch
    #load_best_model_at_end = True,
    #eval_steps = 25,
    #metric_for_best_model = 'accuracy',
    ),
    train_dataset=train_data,
    #eval_dataset=eval_data,
    peft_config=peft_config,
    # dataset_text_field="text",
    # tokenizer=tokenizer,
    processing_class=tokenizer,
    # max_seq_length=max_seq_length,
    # packing=False,
    # dataset_kwargs={
    #     "add_special_tokens": False,
    #     "append_concat_token": False,
    # },
    #compute_metrics=compute_metrics,
    #callbacks = [EarlyStoppingCallback(early_stopping_patience=3)],
)

Converting train dataset to ChatML: 100%|██████████| 900/900 [00:00<00:00, 24988.57 examples/s]
Adding EOS to train dataset: 100%|██████████| 900/900 [00:00<00:00, 50758.70 examples/s]
Tokenizing train dataset: 100%|██████████| 900/900 [00:00<00:00, 4812.27 examples/s]
Truncating train dataset: 100%|██████████| 900/900 [00:00<?, ? examples/s]
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [26]:
# Train model
trainer.train()

Step,Training Loss
25,1.7341
50,0.9367
75,0.8718
100,0.8291
125,0.8242
150,0.7414
175,0.6817
200,0.6951
225,0.6725
250,0.4802


TrainOutput(global_step=560, training_loss=0.550455518918378, metrics={'train_runtime': 2672.1974, 'train_samples_per_second': 1.684, 'train_steps_per_second': 0.21, 'total_flos': 1.753310971170816e+16, 'train_loss': 0.550455518918378})

In [27]:
# Save trained model and tokenizer
trainer.save_model()
tokenizer.save_pretrained(output_dir)

('trained_weigths\\tokenizer_config.json',
 'trained_weigths\\special_tokens_map.json',
 'trained_weigths\\tokenizer.json')

In [28]:
%load_ext tensorboard
%tensorboard --logdir logs/runs

Launching TensorBoard...

## Testing

In [29]:
y_pred = predict(test, model, tokenizer)
evaluate(y_true, y_pred)

  0%|          | 0/900 [00:00<?, ?it/s]Device set to use cuda:0
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
DynamicCache + torch.export is tested on torch 2.6.0+ and may not work on earlier versions.
  0%|          | 1/900 [00:00<04:04,  3.67it/s]Device set to use cuda:0
  0%|          | 2/900 [00:00<03:27,  4.33it/s]Device set to use cuda:0
  0%|          | 3/900 [00:00<03:28,  4.31it/s]Device set to use cuda:0
  0%|          | 4/900 [00:00<03:25,  4.36it/s]Device set to use cuda:0
  1%|          | 5/900 [00:01<03:34,  4.17it/s]Device set to use cuda:0
  1%|          | 6/900 [00:01<03:33,  4.18it/s]Device set to use cuda:0
  1%|          | 7/900 [00:01<03:26,  4.33it/s]Device set to use cuda:0
  1%|          | 8/900 [00:01<03:16,  4.53it/s]Device set to use cuda:0
  1%|          | 9/900 [00:02<03:06,  4.78it/s]Device set to use cuda:0
  1%|          | 10/900 [00:02<02:59,  4.97it/s]Device set to use cuda:0
  1%|          | 11/900 [00:02<02:

Accuracy: 0.867
Accuracy for label 0: 0.947
Accuracy for label 1: 0.827
Accuracy for label 2: 0.827

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.95      0.95       300
           1       0.79      0.83      0.81       300
           2       0.86      0.83      0.84       300

    accuracy                           0.87       900
   macro avg       0.87      0.87      0.87       900
weighted avg       0.87      0.87      0.87       900


Confusion Matrix:
[[284  15   1]
 [ 14 248  38]
 [  3  49 248]]





The following code will create a Pandas DataFrame called evaluation containing the text, true labels, and predicted labels from the test set. This is expectially useful for understanding the errors that the fine-tuned model makes, and gettting insights on how to improve the prompt.

In [30]:
evaluation = pd.DataFrame({'text': X_test["text"], 
                           'y_true':y_true, 
                           'y_pred': y_pred},
                         )
evaluation.to_csv("llama_test_predictions.csv", index=False)