## Fine-tune Gemma 7B it for Sentiment Analysis

In [1]:
%pip install -q -U torch --index-url https://download.pytorch.org/whl/cu117

Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install -q -U transformers==4.38.2
%pip install -q accelerate==0.32.0
%pip install -q -i https://pypi.org/simple/ bitsandbytes
%pip install -q -U datasets==2.16.1
%pip install -q -U trl==0.7.11
%pip install -q -U peft==0.10.0

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [4]:
import warnings
warnings.filterwarnings("ignore")

In [5]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

import torch
import torch.nn as nn

import transformers
from transformers import (AutoModelForCausalLM, 
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          pipeline, 
                          logging)
from datasets import Dataset
from peft import LoraConfig, PeftConfig
import bitsandbytes as bnb
from trl import SFTTrainer

from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix)
from sklearn.model_selection import train_test_split

In [6]:
print(f"transformers=={transformers.__version__}")

transformers==4.38.2


In [7]:
# model_name = "/kaggle/input/gemma/transformers/7b-it/1"
model_name = "google/gemma-7b"

compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config, 
)

model.config.use_cache = False
model.config.pretraining_tp = 1

max_seq_length = 2048
tokenizer = AutoTokenizer.from_pretrained(model_name, max_seq_length=max_seq_length)
EOS_TOKEN = tokenizer.eos_token

Loading checkpoint shards: 100%|██████████| 4/4 [00:21<00:00,  5.32s/it]


In [8]:
filename = "../dataset/all-data.csv"

df = pd.read_csv(filename, 
                 names=["sentiment", "text"],
                 encoding="utf-8", encoding_errors="replace")

X_train = list()
X_test = list()
for sentiment in ["positive", "neutral", "negative"]:
    train, test  = train_test_split(df[df.sentiment==sentiment], 
                                    train_size=300,
                                    test_size=300, 
                                    random_state=42)
    X_train.append(train)
    X_test.append(test)

X_train = pd.concat(X_train).sample(frac=1, random_state=10)
X_test = pd.concat(X_test)

eval_idx = [idx for idx in df.index if idx not in list(train.index) + list(test.index)]
X_eval = df[df.index.isin(eval_idx)]
X_eval = (X_eval
          .groupby('sentiment', group_keys=False)
          .apply(lambda x: x.sample(n=50, random_state=10, replace=True)))
X_train = X_train.reset_index(drop=True)

def generate_prompt(data_point):
    return f"""generate_prompt
            Analyze the sentiment of the news headline enclosed in square brackets, 
            determine if it is positive, neutral, or negative, and return the answer as 
            the corresponding sentiment label "positive" or "neutral" or "negative"

            [{data_point["text"]}] = {data_point["sentiment"]}
            """.strip() + EOS_TOKEN

def generate_test_prompt(data_point):
    return f"""
            Analyze the sentiment of the news headline enclosed in square brackets, 
            determine if it is positive, neutral, or negative, and return the answer as 
            the corresponding sentiment label "positive" or "neutral" or "negative"

            [{data_point["text"]}] = 

            """.strip()

X_train = pd.DataFrame(X_train.apply(generate_prompt, axis=1), 
                       columns=["text"])
X_eval = pd.DataFrame(X_eval.apply(generate_prompt, axis=1), 
                      columns=["text"])

y_true = X_test.sentiment
X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=["text"])

train_data = Dataset.from_pandas(X_train)
eval_data = Dataset.from_pandas(X_eval)

In [9]:
def evaluate(y_true, y_pred):
    labels = ['positive', 'neutral', 'negative']
    mapping = {'positive': 2, 'neutral': 1, 'none':1, 'negative': 0}
    def map_func(x):
        return mapping.get(x, 1)
    
    y_true = np.vectorize(map_func)(y_true)
    y_pred = np.vectorize(map_func)(y_pred)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    print(f'Accuracy: {accuracy:.3f}')
    
    # Generate accuracy report
    unique_labels = set(y_true)  # Get unique labels
    
    for label in unique_labels:
        label_indices = [i for i in range(len(y_true)) 
                         if y_true[i] == label]
        label_y_true = [y_true[i] for i in label_indices]
        label_y_pred = [y_pred[i] for i in label_indices]
        accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {label}: {accuracy:.3f}')
        
    # Generate classification report
    class_report = classification_report(y_true=y_true, y_pred=y_pred)
    print('\nClassification Report:')
    print(class_report)
    
    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=[0, 1, 2])
    print('\nConfusion Matrix:')
    print(conf_matrix)

In [10]:
def predict(X_test, model, tokenizer):
    y_pred = []
    for i in tqdm(range(len(X_test))):
        prompt = X_test.iloc[i]["text"]
        input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
        outputs = model.generate(**input_ids, max_new_tokens=1, temperature=0.0)
        result = tokenizer.decode(outputs[0])
        answer = result.split("=")[-1].lower()
        if "positive" in answer:
            y_pred.append("positive")
        elif "negative" in answer:
            y_pred.append("negative")
        elif "neutral" in answer:
            y_pred.append("neutral")
        else:
            y_pred.append("none")
    return y_pred

In [11]:
y_pred = predict(X_test, model, tokenizer)

100%|██████████| 900/900 [02:29<00:00,  6.01it/s]


In [12]:
evaluate(y_true, y_pred)

Accuracy: 0.673
Accuracy for label 0: 0.777
Accuracy for label 1: 0.337
Accuracy for label 2: 0.907

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.78      0.84       300
           1       0.58      0.34      0.43       300
           2       0.58      0.91      0.71       300

    accuracy                           0.67       900
   macro avg       0.69      0.67      0.66       900
weighted avg       0.69      0.67      0.66       900


Confusion Matrix:
[[233  51  16]
 [ 17 101 182]
 [  5  23 272]]


In [13]:
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj",],
)

training_arguments = TrainingArguments(
    output_dir="logs",
    num_train_epochs=5,
    gradient_checkpointing=True,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=False,
    evaluation_strategy='steps',
    eval_steps = 112,
    eval_accumulation_steps=1,
    lr_scheduler_type="cosine",
    report_to="tensorboard",
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    max_seq_length=max_seq_length,
    args=training_arguments,
    packing=False,
)

Map: 100%|██████████| 900/900 [00:00<00:00, 6236.79 examples/s]
Map: 100%|██████████| 150/150 [00:00<00:00, 21127.87 examples/s]


In [14]:
# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained("../models/trained-gemma-model")

  4%|▍         | 25/560 [02:04<44:29,  4.99s/it]

{'loss': 1.696, 'grad_norm': 1.0175307989120483, 'learning_rate': 0.00019989290417745542, 'epoch': 0.22}


  9%|▉         | 50/560 [04:08<42:05,  4.95s/it]

{'loss': 0.8742, 'grad_norm': 0.8318800330162048, 'learning_rate': 0.00019818290217987587, 'epoch': 0.44}


 13%|█▎        | 75/560 [06:12<40:21,  4.99s/it]

{'loss': 0.8119, 'grad_norm': 0.6794980764389038, 'learning_rate': 0.00019442240532420587, 'epoch': 0.67}


 18%|█▊        | 100/560 [08:16<37:51,  4.94s/it]

{'loss': 0.7751, 'grad_norm': 0.694873034954071, 'learning_rate': 0.00018868994947865883, 'epoch': 0.89}


                                                 
 20%|██        | 112/560 [09:50<37:03,  4.96s/it]

{'eval_loss': 5.030557632446289, 'eval_runtime': 34.2263, 'eval_samples_per_second': 4.383, 'eval_steps_per_second': 0.555, 'epoch': 1.0}


 22%|██▏       | 125/560 [10:56<37:28,  5.17s/it]  

{'loss': 0.7189, 'grad_norm': 0.6813368797302246, 'learning_rate': 0.00018110525376921862, 'epoch': 1.11}


 27%|██▋       | 150/560 [13:04<42:26,  6.21s/it]

{'loss': 0.6234, 'grad_norm': 0.6721622943878174, 'learning_rate': 0.00017182672031282296, 'epoch': 1.33}


 31%|███▏      | 175/560 [15:45<33:10,  5.17s/it]

{'loss': 0.574, 'grad_norm': 0.7719905376434326, 'learning_rate': 0.00016104812607797202, 'epoch': 1.56}


 36%|███▌      | 200/560 [17:50<29:52,  4.98s/it]

{'loss': 0.5714, 'grad_norm': 0.5848060250282288, 'learning_rate': 0.00014899457596139729, 'epoch': 1.78}


                                                 
 40%|████      | 224/560 [20:23<28:19,  5.06s/it]

{'eval_loss': 3.614361047744751, 'eval_runtime': 32.4832, 'eval_samples_per_second': 4.618, 'eval_steps_per_second': 0.585, 'epoch': 1.99}


 40%|████      | 225/560 [20:28<1:22:31, 14.78s/it]

{'loss': 0.565, 'grad_norm': 0.6519268751144409, 'learning_rate': 0.0001359178015985163, 'epoch': 2.0}


 45%|████▍     | 250/560 [22:34<25:53,  5.01s/it]  

{'loss': 0.3378, 'grad_norm': 0.7876952290534973, 'learning_rate': 0.00012209090408937971, 'epoch': 2.22}


 49%|████▉     | 275/560 [24:39<23:59,  5.05s/it]

{'loss': 0.3342, 'grad_norm': 0.6421417593955994, 'learning_rate': 0.0001078026504353325, 'epoch': 2.44}


 54%|█████▎    | 300/560 [26:44<21:48,  5.03s/it]

{'loss': 0.3402, 'grad_norm': 0.9286759495735168, 'learning_rate': 9.335144280211066e-05, 'epoch': 2.67}


 58%|█████▊    | 325/560 [28:50<19:36,  5.00s/it]

{'loss': 0.3567, 'grad_norm': 0.621614396572113, 'learning_rate': 7.903908655793224e-05, 'epoch': 2.89}


                                                 
 60%|██████    | 336/560 [30:15<18:15,  4.89s/it]

{'eval_loss': 3.6823933124542236, 'eval_runtime': 31.461, 'eval_samples_per_second': 4.768, 'eval_steps_per_second': 0.604, 'epoch': 2.99}


 62%|██████▎   | 350/560 [31:24<17:30,  5.00s/it]

{'loss': 0.2661, 'grad_norm': 0.7258069515228271, 'learning_rate': 6.516448723761315e-05, 'epoch': 3.11}


 67%|██████▋   | 375/560 [33:27<15:07,  4.91s/it]

{'loss': 0.2052, 'grad_norm': 0.917387843132019, 'learning_rate': 5.2017408068077064e-05, 'epoch': 3.33}


 71%|███████▏  | 400/560 [35:29<12:56,  4.85s/it]

{'loss': 0.1863, 'grad_norm': 0.6624898314476013, 'learning_rate': 3.987241842583983e-05, 'epoch': 3.56}


 76%|███████▌  | 425/560 [37:33<11:20,  5.04s/it]

{'loss': 0.2016, 'grad_norm': 0.7086446285247803, 'learning_rate': 2.8983159609539646e-05, 'epoch': 3.78}


                                                 
 80%|████████  | 448/560 [40:02<09:29,  5.08s/it]

{'eval_loss': 4.016356468200684, 'eval_runtime': 32.6009, 'eval_samples_per_second': 4.601, 'eval_steps_per_second': 0.583, 'epoch': 3.98}


 80%|████████  | 450/560 [40:12<21:52, 11.93s/it]

{'loss': 0.204, 'grad_norm': 0.4562527537345886, 'learning_rate': 1.9577047683638873e-05, 'epoch': 4.0}


 85%|████████▍ | 475/560 [42:18<07:05,  5.01s/it]

{'loss': 0.1426, 'grad_norm': 0.6521354913711548, 'learning_rate': 1.1850524021436337e-05, 'epoch': 4.22}


 89%|████████▉ | 500/560 [44:23<05:01,  5.02s/it]

{'loss': 0.1434, 'grad_norm': 0.7630910277366638, 'learning_rate': 5.964952737136353e-06, 'epoch': 4.44}


 94%|█████████▍| 525/560 [46:28<02:56,  5.06s/it]

{'loss': 0.1407, 'grad_norm': 0.4852912425994873, 'learning_rate': 2.043250686804865e-06, 'epoch': 4.67}


 98%|█████████▊| 550/560 [48:33<00:50,  5.01s/it]

{'loss': 0.1367, 'grad_norm': 0.3765665888786316, 'learning_rate': 1.6732041875354709e-07, 'epoch': 4.89}


                                                 
100%|██████████| 560/560 [49:55<00:00,  5.35s/it]


{'eval_loss': 4.173113822937012, 'eval_runtime': 32.1804, 'eval_samples_per_second': 4.661, 'eval_steps_per_second': 0.59, 'epoch': 4.98}
{'train_runtime': 2995.6084, 'train_samples_per_second': 1.502, 'train_steps_per_second': 0.187, 'train_loss': 0.4581622526049614, 'epoch': 4.98}


In [15]:
# Save trained model
trainer.model.save_pretrained("../models/trained-gemma-model")

In [16]:
%load_ext tensorboard
%tensorboard --logdir logs/runs

Reusing TensorBoard on port 6006 (pid 7432), started 3 days, 6:03:21 ago. (Use '!kill 7432' to kill it.)

In [17]:
y_pred = predict(X_test, model, tokenizer)
evaluate(y_true, y_pred)

100%|██████████| 900/900 [03:00<00:00,  4.98it/s]

Accuracy: 0.876
Accuracy for label 0: 0.933
Accuracy for label 1: 0.867
Accuracy for label 2: 0.827

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.93      0.95       300
           1       0.79      0.87      0.83       300
           2       0.88      0.83      0.85       300

    accuracy                           0.88       900
   macro avg       0.88      0.88      0.88       900
weighted avg       0.88      0.88      0.88       900


Confusion Matrix:
[[280  18   2]
 [  9 260  31]
 [  2  50 248]]





In [18]:
evaluation = pd.DataFrame({'text': X_test["text"], 
                           'y_true':y_true, 
                           'y_pred': y_pred},
                         )
evaluation.to_csv("gemma_test_predictions.csv", index=False)