# 파인튜닝 최적화 하기

In [2]:
!pip install pynvml
!pip install datasets



# Investigating Fine-Tuning Techniques

In [3]:
from datasets import load_dataset
from transformers import (
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    AutoModelForSequenceClassification,
    AutoTokenizer,
)
import torch.nn.functional as F
import torch
from pynvml import (
    nvmlInit,
    nvmlDeviceGetHandleByIndex,
    nvmlDeviceGetMemoryInfo,
    nvmlDeviceGetCount,
    nvmlDeviceGetName,
)
import random

# GPU Benchmarking Utilities

In [4]:
def print_gpu_utilization():
    if torch.cuda.is_available():
        nvmlInit()
        deviceCount = nvmlDeviceGetCount()
        for i in range(deviceCount):
            handle = nvmlDeviceGetHandleByIndex(i)
            info = nvmlDeviceGetMemoryInfo(handle)
            print("Device", i, ":", nvmlDeviceGetName(handle))
            print(f"GPU memory occupied: {info.used//1024**2} MB.")
        torch.cuda.empty_cache()


def print_summary(result):
    if torch.cuda.is_available():
        print(f"Time: {result.metrics['train_runtime']:.2f}")
        print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
        print_gpu_utilization()

In [5]:
labels = []
results = []

In [6]:
## set device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# DEVICE = 'cuda:1'
print("Using device", DEVICE)

## measure GPU utilization
if torch.cuda.is_available():
    print_gpu_utilization()
    torch.ones((1, 1)).to(DEVICE)
    print_gpu_utilization()

Using device cpu


# Load Model and Tokenizer

In [7]:
tokenizer = AutoTokenizer.from_pretrained(
    "bert-base-cased", model_max_length=512, cache_dir="./cache/"
)
# tokenizer.pad_token_id = tokenizer.eos_token_id

from transformers import BertConfig, BertForSequenceClassification

# config = BertConfig.from_pretrained("bert-base-uncased")
# config.max_position_embeddings = 512

# model = BertForSequenceClassification.from_pretrained("bert-base-uncased", config=config)



model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-cased",
    num_labels=2
).to(DEVICE)
model.config.use_cache = False
# model.config.pad_token_id = tokenizer.pad_token_id


if torch.cuda.is_available():
    print_gpu_utilization()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Data Prep

In [8]:
# split = ["train[:1%]", "test[:1%]"]
split = ["train[:500]", "test[:200]"]
raw_train, raw_test = load_dataset(
    "tweet_eval", "offensive",
    split=split
)

print(len(raw_train))
print(raw_train[2]["text"])
print(raw_train[2]["label"])
print(tokenizer.encode(raw_train[2]["text"]))

train = raw_train.map(
    lambda x: tokenizer(x["text"], truncation=True, padding="max_length"),
    batched=True,
)
test = raw_test.map(
    lambda x: tokenizer(x["text"], truncation=True, padding="max_length"),
    batched=True,
)

500
@user Get him some line help. He is gonna be just fine. As the game went on you could see him progressing more with his reads. He brought what has been missing. The deep ball presence. Now he just needs a little more time
0
[101, 137, 4795, 3949, 1140, 1199, 1413, 1494, 119, 1124, 1110, 6100, 1129, 1198, 2503, 119, 1249, 1103, 1342, 1355, 1113, 1128, 1180, 1267, 1140, 5070, 1158, 1167, 1114, 1117, 9568, 119, 1124, 1814, 1184, 1144, 1151, 3764, 119, 1109, 1996, 3240, 2915, 119, 1986, 1119, 1198, 2993, 170, 1376, 1167, 1159, 102]


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

# Vanilla Training

In [9]:
default_args = {
    "output_dir": "testing_hyper_params",
    "num_train_epochs": 1,
    "per_device_train_batch_size": 1,
    "evaluation_strategy": "epoch",
     'report_to': "none"
}

training_args = TrainingArguments(**default_args)
trainer = Trainer(
    model=model,
    train_dataset=train,
    eval_dataset=test,
    tokenizer=tokenizer,
    args=training_args,
)
trainer.evaluate()

{'eval_loss': 0.6930201053619385,
 'eval_runtime': 7.7484,
 'eval_samples_per_second': 25.812,
 'eval_steps_per_second': 3.226}

In [10]:
result = trainer.train()
labels.append('Vanilla')
results.append(result)
print_summary(result)

Epoch,Training Loss,Validation Loss
1,1.3399,1.393136


Checkpoint destination directory testing_hyper_params/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.


# Compiling the model using torch.compile

In [11]:
# compiled_model = torch.compile(model)

# trainer = Trainer(
#     model=compiled_model,
#     train_dataset=train,
#     eval_dataset=test,
#     tokenizer=tokenizer,
#     args=training_args,
# )
# trainer.evaluate()

In [12]:
# result = trainer.train()
# labels.append('Compiled')
# results.append(result)
# print_summary(result)

# Adjust Batch size to 4

In [13]:
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-cased",
    num_labels=2,
).to(DEVICE)
model.config.use_cache = False

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
default_args["per_device_train_batch_size"] = 4
print(default_args)
training_args = TrainingArguments(**default_args)
trainer = Trainer(
    model=model,
    train_dataset=train,
    eval_dataset=test,
    tokenizer=tokenizer,
    args=training_args,
)
result = trainer.train()
labels.append('Batch Size = 4')
results.append(result)
print_summary(result)

{'output_dir': 'testing_hyper_params', 'num_train_epochs': 1, 'per_device_train_batch_size': 4, 'evaluation_strategy': 'epoch', 'report_to': 'none'}


Epoch,Training Loss,Validation Loss


# Gradient Accumulation (Adjust Effective Batch size to 4)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-cased",
    num_labels=2,
).to(DEVICE)
model.config.use_cache = False

In [None]:
default_args["per_device_train_batch_size"] = 1
default_args["gradient_accumulation_steps"] = 4
print(default_args)

training_args = TrainingArguments(**default_args)
trainer = Trainer(
    model=model,
    train_dataset=train,
    eval_dataset=test,
    tokenizer=tokenizer,
    args=training_args,
)
result = trainer.train()
labels.append('Gradient Accumulation = 4')
results.append(result)
print_summary(result)

# Mixed Precision

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-cased",
    num_labels=2,
).to(DEVICE)
model.config.use_cache = False

In [None]:
default_args["fp16"] = True
# default_args["gradient_checkpointing"] = False
default_args["gradient_accumulation_steps"] = 1
default_args["per_device_train_batch_size"] = 1

print(default_args)
training_args = TrainingArguments(**default_args)
trainer = Trainer(
    model=model,
    train_dataset=train,
    eval_dataset=test,
    tokenizer=tokenizer,
    args=training_args,
)
result = trainer.train()
labels.append('Mixed Precision')
results.append(result)
print_summary(result)

# Dynamic Padding & Uniform Length Batching

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-cased",
    num_labels=2,
).to(DEVICE)
model.config.use_cache = False

In [None]:
collate_fn = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")

train = raw_train.map(lambda x: tokenizer(x["text"], truncation=True), batched=True)
test = raw_test.map(lambda x: tokenizer(x["text"], truncation=True), batched=True)
default_args["fp16"] = False

print(default_args)
training_args = TrainingArguments(**default_args)

trainer = Trainer(
    model=model,
    train_dataset=train,
    eval_dataset=test,
    tokenizer=tokenizer,
    args=training_args,
    data_collator=collate_fn,
)
result = trainer.train()
labels.append('Dynamic Padding')
results.append(result)
print_summary(result)

# Choosing params for my GPUs

In [None]:
default_args["fp16"] = True
default_args["gradient_checkpointing"] = False
default_args["per_device_train_batch_size"] = 4
default_args["gradient_accumulation_steps"] = 4


In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-cased",
    num_labels=2,
).to(DEVICE)
model.config.use_cache = False

print(default_args)
training_args = TrainingArguments(**default_args)

trainer = Trainer(
    model=model,
    train_dataset=train,
    eval_dataset=test,
    tokenizer=tokenizer,
    args=training_args,
    data_collator=collate_fn,
)
result = trainer.train()
labels.append('BS=4 + Grad. Accum = 4 + MP + DP')
results.append(result)
print_summary(result)

In [None]:
len(labels), len(results)

In [None]:
results[-1]

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# this will transform your results into a list of lists (which is easier to plot)

values1 = [r.metrics['train_runtime'] for r in results]
# values2 = [r.metrics['train_samples_per_second'] for r in results]
values2 = [1744+4271, 3818+5787, 1940+4275, 1946+4326, 1258+4117, 1632+ 4159]  # memory
values2 = [(v / (24*1024) * 100) for v in values2]

barWidth = 0.3

# here we generate positions for each group of bars
r1 = np.arange(len(values1))
r2 = [x + barWidth for x in r1]

# create figure and axis objects
fig, ax1 = plt.subplots(figsize=(14, 10))  # Increase the figure size here

color = 'darkblue'
ax1.set_xlabel('Parameter Combination', fontweight='bold', fontsize=16)
ax1.set_ylabel('Total Seconds to run', color=color, fontweight='bold', fontsize=16)
ax1.bar(r1, values1, color=color, width=barWidth, edgecolor='grey', label='Key1', hatch='\\')  # Adding hatch pattern
ax1.tick_params(axis='y', labelcolor=color)
ax1.set_xticks([r + barWidth/2 for r in range(len(values1))])
ax1.set_xticklabels(labels, rotation=30, ha='right', fontsize=16)  # Rotate labels less and align right

# instantiate a second axis that shares the same x-axis
ax2 = ax1.twinx()
color = 'red'
ax2.set_ylabel('GPU Memory % Used', color=color, fontweight='bold', fontsize=14)  # we already handled the x-label with ax1
ax2.bar(r2, values2, color=color, width=barWidth, edgecolor='grey', label='Key2', hatch='//')  # Adding hatch pattern
ax2.tick_params(axis='y', labelcolor=color)
ax2.grid(False) 
ax1.grid(False) 

plt.title('Comparing Open-source Training Hyperparameters', fontsize=20)
fig.tight_layout(pad=2.0)  # Increase padding

# Save the plot as a PNG
fig.savefig('Training_Comparison.png', dpi=1000, bbox_inches='tight')

plt.show()
