<a href="https://colab.research.google.com/github/khnhenriette/ProjectADL/blob/math-medium/notebooks/medium_fine_tune_math_hpo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Hyper-Parameter optimization: Fine tune gpt2-medium for basic math tasks

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers datasets
!pip install torch
!pip install optuna


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

Use the dataset math_dataset.json that includes 10000 simple math examples of the form "89 minus 84 equals 5" using addition, subtraction, multiplication and division -- ensure dataset is uploaded to Google Colab before running

In [3]:
import pandas as pd
from datasets import Dataset

# Load the CSV dataset
df = pd.read_csv('math_dataset.csv')  # Update this path if using Google Drive

# Convert DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Split dataset into training and validation sets
split_dataset = dataset.train_test_split(test_size=0.1)
train_dataset = split_dataset['train']
eval_dataset = split_dataset['test']

print(f"Training examples: {len(train_dataset)}")
print(f"Validation examples: {len(eval_dataset)}")


Training examples: 9000
Validation examples: 1000


In [4]:
print(train_dataset[4])

{'math_problem': '49 times 79 equals 3871'}


### Use the Huggin Face Trainer to fine-tune the available gpt2-medium to perform better on the simple math tasks

In [5]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
# Load GPT-2 Medium tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
model = GPT2LMHeadModel.from_pretrained("gpt2-medium")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Start by finding right max_length for padding


In [6]:
# Check the longest sequence in your dataset
max_tokenized_length = max(len(tokenizer(example)['input_ids']) for example in dataset['math_problem'])
print(f"Maximum tokenized length: {max_tokenized_length}")


Maximum tokenized length: 8


In [7]:

# Set the EOS token as the padding token
tokenizer.pad_token = tokenizer.eos_token

# choose max_length slightly higher than longest sequence in dataset
max_length = 32

# Tokenize the dataset
def tokenize_function(examples):
    tokenized_inputs = tokenizer(
        examples["math_problem"],
        truncation=True,
        padding="max_length",
        max_length=max_length
    )
    # Labels are the same as input_ids for causal language modeling
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
    return tokenized_inputs


In [8]:
# ensure padding and masking are done correctly

testing = tokenizer("3 plus 65 equals 68", padding="max_length", max_length=10)
print(testing)


{'input_ids': [18, 5556, 6135, 21767, 8257, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]}


In [9]:
# tokenize dataset

train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [10]:
# mask eos tokens in labels to avoid their prediction in final outcome

def mask_eos_in_labels(example):
    example["labels"] = [token if token != tokenizer.eos_token_id else -100 for token in example["labels"]]
    return example

train_dataset = train_dataset.map(mask_eos_in_labels)
eval_dataset = eval_dataset.map(mask_eos_in_labels)


print(train_dataset[0])
print(eval_dataset[0])


Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

{'math_problem': '42 plus 65 equals 107', 'input_ids': [3682, 5556, 6135, 21767, 16226, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [3682, 5556, 6135, 21767, 16226, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]}
{'math_problem': '83 times 14 equals 1162', 'input_ids': [5999, 1661, 1478, 21767, 1367, 5237, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'label

Train with hyperparameter optimization using optuna

In [None]:
import optuna
from transformers import TrainingArguments, Trainer

# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters
    learning_rate = trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True)
    batch_size = trial.suggest_categorical("batch_size", [4, 8, 16])
    num_epochs = trial.suggest_int("num_epochs", 1, 5)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir="./gpt2_finetuned_optuna",
        evaluation_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        num_train_epochs=num_epochs,
        save_steps=500,
        save_total_limit=2,
        logging_dir="./logs",
        logging_steps=50,
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
    )

    # Train the model and evaluate on validation set
    trainer.train()
    eval_results = trainer.evaluate()

    # Use validation loss as the optimization metric
    return eval_results["eval_loss"]

# Run the hyperparameter search
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10, show_progress_bar=True)
print("Tested learning rates:", [trial.params["learning_rate"] for trial in study.trials])

# Best hyperparameters
print("Best hyperparameters:", study.best_params)


[I 2024-11-27 18:45:16,762] A new study created in memory with name: no-name-aaab440e-f7cf-444c-a8cc-7748707a21fd


  0%|          | 0/10 [00:00<?, ?it/s]

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,2.4911,2.397176
2,2.4693,2.325046
3,2.3904,2.276838
4,2.3993,2.248079
5,2.3502,2.238085


[I 2024-11-27 19:14:32,657] Trial 0 finished with value: 2.2380847930908203 and parameters: {'learning_rate': 1.3024524206379397e-06, 'batch_size': 16, 'num_epochs': 5}. Best is trial 0 with value: 2.2380847930908203.


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.1528,2.082261


[I 2024-11-27 19:21:01,471] Trial 1 finished with value: 2.082261323928833 and parameters: {'learning_rate': 8.80437443303844e-06, 'batch_size': 16, 'num_epochs': 1}. Best is trial 1 with value: 2.082261323928833.


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.1434,1.988409
2,1.9959,1.915849
3,1.9274,1.869949
4,1.9082,1.846482
5,1.8965,1.831837


[I 2024-11-27 20:23:10,142] Trial 2 finished with value: 1.8318371772766113 and parameters: {'learning_rate': 7.2377078649501456e-06, 'batch_size': 4, 'num_epochs': 5}. Best is trial 2 with value: 1.8318371772766113.


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.8097,1.83049


[I 2024-11-27 20:31:56,655] Trial 3 finished with value: 1.8304897546768188 and parameters: {'learning_rate': 1.4650457024789522e-06, 'batch_size': 8, 'num_epochs': 1}. Best is trial 3 with value: 1.8304897546768188.


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.8519,1.835168
2,1.7768,1.828081
3,1.7588,1.819652
4,1.7996,1.804654
5,1.8568,1.800438


[I 2024-11-27 21:30:16,614] Trial 4 finished with value: 1.8004378080368042 and parameters: {'learning_rate': 2.924875140850028e-06, 'batch_size': 4, 'num_epochs': 5}. Best is trial 4 with value: 1.8004378080368042.


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.7067,1.818013
2,1.6618,1.821672
3,1.6695,1.815099


[I 2024-11-27 22:06:46,042] Trial 5 finished with value: 1.8150991201400757 and parameters: {'learning_rate': 1.4606046405416623e-06, 'batch_size': 4, 'num_epochs': 3}. Best is trial 4 with value: 1.8004378080368042.


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.7077,1.829992


[I 2024-11-27 22:18:40,674] Trial 6 finished with value: 1.829992413520813 and parameters: {'learning_rate': 1.1381137735171453e-05, 'batch_size': 4, 'num_epochs': 1}. Best is trial 4 with value: 1.8004378080368042.


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.7034,1.836629
2,1.6317,1.830623
3,1.6728,1.818369


[I 2024-11-27 22:54:06,685] Trial 7 finished with value: 1.8183691501617432 and parameters: {'learning_rate': 1.4795418030208947e-06, 'batch_size': 4, 'num_epochs': 3}. Best is trial 4 with value: 1.8004378080368042.


  trainer = Trainer(


Epoch,Training Loss,Validation Loss


In [None]:
# fine tune using best HPs

best_hyperparams = study.best_params

training_args = TrainingArguments(
    output_dir="./gpt2_finetuned_hpo",
    evaluation_strategy="epoch",
    learning_rate=best_hyperparams["learning_rate"],
    per_device_train_batch_size=best_hyperparams["batch_size"],
    num_train_epochs=best_hyperparams["num_epochs"],
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=50,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

trainer.train()


Save the fine-tuned model for future use

In [None]:
# Save the model and tokenizer
model.save_pretrained("./gpt2_finetuned_hpo")
tokenizer.save_pretrained("./gpt2_finetuned_hpo")


In [None]:
!zip -r gpt2_finetuned_hpo.zip ./gpt2_finetuned_hpo


In [None]:
!mv gpt2_math_finetuned_new.zip /content/drive/MyDrive/


In [None]:
# try out the model

input_text = "76 plus 24 equals"
inputs = tokenizer(input_text, return_tensors="pt")
outputs = model.generate(**inputs)
print(tokenizer.decode(outputs[0]))


"""

# Generate text
input_text = "76 plus 24 equals"
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

# Generate with adjusted parameters
outputs = model.generate(
    **inputs,
    max_new_tokens=10,  # Limit the number of tokens generated
    pad_token_id=tokenizer.pad_token_id,  # Ensure padding is correctly handled
    eos_token_id=tokenizer.eos_token_id,  # Set an explicit EOS token
)

# Decode and remove `<|endoftext|>` manually
decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
print(decoded_output)

"""

In [None]:
# verify tokens are set consistently

print(f"Pad token ID: {tokenizer.pad_token_id}")
print(f"EOS token ID: {tokenizer.eos_token_id}")
