<a href="https://colab.research.google.com/github/khnhenriette/ProjectADL/blob/math-medium/notebooks/medium_fine_tune_math_hpo_small.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Hyper-Parameter optimization: Fine tune gpt2-medium for basic math tasks

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers datasets
!pip install torch
!pip install optuna


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

Use the dataset math_dataset.json that includes 20000 simple math examples of the form "89 minus 84 equals 5" using addition, subtraction, multiplication and division -- ensure dataset is uploaded to Google Colab before running

In [3]:
import pandas as pd
from datasets import Dataset

# Load the CSV dataset
df = pd.read_csv('math_dataset.csv')  # Update this path if using Google Drive

# Convert DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Shuffle the dataset
dataset = dataset.shuffle(seed=42)  # Ensure reproducibility with a fixed seed

# Split into training (5000 examples) and the rest
train_dataset = dataset.select(range(5000))
remaining_dataset = dataset.select(range(5000, len(dataset)))

# Split the remaining dataset into validation (500 examples) and test (500 examples)
eval_dataset = remaining_dataset.select(range(500))
test_dataset = remaining_dataset.select(range(500, 1000))

# Print dataset sizes
print(f"Training examples: {len(train_dataset)}")
print(f"Validation examples: {len(eval_dataset)}")
print(f"Test examples: {len(test_dataset)}")


Training examples: 5000
Validation examples: 500
Test examples: 500


In [4]:
print(train_dataset[4])

{'math_problem': '13 minus 26 equals -13'}


### Use the Hugging Face Trainer to fine-tune the available gpt2-medium to perform better on the simple math tasks

In [5]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
# Load GPT-2 Medium tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
model = GPT2LMHeadModel.from_pretrained("gpt2-medium")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Start by finding right max_length for padding


In [6]:
# Check the longest sequence in your dataset
max_tokenized_length = max(len(tokenizer(example)['input_ids']) for example in dataset['math_problem'])
print(f"Maximum tokenized length: {max_tokenized_length}")


Maximum tokenized length: 8


In [7]:

# Set the EOS token as the padding token
tokenizer.pad_token = tokenizer.eos_token

# choose max_length slightly higher than longest sequence in dataset
max_length = 32

# Tokenize the dataset
def tokenize_function(examples):
    tokenized_inputs = tokenizer(
        examples["math_problem"],
        truncation=True,
        padding="max_length",
        max_length=max_length
    )
    # Labels are the same as input_ids for causal language modeling
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
    return tokenized_inputs


In [8]:
# ensure padding and masking are done correctly

testing = tokenizer("3 plus 65 equals 68", padding="max_length", max_length=10)
print(testing)


{'input_ids': [18, 5556, 6135, 21767, 8257, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]}


In [13]:
# tokenize dataset

train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [14]:
# mask eos tokens in labels to avoid their prediction in final outcome

def mask_eos_in_labels(example):
    example["labels"] = [token if token != tokenizer.eos_token_id else -100 for token in example["labels"]]
    return example

train_dataset = train_dataset.map(mask_eos_in_labels)
eval_dataset = eval_dataset.map(mask_eos_in_labels)
test_dataset = test_dataset.map(mask_eos_in_labels)


print(train_dataset[0])
print(eval_dataset[0])
print(test_dataset[0])

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

{'math_problem': '77 times 26 equals 2002', 'input_ids': [3324, 1661, 2608, 21767, 6244, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [3324, 1661, 2608, 21767, 6244, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]}
{'math_problem': '41 divided by 94 equals 0.44', 'input_ids': [3901, 9086, 416, 10048, 21767, 657, 13, 2598, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'label

Train with hyperparameter optimization using optuna

In [11]:
import optuna
from transformers import TrainingArguments, Trainer

# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters
    learning_rate = trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True)
    batch_size = trial.suggest_categorical("batch_size", [4, 8, 16])
    num_epochs = trial.suggest_int("num_epochs", 1, 5)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir="./gpt2_finetuned_optuna",
        evaluation_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        num_train_epochs=num_epochs,
        save_steps=500,
        save_total_limit=2,
        logging_dir="./logs",
        logging_steps=50,
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
    )

    # Train the model and evaluate on validation set
    trainer.train()
    eval_results = trainer.evaluate()

    # Use validation loss as the optimization metric
    return eval_results["eval_loss"]

# Run the hyperparameter search
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10, show_progress_bar=True)
print("Tested learning rates:", [trial.params["learning_rate"] for trial in study.trials])

# Best hyperparameters
print("Best hyperparameters according to HPO:", study.best_params)



[I 2024-12-06 18:32:27,916] A new study created in memory with name: no-name-08ef3c77-d776-4a1c-b497-702f2c8cfd8c


  0%|          | 0/10 [00:00<?, ?it/s]

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,2.2017,2.1293
2,2.0601,2.055498


[I 2024-12-06 18:37:13,969] Trial 0 finished with value: 2.055497646331787 and parameters: {'learning_rate': 1.1793664936074914e-05, 'batch_size': 4, 'num_epochs': 2}. Best is trial 0 with value: 2.055497646331787.


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.0654,2.157407
2,1.9305,2.014508
3,1.8005,1.947562
4,1.7139,1.924625
5,1.5497,1.93551


[I 2024-12-06 18:47:41,206] Trial 1 finished with value: 1.9355098009109497 and parameters: {'learning_rate': 5.2451161530399926e-05, 'batch_size': 4, 'num_epochs': 5}. Best is trial 1 with value: 1.9355098009109497.


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.6526,2.034667
2,1.4837,2.0411
3,1.3635,2.066992


[I 2024-12-06 18:53:59,638] Trial 2 finished with value: 2.0669918060302734 and parameters: {'learning_rate': 5.48231492011936e-05, 'batch_size': 4, 'num_epochs': 3}. Best is trial 1 with value: 1.9355098009109497.


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.3416,2.066382
2,1.3306,2.078532
3,1.3733,2.078708
4,1.3315,2.090487
5,1.2932,2.095432


[I 2024-12-06 18:59:45,215] Trial 3 finished with value: 2.0954315662384033 and parameters: {'learning_rate': 5.300314773918238e-06, 'batch_size': 8, 'num_epochs': 5}. Best is trial 1 with value: 1.9355098009109497.


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.2336,2.250298
2,1.1024,2.270977


[I 2024-12-06 19:03:57,246] Trial 4 finished with value: 2.2709765434265137 and parameters: {'learning_rate': 3.742461233896312e-05, 'batch_size': 4, 'num_epochs': 2}. Best is trial 1 with value: 1.9355098009109497.


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.1973,2.235416
2,1.2732,2.242908
3,1.2392,2.24465


[I 2024-12-06 19:07:23,354] Trial 5 finished with value: 2.24465012550354 and parameters: {'learning_rate': 2.5870143142934004e-06, 'batch_size': 8, 'num_epochs': 3}. Best is trial 1 with value: 1.9355098009109497.


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.789,2.539587
2,1.0358,2.406805
3,1.2152,2.319804
4,1.2086,2.307731


[I 2024-12-06 19:15:49,401] Trial 6 finished with value: 2.3077313899993896 and parameters: {'learning_rate': 6.65174239816731e-06, 'batch_size': 4, 'num_epochs': 4}. Best is trial 1 with value: 1.9355098009109497.


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.3357,2.238823
2,1.3044,2.221638
3,1.2711,2.284343
4,1.1861,2.3319
5,1.1012,2.365633


[I 2024-12-06 19:21:43,809] Trial 7 finished with value: 2.365633249282837 and parameters: {'learning_rate': 3.9060403786337064e-05, 'batch_size': 8, 'num_epochs': 5}. Best is trial 1 with value: 1.9355098009109497.


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.6828,2.738264
2,0.8747,2.657483
3,1.0352,2.545393
4,1.0766,2.488992
5,1.0698,2.479054


[I 2024-12-06 19:33:33,955] Trial 8 finished with value: 2.4790537357330322 and parameters: {'learning_rate': 5.102478663560619e-06, 'batch_size': 4, 'num_epochs': 5}. Best is trial 1 with value: 1.9355098009109497.


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.7976,2.67977


[I 2024-12-06 19:34:56,890] Trial 9 finished with value: 2.679769515991211 and parameters: {'learning_rate': 2.3325870504045444e-06, 'batch_size': 8, 'num_epochs': 1}. Best is trial 1 with value: 1.9355098009109497.
Tested learning rates: [1.1793664936074914e-05, 5.2451161530399926e-05, 5.48231492011936e-05, 5.300314773918238e-06, 3.742461233896312e-05, 2.5870143142934004e-06, 6.65174239816731e-06, 3.9060403786337064e-05, 5.102478663560619e-06, 2.3325870504045444e-06]
Best hyperparameters according to HPO: {'learning_rate': 5.2451161530399926e-05, 'batch_size': 4, 'num_epochs': 5}


In [12]:
stop_execution()

NameError: name 'stop_execution' is not defined

Check the HPO outcomes and choose the best hyperparameters: the code above does not take into account overfitting so make sure to choose well!

In [None]:
best_params = {'learning_rate': , 'batch_size': , 'num_epochs': }

In [15]:
# fine tune using best HPs

best_hyperparams = study.best_params

training_args = TrainingArguments(
    output_dir="./gpt2_finetuned_hpo_small",
    evaluation_strategy="epoch",
    learning_rate=best_hyperparams["learning_rate"],
    per_device_train_batch_size=best_hyperparams["batch_size"],
    num_train_epochs=best_hyperparams["num_epochs"],
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=50,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.9935,2.667672
2,1.1252,2.459021
3,1.1838,2.457612
4,1.1313,2.478989
5,1.0549,2.514518


TrainOutput(global_step=6250, training_loss=1.0241088479614258, metrics={'train_runtime': 648.2968, 'train_samples_per_second': 38.563, 'train_steps_per_second': 9.641, 'total_flos': 1451094835200000.0, 'train_loss': 1.0241088479614258, 'epoch': 5.0})

In [16]:
# verify tokens are set consistently

print(f"Pad token ID: {tokenizer.pad_token_id}")
print(f"EOS token ID: {tokenizer.eos_token_id}")


Pad token ID: 50256
EOS token ID: 50256


In [17]:
# Try out the model

# Function to predict the next token
def predict_next_token(input_text):
    # Tokenize and move input to GPU
    inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
    # Generate only the next token
    outputs = model.generate(
        **inputs,
        max_new_tokens=1,  # Generate only one token
        pad_token_id=tokenizer.pad_token_id,  # Ensure padding is handled
        eos_token_id=tokenizer.eos_token_id  # Set EOS token explicitly
    )
    # Decode and return the output
    return tokenizer.decode(outputs[0])

# Test cases
input_texts = [
    "76 plus 24 equals",
    "80 minus 24 equals",
    "4 times 10 equals",
    "40 divided by 5 equals"
]

# Generate and print the next token for each input
for input_text in input_texts:
    next_token = predict_next_token(input_text)
    print(f"Input: {input_text}\nNext Token: {next_token}\n")


Input: 76 plus 24 equals
Next Token: 76 plus 24 equals 100

Input: 80 minus 24 equals
Next Token: 80 minus 24 equals 52

Input: 4 times 10 equals
Next Token: 4 times 10 equals 20

Input: 40 divided by 5 equals
Next Token: 40 divided by 5 equals 10



In [18]:
# get performance estiamtion in terms of accuracy

from sklearn.metrics import accuracy_score

# Load tokenized test dataset (example format; replace with your actual data)
# Each entry is a dictionary with 'input_ids' and 'labels', and padding tokens are set to -100 in 'labels'.
test_data = test_dataset
print(test_data[0])

# Prepare data for evaluation
prompts = []
correct_token_dec = []

for item in test_data:
    # Find the last non-padding token (label not equal to -100)
    last_valid_index = len(item['labels']) - 1 - item['labels'][::-1].index(-100)
    prompts.append(item['input_ids'][:last_valid_index])  # Input up to but not including the last token
    correct_token_dec.append(item['labels'][tokenizer.decode(last_valid_index)])  # The last valid label decoded into output

# Evaluate accuracy for predicting the last valid token
predicted_tokens = []
for prompt in prompts:
    next_token = predict_next_token(prompt)
    predicted_tokens.append(next_token)

# check everything is stored correctly -- should be two numbers (ideally the same)
print(correct_token_dec[0])
print(predicted_tokens[0])

# Calculate accuracy
accuracy = accuracy_score(correct_token_dec, predicted_tokens)
accuracy_percentage = accuracy * 100

# Display results
print(f"Accuracy for predicting the last valid token: {accuracy_percentage:.2f}%")



{'math_problem': '29 plus 3 equals 32', 'input_ids': [1959, 5556, 513, 21767, 3933, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [1959, 5556, 513, 21767, 3933, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]}


TypeError: list indices must be integers or slices, not str

Save the fine-tuned model for future use

In [None]:
# Save the model and tokenizer
model.save_pretrained("./gpt2_finetuned_hpo_small")
tokenizer.save_pretrained("./gpt2_finetuned_hpo_small")


In [None]:
!zip -r gpt2_finetuned_hpo_small.zip ./gpt2_finetuned_hpo_small


In [None]:
!mv gpt2_finetuned_hpo_small.zip /content/drive/MyDrive/
