<a href="https://colab.research.google.com/github/khnhenriette/ProjectADL/blob/math-medium/notebooks/medium_fine_tune_math_hpo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Fine tune gpt2-medium for basic math tasks

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers datasets
!pip install torch


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

Use the dataset math_dataset.json that includes 10000 simple math examples of the form "89 minus 84 equals 5" using addition, subtraction, multiplication and division -- ensure dataset is uploaded to Google Colab before running

In [3]:
import pandas as pd
from datasets import Dataset

# Load the CSV dataset
df = pd.read_csv('math_dataset.csv')  # Update this path if using Google Drive

# Convert DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Split dataset into training and validation sets
split_dataset = dataset.train_test_split(test_size=0.1)
train_dataset = split_dataset['train']
eval_dataset = split_dataset['test']

print(f"Training examples: {len(train_dataset)}")
print(f"Validation examples: {len(eval_dataset)}")


Training examples: 9000
Validation examples: 1000


In [4]:
print(train_dataset[4])

{'math_problem': '87 minus 9 equals 78'}


### Use the Huggin Face Trainer to fine-tune the available gpt2-medium to perform better on the simple math tasks

In [6]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments

# Load GPT-2 Medium tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
model = GPT2LMHeadModel.from_pretrained("gpt2-medium")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Start by finding right max_length for padding


In [7]:
# Check the longest sequence in your dataset
max_tokenized_length = max(len(tokenizer(example)['input_ids']) for example in dataset['math_problem'])
print(f"Maximum tokenized length: {max_tokenized_length}")


Maximum tokenized length: 8


In [8]:

# Set the EOS token as the padding token
tokenizer.pad_token = tokenizer.eos_token

# choose max_length slightly higher than longest sequence in dataset
max_length = 32

# Tokenize the dataset
def tokenize_function(examples):
    tokenized_inputs = tokenizer(
        examples["math_problem"],
        truncation=True,
        padding="max_length",
        max_length=max_length
    )
    # Labels are the same as input_ids for causal language modeling
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
    return tokenized_inputs


In [9]:
# ensure padding and masking are done correctly

testing = tokenizer("3 plus 65 equals 68", padding="max_length", max_length=10)
print(testing)


{'input_ids': [18, 5556, 6135, 21767, 8257, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]}


In [10]:
# tokenize dataset

train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [11]:
# mask eos tokens in labels to avoid their prediction in final outcome

def mask_eos_in_labels(example):
    example["labels"] = [token if token != tokenizer.eos_token_id else -100 for token in example["labels"]]
    return example

train_dataset = train_dataset.map(mask_eos_in_labels)
eval_dataset = eval_dataset.map(mask_eos_in_labels)


print(train_dataset[0])
print(eval_dataset[0])


Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

{'math_problem': '2 plus 74 equals 76', 'input_ids': [17, 5556, 8915, 21767, 8684, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [17, 5556, 8915, 21767, 8684, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]}
{'math_problem': '10 divided by 73 equals 0.14', 'input_ids': [940, 9086, 416, 8854, 21767, 657, 13, 1415, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [940, 

In [12]:

# Define training arguments
training_args = TrainingArguments(
    output_dir="./gpt2_math_finetuned",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=50,
    push_to_hub=False,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

# Start training
trainer.train()


  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,2.0484,2.01829
2,1.9003,1.901672
3,1.7842,1.812512
4,1.628,1.767855
5,1.5401,1.752768


TrainOutput(global_step=5625, training_loss=1.846523647732205, metrics={'train_runtime': 2233.1031, 'train_samples_per_second': 20.151, 'train_steps_per_second': 2.519, 'total_flos': 2611970703360000.0, 'train_loss': 1.846523647732205, 'epoch': 5.0})

Save the fine-tuned model for future use

In [13]:
# Save the model and tokenizer
model.save_pretrained("./gpt2_math_finetuned")
tokenizer.save_pretrained("./gpt2_math_finetuned")


('./gpt2_math_finetuned/tokenizer_config.json',
 './gpt2_math_finetuned/special_tokens_map.json',
 './gpt2_math_finetuned/vocab.json',
 './gpt2_math_finetuned/merges.txt',
 './gpt2_math_finetuned/added_tokens.json')

In [14]:
!zip -r gpt2_math_finetuned_new.zip ./gpt2_math_finetuned


  adding: gpt2_math_finetuned/ (stored 0%)
  adding: gpt2_math_finetuned/tokenizer_config.json (deflated 55%)
  adding: gpt2_math_finetuned/special_tokens_map.json (deflated 74%)
  adding: gpt2_math_finetuned/merges.txt (deflated 53%)
  adding: gpt2_math_finetuned/checkpoint-5500/ (stored 0%)
  adding: gpt2_math_finetuned/checkpoint-5500/rng_state.pth (deflated 25%)
  adding: gpt2_math_finetuned/checkpoint-5500/tokenizer_config.json (deflated 55%)
  adding: gpt2_math_finetuned/checkpoint-5500/special_tokens_map.json (deflated 74%)
  adding: gpt2_math_finetuned/checkpoint-5500/trainer_state.json (deflated 82%)
  adding: gpt2_math_finetuned/checkpoint-5500/merges.txt (deflated 53%)
  adding: gpt2_math_finetuned/checkpoint-5500/vocab.json (deflated 68%)
  adding: gpt2_math_finetuned/checkpoint-5500/model.safetensors (deflated 7%)
  adding: gpt2_math_finetuned/checkpoint-5500/training_args.bin (deflated 51%)
  adding: gpt2_math_finetuned/checkpoint-5500/scheduler.pt (deflated 55%)
  adding

In [15]:
!mv gpt2_math_finetuned_new.zip /content/drive/MyDrive/


In [16]:
# try out the model

# Generate text
input_text = "76 plus 24 equals"
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

# Generate with adjusted parameters
outputs = model.generate(
    **inputs,
    max_new_tokens=10,  # Limit the number of tokens generated
    pad_token_id=tokenizer.pad_token_id,  # Ensure padding is correctly handled
    eos_token_id=tokenizer.eos_token_id,  # Set an explicit EOS token
)

# Decode and remove `<|endoftext|>` manually
decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
print(decoded_output)



76 plus 24 equals 104 equals 132 176 equals 14816 equals 14844


In [17]:
# verify tokens are set consistently

print(f"Pad token ID: {tokenizer.pad_token_id}")
print(f"EOS token ID: {tokenizer.eos_token_id}")


Pad token ID: 50256
EOS token ID: 50256
