<a href="https://colab.research.google.com/github/khnhenriette/ProjectADL/blob/math-medium/notebooks/medium_fine_tune_math.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Fine tune gpt2-medium for basic math tasks

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers datasets
!pip install torch


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

Use the dataset math_dataset.json that includes 10000 simple math examples of the form "89 minus 84 equals 5" using addition, subtraction, multiplication and division -- ensure dataset is uploaded to Google Colab before running

In [3]:
import pandas as pd
from datasets import Dataset

# Load the CSV dataset
df = pd.read_csv('math_dataset.csv')  # Update this path if using Google Drive

# Convert DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Split dataset into training and validation sets
split_dataset = dataset.train_test_split(test_size=0.1)
train_dataset = split_dataset['train']
eval_dataset = split_dataset['test']

print(f"Training examples: {len(train_dataset)}")
print(f"Validation examples: {len(eval_dataset)}")


Training examples: 9000
Validation examples: 1000


In [4]:
print(train_dataset[4])

{'math_problem': '66 times 17 equals 1122'}


### Use the Huggin Face Trainer to fine-tune the available gpt2-medium to perform better on the simple math tasks

Start by finding right max_length for padding


In [12]:
# Check the longest sequence in your dataset
max_tokenized_length = max(len(tokenizer(example)['input_ids']) for example in dataset['math_problem'])
print(f"Maximum tokenized length: {max_tokenized_length}")


Maximum tokenized length: 8


In [11]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments

# Load GPT-2 Medium tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
model = GPT2LMHeadModel.from_pretrained("gpt2-medium")

# Set the EOS token as the padding token
tokenizer.pad_token = tokenizer.eos_token

# choose max_length slightly higher than longest sequence in dataset
max_length = 32

# Tokenize the dataset
def tokenize_function(examples):
    tokenized_inputs = tokenizer(
        examples["math_problem"],
        truncation=True,
        padding="max_length",
        max_length=max_length
    )
    # Labels are the same as input_ids for causal language modeling
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
    return tokenized_inputs


In [13]:
# ensure padding and masking are done correctly

testing = tokenizer("3 plus 65 equals 68", padding="max_length", max_length=10)
print(testing)


{'input_ids': [18, 5556, 6135, 21767, 8257, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]}


In [14]:
train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

print(train_dataset[0])
print(eval_dataset[0])


Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

{'math_problem': '58 divided by 84 equals 0.69', 'input_ids': [3365, 9086, 416, 9508, 21767, 657, 13, 3388, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [3365, 9086, 416, 9508, 21767, 657, 13, 3388, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256]}
{'math_problem': '87 minus 1 equals 86', 'input_ids': [5774, 20208, 352, 21767, 9849, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [15]:

# Define training arguments
training_args = TrainingArguments(
    output_dir="./gpt2_math_finetuned",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=50,
    push_to_hub=False,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

# Start training
trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.3262,0.327398
2,0.3006,0.305502
3,0.2811,0.290017


TrainOutput(global_step=3375, training_loss=0.3266779351411042, metrics={'train_runtime': 1473.3078, 'train_samples_per_second': 18.326, 'train_steps_per_second': 2.291, 'total_flos': 1567182422016000.0, 'train_loss': 0.3266779351411042, 'epoch': 3.0})

Save the fine-tuned model for future use

In [16]:
# Save the model and tokenizer
model.save_pretrained("./gpt2_math_finetuned")
tokenizer.save_pretrained("./gpt2_math_finetuned")


('./gpt2_math_finetuned/tokenizer_config.json',
 './gpt2_math_finetuned/special_tokens_map.json',
 './gpt2_math_finetuned/vocab.json',
 './gpt2_math_finetuned/merges.txt',
 './gpt2_math_finetuned/added_tokens.json')

In [17]:
!zip -r gpt2_math_finetuned_new.zip ./gpt2_math_finetuned


  adding: gpt2_math_finetuned/ (stored 0%)
  adding: gpt2_math_finetuned/checkpoint-3375/ (stored 0%)
  adding: gpt2_math_finetuned/checkpoint-3375/rng_state.pth (deflated 25%)
  adding: gpt2_math_finetuned/checkpoint-3375/tokenizer_config.json (deflated 55%)
  adding: gpt2_math_finetuned/checkpoint-3375/special_tokens_map.json (deflated 74%)
  adding: gpt2_math_finetuned/checkpoint-3375/trainer_state.json (deflated 81%)
  adding: gpt2_math_finetuned/checkpoint-3375/merges.txt (deflated 53%)
  adding: gpt2_math_finetuned/checkpoint-3375/vocab.json (deflated 68%)
  adding: gpt2_math_finetuned/checkpoint-3375/model.safetensors (deflated 7%)
  adding: gpt2_math_finetuned/checkpoint-3375/training_args.bin (deflated 51%)
  adding: gpt2_math_finetuned/checkpoint-3375/scheduler.pt (deflated 56%)
  adding: gpt2_math_finetuned/checkpoint-3375/config.json (deflated 52%)
  adding: gpt2_math_finetuned/checkpoint-3375/generation_config.json (deflated 24%)
  adding: gpt2_math_finetuned/checkpoint-33

In [18]:
!mv gpt2_math_finetuned_new.zip /content/drive/MyDrive/


In [26]:
# try out the model

# Generate text
input_text = "76 plus 24 equals"
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

# Generate with adjusted parameters
outputs = model.generate(
    **inputs,
    max_new_tokens=10,  # Limit the number of tokens generated
    pad_token_id=tokenizer.pad_token_id,  # Ensure padding is correctly handled
    eos_token_id=tokenizer.eos_token_id,  # Set an explicit EOS token
)

# Decode and remove `<|endoftext|>` manually
decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
print(decoded_output)



76 plus 24 equals 102


In [23]:
# verify tokens are set consistently

print(f"Pad token ID: {tokenizer.pad_token_id}")
print(f"EOS token ID: {tokenizer.eos_token_id}")


Pad token ID: 50256
EOS token ID: 50256
