In [1]:
!pip install transformers peft datasets




In [2]:
!pip install accelerate -U



In [3]:
pip install transformers[torch]



In [4]:
# This cell makes sure modules are auto-loaded when you change external python files
%load_ext autoreload
%autoreload 2

In [5]:
# Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
# Direct to your assignment folder.
%cd /content/drive/MyDrive/project-m2-2024-jim

/content/drive/MyDrive/project-m2-2024-jim


In [7]:
from datasets import load_dataset

dataset = load_dataset("sail/symbolic-instruction-tuning")

# Print dataset information
print(dataset)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 796464
    })
    validation: Dataset({
        features: ['input', 'output'],
        num_rows: 4077
    })
    test: Dataset({
        features: ['input', 'output'],
        num_rows: 74026
    })
})


In [8]:
# Sample a subset of the training data (e.g., 10%)
train_subset = dataset['train'].shuffle(seed=42).select(range(0, len(dataset['train']) // 40))

# Print the subset information
print(train_subset)

Dataset({
    features: ['input', 'output'],
    num_rows: 19911
})


In [9]:
import os

# Create the directory if it does not exist
output_dir = "data/sft_train_data"
os.makedirs(output_dir, exist_ok=True)

# Save the train subset to the specified directory
train_subset.save_to_disk(output_dir)

print(f"Train subset saved to {output_dir}")


Saving the dataset (0/1 shards):   0%|          | 0/19911 [00:00<?, ? examples/s]

Train subset saved to data/sft_train_data


In [10]:
from transformers import GPT2Tokenizer

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Set pad_token to eos_token
tokenizer.pad_token = tokenizer.eos_token

def preprocess_data(example):
    inputs = tokenizer(example["input"], padding="max_length", truncation=True, max_length=1024, return_tensors="pt")
    labels = tokenizer(example["output"], padding="max_length", truncation=True, max_length=1024, return_tensors="pt").input_ids
    labels[labels == tokenizer.pad_token_id] = -100  # Set padding tokens to -100 to ignore them in the loss
    return {"input_ids": inputs.input_ids.squeeze(), "attention_mask": inputs.attention_mask.squeeze(), "labels": labels.squeeze()}

# Apply preprocessing to the train subset
train_dataset = train_subset.map(preprocess_data, remove_columns=["input", "output"])


In [12]:
validate_subset = dataset['validation'].shuffle(seed=42).select(range(0, len(dataset['validation']) // 40))
print(validate_subset)

import os

# Create the directory if it does not exist
output_dir = "data/sft_validate_data"
os.makedirs(output_dir, exist_ok=True)

# Save the train subset to the specified directory
validate_subset.save_to_disk(output_dir)

print(f"Validate subset saved to {output_dir}")


Dataset({
    features: ['input', 'output'],
    num_rows: 101
})


Saving the dataset (0/1 shards):   0%|          | 0/101 [00:00<?, ? examples/s]

Validate subset saved to data/sft_validate_data


In [13]:
validate_dataset = validate_subset.map(preprocess_data, remove_columns=["input", "output"])

Map:   0%|          | 0/101 [00:00<?, ? examples/s]

In [14]:
import torch
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model

# LoRA configuration
peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['c_attn', 'c_proj', 'mlp.c_fc', 'mlp.c_proj']  # Targeting GPT-2 specific modules
)

# Load the model
model_name = "gpt2"
model = AutoModelForCausalLM.from_pretrained(model_name)
model = get_peft_model(model, peft_config)

# Training arguments for the first fine-tuning
training_args = TrainingArguments(
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    max_steps=200,
    save_strategy="no",
    logging_steps=10,
    output_dir="./model_output_first",
    optim="adamw_torch",
)

# Trainer setup for the first fine-tuning
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validate_dataset,
    tokenizer=tokenizer,
)

# Train the model on the first dataset
trainer.train()


max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
10,11.3436
20,11.4672
30,11.1965
40,10.9686
50,10.6525
60,10.3665
70,9.6116
80,9.2923
90,9.1296
100,8.7635


TrainOutput(global_step=200, training_loss=9.570018730163575, metrics={'train_runtime': 982.6575, 'train_samples_per_second': 3.256, 'train_steps_per_second': 0.204, 'total_flos': 1718654651596800.0, 'train_loss': 9.570018730163575, 'epoch': 0.16070711128967458})

In [15]:
# Save the model and tokenizer after the first fine-tuning
model_output_dir_first = "./model_output_first"
model.save_pretrained(model_output_dir_first)
tokenizer.save_pretrained(model_output_dir_first)




('./model_output_first/tokenizer_config.json',
 './model_output_first/special_tokens_map.json',
 './model_output_first/vocab.json',
 './model_output_first/merges.txt',
 './model_output_first/added_tokens.json')

In [16]:
dataset2 = load_dataset("meta-math/MetaMathQA")
print(dataset2)

DatasetDict({
    train: Dataset({
        features: ['type', 'query', 'original_question', 'response'],
        num_rows: 395000
    })
})


In [17]:
# Sample a subset of the training data (e.g., 10%)
train_subset2 = dataset2['train'].shuffle(seed=42).select(range(0, len(dataset2['train']) // 20))

# Print the subset information
print(train_subset2)



Dataset({
    features: ['type', 'query', 'original_question', 'response'],
    num_rows: 19750
})


In [20]:
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer

# Split the dataset into training and validation sets
train_test_split = train_subset2.train_test_split(test_size=0.1, seed=42)
train_dataset = train_test_split['train']
val_dataset = train_test_split['test']

# Save the train and validation splits
train_dataset.save_to_disk("data/sft2/train_data")
val_dataset.save_to_disk("data/sft2/val_data")

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Set pad_token to eos_token
tokenizer.pad_token = tokenizer.eos_token

def preprocess_data(example):
    inputs = tokenizer(example["query"], padding="max_length", truncation=True, max_length=1024, return_tensors="pt")
    labels = tokenizer(example["response"], padding="max_length", truncation=True, max_length=1024, return_tensors="pt").input_ids
    labels[labels == tokenizer.pad_token_id] = -100  # Set padding tokens to -100 to ignore them in the loss
    return {"input_ids": inputs.input_ids.squeeze(), "attention_mask": inputs.attention_mask.squeeze(), "labels": labels.squeeze()}

# Apply preprocessing to the datasets
train_dataset = train_dataset.map(preprocess_data, remove_columns=["type", "query", "original_question", "response"])
val_dataset = val_dataset.map(preprocess_data, remove_columns=["type", "query", "original_question", "response"])

# Save the preprocessed train and validation splits
train_dataset.save_to_disk("preprocessed_train_subset2")
val_dataset.save_to_disk("preprocessed_val_subset2")


Saving the dataset (0/1 shards):   0%|          | 0/17775 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1975 [00:00<?, ? examples/s]

Map:   0%|          | 0/17775 [00:00<?, ? examples/s]

Map:   0%|          | 0/1975 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/17775 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1975 [00:00<?, ? examples/s]

In [21]:
import torch
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model

# Load the model fine-tuned on the first dataset
model_output_dir_first = "./model_output_first"
model = AutoModelForCausalLM.from_pretrained(model_output_dir_first)
model = get_peft_model(model, peft_config)

# Training arguments for the second fine-tuning
training_args = TrainingArguments(
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    max_steps=200,
    save_strategy="no",
    logging_steps=10,
    output_dir="./model_output_second",
    optim="adamw_torch",
)

# Trainer setup for the second fine-tuning
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Train the model on the second dataset
trainer.train()

# Save the model and tokenizer after the second fine-tuning
model_output_dir_second = "./model_output_second"
model.save_pretrained(model_output_dir_second)
tokenizer.save_pretrained(model_output_dir_second)


max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
10,10.6433
20,10.2057
30,9.2851
40,8.3425
50,7.5408
60,7.0957
70,6.8634
80,6.6099
90,6.4557
100,6.365




('./model_output_second/tokenizer_config.json',
 './model_output_second/special_tokens_map.json',
 './model_output_second/vocab.json',
 './model_output_second/merges.txt',
 './model_output_second/added_tokens.json')