In [37]:
import os
import torch
import random
from transformers import GPT2LMHeadModel, GPT2TokenizerFast, GPT2Config
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
import tensorflow as tf
import accelerate

# Check available devices and select the best one
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using CUDA device: {torch.cuda.get_device_name(0)}")
elif torch.mps.is_available():
    device = torch.device("mps")
    print("Using MPS (Apple Silicon GPU)")
else:
    device = torch.device("cpu")
    print("No GPU available, using CPU")

print(f"Using device: {device}")

Using MPS (Apple Silicon GPU)
Using device: mps


In [38]:
# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"  # Options: "gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl"
tokenizer = GPT2TokenizerFast.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Add padding token to tokenizer
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

def load_and_prepare_openwebmath(fraction=0.001):
    try:
        print(f"Loading {fraction * 100:.1f}% of the OpenWebMath dataset...")
        # Load the full dataset (non-streaming)
        dataset = load_dataset("open-web-math/open-web-math")
        # Print column names to understand the structure
        print(f"Dataset columns: {dataset['train'].column_names}")
        
        # Create a subset using random indices
        train_size = len(dataset["train"])
        subset_size = int(train_size * fraction)
        indices = random.sample(range(train_size), subset_size)
        
        # Select the subset
        subset_dataset = dataset["train"].select(indices)
        print(f"Dataset loaded with {len(subset_dataset)} examples ({fraction * 100:.1f}% of full dataset)")
        return subset_dataset
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None
        
# Process the dataset for training
def tokenize_function(examples):
    # Check if 'text' column exists in examples
    if 'text' not in examples:
        raise KeyError(f"'text' column not found. Available columns: {list(examples.keys())}")
    
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )

dataset_fraction = 0.001  # Change this to adjust the fraction of data to use
dataset = load_and_prepare_openwebmath(fraction=dataset_fraction)

Loading 0.1% of the OpenWebMath dataset...
Dataset columns: ['url', 'text', 'date', 'metadata']
Dataset loaded with 6315 examples (0.1% of full dataset)


In [39]:
if dataset is not None:
    # Double-check column names before tokenization
    print(f"Columns in dataset before tokenization: {dataset.column_names}")
    
    # Verify 'text' column exists and show a sample
    if 'text' in dataset.column_names:
        print(f"Sample text from first example: {dataset['text'][0][:100]}...")
    else:
        print("WARNING: 'text' column not found in dataset!")
        
    # Tokenize the dataset
    try:
        tokenized_dataset = dataset.map(
            tokenize_function,
            batched=True,
            remove_columns=['url', 'date', 'metadata']
        )
        
        # After tokenization, we can optionally remove the text column to save memory
        if 'text' in tokenized_dataset.column_names:
            tokenized_dataset = tokenized_dataset.remove_columns(['text'])
            
        print(f"Tokenization successful. Final columns: {tokenized_dataset.column_names}")
        print(f"Dataset size: {len(tokenized_dataset)} examples")
    except Exception as e:
        print(f"Error during tokenization: {e}")
else:
    print("Dataset loading failed, cannot proceed with tokenization.")

Columns in dataset before tokenization: ['url', 'text', 'date', 'metadata']
Sample text from first example: ## #StackBounty: #logistic #normal-distribution #expected-value Expectation of Inverse Logit of Norm...


Map: 100%|██████████| 6315/6315 [00:06<00:00, 916.12 examples/s]

Tokenization successful. Final columns: ['input_ids', 'attention_mask']
Dataset size: 6315 examples





In [40]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-math-continued",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10000,
    save_total_limit=2,
    prediction_loss_only=True,
    fp16=torch.cuda.is_available(),  # Use mixed precision if available
    gradient_accumulation_steps=4,   # Accumulate gradients to simulate larger batch sizes
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500,
    logging_dir="./logs",            # Directory for storing logs
    logging_steps=500,               # Log every 500 steps
)

In [41]:
# Create data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # GPT-2 uses causal language modeling, not masked
)

In [43]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset,
)

# Train the model
print("Starting training...")
trainer.train()

# Save the fine-tuned model
model_save_path = "./gpt2-math-continued-final"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Model saved to {model_save_path}")

Starting training...


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


KeyboardInterrupt: 

In [None]:
# Example of loading and using the continued pre-trained model
def test_model():
    print("\nTesting the model with a sample prompt...")
    
    # Load the saved model
    loaded_model = GPT2LMHeadModel.from_pretrained(model_save_path)
    loaded_tokenizer = GPT2Tokenizer.from_pretrained(model_save_path)
    loaded_model.to(device)
    
    # Sample math prompt
    test_prompt = "The integral of x^2 is"
    
    # Tokenize the prompt
    inputs = loaded_tokenizer(test_prompt, return_tensors="pt").to(device)
    
    # Generate output
    outputs = loaded_model.generate(
        inputs["input_ids"],
        max_length=100,
        num_return_sequences=1,
        temperature=0.7,
        pad_token_id=loaded_tokenizer.eos_token_id
    )
    
    # Decode and print the output
    generated_text = loaded_tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Input: {test_prompt}")
    print(f"Generated: {generated_text}")

# Uncomment to test the model after training
# test_model()