In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:


# Step 1: Install Required Libraries
!pip install -q transformers datasets accelerate

# Step 2: Imports
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
import pandas as pd

# Step 3: Device Setup (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Step 4: Load Dataset
train_df = pd.read_csv("/content/training_data.csv")  # Make sure to upload your CSV file here

train_df = train_df.rename(columns={
    "choice1": "choice_0",
    "choice2": "choice_1",
    "choice3": "choice_2",
    "choice4": "choice_3"
})

# Step 5: Dataset Preprocessing
def format_example(example):
    return f"Question: {example['question']}\nOptions:\nA. {example['choice_0']}\nB. {example['choice_1']}\nC. {example['choice_2']}\nD. {example['choice_3']}"

train_texts = train_df.apply(format_example, axis=1).tolist()
dataset = Dataset.from_dict({"text": train_texts})

# Step 6: Tokenization Function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

# Step 7: Define Model and Variants
model_checkpoint = "gpt2-medium"
variants = {
    "model-1": {"batch_size": 1, "learning_rate": 1e-5, "num_epochs": 1},
    "model-2": {"batch_size": 1, "learning_rate": 5e-5, "num_epochs": 3},
    "model-3": {"batch_size": 1, "learning_rate": 1e-4, "num_epochs": 5},
    "model-4": {"batch_size": 1, "learning_rate": 3e-5, "num_epochs": 2},
    "model-5": {"batch_size": 1, "learning_rate": 8e-5, "num_epochs": 4}
}

# Step 8: Load Tokenizer and Model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(model_checkpoint)
model.gradient_checkpointing_enable()
model = model.to(device)  # Move model to GPU

# Step 9: Tokenize Dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Step 10: Training Loop
for variant_name, params in variants.items():
    print(f"\nTraining {model_checkpoint} with {variant_name} settings...", flush=True)

    output_dir = f"/content/drive/MyDrive/{model_checkpoint.replace('/', '_')}_{variant_name}_model"


    training_args = TrainingArguments(
        output_dir=output_dir,
        save_total_limit=1,
        save_steps=1000000,   # Effectively disables intermediate saving
        save_strategy="no",
        logging_dir=f"{output_dir}/logs",
        learning_rate=params["learning_rate"],
        per_device_train_batch_size=params["batch_size"],
        gradient_accumulation_steps=8,   # Simulate larger batch size
        gradient_checkpointing=True,
        fp16=True,  # Mixed precision for faster GPU training
        num_train_epochs=params["num_epochs"],
        weight_decay=0.01,
        report_to="none",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    trainer.train()

    # Save Model and Tokenizer
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

    print(f"Training and saving complete for {model_checkpoint} with {variant_name} variant.\n", flush=True)

print("All model variants training completed!")


✅ Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/13830 [00:00<?, ? examples/s]


Training gpt2-medium with low settings...


  trainer = Trainer(
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
500,2.3947
1000,2.2915
1500,2.2316


✅ Training and saving complete for gpt2-medium with low variant.


Training gpt2-medium with medium settings...


  trainer = Trainer(


Step,Training Loss
500,2.1417
1000,2.1257
1500,2.0753
2000,1.9036
2500,1.7658
3000,1.7292
3500,1.6953
4000,1.537
4500,1.5364
5000,1.5293


✅ Training and saving complete for gpt2-medium with medium variant.

🎯 All model variants training completed!


In [3]:
!python --version


Python 3.11.12
