In [1]:
# Step 1: Install Required Libraries
!pip install -q transformers datasets accelerate

# Step 2: Imports
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
import pandas as pd

# Step 3: Device Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Step 4: Load Dataset
train_df = pd.read_csv("/kaggle/input/training-data/training_data.csv")

train_df = train_df.rename(columns={
    "choice1": "choice_0",
    "choice2": "choice_1",
    "choice3": "choice_2",
    "choice4": "choice_3"
})

# Step 5: Format Dataset
def format_example(example):
    return f"Question: {example['question']}\nOptions:\nA. {example['choice_0']}\nB. {example['choice_1']}\nC. {example['choice_2']}\nD. {example['choice_3']}"

train_texts = train_df.apply(format_example, axis=1).tolist()
dataset = Dataset.from_dict({"text": train_texts})

# Step 6: Tokenization Function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

# Step 7: Define Model Checkpoint & Variants
model_checkpoint = "EleutherAI/gpt-neo-125M"

variants = {
    "model-1":      {"batch_size": 1, "learning_rate": 1e-5, "num_epochs": 1, "accum_steps": 8},
    "model-2":   {"batch_size": 2, "learning_rate": 3e-5, "num_epochs": 2, "accum_steps": 4},
    "model-3":     {"batch_size": 2, "learning_rate": 5e-5, "num_epochs": 3, "accum_steps": 4},
    "model-4": {"batch_size": 2, "learning_rate": 2e-5, "num_epochs": 5, "accum_steps": 4},
    "model-5":     {"batch_size": 4, "learning_rate": 1e-4, "num_epochs": 2, "accum_steps": 2},
}

# Step 8: Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer.pad_token = tokenizer.eos_token

# Step 9: Tokenize Dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_dataset = tokenized_dataset.with_format("torch")

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Step 10: Training Loop
for variant_name, params in variants.items():
    print(f"\n🚀 Training {model_checkpoint} with **{variant_name}** variant...", flush=True)

    output_dir = f"/kaggle/working/{model_checkpoint.replace('/', '_')}_{variant_name}_model"

    # Fresh model for each run
    model = AutoModelForCausalLM.from_pretrained(model_checkpoint)
    model.gradient_checkpointing_enable()
    model = model.to(device)

    training_args = TrainingArguments(
        output_dir=output_dir,
        save_total_limit=1,
        save_steps=1000000,
        save_strategy="no",
        logging_dir=f"{output_dir}/logs",
        logging_steps=1000000,
        per_device_train_batch_size=params["batch_size"],
        gradient_accumulation_steps=params["accum_steps"],
        learning_rate=params["learning_rate"],
        num_train_epochs=params["num_epochs"],
        gradient_checkpointing=True,
        fp16=True,
        weight_decay=0.01,
        report_to="none",
        dataloader_num_workers=2,
        warmup_steps=50,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    trainer.train()

    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

    print(f"Finished training & saving for {variant_name}!\n", flush=True)

print("All GPT-Neo 125M variants trained successfully!")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.5/207.5 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

2025-04-29 04:57:21.469776: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745902641.672702      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745902641.731446      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


✅ Using device: cuda


tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

Map:   0%|          | 0/13830 [00:00<?, ? examples/s]


🚀 Training EleutherAI/gpt-neo-125M with **neo_low** variant...


config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/526M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

  trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss


✅ Finished training & saving for neo_low!


🚀 Training EleutherAI/gpt-neo-125M with **neo_medium** variant...


  trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


✅ Finished training & saving for neo_medium!


🚀 Training EleutherAI/gpt-neo-125M with **neo_high** variant...


  trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

✅ Finished training & saving for neo_high!


🚀 Training EleutherAI/gpt-neo-125M with **neo_extended** variant...


  trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

✅ Finished training & saving for neo_extended!


🚀 Training EleutherAI/gpt-neo-125M with **neo_fast** variant...


  trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


✅ Finished training & saving for neo_fast!

🎯 All GPT-Neo 125M variants trained successfully!


In [3]:
!zip -r file.zip /kaggle/working

  adding: kaggle/working/ (stored 0%)
  adding: kaggle/working/.virtual_documents/ (stored 0%)
  adding: kaggle/working/EleutherAI_gpt-neo-125M_neo_extended_model/ (stored 0%)
  adding: kaggle/working/EleutherAI_gpt-neo-125M_neo_extended_model/special_tokens_map.json (deflated 74%)
  adding: kaggle/working/EleutherAI_gpt-neo-125M_neo_extended_model/tokenizer.json (deflated 82%)
  adding: kaggle/working/EleutherAI_gpt-neo-125M_neo_extended_model/tokenizer_config.json (deflated 55%)
  adding: kaggle/working/EleutherAI_gpt-neo-125M_neo_extended_model/vocab.json

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


 (deflated 59%)
  adding: kaggle/working/EleutherAI_gpt-neo-125M_neo_extended_model/training_args.bin (deflated 52%)
  adding: kaggle/working/EleutherAI_gpt-neo-125M_neo_extended_model/model.safetensors (deflated 8%)
  adding: kaggle/working/EleutherAI_gpt-neo-125M_neo_extended_model/config.json (deflated 59%)
  adding: kaggle/working/EleutherAI_gpt-neo-125M_neo_extended_model/generation_config.json (deflated 24%)
  adding: kaggle/working/EleutherAI_gpt-neo-125M_neo_extended_model/merges.txt (deflated 53%)
  adding: kaggle/working/EleutherAI_gpt-neo-125M_neo_low_model/ (stored 0%)
  adding: kaggle/working/EleutherAI_gpt-neo-125M_neo_low_model/special_tokens_map.json (deflated 74%)
  adding: kaggle/working/EleutherAI_gpt-neo-125M_neo_low_model/tokenizer.json (deflated 82%)
  adding: kaggle/working/EleutherAI_gpt-neo-125M_neo_low_model/tokenizer_config.json (deflated 55%)
  adding: kaggle/working/EleutherAI_gpt-neo-125M_neo_low_model/vocab.json (deflated 59%)
  adding: kaggle/working/Ele

In [4]:
!ls

EleutherAI_gpt-neo-125M_neo_extended_model
EleutherAI_gpt-neo-125M_neo_fast_model
EleutherAI_gpt-neo-125M_neo_high_model
EleutherAI_gpt-neo-125M_neo_low_model
EleutherAI_gpt-neo-125M_neo_medium_model
file.zip


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [5]:
from IPython.display import FileLink
FileLink(r'file.zip')