In [1]:
!pip install -qU transformers accelerate bitsandbytes peft trl datasets evaluate ai2-olmo

In [None]:
import os
import torch
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
import hf_olmo # pip install ai2-olmo
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import Dataset
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling
import math

In [3]:
model_name = "allenai/OLMo-1B"
olmo = AutoModelForCausalLM.from_pretrained(model_name)

In [4]:
tokenizer = tokenizer = AutoTokenizer.from_pretrained(model_name)

In [5]:
tokenizer.pad_token = tokenizer.eos_token

In [6]:
def tokenize_function(examples):
    return tokenizer(examples["text"])

# block_size = tokenizer.model_max_length #or 1024 or 2048
block_size = 128
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [7]:
def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [8]:
trainer = Trainer(
    model=olmo,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

olmo.eval()

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


OLMoForCausalLM(
  (model): Olmo(
    (transformer): ModuleDict(
      (wte): Embedding(50304, 2048)
      (emb_drop): Dropout(p=0.0, inplace=False)
      (ln_f): LayerNorm()
      (blocks): ModuleList(
        (0-15): 16 x OlmoSequentialBlock(
          (dropout): Dropout(p=0.0, inplace=False)
          (act): SwiGLU()
          (attn_out): Linear(in_features=2048, out_features=2048, bias=False)
          (ff_out): Linear(in_features=8192, out_features=2048, bias=False)
          (rotary_emb): RotaryEmbedding()
          (attn_norm): LayerNorm()
          (ff_norm): LayerNorm()
          (att_proj): Linear(in_features=2048, out_features=6144, bias=False)
          (ff_proj): Linear(in_features=2048, out_features=16384, bias=False)
        )
      )
      (ff_out): Embedding(50304, 2048)
    )
  )
)

In [1]:
siqa_data = load_dataset("text", data_files={"train": "./data/train_siqa.txt", "validation":"./data/val_siqa.txt"})
siqa_tokenized_datasets = siqa_data.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

In [None]:
siqa_datasets = siqa_tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

In [None]:
siqa_eval_results = trainer.evaluate(siqa_datasets['validation'])
print(f"Olmo Perplexity for SIQA: {math.exp(siqa_eval_results['eval_loss']):.2f}")

In [None]:
piqa_data =  load_dataset("text", data_files={"train": "./data/train_piqa.txt", "validation":"./data/val_piqa.txt"})
piqa_tokenized_datasets = piqa_data.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

In [None]:
piqa_datasets = piqa_tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

In [None]:
piqa_eval_results = trainer.evaluate(piqa_datasets['validation'])
print(f"Olmo Perplexity for PIQA: {math.exp(piqa_eval_results['eval_loss']):.2f}")

In [9]:
all_datas = load_dataset("text", data_files={"train": "./data/train.txt", "validation":"./data/val.txt"})

In [10]:
all_tokenized_datasets = all_datas.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

In [11]:
all_datasets = all_tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

In [12]:
model_name = 'olmo_finetuned'
finetune_training_args = TrainingArguments(
    f"{model_name}",
    evaluation_strategy = "epoch",
    num_train_epochs=5,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=False,
    fp16=True
)

finetune_trainer = Trainer(
    model=olmo,
    args=finetune_training_args,
    train_dataset=all_datasets['train'],
    eval_dataset=all_datasets['validation'],
)

finetune_trainer.train()

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss
1,1.6063,1.641503
2,1.2243,1.485916
3,0.9027,1.42676
4,0.6175,1.465981
5,0.3836,1.598534


Checkpoint destination directory olmo_finetuned/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory olmo_finetuned/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


TrainOutput(global_step=53245, training_loss=1.0071997929183583, metrics={'train_runtime': 55114.7069, 'train_samples_per_second': 123.65, 'train_steps_per_second': 0.966, 'total_flos': 5.619804590435205e+18, 'train_loss': 1.0071997929183583, 'epoch': 5.0})

In [13]:
finetune_trainer.save_model("clubbed_finetuned_model")

In [None]:
finetuned_results = finetune_trainer.evaluate()

In [None]:
print(f"Finetuned Model Perplexity: {math.exp(finetuned_results['eval_loss']):.2f}")