In [1]:
!pip install -qU transformers accelerate bitsandbytes peft trl datasets evaluate ai2-olmo

In [None]:
import os
import torch
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
import hf_olmo # pip install ai2-olmo
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import Dataset
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling
import math

In [3]:
model_name = "allenai/OLMo-1B"
olmo = AutoModelForCausalLM.from_pretrained(model_name)

In [4]:
tokenizer = tokenizer = AutoTokenizer.from_pretrained(model_name)

In [5]:
tokenizer.pad_token = tokenizer.eos_token

In [6]:
def tokenize_function(examples):
    return tokenizer(examples["text"])

# block_size = tokenizer.model_max_length #or 1024 or 2048
block_size = 1024
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [7]:
def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [8]:
trainer = Trainer(
    model=olmo,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

olmo.eval()

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


OLMoForCausalLM(
  (model): Olmo(
    (transformer): ModuleDict(
      (wte): Embedding(50304, 2048)
      (emb_drop): Dropout(p=0.0, inplace=False)
      (ln_f): LayerNorm()
      (blocks): ModuleList(
        (0-15): 16 x OlmoSequentialBlock(
          (dropout): Dropout(p=0.0, inplace=False)
          (act): SwiGLU()
          (attn_out): Linear(in_features=2048, out_features=2048, bias=False)
          (ff_out): Linear(in_features=8192, out_features=2048, bias=False)
          (rotary_emb): RotaryEmbedding()
          (attn_norm): LayerNorm()
          (ff_norm): LayerNorm()
          (att_proj): Linear(in_features=2048, out_features=6144, bias=False)
          (ff_proj): Linear(in_features=2048, out_features=16384, bias=False)
        )
      )
      (ff_out): Embedding(50304, 2048)
    )
  )
)

In [10]:
siqa_data = load_dataset("text", data_files={"train": "train_siqa.txt", "validation":"val_siqa.txt"})
siqa_tokenized_datasets = siqa_data.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

In [11]:
siqa_datasets = siqa_tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

In [12]:
siqa_eval_results = trainer.evaluate(siqa_datasets['validation'])
print(f"Olmo Perplexity for SIQA: {math.exp(siqa_eval_results['eval_loss']):.2f}")

Olmo Perplexity for SIQA: 7.81


In [9]:
piqa_data =  load_dataset("text", data_files={"train": "train_piqa.txt", "validation":"val_piqa.txt"})
piqa_tokenized_datasets = piqa_data.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

Map (num_proc=4):   0%|          | 0/16116 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1838 [00:00<?, ? examples/s]

In [10]:
piqa_datasets = piqa_tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

Map (num_proc=4):   0%|          | 0/16116 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1838 [00:00<?, ? examples/s]

In [11]:
piqa_eval_results = trainer.evaluate(piqa_datasets['validation'])
print(f"Olmo Perplexity for PIQA: {math.exp(piqa_eval_results['eval_loss']):.2f}")

Olmo Perplexity for PIQA: 5.98


In [28]:
all_datas = load_dataset("text", data_files={"train": "./data/train.txt", "validation":"./data/val.txt"})

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [31]:
all_tokenized_datasets = all_datas.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

Map (num_proc=4):   0%|          | 0/3595609 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/605575 [00:00<?, ? examples/s]

In [32]:
all_datasets = all_tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

Map (num_proc=4):   0%|          | 0/3595609 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/605575 [00:00<?, ? examples/s]

In [33]:
model_name = 'olmo_finetuned'
finetune_training_args = TrainingArguments(
    f"{model_name}",
    evaluation_strategy = "epoch",
    num_train_epochs=10,
    per_device_train_batch_size = 4,
    per_device_eval_batch_size=4,
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=False,
)

finetune_trainer = Trainer(
    model=olmo,
    args=finetune_training_args,
    train_dataset=all_datasets['train'],
    eval_dataset=all_datasets['validation'],
)

finetune_trainer.train()

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 