# Training Phi-2 from scratch

In [None]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [None]:
# !pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git
# !pip install -q datasets bitsandbytes einops wandb
# !pip install --upgrade datasets

In [None]:
# ! pip install transformers==4.28.0
# ! pip install --upgrade transformers

In [None]:
from datasets import load_dataset
import os 

dataset_name = "togethercomputer/RedPajama-Data-1T-Sample"
dataset_train = load_dataset(dataset_name, cache_dir="dataset/RedPajama-Data-1T-Sample", 
                             split="train", num_proc=16)

#### Setup tokenizer

In [None]:
import torch
from transformers import  AutoTokenizer

model_name = "microsoft/phi-2"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token


#### Setup Configuration and create model

In [None]:
from transformers import AutoModelForCausalLM, AutoConfig

model_name = "microsoft/phi-2"

# Initialize model with random weights
model_phi2_scratch = AutoModelForCausalLM.from_pretrained(model_name, 
                                                          trust_remote_code=True, 
                                                          device_map = 'cpu'
                                                          )
model_phi2_scratch.init_weights()


In [None]:
model_phi2_scratch = model_phi2_scratch.to('cuda:1')

In [None]:
from transformers import TrainingArguments

output_dir = "./results"
per_device_train_batch_size = 2
gradient_accumulation_steps = 8
optim = "paged_adamw_32bit"
save_steps = 5000
logging_steps = 1
learning_rate = 2e-4
max_grad_norm = 0.3
max_steps = 5000
warmup_ratio = 0.03
lr_scheduler_type = "constant"

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    #group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
    #gradient_checkpointing=True,
    report_to="tensorboard"
)

In [None]:
from datasets import  DatasetDict

raw_datasets = DatasetDict(
    {
        "train": dataset_train  # .shuffle().select(range(50000)),
    }
)
context_length = 256

def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_datasets = raw_datasets.map(
    tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
)

In [None]:
from transformers import Trainer, DataCollatorForLanguageModeling, DataCollator

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = Trainer(
    model=model_phi2_scratch,
    train_dataset=tokenized_datasets["train"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    args=training_arguments,
)

In [None]:
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

In [None]:
trainer.train() ## randomly initialized weights