In [None]:
conda create --name hf python=3.10 -c conda-forge

In [None]:
# install torch with the correct cuda version, check nvcc --version
!pip install torch --extra-index-url https://download.pytorch.org/whl/cu118 --upgrade
# install Hugging Face Libraries and additional dependencies
!pip install "transformers==4.33.1" "datasets==2.14.5" "accelerate==0.22.0" "evaluate==0.4.0" "peft==0.5.0" tensorboard packaging --upgrade
# install deepspeed and ninja for jit compilations of kernels
!pip install "deepspeed==0.10.3" ninja --upgrade
# install additional Flash Attention
!pip install flash-attn --no-build-isolation --upgrade


In [None]:
!huggingface-cli login --token YOUR_TOKEN


In [None]:
{
  "instruction": "What is world of warcraft",
  "context": "",
  "response": "World of warcraft is a massive online multi player role playing game. It was released in 2004 by bizarre entertainment"
}


In [None]:
from datasets import load_dataset
from random import randrange

# Load dataset from the hub
dataset = load_dataset("databricks/databricks-dolly-15k", split="train")

print(f"dataset size: {len(dataset)}")
print(dataset[randrange(len(dataset))])
# dataset size: 15011


In [None]:
def format_dolly(sample):
    instruction = f"### Instruction\n{sample['instruction']}"
    context = f"### Context\n{sample['context']}" if len(sample["context"]) > 0 else None
    response = f"### Answer\n{sample['response']}"
    # join all the parts together
    prompt = "\n\n".join([i for i in [instruction, context, response] if i is not None])
    return prompt



In [None]:
def format_dolly(sample):
    instruction = f"### Instruction\n{sample['instruction']}"
    context = f"### Context\n{sample['context']}" if len(sample["context"]) > 0 else None
    response = f"### Answer\n{sample['response']}"
    # join all the parts together
    prompt = "\n\n".join([i for i in [instruction, context, response] if i is not None])
    return prompt



In [None]:
from random import randrange

print(format_dolly(dataset[randrange(len(dataset))]))


In [None]:
from transformers import AutoTokenizer

model_id = "tiiuae/falcon-180B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token


In [None]:
from random import randint
from itertools import chain
from functools import partial


# template dataset to add prompt to each sample
def template_dataset(sample):
    sample["text"] = f"{format_dolly(sample)}{tokenizer.eos_token}"
    return sample


# apply prompt template per sample
dataset = dataset.map(template_dataset, remove_columns=list(dataset.features))
# print random sample
print(dataset[randint(0, len(dataset))]["text"])

# empty list to save remainder from batches to use in next batch
remainder = {"input_ids": [], "attention_mask": [], "token_type_ids": []}

def chunk(sample, chunk_length=2048):
    # define global remainder variable to save remainder from batches to use in next batch
    global remainder
    # Concatenate all texts and add remainder from previous batch
    concatenated_examples = {k: list(chain(*sample[k])) for k in sample.keys()}
    concatenated_examples = {k: remainder[k] + concatenated_examples[k] for k in concatenated_examples.keys()}
    # get total number of tokens for batch
    batch_total_length = len(concatenated_examples[list(sample.keys())[0]])

    # get max number of chunks for batch
    if batch_total_length >= chunk_length:
        batch_chunk_length = (batch_total_length // chunk_length) * chunk_length

    # Split by chunks of max_len.
    result = {
        k: [t[i : i + chunk_length] for i in range(0, batch_chunk_length, chunk_length)]
        for k, t in concatenated_examples.items()
    }
    # add remainder to global variable for next batch
    remainder = {k: concatenated_examples[k][batch_chunk_length:] for k in concatenated_examples.keys()}
    # prepare labels
    result["labels"] = result["input_ids"].copy()
    return result


# tokenize and chunk dataset
lm_dataset = dataset.map(
    lambda sample: tokenizer(sample["text"]), batched=True, remove_columns=list(dataset.features)
).map(
    partial(chunk, chunk_length=2048),
    batched=True,
)

# Print total number of samples
print(f"Total number of samples: {len(lm_dataset)}")


In [None]:
lm_dataset.save_to_disk("dolly-processed")


In [None]:
!torchrun --nproc_per_node 8 run_ds_lora.py \
  --model_id tiiuae/falcon-180B \
  --dataset_path dolly-processed \
  --output_dir falcon-180b-lora-fa \
  --num_train_epochs 3 \
  --per_device_train_batch_size 1 \
  --learning_rate 4e-3 \
  --gradient_checkpointing True \
  --gradient_accumulation_steps 8 \
  --bf16 True \
  --tf32 True \
  --use_flash_attn True \
  --lr_scheduler_type "constant_with_warmup" \
  --logging_steps 25 \
  --save_steps 100 \
  --save_total_limit 3 \
  --deepspeed configs/ds_falcon_180b_z3.json \
  #--merge_adapters True

