### Training CodeLLaMa using PEFT on C#

Adapted from https://ragntune.com/blog/guide-fine-tuning-code-llama


In [1]:
%pip install -qU peft==0.7.1 transformers==4.36.2 "datasets>=2.6.1" ipywidgets bitsandbytes accelerate python-dotenv

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.13.1+cu116 requires torch==1.12.1, but you have torch 2.1.2 which is incompatible.
torchaudio 0.12.1+cu116 requires torch==1.12.1, but you have torch 2.1.2 which is incompatible.[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [1]:
%reload_ext dotenv
%dotenv

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [17]:
import wandb
%env WANDB_LOG_MODEL="checkpoint"
run = wandb.init(project="codellama-training")



env: WANDB_LOG_MODEL="checkpoint"


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.0166693625501163, max=1.0))…

In [26]:
import torch
from peft import (
  LoraConfig,
  get_peft_model,
  prepare_model_for_kbit_training,
)
from transformers import (
  AutoModelForCausalLM,
  TrainingArguments,
  Trainer,
  CodeLlamaTokenizerFast,
  default_data_collator
)
from datasets import load_dataset
from datetime import datetime

In [27]:
model_repo="codellama"
model_name="CodeLlama-7b"
my_model=f'{model_name}-csharp'
my_repo=f'fasterinnerlooper/{my_model}'

base_model = AutoModelForCausalLM.from_pretrained(
  f'{model_repo}/{model_name}-hf',
  load_in_8bit=True,
  torch_dtype="auto",
  # device_map="cpu",
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [28]:
tokenizer = CodeLlamaTokenizerFast(vocab_file='./tokenizer.model', tokenizer_file='./tokenizer.json')
tokenizer.mask_token = tokenizer.fill_token
tokenizer.pad_token='_<PAD>'
tokenizer.add_tokens([
  '_<PAD>',
  '_<SUF>',
  '_<MID>',
  '_<PRE>'])

4

In [29]:
base_model.gradient_checkpointing_enable()
base_model = prepare_model_for_kbit_training(base_model)

OutOfMemoryError: CUDA out of memory. Tried to allocate 502.00 MiB. GPU 0 has a total capacty of 15.73 GiB of which 149.12 MiB is free. Process 3576751 has 15.58 GiB memory in use. Of the allocated memory 14.40 GiB is allocated by PyTorch, and 258.10 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

: 

: 

In [None]:
config = LoraConfig(
  r=8,
  lora_alpha=32,
  target_modules=[
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
  ],
  lora_dropout=0.1,
  bias="none",
  task_type="CAUSAL_LM"
)
model = get_peft_model(base_model, config)
max_length=150

In [None]:
import datasets

tokenized_train_dataset = datasets.load_from_disk('train_dataset')
tokenized_eval_dataset = datasets.load_from_disk('eval_dataset')

In [None]:
from random import randint

shards = 1000
sharded_train_dataset = tokenized_train_dataset.shard(num_shards=shards, index=randint(0,shards-1))
sharded_eval_dataset = tokenized_eval_dataset.shard(num_shards=shards, index=randint(0,shards-1))

In [None]:
def split_rows(examples):
  outputs={'input_ids':[], 'attention_mask': [], 'labels': []}
  for x in examples['input_ids']:
    for y in range(0, 4096, max_length):
      outputs['input_ids']+=[x[y:y+max_length]]
      outputs['attention_mask']+=[1 * max_length]
      outputs['labels']+=[x[y:y+max_length]]
  return outputs

split_train_dataset = sharded_train_dataset.map(split_rows, batched=True)
split_eval_dataset = sharded_eval_dataset.map(split_rows, batched=True)

In [None]:
def pad_rows(examples):
  outputs = {'input_ids':[], 'attention_mask':[]}
  for example in examples['input_ids']:
    dif = max_length-len(example)
    fill=[tokenizer.pad_token_id for _ in range(dif)]
    outputs['input_ids']+=torch.tensor([example+fill])
    outputs['attention_mask']+=torch.tensor([[1 for _ in range(len(example))]+[0 for _ in range(len(fill))]])
  outputs['labels']=outputs['input_ids'].copy()
  return outputs

padded_train_dataset = split_train_dataset.map(pad_rows, batched=True)
padded_eval_dataset = split_eval_dataset.map(pad_rows, batched=True)

Map:   0%|          | 0/4284 [00:00<?, ? examples/s]

Map:   0%|          | 0/476 [00:00<?, ? examples/s]

In [None]:
tokenizer.add_bos_token=True
tokenizer.add_eos_token=True

In [None]:
batch_size = 128
per_device_train_batch_size = 8
gradient_accumulation_steps = batch_size // per_device_train_batch_size
output_dir = "csharp-codellama"

args = TrainingArguments(
  report_to="wandb",
  run_name=f"{my_model}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}",
  per_device_train_batch_size=per_device_train_batch_size,
  per_device_eval_batch_size=per_device_train_batch_size,
  gradient_accumulation_steps=gradient_accumulation_steps,
  weight_decay=0.01,
  # warmup_steps=2,
  # max_steps=10,
  # learning_rate=2e-4,
  # fp16=True,
  # logging_steps=10,
  # optim="adamw_bnb_8bit",
  output_dir=output_dir,
  evaluation_strategy='epoch',
  eval_steps=100,
  save_total_limit=2,
  # load_best_model_at_end=False,
  save_steps=10,
  resume_from_checkpoint=True
)

In [None]:
model = model.to("cpu")
trainer = Trainer(
  model=model,
  train_dataset=padded_train_dataset,
  eval_dataset=padded_eval_dataset,
  args=args,
  data_collator=default_data_collator,
)
model.config.use_cache=False
trainer.train()

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
if wandb.run is not None:
  wandb.finish()

0,1
eval/loss,█▂▁
eval/runtime,▃█▁
eval/samples_per_second,▆▁█
eval/steps_per_second,▇▁█
train/epoch,▁▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇▇███
train/global_step,▁▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇▇███
train/learning_rate,██▇▇▆▆▆▅▅▅▄▄▃▃▃▂▂▁▁
train/loss,█▆▅▅▄▃▃▂▂▂▂▂▂▁▁▁▁▁▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,4.1453
eval/runtime,291.795
eval/samples_per_second,29.363
eval/steps_per_second,3.67
train/epoch,2.96
train/global_step,198.0
train/learning_rate,0.0
train/loss,4.0956
train/total_flos,2.76037122392064e+16
train/train_loss,4.72826


In [None]:
model.push_to_hub(repo_id=my_model)

adapter_model.safetensors:   0%|          | 0.00/6.30M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/fasterinnerlooper/bloomz-csharp/commit/02262038c82842fcbf6ab0d7983264f231268ce1', commit_message='Upload model', commit_description='', oid='02262038c82842fcbf6ab0d7983264f231268ce1', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
from huggingface_hub import ModelCardData, ModelCard

card_data = ModelCardData(language='en', license='mit', library_name='pytorch')
card = ModelCard.from_template(
    card_data,
    model_id=my_repo
    model_description=f"{model_name} trained on lcc_csharp, using PEFT",
    developers="Shafiq Jetha",
)
card.push_to_hub(repo_id=my_repo)

CommitInfo(commit_url='https://huggingface.co/fasterinnerlooper/bloomz-csharp/commit/e4c337cdef81e0000ea0ac20c067e0f705f4d339', commit_message='Upload README.md with huggingface_hub', commit_description='', oid='e4c337cdef81e0000ea0ac20c067e0f705f4d339', pr_url=None, pr_revision=None, pr_num=None)