In [1]:
!python -m venv training-with-peft
!source training-with-peft/bin/activate

In [3]:
%pip install -qU peft==0.7.1 transformers==4.36.2 datasets>=2.6.1 ipywidgets bitsandbytes accelerate

[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
from huggingface_hub import login

!git config --global credential.helper store
login(token='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', add_to_git_credential=True)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
from transformers import (
  AutoModelForCausalLM,
  AutoTokenizer,
  default_data_collator,
  get_linear_schedule_with_warmup
)
from peft import (
  PromptTuningInit,
  PromptTuningConfig,
  TaskType,
  PeftModelForCausalLM,
  prepare_model_for_kbit_training
)
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm import tqdm

In [5]:
base_model_id="codellama/CodeLlama-7b-hf"
model_id="CodeLlama-7b-csharp"
model_name_or_path='./fasterinnerlooper/CodeLlama-7b-hf'
peft_config = PromptTuningConfig(
  task_type=TaskType.CAUSAL_LM,
  prompt_tuning_init=PromptTuningInit.TEXT,
  num_virtual_tokens=8,
  prompt_tuning_init_text='Complete the following code snippet:',
  tokenizer_name_or_path=base_model_id,
)

device="cuda"
max_length=32
lr=3e-2
num_epochs=5
batch_size=8
shards=10000

In [6]:
from datasets import load_from_disk
# train_dataset=load_from_disk("./train_dataset/")
# eval_dataset=load_from_disk("./eval_dataset/")
padded_train_dataset=load_from_disk('./padded-train-dataset/')
padded_eval_dataset=load_from_disk('./padded-eval-dataset/')

In [7]:
tokenizer=AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-hf")
if tokenizer.pad_token_id is None:
  tokenizer.pad_token_id=tokenizer.eos_token_id

In [8]:
def split_rows(examples):
  outputs={'input_ids':[], 'attention_mask': [], 'labels': []}
  for x in examples['input_ids']:
    for y in range(0, 4096, max_length):
      outputs['input_ids']+=[x[y:y+max_length]]
      outputs['attention_mask']+=[1 * max_length]
      outputs['labels']+=[x[y:y+max_length]]
  return outputs

# split_train_dataset = train_dataset.map(split_rows, batched=True)
# split_eval_dataset = eval_dataset.map(split_rows, batched=True)

In [9]:
def pad_rows(examples):
  outputs = {'input_ids':[], 'attention_mask':[]}
  for example in examples['input_ids']:
    dif = max_length-len(example)
    fill=[tokenizer.pad_token_id for _ in range(dif)]
    outputs['input_ids']+=torch.tensor([example+fill])
    outputs['attention_mask']+=torch.tensor([[1 for _ in range(len(example))]+[0 for _ in range(len(fill))]])
  outputs['labels']=outputs['input_ids'].copy()
  return outputs

# padded_train_dataset = split_train_dataset.map(pad_rows, batched=True)
# padded_eval_dataset = split_eval_dataset.map(pad_rows, batched=True)

In [10]:
from random import randint

training_visited=[]
eval_visited=[]

t = randint(0, shards-1)
training_visited.append(t)
sharded_train_dataset = padded_train_dataset.shard(shards, t)
e = randint(0, shards-1)
eval_visited.append(e)
sharded_eval_dataset = padded_eval_dataset.shard(shards, e)

In [11]:
train_dataloader = DataLoader(
  sharded_train_dataset,
  collate_fn=default_data_collator,
  batch_size=batch_size,
  pin_memory=True
)
eval_dataloader = DataLoader(
  sharded_eval_dataset,
  collate_fn=default_data_collator,
  batch_size=batch_size,
  pin_memory=True
)

In [12]:
# model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
model = AutoModelForCausalLM.from_pretrained(
  base_model_id,
  load_in_8bit=True,
  torch_dtype=torch.float16,
  device_map="auto")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
model=PeftModelForCausalLM.from_pretrained(model, model_name_or_path, config=peft_config)

In [14]:
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler=get_linear_schedule_with_warmup(
  optimizer=optimizer,
  num_warmup_steps=0,
  num_training_steps=len(train_dataloader) * num_epochs
)

In [16]:
prepare_model_for_kbit_training(model)

for epoch in range(num_epochs):
  model.train()
  total_loss=0
  train_tqdm = tqdm(train_dataloader)
  for step, batch in enumerate(train_tqdm):
    batch={k: v.to(device) for k,v in batch.items()}
    outputs=model(**batch)
    loss = outputs.loss
    total_loss+=loss.detach().float()
    loss.backward()
    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    train_epoch_loss=total_loss/len(train_dataloader)
    train_ppl=torch.exp(train_epoch_loss)
    train_tqdm.set_description(f"(train) epoch #{epoch}: ppl:{train_ppl.item():.4f} train loss:{train_epoch_loss:.4f}")

  model.eval()
  eval_loss=0
  eval_preds=[]
  eval_tqdm = tqdm(eval_dataloader)
  for step, batch in enumerate(eval_tqdm):
    batch={k:v.to(device) for k,v in batch.items()}
    with torch.no_grad():
      outputs=model(**batch)
    loss=outputs.loss
    eval_loss+=loss.detach().float()
    eval_preds.extend(
      tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
    )

    eval_epoch_loss=eval_loss/len(eval_dataloader)
    eval_ppl=torch.exp(eval_epoch_loss)
    eval_tqdm.set_description(f"(eval) epoch #{epoch}: ppl:{eval_ppl.item():.4f} train loss:{eval_epoch_loss:.4f}")

  model.save_pretrained(model_id)
  # print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

  0%|          | 0/245 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
(train) epoch #0: ppl:32.1104 train loss:3.4692: 100%|██████████| 245/245 [15:16<00:00,  3.74s/it]
(eval) epoch #0: ppl:34.5932 train loss:3.5437: 100%|██████████| 28/28 [00:37<00:00,  1.35s/it]
(train) epoch #1: ppl:32.0757 train loss:3.4681: 100%|██████████| 245/245 [15:18<00:00,  3.75s/it]
(eval) epoch #1: ppl:34.5932 train loss:3.5437: 100%|██████████| 28/28 [00:34<00:00,  1.22s/it]
(train) epoch #2: ppl:4.5716 train loss:1.5199:  44%|████▎     | 107/245 [06:42<08:35,  3.73s/it]

: 

In [None]:
from peft import PeftConfig, PeftModelForCausalLM
from transformers import AutoModelForCausalLM
import torch

model_id = "codellama/CodeLlama-7b-hf"
model = AutoModelForCausalLM.from_pretrained(
  model_id,
  load_in_8bit=True,
  torch_dtype=torch.float16,
  device_map="auto")
config = PeftConfig.from_pretrained("./fasterinnerlooper/CodeLlama-7b-hf/")
model=PeftModelForCausalLM.from_pretrained(model,'./fasterinnerlooper/CodeLlama-7b-hf/', config=config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
from transformers import AutoModelForCausalLM, OPTForCausalLM, AutoTokenizer
# from peft import PeftConfig
# import torch

# model_id = "codellama/CodeLlama-7b-hf"
# adapter_model_id = "./fasterinnerlooper/CodeLlama-7b-hf/"
tokenizer = AutoTokenizer.from_pretrained(model_id)
# text = "Hello"
# inputs = tokenizer(text, return_tensors="pt")

# model = AutoModelForCausalLM.from_pretrained(model_id, load_in_8bit=True, torch_dtype=torch.float16, device_map="auto")
# peft_config = PeftConfig.from_pretrained(adapter_model_id)

# to initiate with random weights
# peft_config.init_lora_weights = False

# model.add_adapter(peft_config)
# model.enable_adapters()
# output = model.generate(**inputs)

Streaming from https://huggingface.co/docs/transformers/generation_strategies#streaming

In [None]:
# from transformers import AutoTokenizer
# tokenizer = AutoTokenizer.from_pretrained('codellama/CodeLlama-7b-hf')
inputs = tokenizer(
  """_<PRE>
    public class DataReader(IConsole console)
    {
        public IConsole Console { get; } = console;

        public async Task ProcessFileAsync(string filename)
        {
            var tempFile = Path.GetRandomFileName();
_<SUF>
                {
                    fileProgressBar.Refresh(i, $"Reading row group {i + 1}");
                    var table = await reader.ReadAsTableAsync(rowGroupIndex: i);
                    var bar = new ProgressBar(this.Console, table.Count);
                    progressBars.Add(bar);
                    tasks.Add(Task.Factory.StartNew((obj) =>
                                        {
                                            var taskData = obj as MyTaskData;
                                            var table = taskData.Table;
                                            var i = taskData.Index;
                                            totalFileProgressBar.Max += table.Count;
                                            foreach (var row in table)
                                            {
_<MID>
""",
return_tensors="pt")

In [None]:
from transformers import TextStreamer
device = "cuda"

with torch.no_grad():
    inputs = {k: v.to(device) for k, v in inputs.items()}
    streamer = TextStreamer(tokenizer)
    _ = model.generate(
        **inputs,
        # input_ids=inputs["input_ids"],
        # attention_mask=inputs["attention_mask"],
        max_new_tokens=50,
        streamer=streamer
        # eos_token_id=3
    )
    # print(outputs)
    # print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> _<PRE>
    public class DataReader(IConsole console)
    {
        public IConsole Console { get; } = console;

        public async Task ProcessFileAsync(string filename)
        {
            var tempFile = Path.GetRandomFileName();
_<SUF>
                {
                    fileProgressBar.Refresh(i, $"Reading row group {i + 1}");
                    var table = await reader.ReadAsTableAsync(rowGroupIndex: i);
                    var bar = new ProgressBar(this.Console, table.Count);
                    progressBars.Add(bar);
                    tasks.Add(Task.Factory.StartNew((obj) =>
                                        {
                                            var taskData = obj as MyTaskData;
                                            var table = taskData.Table;
                                            var i = taskData.Index;
                                            totalFileProgressBar.Max += table.Count;
                                            foreach (v



                                               _<PRE>
                                           bar.Dispose _<PRE>
                

OutOfMemoryError: CUDA out of memory. Tried to allocate 98.00 MiB. GPU 0 has a total capacty of 15.89 GiB of which 98.69 MiB is free. Process 1424091 has 15.79 GiB memory in use. Of the allocated memory 12.26 GiB is allocated by PyTorch, and 3.03 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF