# Fine-tune SantaCoder on YAML
Sources:
* https://github.com/loubnabnl/santacoder-finetuning
* https://colab.research.google.com/drive/1UMjeXHwOldpLnWjdm1499o2IYy0RgeTw?usp=sharing


### Install required libraries

In [1]:
!pip install transformers datasets ipywidgets torch
!pip install accelerate -U
!pip install -U bitsandbytes

Collecting transformers
  Downloading transformers-4.39.0-py3-none-any.whl (8.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m69.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m282.9 MB/s[0m eta [36m0:00:00[0m
Collecting regex!=2019.12.17
  Downloading regex-2023.12.25-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (773 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m773.4/773.4 kB[0m [31m281.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.19.3
  Downloading huggingface_hub-0.21.4-py3-none-any.whl (346 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m346.4/346.4 kB[0m [31m278.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.4.1
  Downloading safetensors-0.4.2-cp39-cp39-manylinux_2_17_x86_64.manyl

### Log in to HuggingFace to access The Stack

In [2]:
import os
from huggingface_hub import login
# Note: Prefer notebook_login() but this was not prompting properly
login(token=os.environ.get('HUGGINGFACE_TOKEN'))
#print(token)

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /opt/app-root/src/.cache/huggingface/token
Login successful


### Import required libraries

In [3]:
import torch
from dataclasses import dataclass
from datasets import load_dataset
from torch.utils.data.dataloader import DataLoader
from tqdm.notebook import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    logging,
    set_seed
)
from dataset import ConstantLengthDataset

### Specify the pre-trained model and dataset

In [4]:
model_id = "bigcode/santacoder"
dataset_id = "bigcode/the-stack-dedup"
data_dir = "data/yaml"

### Load the tokenizer

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# BitsAndBytesConfig allows the configuration of the BitsAndBytes feature of Hugging Face Transformers.
# This feature enables efficient model inference by reducing the model size and computational requirements.
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Enables loading the model in a 4-bit quantized format to reduce memory usage.
    bnb_4bit_use_double_quant=True,  # Activates double quantization, which quantizes not just the weights but also the activations.
    bnb_4bit_quant_type="nf4",  # Sets the quantization type to 'nf4', a 4-bit number format for quantization.
    bnb_4bit_compute_dtype=torch.bfloat16  # Specifies bfloat16 as the data type for computation, balancing precision and speed.
)

### Load the model

In [7]:
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, use_cache=False)


# model = AutoModelForCausalLM.from_pretrained(model_id, 
#                                                   # quantization_config=bnb_config, 
#                                                   device_map="auto")

  return self.fget.__get__(instance, owner)()


### Load the dataset
Use the first 10% of the dataset

In [8]:
dataset = load_dataset(dataset_id, data_dir=data_dir, split='train[:60%]')

Resolving data files:   0%|          | 0/64 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/39 [00:00<?, ?it/s]

### Calculate characters per token

In [9]:
examples, total_characters, total_tokens = 500, 0, 0

for _, example in tqdm(zip(range(examples), iter(dataset)), total=examples):
    total_characters += len(example['content'])
    total_tokens += len(tokenizer(example['content']).tokens())

characters_per_token = total_characters / total_tokens
print(characters_per_token)

  0%|          | 0/500 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (3514 > 2048). Running this sequence through the model will result in indexing errors


2.4527346671628827


### Create a test split

In [10]:
dataset = dataset.train_test_split(test_size=0.005, shuffle=False, seed=555)
train_ds = dataset["train"]
valid_ds = dataset["test"]
len(train_ds), len(valid_ds)

(3153402, 15847)

In [11]:
train_ds = train_ds.shuffle(seed=555)

train_dataset = ConstantLengthDataset(
        tokenizer, train_ds, infinite=True, seq_length=1024, chars_per_token=characters_per_token 
    )
valid_dataset = ConstantLengthDataset(
        tokenizer, valid_ds, infinite=False, seq_length=1024, chars_per_token=characters_per_token
    )

### Define the training arguments

In [12]:
training_args = TrainingArguments(
        # output_dir="santacoder-finetuned-the-stack-yaml",
        output_dir="santacoder-finetuned-alanstack-yaml",
        dataloader_drop_last=True,
        gradient_checkpointing=True,
        gradient_accumulation_steps=4,
        optim="adafactor",
        evaluation_strategy="steps",
#        max_steps=5000,
        max_steps=1000,
        eval_steps=500,
        save_steps=500,
        logging_steps=10,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        learning_rate=5e-5,
        lr_scheduler_type="cosine",
        warmup_steps=100,
        weight_decay=0.05,
        fp16=False,
        push_to_hub=False
)

### Train

In [13]:
train_dataset.start_iteration = 0

In [14]:
trainer = Trainer(
    #tokenizer=tokenizer,
    model=model, args=training_args, 
    train_dataset=train_dataset, 
    eval_dataset=valid_dataset
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [15]:
trainer.train()

Step,Training Loss,Validation Loss
500,0.1915,1.557147
1000,0.6231,1.401176


TrainOutput(global_step=1000, training_loss=1.3540308369100094, metrics={'train_runtime': 15415.4534, 'train_samples_per_second': 0.259, 'train_steps_per_second': 0.065, 'total_flos': 2.5061788483584e+16, 'train_loss': 1.3540308369100094, 'epoch': 1.0})

In [16]:
trainer.save_model() # this creates the config.json file etc you need to run generation tests

In [None]:
# trainer.push_to_hub("Ansible-Model/santacoder-finetuned-alanstack-yaml")

In [4]:
from huggingface_hub import upload_file
from huggingface_hub import Repository
from huggingface_hub import upload_folder

upload_folder(
    folder_path="santacoder-finetuned-alanstack-yaml",
    repo_id="Ansible-Model/santacoder-finetuned-alanstack-yaml",
    # repo_type="space",
)
# upload_file(
#     "santacoder-finetuned-alanstack-yaml/config.json",
#     path_in_repo="config.json",
#     repo_id="Ansible-Model/santacoder-finetuned-alanstack-yaml",
# )

# repo = Repository("santacoder-finetuned-alanstack-yaml",
#                   clone_from="Ansible-Model/santacoder-finetuned-alanstack-yaml")

# repo.git_pull()
# repo.git_add()
# repo.git_commit()
# repo.git_push()


model.safetensors:   0%|          | 0.00/4.60G [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/627 [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/5.63M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.6k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.47k [00:00<?, ?B/s]

Upload 16 LFS files:   0%|          | 0/16 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/4.60G [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/5.63M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.6k [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/627 [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.47k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.60G [00:00<?, ?B/s]

events.out.tfevents.1711028071.llm-collection-workbench-0.451.0:   0%|          | 0.00/5.05k [00:00<?, ?B/s]

events.out.tfevents.1711028638.llm-collection-workbench-0.253.0:   0%|          | 0.00/5.05k [00:00<?, ?B/s]

events.out.tfevents.1711032994.llm-collection-workbench-0.146.0:   0%|          | 0.00/6.50k [00:00<?, ?B/s]

events.out.tfevents.1711033639.llm-collection-workbench-0.244.0:   0%|          | 0.00/27.0k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.47k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Ansible-Model/santacoder-finetuned-alanstack-yaml/commit/5439c2e5879da981e27b61c8e78ea12cbe8e0cec', commit_message='Upload folder using huggingface_hub', commit_description='', oid='5439c2e5879da981e27b61c8e78ea12cbe8e0cec', pr_url=None, pr_revision=None, pr_num=None)