# Fine-tune SantaCoder on YAML
Sources:
* https://github.com/loubnabnl/santacoder-finetuning
* https://colab.research.google.com/drive/1UMjeXHwOldpLnWjdm1499o2IYy0RgeTw?usp=sharing



### Install required libraries

In [1]:
!pip install transformers datasets torch
!pip install huggingface_hub
!pip install accelerate -U
!pip install transformers[torch]

Collecting transformers
  Downloading transformers-4.39.1-py3-none-any.whl (8.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m69.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m290.5 MB/s[0m eta [36m0:00:00[0m
Collecting regex!=2019.12.17
  Downloading regex-2023.12.25-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (773 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m773.4/773.4 kB[0m [31m286.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.19,>=0.14
  Downloading tokenizers-0.15.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m151.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.19.3
  Downloading huggingface_hub-0.21.4-

### Log in to HuggingFace to access The Stack

In [2]:
import os
from huggingface_hub import login
token=os.environ.get('HUGGINGFACE_TOKEN')
login(token)


Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /opt/app-root/src/.cache/huggingface/token
Login successful


### Import required libraries

In [3]:
import torch
from dataclasses import dataclass
from datasets import load_dataset
from torch.utils.data import IterableDataset
from torch.utils.data.dataloader import DataLoader
from tqdm.notebook import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    logging,
    set_seed
)
from dataset import ConstantLengthDataset

### Specify the original pre-trained model and fine-tuned model

In [4]:
pretrained_id = "bigcode/santacoder"
model_id = "santacoder-finetuned-alanstack-yaml"

### Load the tokenizer

In [5]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_id)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Load the model

In [6]:
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             trust_remote_code=True, 
                                             use_cache=False)

### Create a minimal Ansible dataset from text file

In [7]:
dataset_dict = load_dataset("text", 
                            data_files="ansible-examples-ec2.txt",
                            sample_by="paragraph")
dataset = dataset_dict['train']

### Calculate characters per token

In [8]:
examples, total_characters, total_tokens = 500, 0, 0

for _, example in tqdm(zip(range(examples), iter(dataset)), total=examples):
    total_characters += len(example['text'])
    total_tokens += len(tokenizer(example['text']).tokens())

characters_per_token = total_characters / total_tokens
print(characters_per_token)

  0%|          | 0/500 [00:00<?, ?it/s]

2.9357311320754715


### Create a test split

In [9]:
dataset = dataset.train_test_split(test_size=0.1)
train_ds = dataset["train"]
valid_ds = dataset["test"]
len(train_ds), len(valid_ds)

(15, 2)

In [10]:
train_ds = train_ds.shuffle(seed=555)

train_dataset = ConstantLengthDataset(
        tokenizer, train_ds, infinite=True, seq_length=1024, chars_per_token=characters_per_token, content_field="text" 
    )
valid_dataset = ConstantLengthDataset(
        tokenizer, valid_ds, infinite=False, seq_length=1024, chars_per_token=characters_per_token, content_field="text"  
    )

### Define the training arguments

In [11]:
training_args = TrainingArguments(
        output_dir="santacoder-finetuned-alanstack-ec2",
        dataloader_drop_last=True,
        gradient_checkpointing=True,
        gradient_accumulation_steps=4,
        optim="adafactor",
        evaluation_strategy="steps",
        # max_steps=5000,
        max_steps=100,
        # max_steps=10,#work version
        eval_steps=500,
        save_steps=500,
        logging_steps=10,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        learning_rate=5e-5,
        lr_scheduler_type="cosine",
        warmup_steps=100,
        weight_decay=0.05,
        fp16=False,
        push_to_hub=True
)

### Train

In [12]:
train_dataset.start_iteration = 0

In [13]:
trainer = Trainer(
    # tokenizer=tokenizer,
    model=model, 
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset
    # train_dataset=train_ds,
    # eval_dataset=valid_ds
   
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [14]:
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=100, training_loss=0.09472570978105069, metrics={'train_runtime': 1209.4817, 'train_samples_per_second': 1.323, 'train_steps_per_second': 0.083, 'total_flos': 1.00247153934336e+16, 'train_loss': 0.09472570978105069, 'epoch': 1.0})

In [15]:
trainer.save_model() # this creates the config.json file etc you need to run generation tests

events.out.tfevents.1711159033.llm-back-project-workbench-0.394.0:   0%|          | 0.00/7.48k [00:00<?, ?B/s]

events.out.tfevents.1711158562.llm-back-project-workbench-0.389.0:   0%|          | 0.00/5.07k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.60G [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.47k [00:00<?, ?B/s]

In [16]:
# trainer.push_to_hub("Ansible-Model/santacoder-finetuned-alanstack-ec2")

In [17]:
model.push_to_hub("Ansible-Model/santacoder-finetuned-alanstack-ec2")


# base_tokenizer.push_to_hub("Llama-2-7b-hf_finetuned_finance_jupyter")


README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.60G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Ansible-Model/santacoder-finetuned-alanstack-ec2/commit/85a483304617813463b35919060fe20877b28aa7', commit_message='Upload model', commit_description='', oid='85a483304617813463b35919060fe20877b28aa7', pr_url=None, pr_revision=None, pr_num=None)