# Fine-tune SantaCoder on YAML
Sources:
* https://github.com/loubnabnl/santacoder-finetuning
* https://colab.research.google.com/drive/1UMjeXHwOldpLnWjdm1499o2IYy0RgeTw?usp=sharing


### Install required libraries

In [1]:
!pip install transformers datasets ipywidgets torch
!pip install accelerate -U

Collecting transformers
  Downloading transformers-4.38.2-py3-none-any.whl (8.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m67.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m284.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.19.3
  Downloading huggingface_hub-0.21.4-py3-none-any.whl (346 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m346.4/346.4 kB[0m [31m273.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.19,>=0.14
  Downloading tokenizers-0.15.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m217.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting regex!=2019.12.17
  Downloading regex-2023.12.25-cp39-cp39-manylinux_2_17_x86_64.ma

### Log in to HuggingFace to access The Stack

In [2]:
import os
from huggingface_hub import login
# Note: Prefer notebook_login() but this was not prompting properly
login(token=os.environ.get('HUGGINGFACE_TOKEN'))
#print(token)

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /opt/app-root/src/.cache/huggingface/token
Login successful


### Import required libraries

In [3]:
import torch
from dataclasses import dataclass
from datasets import load_dataset
from torch.utils.data.dataloader import DataLoader
from tqdm.notebook import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    logging,
    set_seed
)
from dataset import ConstantLengthDataset

### Specify the pre-trained model and dataset

In [4]:
model_id = "bigcode/santacoder"
dataset_id = "bigcode/the-stack-dedup"
data_dir = "data/yaml"

### Load the tokenizer

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Load the model

In [6]:
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, use_cache=False)

  return self.fget.__get__(instance, owner)()


### Load the dataset
Use the first 10% of the dataset

In [7]:
dataset = load_dataset(dataset_id, data_dir=data_dir, split='train[:10%]')

Resolving data files:   0%|          | 0/64 [00:00<?, ?it/s]

### Calculate characters per token

In [8]:
examples, total_characters, total_tokens = 500, 0, 0

for _, example in tqdm(zip(range(examples), iter(dataset)), total=examples):
    total_characters += len(example['content'])
    total_tokens += len(tokenizer(example['content']).tokens())

characters_per_token = total_characters / total_tokens
print(characters_per_token)

  0%|          | 0/500 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (3514 > 2048). Running this sequence through the model will result in indexing errors


2.4527346671628827


### Create a test split

In [9]:
dataset = dataset.train_test_split(test_size=0.005, shuffle=False, seed=555)
train_ds = dataset["train"]
valid_ds = dataset["test"]
len(train_ds), len(valid_ds)

(525566, 2642)

In [10]:
train_ds = train_ds.shuffle(seed=555)

train_dataset = ConstantLengthDataset(
        tokenizer, train_ds, infinite=True, seq_length=1024, chars_per_token=characters_per_token 
    )
valid_dataset = ConstantLengthDataset(
        tokenizer, valid_ds, infinite=False, seq_length=1024, chars_per_token=characters_per_token
    )

### Define the training arguments

In [11]:
training_args = TrainingArguments(
        output_dir="santacoder-finetuned-the-stack-yaml",
        dataloader_drop_last=True,
        gradient_checkpointing=True,
        gradient_accumulation_steps=4,
        optim="adafactor",
        evaluation_strategy="steps",
#        max_steps=5000,
        max_steps=1000,
        eval_steps=500,
        save_steps=500,
        logging_steps=10,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        learning_rate=5e-5,
        lr_scheduler_type="cosine",
        warmup_steps=100,
        weight_decay=0.05,
        fp16=False,
        push_to_hub=False
)

In [18]:
import torch
import os

print("PyTorch version:", torch.__version__)

# Check if CUDA is available
if torch.cuda.is_available():
    print("CUDA is available.")
    print("Number of GPU:", torch.cuda.device_count())
    for i in range(torch.cuda.device_count()):
        print(f"Current Device : {torch.cuda.current_device}")
        print(f"Device {i}: {torch.cuda.get_device_name(i)}")
        print(torch.cuda.get_device_properties(i))
        print(f"Memory GB: {torch.cuda.get_device_properties(i).total_memory / 1024**3:.2f} GB")
        print(f"GPU Allocated: {torch.cuda.memory_allocated(i) / 1024 ** 3:.2f} GB")
        print(f"GPU Cached:    {torch.cuda.memory_reserved(i) / 1024 ** 3:.2f} GB")
        
        
else:
    print("CUDA is not available. No GPU detected.")


print(torch.cuda._get_device_index)
print(torch.cuda.current_device)

#torch.cuda.set_device(3)


CUDA_VISIBLE_DEVICES = os.environ.get('CUDA_VISIBLE_DEVICES')
print(CUDA_VISIBLE_DEVICES)


PyTorch version: 2.0.1+cu118
CUDA is available.
Number of GPU: 1
Current Device : <function current_device at 0x7fe603715a60>
Device 0: NVIDIA A10G
_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)
Memory GB: 21.98 GB
GPU Allocated: 4.31 GB
GPU Cached:    10.17 GB
<function _get_device_index at 0x7fe6038949d0>
<function current_device at 0x7fe603715a60>
2


In [19]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Fri Mar  8 09:13:35 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.14              Driver Version: 550.54.14      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A10G                    On  |   00000000:00:1B.0 Off |                    0 |
|  0%   27C    P0             66W /  300W |   21346MiB /  23028MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA A10G                    On  |   00

### Train

In [14]:
train_dataset.start_iteration = 0

In [15]:
trainer = Trainer(
    #tokenizer=tokenizer,
    model=model, args=training_args, 
    train_dataset=train_dataset, 
    eval_dataset=valid_dataset
)

In [16]:
trainer.train()

Step,Training Loss,Validation Loss
500,0.3629,1.449033
1000,1.1211,1.312178


TrainOutput(global_step=1000, training_loss=1.4783561820983886, metrics={'train_runtime': 4912.1649, 'train_samples_per_second': 0.814, 'train_steps_per_second': 0.204, 'total_flos': 2.5061788483584e+16, 'train_loss': 1.4783561820983886, 'epoch': 1.0})

In [17]:
trainer.save_model() # this creates the config.json file etc you need to run generation tests