### Connect go Google Drive

In [None]:
# mount google drive to access cloned repo
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
cd drive/MyDrive/GitHub/finetuning-llm/

/content/drive/MyDrive/GitHub/finetuning-llm


In [None]:
pwd

'/content/drive/MyDrive/GitHub/finetuning-llm'

### Install Dependencies

In [None]:
!pip install datasets
!pip install trl
!pip install bitsandbytes
!pip install evaluate
!pip install rouge_score



# Optimised Code

In [None]:
# Provide Colab a systemic path:
import sys
sys.path.append('/content/drive/My Drive/GitHub/finetuning-llm/')

In [None]:
import torch
import os

from src.train.callbacks import BatchSizeCallback, MetricsLoggingCallback
from src.train.finetune_helpers import ModelArguments, ScriptArguments
from src.train.finetune_causallm import *

os.environ["WANDB_LOG_MODEL"] = "checkpoint"  # log all model checkpoints

In [None]:
model_args = ModelArguments(
    model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    use_4bit=True,
    use_nested_quant=True,
    bnb_4bit_compute_dtype="bfloat16",
    bnb_4bit_quant_dtype="nf4",
    lora_alpha=128,
    lora_dropout=0.1,
    lora_r=8
)

script_args = ScriptArguments(
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    # auto_find_batch_size=True,
    # gradient_accumulation_steps=4,
    learning_rate=2e-5,
    weight_decay=0.01,
    max_seq_length=512,
    dataset_name="4DR1455/finance_questions",
    bf16=True,
    optim='adafactor',
    lr_scheduler_type='cosine',
    packing=False,
    num_train_epochs=3,
    # max_steps=5,
    save_steps=100,
    logging_steps=10,
    eval_steps=100,
    warmup_steps=100,
    eval_strategy='steps',
    run_name="TinyLlama-1.1B-Chat-Finance-v1.3",
    report_to="wandb",
    save_safetensors=True,
    label_names=['labels'],
    load_best_model_at_end=True,
    dataloader_num_workers=10,
)


In [None]:
causallm_llm = FinetuneCausalLM(model_args=model_args, script_args=script_args)

tokenizer = causallm_llm.get_tokenizer()
peft_config = causallm_llm.get_peft_config()
peft_model = causallm_llm.get_model(peft_config)

data_collator = data_collator(tokenizer=tokenizer)

train_set, eval_set, test_set = causallm_llm.tokenize_split_dataset(sample=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


trainable params: 1,126,400 || all params: 1,101,174,784 || trainable%: 0.1023


In [None]:
causallm_llm.train(
    peft_model=peft_model,
    peft_config=peft_config,
    data_collator=data_collator)

  trainer = SFTTrainer(


Tokenizing train dataset:   0%|          | 0/43149 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2244 > 2048). Running this sequence through the model will result in indexing errors


Tokenizing train dataset:   0%|          | 0/43149 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/5394 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/5394 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Currently logged in as: [33mleonsunwl[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Step,Training Loss,Validation Loss,R1,R2,Rl,Rlsum
100,1.6743,1.775566,0.4446,0.1973,0.3642,0.3868
200,1.6758,1.74911,0.4488,0.2018,0.3689,0.3912
300,1.6979,1.739718,0.45,0.2028,0.3702,0.3924
400,1.6636,1.735,0.4504,0.2033,0.3707,0.3927
500,1.64,1.731941,0.4511,0.2041,0.3715,0.3934
600,1.6973,1.729613,0.4507,0.204,0.3713,0.3931
700,1.6549,1.728103,0.4507,0.2043,0.3713,0.3928
800,1.6385,1.727345,0.4509,0.2045,0.3716,0.393
900,1.6976,1.727093,0.4508,0.2045,0.3714,0.3928
1000,1.6638,1.72705,0.4508,0.2045,0.3716,0.393


[34m[1mwandb[0m: Adding directory to artifact (./results/TinyLlama/TinyLlama-1.1B-Chat-v1.0/checkpoint-100)... Done. 1.4s


[34m[1mwandb[0m: Adding directory to artifact (./results/TinyLlama/TinyLlama-1.1B-Chat-v1.0/checkpoint-200)... Done. 0.0s


[34m[1mwandb[0m: Adding directory to artifact (./results/TinyLlama/TinyLlama-1.1B-Chat-v1.0/checkpoint-300)... Done. 0.0s


[34m[1mwandb[0m: Adding directory to artifact (./results/TinyLlama/TinyLlama-1.1B-Chat-v1.0/checkpoint-400)... Done. 0.1s


[34m[1mwandb[0m: Adding directory to artifact (./results/TinyLlama/TinyLlama-1.1B-Chat-v1.0/checkpoint-500)... Done. 0.0s


[34m[1mwandb[0m: Adding directory to artifact (./results/TinyLlama/TinyLlama-1.1B-Chat-v1.0/checkpoint-600)... Done. 0.1s


[34m[1mwandb[0m: Adding directory to artifact (./results/TinyLlama/TinyLlama-1.1B-Chat-v1.0/checkpoint-700)... Done. 0.0s


[34m[1mwandb[0m: Adding directory to artifact (./results/TinyLlama/TinyLlama-1.1B-Chat-v1.0/checkpoint-800)... Done. 0.1s


[34m[1mwandb[0m: Adding directory to artifact (./results/TinyLlama/TinyLlama-1.1B-Chat-v1.0/checkpoint-900)... Done. 0.0s


[34m[1mwandb[0m: Adding directory to artifact (./results/TinyLlama/TinyLlama-1.1B-Chat-v1.0/checkpoint-1000)... Done. 0.1s
[34m[1mwandb[0m: Adding directory to artifact (./results/TinyLlama/TinyLlama-1.1B-Chat-v1.0/checkpoint-1011)... Done. 0.0s


In [None]:
torch.cuda.device_count()

1

In [None]:
os.cpu_count()

12

In [None]:
train_set_subset = train_set.select(range(2))
eval_set_subset = eval_set.select(range(2))
test_set_subset = train_set.select(range(2))

In [None]:
causallm_llm.train_set = train_set_subset
causallm_llm.eval_set_subset = eval_set_subset
causallm_llm.test_set_subset = test_set_subset


In [None]:
def formatting_prompts_func(example):
    output_texts = []
    # Constructing a standard Alpaca (https://github.com/tatsu-lab/stanford_alpaca#data-release) prompt
    mssg = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
    for i in range(len(example["instruction"])):
        text = f"{mssg}\n### Instruction:\n{example['instruction'][i]}\n### Output: {example['output'][i]}"
        output_texts.append(text)
    return output_texts