# Setup

In [1]:
!pip install -q -U datasets transformers accelerate peft trl bitsandbytes wandb evaluate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.0/225.0 kB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.2/102.2 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m83.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━

In [2]:
from transformers import AutoTokenizer
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Get key to huggingface account (stored in Google Colab Secrets)
from google.colab import userdata
HF_TOKEN = userdata.get('HF_TOKEN')

from huggingface_hub import login
login(token = HF_TOKEN)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
from datasets import load_dataset
from random import randrange

# Loading...

# Dataset
dataset = load_dataset("nickrosh/Evol-Instruct-Code-80k-v1")

# Model
base_model = "meta-llama/Llama-2-7b-hf"
fineTuned_model = "meta-Llama-2-7b-Code-Instruct-PEFT-V1"

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model)

Downloading readme:   0%|          | 0.00/282 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/121M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [5]:
# Set tokenizer padding
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = "right"

# Dataset prep

In [6]:
# Check on dataset
dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'output'],
        num_rows: 78264
    })
})

In [7]:
from datasets import DatasetDict, Dataset
import pandas as pd

# Convert the 'train' Dataset to a pandas DataFrame
df = pd.DataFrame(dataset['train'])

# Sort the DataFrame based on the length of the strings in the 'instruction' column
df_sorted = df.sort_values(by='instruction', key=lambda col: col.str.len())

# Convert the sorted DataFrame back to a Dataset object if needed
sorted_dataset = Dataset.from_pandas(df_sorted)
sorted_dataset = sorted_dataset.remove_columns('__index_level_0__')

In [8]:
# Select first K samples from dataset
datasetFirstK = sorted_dataset.select(range(1000))
datasetFirstK

Dataset({
    features: ['instruction', 'output'],
    num_rows: 1000
})

In [9]:
# Split dataset in train and test
train_test_split_dataset2 = datasetFirstK.train_test_split(test_size=0.2)
train_test_split_dataset2

DatasetDict({
    train: Dataset({
        features: ['instruction', 'output'],
        num_rows: 800
    })
    test: Dataset({
        features: ['instruction', 'output'],
        num_rows: 200
    })
})

# Fine Tune

In [10]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging
)
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
from trl import SFTTrainer

import numpy as np
import evaluate

In [11]:
# Quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4")

# LoRa configuration
peft_config = LoraConfig(
    r = 16, #rank
    lora_alpha = 32, # alpha (amount of change added to original weights - to balance change)
    lora_dropout = 0.1, # probability of randomly dropping neurons to avoid overfitting
    bias = "none",
    task_type = "CAUSAL_LM"
    #use_rslora = True, # does lora_alpha/math.sqrt(r) instead of lora_alpha/r - proven to be better
    #target_modules = ['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)

# Load foundation model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config = bnb_config,
    #use_cache=False,
    #use_flash_attention_2=use_flash_attention,
    device_map = "auto"
)

# Cast the layernorm in fp32, make output embedding layer require grads, add the upcasting of the lmhead to fp32
model = prepare_model_for_kbit_training(model)

config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [12]:
import wandb

wandb.login(key = userdata.get('WANDB'))
wandb.init()

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjuliagontijolopes[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [13]:
# Set training arguments
training_arguments = TrainingArguments(
        output_dir = fineTuned_model,
        num_train_epochs=1,
        per_device_train_batch_size=18,
        gradient_accumulation_steps=2,
        evaluation_strategy="steps",
        eval_steps=10,
        logging_steps=1,
        optim="paged_adamw_8bit",
        learning_rate=2e-4,
        lr_scheduler_type="linear",
        warmup_steps=10,
        report_to="wandb",
        max_steps = 100,
        gradient_checkpointing=True,  # Leads to reduction in memory at slighly decrease in speed
        gradient_checkpointing_kwargs={"use_reentrant": False}
        #save_steps = 50
)

model = get_peft_model(model, peft_config)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=train_test_split_dataset2['train'],
    eval_dataset=train_test_split_dataset2['test'],
    max_seq_length=100,
    dataset_text_field = 'instruction',
    tokenizer=tokenizer,
    args=training_arguments,
    packing = True
    #formatting_func=format_instruction
    #compute_metrics=compute_metrics,
)

# Train model
trainer.train()

wandb.run

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
10,3.4447,3.166835
20,2.3417,2.386507
30,2.1127,2.13321
40,1.7639,1.918623
50,1.5759,1.849957
60,1.473,1.845062
70,1.3543,1.87723
80,1.2742,1.91882


Step,Training Loss,Validation Loss
10,3.4447,3.166835
20,2.3417,2.386507
30,2.1127,2.13321
40,1.7639,1.918623
50,1.5759,1.849957
60,1.473,1.845062
70,1.3543,1.87723
80,1.2742,1.91882
90,1.1549,1.947669
100,1.1733,1.994431


In [14]:
wandb.finish()

trainer.save_model()

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,█▄▃▁▁▁▁▁▂▂
eval/runtime,▁▆█▇▁▅▅▂▃▃
eval/samples_per_second,█▃▁▂█▄▄▇▆▆
eval/steps_per_second,█▃▁▃█▃▅▆▅▅
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/grad_norm,▃▃▄▆▇█▃▄▃▅▂▁▂▂▂▃▃▂▂▁▂▃▂▁▁▁▁▃▃▂▂▂▂▃▄▂▂▂▂▂
train/learning_rate,▂▃▅▇███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
train/loss,██▇▇▆▅▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/loss,1.99443
eval/runtime,12.3249
eval/samples_per_second,1.947
eval/steps_per_second,0.243
total_flos,1.27180836274176e+16
train/epoch,33.33
train/global_step,100.0
train/grad_norm,0.67157
train/learning_rate,0.0
train/loss,1.1733


In [14]:
# Empty VRAM
del model
del trainer
import gc
gc.collect()
gc.collect()
torch.cuda.empty_cache()

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto",
)

model = PeftModel.from_pretrained(model, fineTuned_model)
model = model.merge_and_unload()

tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = "right"

#model.push_to_hub(fineTuned_model, use_temp_dir=False, token=HF_TOKEN)
#tokenizer.push_to_hub(fineTuned_model, use_temp_dir=False, token=HF_TOKEN)

In [None]:
# merged_model_path = "JuliaGL/meta-Llama-2-7b-Code-Instruct-PEFT-V1-3.0"
# model.save_pretrained(merged_model_path)
# tokenizer.save_pretrained(merged_model_path)

# model.push_to_hub(merged_model_path, use_temp_dir=False, token=HF_TOKEN)
# tokenizer.push_to_hub(merged_model_path, use_temp_dir=False, token=HF_TOKEN)




# model = PeftModel.from_pretrained(base_model, fineTuned_model)
# merged_model = model.merge_and_unload()
# merged_model.save_pretrained(merged_model)
# tokenizer.save_pretrained(merged_model)
# merged_model.push_to_hub(merged_model, token=HF_TOKEN)




# # Save trained model
# trainer.model.save_pretrained(fineTuned_model)

# # Reload model in FP16 and merge it with LoRA weights
# model = AutoModelForCausalLM.from_pretrained(
#     base_model,
#     low_cpu_mem_usage=True,
#     return_dict=True,
#     torch_dtype=torch.float16,
#     device_map = "auto",
# )

# model = PeftModel.from_pretrained(model, fineTuned_model)
# model = model.merge_and_unload()

# # Reload tokenizer to save it
# tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
# tokenizer.pad_token = tokenizer.unk_token
# tokenizer.padding_side = "right"


# Push model to the hub
# model.push_to_hub(fineTuned_model, use_temp_dir=False, token=HF_TOKEN)
# tokenizer.push_to_hub(fineTuned_model, use_temp_dir=False, token=HF_TOKEN)

In [None]:
# model.push_to_hub(fineTuned_model, use_temp_dir=False, token=HF_TOKEN)
# tokenizer.push_to_hub(fineTuned_model, use_temp_dir=False, token=HF_TOKEN)

# Old prompting experiments

In [8]:
# Format prompt function

def prompt_format_instruction(sample):
	return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{sample['instruction']}

### Response:
"""


# Format function for training

def format_instruction(sample):
	return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{sample['instruction']}

### Response:
{sample['output']}
"""

In [None]:
prompt = """Below is an instruction that describes how you should respond, and an input with a question. Write a response that appropriately answers the question while using the tone described.

### Instruction:
You are a person that talks in a informal way. When asked a question, you should respond like you are taking time to think about the answer along the way. Say words to demonstrate you are reasoning while resopnding.

### Input:
What is the capital of Brazil?

### Response:
"""

In [11]:
prompt = """
### Instruction:
You are a person that talks in a informal way. When asked a question, you should respond like you are taking time to think about the answer along the way. Say words to demonstrate you are reasoning while resopnding.

### Input:
What is the capital of Brazil?

### Response:
"""

In [13]:
prompt = """Whats the capital of Brazil?"""

In [15]:
prompt = """Write a code in python with a function that returns the mean of all numbers stored in a vector"""

In [16]:
# Prompt base model with random sample on prompt template
input_idss = tokenizer.encode(prompt, return_tensors="pt")
tokenizer.decode(model.generate(input_ids=input_idss, max_length=len(input_idss[0]) + 300)[0], skip_special_tokens=True)

# dir(tokenizer)
# tokenizer.special_tokens_map

'Write a code in python with a function that returns the mean of all numbers stored in a vector.\nWrite a code in python with a function that returns the mean of all numbers stored in a vector. The function should return the mean and the standard deviation of the vector.\nWrite a code in python with a function that returns the mean of all numbers stored in a vector. The function should return the mean and the standard deviation of the vector. The function should also take in a vector of numbers and return the mean and standard deviation of the vector.\nWrite a code in python with a function that returns the mean of all numbers stored in a vector. The function should return the mean and the standard deviation of the vector. The function should also take in a vector of numbers and return the mean and standard deviation of the vector. The function should also take in a vector of numbers and return the mean and standard deviation of the vector.\nWrite a code in python with a function that 