In [1]:
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    AutoModel,
    AutoConfig,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    TrainerCallback,
    AutoModelForCausalLM,
    GenerationConfig
)
from tqdm.auto import tqdm
from transformers.integrations import WandbCallback
from datasets import load_dataset, DatasetDict
from peft import LoraConfig
from trl import SFTTrainer
from huggingface_hub import HfApi, HfFolder, Repository
import os
import torch
import wandb
from datetime import datetime
from types import SimpleNamespace
import json
import re
import pandas as pd
import random

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


In [2]:
class LLMSampleCB(WandbCallback):
    def __init__(self, trainer, test_dataset, num_samples=10, max_new_tokens=256, log_model="checkpoint"):
        super().__init__()
        self._log_model = log_model
        self.sample_dataset = test_dataset.select(range(num_samples))
        self.model, self.tokenizer = trainer.model, trainer.tokenizer
        self.gen_config = GenerationConfig.from_pretrained(trainer.model.name_or_path,
                                                           max_new_tokens=max_new_tokens)
    def generate(self, prompt):
        tokenized_prompt = self.tokenizer(prompt, return_tensors='pt')['input_ids'].cuda()
        with torch.inference_mode():
            output = self.model.generate(inputs=tokenized_prompt, generation_config=self.gen_config)
        return self.tokenizer.decode(output[0][len(tokenized_prompt[0]):], skip_special_tokens=True)
    
    def samples_table(self, examples):
        records_table = wandb.Table(columns=["prompt", "generation"] + list(self.gen_config.to_dict().keys()))
        for example in tqdm(examples, leave=False):
            prompt = example["text"]
            generation = self.generate(prompt=prompt)
            records_table.add_data(prompt, generation, *list(self.gen_config.to_dict().values()))
        return records_table
        
    def on_evaluate(self, args, state, control,  **kwargs):
        super().on_evaluate(args, state, control, **kwargs)
        records_table = self.samples_table(self.sample_dataset)
        self._wandb.log({"sample_predictions":records_table})

In [3]:
# !wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json

In [4]:
import json

dataset_file = "ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json"

with open(dataset_file, "r") as f:
    shareGPT = json.load(f)

In [5]:
type(shareGPT), shareGPT[0:3], len(shareGPT)

(list,
 [{'id': 'QWJhYvA_0',
   'conversations': [{'from': 'human',
     'value': "Summarize the main ideas of Jeff Walker's Product Launch Formula into bullet points as it pertains to a growth marketing agency implementing these strategies and tactics for their clients..."},
    {'from': 'gpt',
     'value': "Here are the main ideas of Jeff Walker's Product Launch Formula that can be applied by a growth marketing agency for their clients:\n\n1. Identify the target audience and their needs: Understand the ideal customer for the product or service, and create a messaging that resonates with them.\n2. Pre-launch: Build anticipation and excitement for the launch by creating buzz, gathering testimonials and case studies, and using social media to create awareness.\n3. Launch: Use a well-crafted launch sequence to maximize sales and conversions. This can include offering bonuses, creating scarcity, and using a deadline to create urgency.\n4. Post-launch: Follow up with customers, gather fee

In [6]:
new_shareGPT = []

for conversation in shareGPT:
    # split each convo if 'from' is 'human' or 'gpt'
    human_turns = [turn for turn in conversation['conversations'] if turn['from'] == 'human']
    gpt_turns = [turn for turn in conversation['conversations'] if turn['from'] == 'gpt']

    # Check if there is at least one human turn and one gpt turn
    if human_turns and gpt_turns:
        # Get the first human turn and the first gpt turn
        first_human_turn = human_turns[0]
        first_gpt_turn = gpt_turns[0]

        # Create a new dictionary with first human turn as instruction and first gpt turn as output
        new_conversation = {
            'instruction': first_human_turn['value'],
            'output': first_gpt_turn['value']
        }

        # Add the new conversation to the new list
        new_shareGPT.append(new_conversation)

new_shareGPT

[{'instruction': "Summarize the main ideas of Jeff Walker's Product Launch Formula into bullet points as it pertains to a growth marketing agency implementing these strategies and tactics for their clients...",
  'output': "Here are the main ideas of Jeff Walker's Product Launch Formula that can be applied by a growth marketing agency for their clients:\n\n1. Identify the target audience and their needs: Understand the ideal customer for the product or service, and create a messaging that resonates with them.\n2. Pre-launch: Build anticipation and excitement for the launch by creating buzz, gathering testimonials and case studies, and using social media to create awareness.\n3. Launch: Use a well-crafted launch sequence to maximize sales and conversions. This can include offering bonuses, creating scarcity, and using a deadline to create urgency.\n4. Post-launch: Follow up with customers, gather feedback, and continue to provide value to keep them engaged and loyal.\n5. Create a produc

In [7]:
# filter out refusals
refusal_pattern = r"\b(?:I will not|I(?:'m| am) unable to|I cannot|I can't|I'm sorry|I am sorry|I am not able|I'm unable|I'm not able|I am unable|I am unable |AI assistant|AI chatbot| AI language model)\b"

filtered_shareGPT = [entry for entry in new_shareGPT if not re.search(refusal_pattern, entry['output'], re.IGNORECASE)]

In [8]:
# filter out instructions / responses of less than 20 characters
filtered_shareGPT = [entry for entry in new_shareGPT if not re.search(refusal_pattern, entry['output'], re.IGNORECASE) and len(entry['instruction']) >= 20 and len(entry['output']) >= 20]

In [9]:
seed = 42

random.seed(seed)
random.shuffle(filtered_shareGPT) 

In [10]:
# train_dataset = alpaca[:-1000]
train_dataset = filtered_shareGPT[:10000]
eval_dataset = filtered_shareGPT[-1000:]

In [11]:
train_df = pd.DataFrame(train_dataset)
eval_df = pd.DataFrame(eval_dataset)

train_table = wandb.Table(dataframe=train_df)
eval_table  = wandb.Table(dataframe=eval_df)

train_df.to_json("shareGPT_train.jsonl", orient='records', lines=True)
eval_df.to_json("shareGPT_eval.jsonl", orient='records', lines=True)

with wandb.init(project="shareGPT_ft", entity="kobihackenburg", job_type="split_data"):
    at = wandb.Artifact(
        name="shareGPT_splitted", 
        type="dataset",
        description="shareGPT dataset for instruction finetuning"
    )
    at.add_file("shareGPT_train.jsonl")
    at.add_file("shareGPT_eval.jsonl")
    wandb.log_artifact(at)
    wandb.log({"train_dataset":train_table, "eval_dataset":eval_table})

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkobihackenburg[0m. Use [1m`wandb login --relogin`[0m to force relogin




VBox(children=(Label(value='49.253 MB of 49.253 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [12]:
# Define constants
SEED = 42
TRAIN_SIZE = 10000
EVAL_SIZE = 1000
TOTAL_SAMPLE_SIZE = TRAIN_SIZE + EVAL_SIZE
MODEL_ID = 'meta-llama/Llama-2-7b-hf' #"EleutherAI/pythia-70m", 'meta-llama/Llama-2-7b-hf'
MODEL_NAME = MODEL_ID.split('/')[-1]
DATASET_NAME = 'shareGPT'
BASE_REPOSITORY = 'persuasion-scaling-laws'

wandb.init(project="shareGPT_ft",
           entity="kobihackenburg",
           job_type="train",
           tags=["hf_sft_lora", "7b"],
           name=f"{BASE_REPOSITORY}/{MODEL_NAME}/{DATASET_NAME}/10k_filtered")
artifact = wandb.use_artifact('kobihackenburg/shareGPT_ft/shareGPT_splitted:v0', type='dataset')
artifact_dir = artifact.download()

[34m[1mwandb[0m:   2 of 2 files downloaded.  


In [13]:
shareGPT_ds = load_dataset("json", data_dir=artifact_dir)
shareGPT_ds

DatasetDict({
    train: Dataset({
        features: ['instruction', 'output'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['instruction', 'output'],
        num_rows: 1000
    })
})

In [14]:
train_dataset = shareGPT_ds["train"].select(range(TRAIN_SIZE))
eval_dataset = shareGPT_ds["test"].select(range(EVAL_SIZE))

In [15]:
def create_prompt(row):
    return ("Below is an instruction that describes a task. "
            "Write a response that appropriately completes the request.\n\n"
            "### Instruction:\n{instruction}\n\n### Response:\n{output}").format_map(row)

In [16]:
model_kwargs = dict(
    device_map={"" : 0},
    trust_remote_code=True,
    # low_cpu_mem_usage=True,
    torch_dtype=torch.bfloat16,
    # use_flash_attention_2=True,
    use_cache=False,
)

In [17]:
peft_config = LoraConfig(
    r=64,  # the rank of the LoRA matrices
    lora_alpha=16, # the weight
    lora_dropout=0.1, # dropout to add to the LoRA layers
    bias="none", # add bias to the nn.Linear layers?
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj","v_proj","o_proj"] # the name of the layers to add LoRA
    # target_modules=["query_key_value", "dense"] # uncomment if using a Pythia model
)

In [18]:
batch_size = 16
gradient_accumulation_steps = 2
num_train_epochs = 3

total_num_steps = num_train_epochs * 11_210 // (batch_size * gradient_accumulation_steps)

total_num_steps

1050

In [19]:
output_dir = "./output/"
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size//2,
    bf16=True,
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    warmup_ratio = 0.1,
    max_steps=total_num_steps,
    gradient_accumulation_steps=gradient_accumulation_steps,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs=dict(use_reentrant=False),
    evaluation_strategy="steps",
    eval_steps=total_num_steps // num_train_epochs,
    # eval_steps=10,
    # logging strategies
    logging_strategy="steps",
    logging_steps=1,
    save_strategy="steps",
    save_steps=total_num_steps // num_train_epochs,
)

In [20]:
trainer = SFTTrainer(
    model=MODEL_ID,
    model_init_kwargs=model_kwargs,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    packing=True,
    max_seq_length=1024,
    args=training_args,
    formatting_func=create_prompt,
    peft_config=peft_config,
)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

[codecarbon INFO @ 04:52:09] [setup] RAM Tracking...
[codecarbon INFO @ 04:52:09] [setup] GPU Tracking...
[codecarbon INFO @ 04:52:09] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 04:52:09] [setup] CPU Tracking...
[codecarbon INFO @ 04:52:11] CPU Model on constant consumption mode: Intel(R) Xeon(R) Gold 6338 CPU @ 2.00GHz
[codecarbon INFO @ 04:52:11] >>> Tracker's metadata:
[codecarbon INFO @ 04:52:11]   Platform system: Linux-5.15.0-91-generic-x86_64-with-glibc2.35
[codecarbon INFO @ 04:52:11]   Python version: 3.10.12
[codecarbon INFO @ 04:52:11]   CodeCarbon version: 2.3.4
[codecarbon INFO @ 04:52:11]   Available RAM : 251.516 GB
[codecarbon INFO @ 04:52:11]   CPU count: 128
[codecarbon INFO @ 04:52:11]   CPU model: Intel(R) Xeon(R) Gold 6338 CPU @ 2.00GHz
[codecarbon INFO @ 04:52:11]   GPU count: 2
[codecarbon INFO @ 04:52:11]   GPU model: 2 x NVIDIA A100 80GB PCIe


In [21]:
def create_prompt_no_anwer(row):
    row["output"] = ""
    return {"text": create_prompt(row)}

test_dataset = eval_dataset.map(create_prompt_no_anwer)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [22]:
wandb_callback = LLMSampleCB(trainer, test_dataset, num_samples=10, max_new_tokens=256)

In [23]:
trainer.add_callback(wandb_callback)

In [24]:
trainer.train()
wandb.finish()



Step,Training Loss,Validation Loss
350,1.2033,1.151504
700,1.0416,1.157591
1050,0.9536,1.159639


[codecarbon INFO @ 04:52:31] Energy consumed for RAM : 0.000393 kWh. RAM Power : 94.31847381591797 W
[codecarbon INFO @ 04:52:31] Energy consumed for all GPUs : 0.001913 kWh. Total GPU Power : 458.83749440672716 W
[codecarbon INFO @ 04:52:31] Energy consumed for all CPUs : 0.000177 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 04:52:31] 0.002484 kWh of electricity used since the beginning.
[codecarbon INFO @ 04:52:46] Energy consumed for RAM : 0.000786 kWh. RAM Power : 94.31847381591797 W
[codecarbon INFO @ 04:52:46] Energy consumed for all GPUs : 0.004039 kWh. Total GPU Power : 510.6213682250755 W
[codecarbon INFO @ 04:52:46] Energy consumed for all CPUs : 0.000354 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 04:52:46] 0.005180 kWh of electricity used since the beginning.
[codecarbon INFO @ 04:53:01] Energy consumed for RAM : 0.001178 kWh. RAM Power : 94.31847381591797 W
[codecarbon INFO @ 04:53:01] Energy consumed for all GPUs : 0.006055 kWh. Total GPU Power : 484.190947105626

  0%|          | 0/10 [00:00<?, ?it/s]

[codecarbon INFO @ 06:03:32] Energy consumed for RAM : 0.111880 kWh. RAM Power : 94.31847381591797 W
[codecarbon INFO @ 06:03:32] Energy consumed for all GPUs : 0.595616 kWh. Total GPU Power : 285.880795013573 W
[codecarbon INFO @ 06:03:32] Energy consumed for all CPUs : 0.050470 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 06:03:32] 0.757966 kWh of electricity used since the beginning.
[codecarbon INFO @ 06:03:47] Energy consumed for RAM : 0.112273 kWh. RAM Power : 94.31847381591797 W
[codecarbon INFO @ 06:03:47] Energy consumed for all GPUs : 0.596592 kWh. Total GPU Power : 234.28331693605892 W
[codecarbon INFO @ 06:03:47] Energy consumed for all CPUs : 0.050647 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 06:03:47] 0.759512 kWh of electricity used since the beginning.
[codecarbon INFO @ 06:04:02] Energy consumed for RAM : 0.112665 kWh. RAM Power : 94.31847381591797 W
[codecarbon INFO @ 06:04:02] Energy consumed for all GPUs : 0.597633 kWh. Total GPU Power : 250.0864398000768

  0%|          | 0/10 [00:00<?, ?it/s]

[codecarbon INFO @ 07:15:49] Energy consumed for RAM : 0.225364 kWh. RAM Power : 94.31847381591797 W
[codecarbon INFO @ 07:15:49] Energy consumed for all GPUs : 1.195530 kWh. Total GPU Power : 317.07400397609945 W
[codecarbon INFO @ 07:15:49] Energy consumed for all CPUs : 0.101661 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 07:15:49] 1.522554 kWh of electricity used since the beginning.
[codecarbon INFO @ 07:16:04] Energy consumed for RAM : 0.225757 kWh. RAM Power : 94.31847381591797 W
[codecarbon INFO @ 07:16:04] Energy consumed for all GPUs : 1.196537 kWh. Total GPU Power : 242.00749170764414 W
[codecarbon INFO @ 07:16:04] Energy consumed for all CPUs : 0.101838 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 07:16:04] 1.524132 kWh of electricity used since the beginning.
[codecarbon INFO @ 07:16:19] Energy consumed for RAM : 0.226149 kWh. RAM Power : 94.31847381591797 W
[codecarbon INFO @ 07:16:19] Energy consumed for all GPUs : 1.197540 kWh. Total GPU Power : 240.88168315760

  0%|          | 0/10 [00:00<?, ?it/s]

[codecarbon INFO @ 08:28:04] Energy consumed for RAM : 0.338806 kWh. RAM Power : 94.31847381591797 W
[codecarbon INFO @ 08:28:04] Energy consumed for all GPUs : 1.794944 kWh. Total GPU Power : 313.9780188184049 W
[codecarbon INFO @ 08:28:04] Energy consumed for all CPUs : 0.152830 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 08:28:04] 2.286580 kWh of electricity used since the beginning.
[codecarbon INFO @ 08:28:19] Energy consumed for RAM : 0.339199 kWh. RAM Power : 94.31847381591797 W
[codecarbon INFO @ 08:28:19] Energy consumed for all GPUs : 1.795927 kWh. Total GPU Power : 235.8527071186449 W
[codecarbon INFO @ 08:28:19] Energy consumed for all CPUs : 0.153007 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 08:28:19] 2.288132 kWh of electricity used since the beginning.
[codecarbon INFO @ 08:28:34] Energy consumed for RAM : 0.339591 kWh. RAM Power : 94.31847381591797 W
[codecarbon INFO @ 08:28:34] Energy consumed for all GPUs : 1.796976 kWh. Total GPU Power : 252.0812707215273

VBox(children=(Label(value='1229.943 MB of 1290.257 MB uploaded (4.491 MB deduped)\r'), FloatProgress(value=0.…

0,1
eval/loss,▁▁▆▆██
eval/runtime,▁▁██▇▇
eval/samples_per_second,██▁▁▂▂
eval/steps_per_second,██▁▁▁▁
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/grad_norm,▂▂▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▃▂▃▃▃▃▃█▃▄▃▃▃▃▃▃▄▃▃▃▃▃
train/learning_rate,▂▃▅▆██████▇▇▇▇▇▆▆▆▆▅▅▅▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁
train/loss,█▇▇▆▃▅▇▄▄▄▅▃▃▅▄▄▃▂▂▃▃▂▂▂▂▃▂▂▂▂▂▃▃▁▃▁▁▃▂▂
train/total_flos,▁▁

0,1
eval/loss,1.15964
eval/runtime,56.7416
eval/samples_per_second,11.209
eval/steps_per_second,0.705
train/epoch,10.55
train/global_step,1050.0
train/grad_norm,0.04956
train/learning_rate,0.0
train/loss,0.9536
train/total_flos,2.7503955630096384e+18


In [25]:
# EXPORT MODEL
# Get the trained model
os.environ["HF_API_TOKEN"] = "token here"
model = trainer.model

# push model to Hugging Face hub
model.push_to_hub(
    f"{BASE_REPOSITORY}/{MODEL_NAME}-{DATASET_NAME}",
    use_auth_token=os.environ["HF_API_TOKEN"],
    private=True,
)



adapter_model.safetensors:   0%|          | 0.00/134M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/persuasion-scaling-laws/Llama-2-7b-hf-shareGPT/commit/95391773a7bb76bd60b901aa19645d9421490fdc', commit_message='Upload model', commit_description='', oid='95391773a7bb76bd60b901aa19645d9421490fdc', pr_url=None, pr_revision=None, pr_num=None)