In [1]:
# Install and import the necessary libraries
!pip install torch
!pip install -q -U accelerate peft bitsandbytes transformers trl einops

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
### Load dataset
from datasets import load_dataset

def get_dataset():
  dataset_loaded = load_dataset("OpenAssistant/oasst1")
  train_dataset = dataset_loaded["train"].to_pandas()
  val_dataset = dataset_loaded["validation"].to_pandas()
  return dataset_loaded, train_dataset, val_dataset

In [3]:
def prep_data(df):
    df_assistant = df[(df.role == "assistant") & (df["rank"] == 0.0)].copy()
    df_prompter = df[(df.role == "prompter")].copy()
    df_prompter = df_prompter.set_index("message_id")
    df_assistant["output"] = df_assistant["text"].values

    inputs = []
    parent_ids = []
    for _, row in df_assistant.iterrows():
        input = df_prompter.loc[row.parent_id]
        inputs.append(input.text)
        parent_ids.append(input.parent_id)

    df_assistant["instruction"] = inputs
    df_assistant["parent_id"] = parent_ids

    df_assistant = df_assistant[df_assistant.lang == "en"]

    df_assistant = df_assistant[
        ["instruction", "output", "message_id", "parent_id"]
    ].rename(columns={"message_id": "id"})

    return df_assistant

In [4]:
entire, train_ds, val_ds = get_dataset()

train_df = prep_data(train_ds)
eval_df = prep_data(val_ds)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

(…)-00000-of-00001-b42a775f407cee45.parquet:   0%|          | 0.00/39.5M [00:00<?, ?B/s]

(…)-00000-of-00001-134b8fd0c89408b6.parquet:   0%|          | 0.00/2.08M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/84437 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4401 [00:00<?, ? examples/s]

In [5]:
train_df.head()

Unnamed: 0,instruction,output,id,parent_id
1,Can you write a short introduction about the r...,"""Monopsony"" refers to a market structure where...",c8e83833-ecbc-44fe-b6db-735228c25a1c,
7,What can be done at a regulatory level to ensu...,Here are some potential regulatory options to ...,73d6f715-3787-409c-81e4-fde0e5ef60cd,636dd191-50df-4894-ba9a-cd7f00767258
25,Can you explain contrastive learning in machin...,Sure! Let's say you want to build a model whic...,e8ca4e06-a584-4001-8594-5f633e06fa91,
27,I didn't understand how pulling and pushing wo...,It has to do with a process called differentia...,4c7578c0-f45b-4dd0-a1a1-24a189658a41,e8ca4e06-a584-4001-8594-5f633e06fa91
38,I want to start doing astrophotography as a ho...,Getting started in astrophotography can seem d...,19cb94ab-d6ab-4dbd-b408-70ffbd9b595c,


In [6]:
eval_df.head()

Unnamed: 0,instruction,output,id,parent_id
21,What do you think about ChatGPT?,"As an open source alternative to ChatGPT, I do...",7d05acb7-9360-458c-8a1d-c0b6492b8f8a,
23,What are your thoughts on the censorship of Ch...,As a large language model trained on text from...,c8dc7c16-e493-4078-bdc7-368b24476ca9,7d05acb7-9360-458c-8a1d-c0b6492b8f8a
28,"Yeah, I hear you, brother! Power to the people...",Here are some differences between me and ChatG...,48ac2156-f823-4e97-81ab-a66354549f59,779035e6-9872-4d52-9be7-872b5f0b7fe5
33,Can you please provide me the names of the two...,Yes. Given that you're requesting information ...,99433b0b-566a-48c6-a470-8c4c1dc5957f,
42,How would the Future of AI in 10 Years look?,Predicting the future is always a challenging ...,b5de9e83-d570-42b3-a6cd-ca731fb2e4de,


In [12]:
# Save the dataframes to .jsonl files

train_df.to_json('train.jsonl', orient='records', lines=True)
eval_df.to_json('test.jsonl', orient='records', lines=True)

In [15]:
# Load datasets
train_dataset = load_dataset('json', data_files='/content/train.jsonl', split="train")
valid_dataset = load_dataset('json', data_files='/content/test.jsonl', split="train")

# Preprocess datasets
train_dataset_mapped = train_dataset.map(lambda examples: {'text': [f'[INST] <>\n{system_message.strip()}\n<>\n\n' + prompt + ' [/INST] ' + response for prompt, response in zip(examples['instruction'], examples['output'])]}, batched=True)
valid_dataset_mapped = valid_dataset.map(lambda examples: {'text': [f'[INST] <>\n{system_message.strip()}\n<>\n\n' + prompt + ' [/INST] ' + response for prompt, response in zip(examples['instruction'], examples['output'])]}, batched=True)

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/7856 [00:00<?, ? examples/s]

Map:   0%|          | 0/418 [00:00<?, ? examples/s]

In [8]:
#### Load Microsoft-phi2 Model

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16)

torch.set_default_device("cuda")

model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2", torch_dtype="auto", quantization_config=bnb_config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
model.config.use_cache = False

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [9]:
tokenizer.pad_token = tokenizer.eos_token

In [14]:
from transformers import pipeline

# Create a text-generation pipeline with GPT-2
generator = pipeline("text-generation", model="gpt2")

def generate_system_message(prompt, max_length=150, temperature=0.7):
    system_instruction = (
        "You will be given a high-level description of the model we are training, "
        "and from that, you will generate a simple system prompt for that model to use. "
        "Remember, you are not generating the system message for data generation -- "
        "you are generating the system message to use for inference. "
        "A good format to follow is `Given WHAT_THE_MODEL_SHOULD_DO.`\n\n"
        "Make it as concise as possible. Include nothing but the system prompt in your response."
    )
    # Combine the system instruction and the user prompt
    full_prompt = system_instruction + "\n\n" + prompt.strip()

    # Generate a response using the free model
    output = generator(full_prompt, max_length=max_length, temperature=temperature, num_return_sequences=1)
    generated_text = output[0]['generated_text']

    # Optionally, you can post-process the text to extract only the system prompt part
    # For simplicity, here we return the entire generated text.
    return generated_text

# Example usage:
prompt = "Describe a model that translates English to French."
system_message = generate_system_message(prompt)
print("Generated system message:")
print(system_message)

# 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
# print(f'The system message is: `{system_message}`. Feel free to re-run this cell if you want a better result.')

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated system message:
You will be given a high-level description of the model we are training, and from that, you will generate a simple system prompt for that model to use. Remember, you are not generating the system message for data generation -- you are generating the system message to use for inference. A good format to follow is `Given WHAT_THE_MODEL_SHOULD_DO.`

Make it as concise as possible. Include nothing but the system prompt in your response.

Describe a model that translates English to French.

You will find that many of the best practices for this are found in the book:

Solving for Missing Data (Wiley, 2002).

This book is well-suited


In [10]:
from peft import LoraConfig

lora_alpha = 16
lora_dropout = 0.1
lora_r = 64

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "query_key_value",
        "dense",
        "dense_h_to_4h",
        "dense_4h_to_h",
    ]
)

In [11]:
from transformers import TrainingArguments

output_dir = "./results"
per_device_train_batch_size = 4
gradient_accumulation_steps = 4
optim = "paged_adamw_32bit"
save_steps = 10
logging_steps = 10
learning_rate = 2e-4
max_grad_norm = 0.3
max_steps = 100
warmup_ratio = 0.03
lr_scheduler_type = "constant"

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    dataloader_pin_memory=False,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
    gradient_checkpointing=True,
)


In [16]:
from trl import SFTTrainer
from transformers import set_seed, TrainingArguments, Trainer
from datasets import Dataset


max_seq_length = 256

train_df_dataset = Dataset.from_pandas(train_df)
eval_df_dataset = Dataset.from_pandas(eval_df)

trainer = SFTTrainer(
    model=model,
    peft_config=peft_config,
    train_dataset=train_dataset_mapped,
    eval_dataset=valid_dataset_mapped,
    tokenizer=tokenizer,
    args=training_arguments,
)



  trainer = SFTTrainer(


Converting train dataset to ChatML:   0%|          | 0/7856 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/7856 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/7856 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (3010 > 2048). Running this sequence through the model will result in indexing errors


Truncating train dataset:   0%|          | 0/7856 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/418 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/418 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/418 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/418 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [17]:
save_model_name = "phi2-finetune"
trainer.train()
trainer.model.save_pretrained(save_model_name) # 6012534a43916343c566dc8df4c228f4ffd0992b

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkiranchw000[0m ([33mimnskc[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,1.7741
20,1.7475
30,1.5967
40,1.2746
50,0.748
60,1.0095
70,0.9102
80,0.8932
90,0.9044
100,0.578


In [18]:
!mkdir /content/session18

In [19]:
!mv phi2-finetune/ /content/session18/
!mv results/ /content/session18/
!mv wandb/ /content/session18/
!mv train.jsonl /content/session18/
!mv test.jsonl /content/session18/

In [21]:
!zip -r archive.zip /content/session18

  adding: content/session18/ (stored 0%)
  adding: content/session18/phi2-finetune/ (stored 0%)
  adding: content/session18/phi2-finetune/adapter_model.safetensors (deflated 8%)
  adding: content/session18/phi2-finetune/README.md (deflated 66%)
  adding: content/session18/phi2-finetune/adapter_config.json (deflated 54%)
  adding: content/session18/train.jsonl (deflated 65%)
  adding: content/session18/test.jsonl (deflated 64%)
  adding: content/session18/wandb/ (stored 0%)
  adding: content/session18/wandb/latest-run/ (stored 0%)
  adding: content/session18/wandb/latest-run/run-yareawrw.wandb (deflated 79%)
  adding: content/session18/wandb/latest-run/logs/ (stored 0%)
  adding: content/session18/wandb/latest-run/logs/debug.log (deflated 69%)
  adding: content/session18/wandb/latest-run/logs/debug-core.log (deflated 57%)
  adding: content/session18/wandb/latest-run/logs/debug-internal.log (deflated 73%)
  adding: content/session18/wandb/latest-run/files/ (stored 0%)
  adding: content/s