In [1]:
# Install and import the necessary libraries
!pip install torch
!pip install -q -U accelerate peft bitsandbytes transformers trl einops

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
### Load dataset (OpenAssistant/oasst1)
from datasets import load_dataset

def get_dataset():
  dataset_loaded = load_dataset("OpenAssistant/oasst1")
  train_dataset = dataset_loaded["train"].to_pandas()
  val_dataset = dataset_loaded["validation"].to_pandas()
  return dataset_loaded, train_dataset, val_dataset

In [None]:
# # paper: https://paperswithcode.com/dataset/oasst1

# def prep_data(df):
#     df_assistant = df[(df.role == "assistant") & (df["rank"] == 0.0)].copy()
#     df_prompter = df[(df.role == "prompter")].copy()
#     df_prompter = df_prompter.set_index("message_id")
#     df_assistant["output"] = df_assistant["text"].values

#     inputs = []
#     parent_ids = []
#     for _, row in df_assistant.iterrows():
#         input = df_prompter.loc[row.parent_id]
#         inputs.append(input.text)
#         parent_ids.append(input.parent_id)

#     df_assistant["instruction"] = inputs
#     df_assistant["parent_id"] = parent_ids

#     df_assistant = df_assistant[df_assistant.lang == "en"]

#     df_assistant = df_assistant[
#         ["instruction", "output", "message_id", "parent_id"]
#     ].rename(columns={"message_id": "id"})

#     return df_assistant



In [3]:
#### Load Microsoft-phi2 Model

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16)

torch.set_default_device("cuda")

model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2", torch_dtype="auto", quantization_config=bnb_config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", max_length=512, trust_remote_code=True)
model.config.use_cache = False
tokenizer.pad_token = tokenizer.eos_token


# Enable gradient checkpointing for memory efficiency
model.gradient_checkpointing_enable()
model.enable_input_require_grads()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [4]:
def prep_data(df):
    """
    Prepares data from a DataFrame by matching assistant messages to corresponding prompter instructions,
    filtering for English examples, and merging them into a single prompt.
    """
    # Filter assistant and prompter messages
    df_assistant = df[(df.role == "assistant") & (df["rank"] == 0.0)].copy()
    df_prompter = df[df.role == "prompter"].copy()
    df_prompter = df_prompter.set_index("message_id")

    # Assistant's output text
    df_assistant["output"] = df_assistant["text"].values

    instructions = []
    parent_ids = []
    # For each assistant message, get the corresponding prompter message (instruction)
    for _, row in df_assistant.iterrows():
        input_row = df_prompter.loc[row.parent_id]
        instructions.append(input_row.text)
        parent_ids.append(input_row.parent_id)

    df_assistant["instruction"] = instructions
    df_assistant["parent_id"] = parent_ids

    # Filter to include only English examples
    df_assistant = df_assistant[df_assistant.lang == "en"]

    # Create a combined prompt with sections, properly escaping quotes
    def create_prompt(row):
        instruction = row["instruction"].replace('"', "'")
        output = row["output"].replace('"', "'")
        return (
            f"""###System:
Read the instruction and provide an answer.
###Instruction:
{instruction}
###Answer:
{output}"""
        )

    df_assistant["prompt"] = df_assistant.apply(create_prompt, axis=1)

    # Select and rename columns for clarity
    df_assistant = df_assistant[
        ["prompt", "instruction", "output", "message_id", "parent_id"]
    ].rename(columns={"message_id": "id"})

    return df_assistant


def collate_and_tokenize(batch):
    # Print batch structure
    # print(f"Batch keys: {batch.keys()}")
    # print(f"First item in prompt column: {batch['prompt'][:2]}")

    # Extract prompts safely
    prompts = batch.get("prompt", [])

    # Ensure all prompts are valid strings
    prompts = [str(p) for p in prompts if p is not None]

    # Tokenize using your tokenizer
    encoded = tokenizer(
        prompts,
        return_tensors="np",
        padding="max_length",
        truncation=True,
        max_length=512,
    )

    return encoded




In [5]:

entire, train_ds, val_ds = get_dataset()

train_df = prep_data(train_ds)
eval_df = prep_data(val_ds)

README.md:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

(…)-00000-of-00001-b42a775f407cee45.parquet:   0%|          | 0.00/39.5M [00:00<?, ?B/s]

(…)-00000-of-00001-134b8fd0c89408b6.parquet:   0%|          | 0.00/2.08M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/84437 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4401 [00:00<?, ? examples/s]

In [6]:
train_df.head(3)

Unnamed: 0,prompt,instruction,output,id,parent_id
1,###System:\nRead the instruction and provide a...,Can you write a short introduction about the r...,"""Monopsony"" refers to a market structure where...",c8e83833-ecbc-44fe-b6db-735228c25a1c,
7,###System:\nRead the instruction and provide a...,What can be done at a regulatory level to ensu...,Here are some potential regulatory options to ...,73d6f715-3787-409c-81e4-fde0e5ef60cd,636dd191-50df-4894-ba9a-cd7f00767258
25,###System:\nRead the instruction and provide a...,Can you explain contrastive learning in machin...,Sure! Let's say you want to build a model whic...,e8ca4e06-a584-4001-8594-5f633e06fa91,


In [7]:
eval_df.head(3)

Unnamed: 0,prompt,instruction,output,id,parent_id
21,###System:\nRead the instruction and provide a...,What do you think about ChatGPT?,"As an open source alternative to ChatGPT, I do...",7d05acb7-9360-458c-8a1d-c0b6492b8f8a,
23,###System:\nRead the instruction and provide a...,What are your thoughts on the censorship of Ch...,As a large language model trained on text from...,c8dc7c16-e493-4078-bdc7-368b24476ca9,7d05acb7-9360-458c-8a1d-c0b6492b8f8a
28,###System:\nRead the instruction and provide a...,"Yeah, I hear you, brother! Power to the people...",Here are some differences between me and ChatG...,48ac2156-f823-4e97-81ab-a66354549f59,779035e6-9872-4d52-9be7-872b5f0b7fe5


In [8]:
# Convert pandas DataFrames to Hugging Face Datasets
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

# Define columns to remove after tokenization (you can adjust as necessary)
columns_to_remove = ["instruction", "output", "id", "parent_id"]

# Tokenize the datasets using the collate function.
tokenized_train_dataset = train_dataset.map(
    collate_and_tokenize,   # No need for lambda or list wrapping
    batched=True,           # Process in batches
    batch_size=8,           # Optimize batch size for P100 GPU
    remove_columns=columns_to_remove
)

tokenized_eval_dataset = eval_dataset.map(
    collate_and_tokenize,
    batched=True,
    batch_size=8,
    remove_columns=columns_to_remove
)

# For demonstration, print one tokenized example from the training dataset
print(tokenized_train_dataset[0])

Map:   0%|          | 0/7856 [00:00<?, ? examples/s]

Map:   0%|          | 0/418 [00:00<?, ? examples/s]

{'prompt': "###System:\nRead the instruction and provide an answer.\n###Instruction:\nCan you write a short introduction about the relevance of the term 'monopsony' in economics? Please use examples related to potential monopsonies in the labour market and cite relevant research.\n###Answer:\n'Monopsony' refers to a market structure where there is only one buyer for a particular good or service. In economics, this term is particularly relevant in the labor market, where a monopsony employer has significant power over the wages and working conditions of their employees. The presence of a monopsony can result in lower wages and reduced employment opportunities for workers, as the employer has little incentive to increase wages or provide better working conditions.\n\nRecent research has identified potential monopsonies in industries such as retail and fast food, where a few large companies control a significant portion of the market (Bivens & Mishel, 2013). In these industries, workers o

In [9]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=[
    "q_proj",
    "up_proj",
    "o_proj",
    "k_proj",
    "down_proj",
    "gate_proj",
    "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)



print("Preparing model for k-bit training...")
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

Preparing model for k-bit training...


In [10]:
print(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): PhiForCausalLM(
      (model): PhiModel(
        (embed_tokens): Embedding(51200, 2560)
        (layers): ModuleList(
          (0-31): 32 x PhiDecoderLayer(
            (self_attn): PhiAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2560, out_features=2560, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2560, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2560, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear4b

In [12]:
from transformers import TrainingArguments

output_dir = "./results"
per_device_train_batch_size = 4
gradient_accumulation_steps = 4
optim = "paged_adamw_32bit"
save_steps = 10
logging_steps = 10
learning_rate = 2e-4
max_grad_norm = 0.3
max_steps = 500
warmup_ratio = 0.03
lr_scheduler_type = "cosine"

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    dataloader_pin_memory=False,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
    gradient_checkpointing=True,
)


In [13]:
from trl import SFTTrainer
from transformers import set_seed, TrainingArguments, Trainer
from datasets import Dataset


trainer = SFTTrainer(
    model=model,
    peft_config=peft_config,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    tokenizer=tokenizer,
    args=training_arguments
)



  trainer = SFTTrainer(


Converting train dataset to ChatML:   0%|          | 0/7856 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/7856 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/7856 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/418 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/418 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/418 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [14]:
save_model_name = "phi2-finetune"
trainer.train()
trainer.model.save_pretrained(save_model_name) # 6012534a43916343c566dc8df4c228f4ffd0992b

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkiranchw000[0m ([33mimnskc[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,1.7006
20,1.5214
30,1.5662
40,1.4452
50,1.3939
60,1.3307
70,1.3089
80,1.3482
90,1.4167
100,1.3608


In [None]:
from transformers import pipeline

prompt = f"[INST] <>\n{system_message}\n<>\n\nwho is roman reigns. [/INST]" # replace the command here with something relevant to your task
num_new_tokens = 100  # change to the number of new tokens you want to generate

# Count the number of tokens in the prompt
num_prompt_tokens = len(tokenizer(prompt)['input_ids'])

# Calculate the maximum length for the generation
max_length = num_prompt_tokens + num_new_tokens

gen = pipeline('text-generation', model=model, truncation=True, tokenizer=tokenizer, max_length=max_length)
result = gen(prompt)
print(result[0]['generated_text'].replace(prompt, ''))

In [18]:
# Merge and save the fine-tuned model
from peft import LoraConfig, PeftModel

# from google.colab import drive
# drive.mount('/content/drive')


model_name = "microsoft/phi-2"
saved_model_name = "/content/session18/results/checkpoint-500"
model_path = "/content/session18/results/checkpoint-500/phi2-qlora"  # change to your preferred path

# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map='auto',
)
peft_model_finetuned = PeftModel.from_pretrained(base_model, saved_model_name)
peft_model_finetuned = peft_model_finetuned.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Save the merged model
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

('/content/session18/results/checkpoint-500/phi2-qlora/tokenizer_config.json',
 '/content/session18/results/checkpoint-500/phi2-qlora/special_tokens_map.json',
 '/content/session18/results/checkpoint-500/phi2-qlora/vocab.json',
 '/content/session18/results/checkpoint-500/phi2-qlora/merges.txt',
 '/content/session18/results/checkpoint-500/phi2-qlora/added_tokens.json',
 '/content/session18/results/checkpoint-500/phi2-qlora/tokenizer.json')

In [19]:
# Load a fine-tuned model from Drive and run inference
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import pipeline

# model_path = "/content/phi2-qlora"  # change to the path where your model is saved
inference_model = AutoModelForCausalLM.from_pretrained(model_path)
inference_tokenizer = AutoTokenizer.from_pretrained(model_path)



prompt = "What is 2 + 2?"  # change to your desired prompt
gen = pipeline('text-generation', model=inference_model, tokenizer=inference_tokenizer)
result = gen(prompt)
print(result[0]['generated_text'])


OutOfMemoryError: CUDA out of memory. Tried to allocate 26.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 26.12 MiB is free. Process 21886 has 14.71 GiB memory in use. Of the allocated memory 14.36 GiB is allocated by PyTorch, and 223.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [15]:
!mkdir /content/session18

In [16]:
!mv phi2-finetune/ /content/session18/
!mv results/ /content/session18/
!mv wandb/ /content/session18/
!mv train.jsonl /content/session18/
!mv test.jsonl /content/session18/

mv: cannot stat 'train.jsonl': No such file or directory
mv: cannot stat 'test.jsonl': No such file or directory


In [17]:
!zip -r archive.zip /content/session18

  adding: content/session18/ (stored 0%)
  adding: content/session18/phi2-finetune/ (stored 0%)
  adding: content/session18/phi2-finetune/README.md (deflated 66%)
  adding: content/session18/phi2-finetune/adapter_model.safetensors (deflated 8%)
  adding: content/session18/phi2-finetune/adapter_config.json (deflated 56%)
  adding: content/session18/results/ (stored 0%)
  adding: content/session18/results/checkpoint-200/ (stored 0%)
  adding: content/session18/results/checkpoint-200/rng_state.pth (deflated 26%)
  adding: content/session18/results/checkpoint-200/README.md (deflated 66%)
  adding: content/session18/results/checkpoint-200/vocab.json (deflated 59%)
  adding: content/session18/results/checkpoint-200/tokenizer.json (deflated 82%)
  adding: content/session18/results/checkpoint-200/added_tokens.json (deflated 84%)
  adding: content/session18/results/checkpoint-200/scheduler.pt (deflated 56%)
  adding: content/session18/results/checkpoint-200/scaler.pt (deflated 60%)
  adding: co