In [1]:
!pip install -q -U pyarrow==14.0.1
!pip install -q -U fsspec==2023.10.0

!pip install -q -U bitsandbytes==0.42.0
!pip install -q -U peft==0.8.2
!pip install -q -U trl==0.7.10
!pip install -q -U accelerate==0.27.1
!pip install -q -U datasets==2.17.0
!pip install -q -U transformers==4.38.0
!pip install python-dotenv

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.0/38.0 MB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m166.4/166.4 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.6.1 requires fsspec==2024.6.1, but you have fsspec 2023.10.0 which is incompatible.[0m[31m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.4/183.4 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.9/150.9 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.2/109.2 kB[0m [31m9.9 MB/s[0m eta [36m0:0

In [2]:
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# # If HF token is stored in .env:
# import os
# os.environ["HF_TOKEN"] = os.getenv('HF_TOKEN')

### Tech Stack Used:
* Transformers
* Peft
* BitsAndBytes
* Accelerate
* TRL

### Basic steps Involved in fine-tuning:
1. Load the base model
2. Train the base model
3. Save the LoRA adapter
4. Reload the base model at half/full precision
5. Merge the LoRA weights with the base model

## Load Model

In [3]:
import accelerate
import bitsandbytes as bnb

print(f"Accelerate version: {accelerate.__version__}")
print(f"BitsAndBytes version: {bnb.__version__}")

Accelerate version: 0.27.1
BitsAndBytes version: 0.42.0


In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Set the qunatization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

#Load the model and Tokenizer
model_id = "google/gemma-2b-it"

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

## Dataset Preparation

### Loading the Dataset

In [5]:
from datasets import load_dataset

dataset = load_dataset("SkunkworksAI/reasoning-0.01")
dataset

Downloading readme:   0%|          | 0.00/847 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/56.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/29857 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['instruction', 'reasoning', 'output', 'reasoning_chains'],
        num_rows: 29857
    })
})

In [6]:
# Convert HF dataset to pandas Dataframe

df = dataset["train"].to_pandas()
df.sample(5)

Unnamed: 0,instruction,reasoning,output,reasoning_chains
1741,It is highly likely that Claudette is a classi...,1. The argument states that Claudette is likel...,D,"[{'step': 1, 'thought': 'The argument states t..."
12876,"For a complex number $z,$ find the minimum val...",1. The problem is asking for the minimum value...,I recognize this expression as the sum of the ...,"[{'step': 1, 'thought': 'The problem is asking..."
3586,Please provide detailed instructions on how to...,1. Understand the goal: Simulate keypresses us...,>>> pyautogui.keyDown('shift'); pyautogui.pres...,"[{'step': 1, 'thought': 'Understand the goal: ..."
24697,The quadratic $x^2-3x+9=x+41$ has two solution...,1. The problem is asking for the positive diff...,To find the solutions of the quadratic equatio...,"[{'step': 1, 'thought': 'The problem is asking..."
9629,"Points $A$, $B$, $C$, and $D$ are located on $...",1. Start by understanding the problem statemen...,Since $D$ and $C$ are located on segment $\ove...,"[{'step': 1, 'thought': 'Start by understandin..."


In [7]:
def generate_prompt(data_point):
    """

    Args:
      data_point:

    Returns:

    """
    # Generate prompt
    prefix_text = 'Below is an instruction that describes a task. Write a response that ' \
                  'appropriately completes the request.\n\n'

    # Samples with additional context info
    if data_point['reasoning']:
        text = f"""<start_of_turn>user {prefix_text} {data_point["instruction"]} here are the inputs {data_point["reasoning"]} <end_of_turn>\n<start_of_turn>model{data_point["output"]} <end_of_turn>"""
    # Without additional context info
    else:
        text = f"""<start_of_turn>user {prefix_text} {data_point["instruction"]} <end_of_turn>\n<start_of_turn>model{data_point["output"]} <end_of_turn>"""
    return text

# Add the "prompt" column in the dataset
text_column = [generate_prompt(data_point) for data_point in dataset["train"]]
dataset = dataset["train"].add_column("prompt", text_column)
dataset

Dataset({
    features: ['instruction', 'reasoning', 'output', 'reasoning_chains', 'prompt'],
    num_rows: 29857
})

In [8]:
print(dataset[0]['prompt'])

<start_of_turn>user Below is an instruction that describes a task. Write a response that appropriately completes the request.

 If a die is rolled three times, what is the probability of getting a sum of 11? None here are the inputs 1. Understand the problem: We need to find the probability of getting a sum of 11 when rolling a die three times.
2. Calculate total possible outcomes: A die has 6 faces, so for each roll, there are 6 possibilities. For three rolls, the total possible outcomes are 6^3 = 216.
3. Identify favorable outcomes: List all combinations of rolls that result in a sum of 11. There are 18 such combinations.
4. Calculate probability: Divide the number of favorable outcomes by the total possible outcomes: 18 / 216 = 1/12.
5. Conclusion: The probability of getting a sum of 11 when rolling a die three times is 1/12. <end_of_turn>
<start_of_turn>modelTo solve this problem, we need to find the number of favorable outcomes (getting a sum of 11) and divide it by the total poss

In [9]:
dataset = dataset.shuffle(seed=1234)  # Shuffle dataset
dataset = dataset.map(lambda samples: tokenizer(samples["prompt"]), batched=True)

Map:   0%|          | 0/29857 [00:00<?, ? examples/s]

Train-Test Split

In [10]:
dataset = dataset.train_test_split(test_size=0.1)
train_data = dataset["train"]
test_data = dataset["test"]

print(train_data)
print(test_data)

Dataset({
    features: ['instruction', 'reasoning', 'output', 'reasoning_chains', 'prompt', 'input_ids', 'attention_mask'],
    num_rows: 26871
})
Dataset({
    features: ['instruction', 'reasoning', 'output', 'reasoning_chains', 'prompt', 'input_ids', 'attention_mask'],
    num_rows: 2986
})


In [11]:
# Automated selection of target modules
import bitsandbytes as bnb

def find_all_linear_names(model):
  cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
  lora_module_names = set()
  for name, module in model.named_modules():
    if isinstance(module, cls):
      names = name.split('.')
      lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names: # needed for 16-bit
      lora_module_names.remove('lm_head')
  return list(lora_module_names)


modules = find_all_linear_names(model)
print(modules)

['v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj', 'q_proj']


In [12]:
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

print(model)

lora_config = LoraConfig(
    r=64,
    lora_alpha=32,
    target_modules=modules,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM" # Causal Language Modeling (e.g., autoregressive models like GPT)
)

model = get_peft_model(model, lora_config)

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear4bit(in_features=16384, out_features=2048, bias=False)
          (act_fn): GELUActivation()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_layernorm): GemmaRMSNorm()
     

In [13]:
# Number of trainable parameters
trainable, total = model.get_nb_trainable_parameters()
print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable/total*100:.4f}%")

Trainable: 78446592 | total: 2584619008 | Percentage: 3.0351%


## Train Model

### Push the trained model to HF

In [None]:
import transformers
from trl import SFTTrainer

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side='right'
torch.cuda.empty_cache()

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=test_data,
    dataset_text_field="prompt",
    peft_config=lora_config,
    max_seq_length=2500,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        num_train_epochs=2,
        warmup_steps=0.03,
        max_steps=100,
        learning_rate=2e-4,
        logging_steps=20,
        output_dir="outputs",
        optim="paged_adamw_8bit",
        save_strategy="epoch",
        report_to="none"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False
trainer.train()



Map:   0%|          | 0/26871 [00:00<?, ? examples/s]

Map:   0%|          | 0/2986 [00:00<?, ? examples/s]

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
20,1.3201
40,0.985
60,1.0437
80,0.9575
100,0.9502


TrainOutput(global_step=100, training_loss=1.0512952995300293, metrics={'train_runtime': 1495.1728, 'train_samples_per_second': 0.268, 'train_steps_per_second': 0.067, 'total_flos': 3416890029625344.0, 'train_loss': 1.0512952995300293, 'epoch': 0.01})

# **The model was already "pushed". Please ignore this error. **

In [14]:
new_model = "gemma-2b-instruct-reasoning"

trainer.model.save_pretrained(new_model)

base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
)
merged_model= PeftModel.from_pretrained(base_model, new_model)
merged_model= merged_model.merge_and_unload()

# Save the merged model locally
# save_adapter=True, save_config=True
merged_model.save_pretrained("merged_model",safe_serialization=True)
tokenizer.save_pretrained("merged_model")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Push the model and tokenizer to the Hugging Face Model Hub
merged_model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

NameError: name 'trainer' is not defined

## Evaluation Metrics

In [15]:
# Load/define base (non-finetuned) vs finetuned models for comparison

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from math import exp
import bitsandbytes as bnb  # Ensure bitsandbytes is installed if using quantization
from peft import PeftModel  # If using PEFT models

# Define your Hugging Face username and model IDs
username = "jaydiaz2023"
finetuned_model_id = f"{username}/gemma-2b-instruct-reasoning"
base_model_id = "google/gemma-2b-it"

# Set the qunatization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_id = "google/gemma-2b-it"

base_model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})
base_tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)

# # Load the Finetuned Model and Tokenizer from local directory
# finetuned_model_path = "merged_model"  # Path where you saved the merged model
# finetuned_model = AutoModelForCausalLM.from_pretrained(
#     finetuned_model_path,
#     torch_dtype=torch.float16,
#     device_map="auto"
# )
# finetuned_tokenizer = AutoTokenizer.from_pretrained(finetuned_model_path)

# Load the Finetuned Model and Tokenizer from HF
finetuned_tokenizer = AutoTokenizer.from_pretrained(model_id)
finetuned_model = AutoModelForCausalLM.from_pretrained(
                  model_id,
                  torch_dtype=torch.float16,
                  device_map="auto"
)

# Ensure the model is in evaluation mode
base_model.eval()
finetuned_model.eval()



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear(in_features=16384, out_features=2048, bias=False)
          (act_fn): GELUActivation()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_layernorm): GemmaRMSNorm()
      )
    )
    (norm): GemmaRM

### Semantic Similarity
Semantic Similarity evaluates how close the model's generated responses are to the expected answers in terms of meaning.

Implementation Steps:
* Choose a Semantic Similarity Metric: Common choices include Cosine Similarity, BERTScore, or Sentence Transformers embeddings.
* Compute Similarity Scores: Compare the generated responses with the ground truth.

In [None]:
import torch
from torch.nn.functional import cosine_similarity

def calculate_semantic_similarity(model, tokenizer, dataset):
    similarities = []
    model.eval()

    with torch.no_grad():
        for example in dataset:
            # Encode the input prompt
            input_ids = tokenizer.encode(example["prompt"], return_tensors="pt").to("cuda")
            # Generate output from the model
            output_ids = model.generate(input_ids, max_length=2500)
            generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

            # Tokenize generated and reference texts
            gen_inputs = tokenizer(generated_text, return_tensors='pt').to("cuda")
            ref_inputs = tokenizer(example["output"], return_tensors='pt').to("cuda")

            # Get embeddings from the last hidden state
            gen_outputs = model(**gen_inputs, output_hidden_states=True, return_dict=True)
            ref_outputs = model(**ref_inputs, output_hidden_states=True, return_dict=True)

            # Average pooling of the embeddings
            gen_embedding = gen_outputs.hidden_states[-1].mean(dim=1).squeeze()
            ref_embedding = ref_outputs.hidden_states[-1].mean(dim=1).squeeze()

            # Compute cosine similarity
            cosine_score = cosine_similarity(gen_embedding, ref_embedding, dim=0).item()
            similarities.append(cosine_score)

    average_similarity = sum(similarities) / len(similarities)
    return average_similarity

base_semantic_similarity = calculate_semantic_similarity(base_model, base_tokenizer, test_data_subset)
finetuned_semantic_similarity = calculate_semantic_similarity(finetuned_model, finetuned_tokenizer, test_data_subset)
print(f"Non-finetuned Semantic Similarity: {base_semantic_similarity:.4f}")
print(f"Finetuned Semantic Similarity: {finetuned_semantic_similarity:.4f}\n")

Non-finetuned Semantic Similarity: 0.8635
Finetuned Semantic Similarity: 0.8330

