In [None]:
!pip install -q -U pyarrow==14.0.1
!pip install -q -U fsspec==2023.10.0

!pip install -q -U bitsandbytes==0.42.0
!pip install -q -U peft==0.8.2
!pip install -q -U trl==0.7.10
!pip install -q -U accelerate==0.27.1
!pip install -q -U datasets==2.17.0
!pip install -q -U transformers==4.38.0
!pip install python-dotenv



In [None]:
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# # If HF token is stored in .env:
# import os
# os.environ["HF_TOKEN"] = os.getenv('HF_TOKEN')

### Tech Stack Used:
* Transformers
* Peft
* BitsAndBytes
* Accelerate
* TRL

### Basic steps Involved in fine-tuning:
1. Load the base model
2. Train the base model
3. Save the LoRA adapter
4. Reload the base model at half/full precision
5. Merge the LoRA weights with the base model

## Load Model

In [None]:
import accelerate
import bitsandbytes as bnb

print(f"Accelerate version: {accelerate.__version__}")
print(f"BitsAndBytes version: {bnb.__version__}")

Accelerate version: 0.27.1
BitsAndBytes version: 0.42.0


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Set the qunatization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

#Load the model and Tokenizer
model_id = "google/gemma-2b-it"

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## Dataset Preparation

### Loading the Dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset("SkunkworksAI/reasoning-0.01")
dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'reasoning', 'output', 'reasoning_chains'],
        num_rows: 29857
    })
})

In [None]:
# Convert HF dataset to pandas Dataframe

df = dataset["train"].to_pandas()
df.sample(5)

Unnamed: 0,instruction,reasoning,output,reasoning_chains
13228,A home buyer can afford to spend no more than ...,1. The problem is asking for the maximum amoun...,89034.79,"[{'step': 1, 'thought': 'The problem is asking..."
19760,What is the least positive integer $n$ such th...,1. The problem is asking for the smallest posi...,To find the least positive integer $n$ such th...,"[{'step': 1, 'thought': 'The problem is asking..."
17776,The roots of the equation $x^2+kx+5 = 0$ diffe...,1. Recognize that the given equation is a quad...,"I notice that this is a quadratic equation, so...","[{'step': 1, 'thought': 'Recognize that the gi..."
27385,"For integers $n$, let \[f(n) = \left\{\n\begin...",1. The problem involves repeated applications ...,This problem involves repeated applications of...,"[{'step': 1, 'thought': 'The problem involves ..."
27,"BEGININPUT\nBEGINCONTEXT\ndate: October 12, 20...",1. Read the instruction and identify the quest...,The two groundbreaking inventions that emerged...,"[{'step': 1, 'thought': 'Read the instruction ..."


In [None]:
def generate_prompt(data_point):
    """

    Args:
      data_point:

    Returns:

    """
    # Generate prompt
    prefix_text = 'Below is an instruction that describes a task. Write a response that ' \
                  'appropriately completes the request.\n\n'

    # Samples with additional context info
    if data_point['reasoning']:
        text = f"""<start_of_turn>user {prefix_text} {data_point["instruction"]} here are the inputs {data_point["reasoning"]} <end_of_turn>\n<start_of_turn>model{data_point["output"]} <end_of_turn>"""
    # Without additional context info
    else:
        text = f"""<start_of_turn>user {prefix_text} {data_point["instruction"]} <end_of_turn>\n<start_of_turn>model{data_point["output"]} <end_of_turn>"""
    return text

# Add the "prompt" column in the dataset
text_column = [generate_prompt(data_point) for data_point in dataset["train"]]
dataset = dataset["train"].add_column("prompt", text_column)
dataset

Dataset({
    features: ['instruction', 'reasoning', 'output', 'reasoning_chains', 'prompt'],
    num_rows: 29857
})

In [None]:
print(dataset[0]['prompt'])

<start_of_turn>user Below is an instruction that describes a task. Write a response that appropriately completes the request.

 If a die is rolled three times, what is the probability of getting a sum of 11? None here are the inputs 1. Understand the problem: We need to find the probability of getting a sum of 11 when rolling a die three times.
2. Calculate total possible outcomes: A die has 6 faces, so for each roll, there are 6 possibilities. For three rolls, the total possible outcomes are 6^3 = 216.
3. Identify favorable outcomes: List all combinations of rolls that result in a sum of 11. There are 18 such combinations.
4. Calculate probability: Divide the number of favorable outcomes by the total possible outcomes: 18 / 216 = 1/12.
5. Conclusion: The probability of getting a sum of 11 when rolling a die three times is 1/12. <end_of_turn>
<start_of_turn>modelTo solve this problem, we need to find the number of favorable outcomes (getting a sum of 11) and divide it by the total poss

In [None]:
dataset = dataset.shuffle(seed=1234)  # Shuffle dataset
dataset = dataset.map(lambda samples: tokenizer(samples["prompt"]), batched=True)

Map:   0%|          | 0/29857 [00:00<?, ? examples/s]

Train-Test Split

In [None]:
dataset = dataset.train_test_split(test_size=0.1)
train_data = dataset["train"]
test_data = dataset["test"]

print(train_data)
print(test_data)

Dataset({
    features: ['instruction', 'reasoning', 'output', 'reasoning_chains', 'prompt', 'input_ids', 'attention_mask'],
    num_rows: 26871
})
Dataset({
    features: ['instruction', 'reasoning', 'output', 'reasoning_chains', 'prompt', 'input_ids', 'attention_mask'],
    num_rows: 2986
})


In [None]:
# Automated selection of target modules
import bitsandbytes as bnb

def find_all_linear_names(model):
  cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
  lora_module_names = set()
  for name, module in model.named_modules():
    if isinstance(module, cls):
      names = name.split('.')
      lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names: # needed for 16-bit
      lora_module_names.remove('lm_head')
  return list(lora_module_names)


modules = find_all_linear_names(model)
print(modules)

['k_proj', 'gate_proj', 'q_proj', 'v_proj', 'o_proj', 'down_proj', 'up_proj']


In [None]:
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

print(model)

lora_config = LoraConfig(
    r=64,
    lora_alpha=32,
    target_modules=modules,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM" # Causal Language Modeling (e.g., autoregressive models like GPT)
)

model = get_peft_model(model, lora_config)

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear4bit(in_features=16384, out_features=2048, bias=False)
          (act_fn): GELUActivation()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_layernorm): GemmaRMSNorm()
     

In [None]:
# Number of trainable parameters
trainable, total = model.get_nb_trainable_parameters()
print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable/total*100:.4f}%")

Trainable: 78446592 | total: 2584619008 | Percentage: 3.0351%


## Train Model

### Push the trained model to HF

In [None]:
import transformers
from trl import SFTTrainer

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side='right'
torch.cuda.empty_cache()

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=test_data,
    dataset_text_field="prompt",
    peft_config=lora_config,
    max_seq_length=2500,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        num_train_epochs=2,
        warmup_steps=0.03,
        max_steps=100,
        learning_rate=2e-4,
        logging_steps=20,
        output_dir="outputs",
        optim="paged_adamw_8bit",
        save_strategy="epoch",
        report_to="none"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False
trainer.train()



Map:   0%|          | 0/26871 [00:00<?, ? examples/s]

Map:   0%|          | 0/2986 [00:00<?, ? examples/s]

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
20,1.3201
40,0.985
60,1.0437
80,0.9575
100,0.9502


TrainOutput(global_step=100, training_loss=1.0512952995300293, metrics={'train_runtime': 1495.1728, 'train_samples_per_second': 0.268, 'train_steps_per_second': 0.067, 'total_flos': 3416890029625344.0, 'train_loss': 1.0512952995300293, 'epoch': 0.01})

In [None]:
new_model = "gemma-2b-instruct-reasoning"

trainer.model.save_pretrained(new_model)

base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
)
merged_model= PeftModel.from_pretrained(base_model, new_model)
merged_model= merged_model.merge_and_unload()

# Save the merged model locally
# save_adapter=True, save_config=True
merged_model.save_pretrained("merged_model",safe_serialization=True)
tokenizer.save_pretrained("merged_model")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Push the model and tokenizer to the Hugging Face Model Hub
merged_model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/jaydiaz2023/gemma-2b-instruct-reasoning/commit/4510b0e7c128dde1e145f7481f4d6c27fc85e17c', commit_message='Upload tokenizer', commit_description='', oid='4510b0e7c128dde1e145f7481f4d6c27fc85e17c', pr_url=None, pr_revision=None, pr_num=None)

## Evaluation Metrics

In [None]:
# Load/define base (non-finetuned) vs finetuned models for comparison

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from math import exp
import bitsandbytes as bnb  # Ensure bitsandbytes is installed if using quantization
from peft import PeftModel  # If using PEFT models

# Define your Hugging Face username and model IDs
username = "jaydiaz2023"
finetuned_model_id = f"{username}/gemma-2b-instruct-reasoning"
base_model_id = "google/gemma-2b-it"

# Set the qunatization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_id = "google/gemma-2b-it"

base_model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})
base_tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)

# # Load the Finetuned Model and Tokenizer from local directory
# finetuned_model_path = "merged_model"  # Path where you saved the merged model
# finetuned_model = AutoModelForCausalLM.from_pretrained(
#     finetuned_model_path,
#     torch_dtype=torch.float16,
#     device_map="auto"
# )
# finetuned_tokenizer = AutoTokenizer.from_pretrained(finetuned_model_path)

# Load the Finetuned Model and Tokenizer from HF
finetuned_tokenizer = AutoTokenizer.from_pretrained(model_id)
finetuned_model = AutoModelForCausalLM.from_pretrained(
                  model_id,
                  torch_dtype=torch.float16,
                  device_map="auto"
)

# Ensure the model is in evaluation mode
base_model.eval()
finetuned_model.eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear(in_features=16384, out_features=2048, bias=False)
          (act_fn): GELUActivation()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_layernorm): GemmaRMSNorm()
      )
    )
    (norm): GemmaRM

### Perplexity

Perplexity is a measurement derived from the probability distribution that a language model assigns to a sequence of words (i.e., a sentence or a corpus). Specifically, it quantifies how well a probability model predicts a sample. In the realm of language modeling, perplexity serves as an indicator of how "confused" the model is when predicting the next word in a sequence.
* Lower Perplexity: Indicates that the model is better at predicting the sample. The model is less "perplexed" by the data.
* Higher Perplexity: Suggests that the model is less certain about its predictions, implying poorer performance.

In [None]:
test_data

NameError: name 'test_data' is not defined

In [None]:
test_data_subset = test_data.shuffle(seed=42).select(range(10))  # Get 10 random rows

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from math import exp

def calculate_perplexity(model, tokenizer, dataset):
    total_loss = 0
    total_tokens = 0

    with torch.no_grad():
        for example in dataset:
            inputs = tokenizer(example["prompt"], return_tensors="pt").to("cuda")
            labels = inputs.input_ids
            outputs = model(**inputs, labels=labels)
            loss = outputs.loss
            total_loss += loss.item() * inputs.input_ids.size(1)
            total_tokens += inputs.input_ids.size(1)

    perplexity = exp(total_loss / total_tokens)
    return perplexity

base_perplexity = calculate_perplexity(base_model, base_tokenizer, test_data_subset)
finetuned_perplexity = calculate_perplexity(finetuned_model, finetuned_tokenizer, test_data_subset)
print(f"Non-finetuned Perplexity: {base_perplexity}")
print(f"Finetuned Perplexity: {finetuned_perplexity}\n")

OutOfMemoryError: CUDA out of memory. Tried to allocate 512.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 211.06 MiB is free. Process 82541 has 14.41 GiB memory in use. Of the allocated memory 13.97 GiB is allocated by PyTorch, and 319.26 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

### Semantic Similarity
Semantic Similarity evaluates how close the model's generated responses are to the expected answers in terms of meaning.

Implementation Steps:
* Choose a Semantic Similarity Metric: Common choices include Cosine Similarity, BERTScore, or Sentence Transformers embeddings.
* Compute Similarity Scores: Compare the generated responses with the ground truth.

In [None]:
import torch
from torch.nn.functional import cosine_similarity

def calculate_semantic_similarity(model, tokenizer, dataset):
    similarities = []
    model.eval()

    with torch.no_grad():
        for example in dataset:
            # Encode the input prompt
            input_ids = tokenizer.encode(example["prompt"], return_tensors="pt").to("cuda")
            # Generate output from the model
            output_ids = model.generate(input_ids, max_length=2500)
            generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

            # Tokenize generated and reference texts
            gen_inputs = tokenizer(generated_text, return_tensors='pt').to("cuda")
            ref_inputs = tokenizer(example["output"], return_tensors='pt').to("cuda")

            # Get embeddings from the last hidden state
            gen_outputs = model(**gen_inputs, output_hidden_states=True, return_dict=True)
            ref_outputs = model(**ref_inputs, output_hidden_states=True, return_dict=True)

            # Average pooling of the embeddings
            gen_embedding = gen_outputs.hidden_states[-1].mean(dim=1).squeeze()
            ref_embedding = ref_outputs.hidden_states[-1].mean(dim=1).squeeze()

            # Compute cosine similarity
            cosine_score = cosine_similarity(gen_embedding, ref_embedding, dim=0).item()
            similarities.append(cosine_score)

    average_similarity = sum(similarities) / len(similarities)
    return average_similarity

base_semantic_similarity = calculate_semantic_similarity(base_model, base_tokenizer, test_data_subset)
finetuned_semantic_similarity = calculate_semantic_similarity(finetuned_model, finetuned_tokenizer, test_data_subset)
print(f"Non-finetuned Semantic Similarity: {base_semantic_similarity:.4f}")
print(f"Finetuned Semantic Similarity: {finetuned_semantic_similarity:.4f}\n")

OutOfMemoryError: CUDA out of memory. Tried to allocate 512.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 211.06 MiB is free. Process 82541 has 14.41 GiB memory in use. Of the allocated memory 13.97 GiB is allocated by PyTorch, and 317.38 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

### BLEU (Bilingual Evaluation Understudy)

BLEU measures the n-gram overlap between the generated text and reference text. It's widely used in machine translation but can be applied to other text generation tasks.

In [None]:
from datasets import load_metric

bleu = load_metric("bleu")

def calculate_bleu(model, tokenizer, dataset):
    references = []
    predictions = []
    model.eval()

    with torch.no_grad():
        for example in dataset:
            input_ids = tokenizer.encode(example["prompt"], return_tensors="pt").to("cuda")
            output_ids = model.generate(input_ids, max_length=2500)
            generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

            references.append([example["output"].split()])
            predictions.append(generated_text.split())

    bleu_score = bleu.compute(predictions=predictions, references=references)
    return bleu_score

base_bleu = calculate_bleu(base_model, base_tokenizer, test_data_subset)
finetuned_bleu = calculate_bleu(finetuned_model, finetuned_tokenizer, test_data_subset)
print(f"Non-finetuned BLEU Score: {base_bleu['bleu']:.4f}")
print(f"Finetuned BLEU Score: {finetuned_bleu['bleu']:.4f}\n")

  bleu = load_metric("bleu")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Non-finetuned BLEU Score: 0.4129
Finetuned BLEU Score: 0.4138



### ROUGE (Recall-Oriented Understudy for Gisting Evaluation)

ROUGE focuses on the overlap of n-grams, word sequences, and word pairs between the generated and reference texts, emphasizing recall.

In [None]:
from datasets import load_metric

rouge = load_metric("rouge")

def compute_rouge(model, tokenizer, dataset):
    references = []
    predictions = []
    model.eval()

    with torch.no_grad():
        for example in dataset:
            input_ids = tokenizer.encode(example["prompt"], return_tensors="pt").to("cuda")
            output_ids = model.generate(input_ids, max_length=2500)
            generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

            references.append(example["output"])
            predictions.append(generated_text)

    rouge_score = rouge.compute(predictions=predictions, references=references)
    return rouge_score

base_rouge = compute_rouge(base_model, base_tokenizer, test_data)
finetuned_rouge = compute_rouge(finetuned_model, finetuned_tokenizer, test_data)
print(f"Non-finetuned ROUGE Scores: {base_rouge}")
print(f"Finetuned ROUGE Scores: {finetuned_rouge}")

### METEOR (Metric for Evaluation of Translation with Explicit ORdering)

METEOR evaluates translation by considering synonymy and stemming, making it more flexible than BLEU.

In [None]:
from datasets import load_metric

meteor = load_metric("meteor")

def compute_meteor(model, tokenizer, dataset):
    references = []
    predictions = []
    model.eval()

    with torch.no_grad():
        for example in dataset:
            input_ids = tokenizer.encode(example["prompt"], return_tensors="pt").to("cuda")
            output_ids = model.generate(input_ids, max_length=2500)
            generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

            references.append(example["output"])
            predictions.append(generated_text)

    meteor_score = meteor.compute(predictions=predictions, references=references)
    return meteor_score

base_meteor = compute_meteor(base_model, base_tokenizer, test_data)
finetuned_meteor = compute_meteor(finetuned_model, finetuned_tokenizer, test_data)
print(f"METEOR Score: {base_meteor}")
print(f"Finetuned METEOR Score: {finetuned_meteor}")