In [None]:
!pip install  peft -q
!pip install -U bitsandbytes -q

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
import torch
from transformers import BitsAndBytesConfig

In [106]:
from datasets import load_dataset

# Load the SNLI dataset
ds = load_dataset("stanfordnlp/snli")

# Function to sample data based on step size and sample size
def sample_data(dataset, sample_size, step):
    sampled_indices = range(0, len(dataset), step)[:sample_size]
    return dataset.select(sampled_indices)

# Create sampled subsets for training, testing, and validation
train_data = sample_data(ds['train'], sample_size=550_000, step=550)
test_data = sample_data(ds['test'], sample_size=10_000, step=100)
validation_data = sample_data(ds['validation'], sample_size=1_000, step=100)

# Display a summary of the datasets
print(len(ds["train"]))
print(f"Training Data: {len(train_data)} samples")
print(f"Testing Data: {len(test_data)} samples")
print(f"Validation Data: {len(validation_data)} samples")


550152
Training Data: 1001 samples
Testing Data: 100 samples
Validation Data: 100 samples


In [None]:
ds["test"][:3]

In [70]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import BitsAndBytesConfig
import torch
from peft import LoraConfig, get_peft_model
import torch

MODEL_NAME = "microsoft/phi-2"

# Define 4-bit quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load the pretrained model with 4-bit quantization
pretrained_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    trust_remote_code=True,
    quantization_config=bnb_config
)

# LoRA configuration for QLoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none"
)
model = get_peft_model(pretrained_model, lora_config)

# Load and configure tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [72]:
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import torch
import re

def evaluate_model(model, tokenizer, dataset, max_length=70):
    print(f"Dataset length: {len(dataset)}")
    model.eval()
    tokenizer.pad_token_id = tokenizer.eos_token_id
    predictions = []
    true_labels = []
    label_map = {0: "entailment", 1: "neutral", 2: "contradiction"}

    # Progress bar with fixed position
    for i in tqdm(range(len(dataset)), position=0, leave=True,desc="Predicting on test Samples"):
        premise = dataset['premise'][i]
        hypothesis = dataset['hypothesis'][i]
        label = dataset['label'][i]
        
        # Skip ambiguous label (-1) in SNLI dataset
        if label == -1:
            print(f"Skipped example {i} (ambiguous label)")
            continue
        true_labels.append(label_map[label])

        # Concatenate premise and hypothesis with a more specific prompt
        input_text = (
            f"Premise: {premise}\n"
            f"Hypothesis: {hypothesis}\n"
            f"Answer with one of the following: entailment, neutral, contradiction.\nAnswer:"
        )

        # Tokenize input text
        inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)
        
        # Generate prediction
        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=4)  # Strict limit on max_new_tokens to get concise answers
            prediction = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
            # Use regex to extract the label directly after "Answer:"
            match = re.search(r"Answer:\s*(entailment|neutral|contradiction)", prediction, re.IGNORECASE)
            if match:
                prediction = match.group(1).lower()
            else:
                prediction = "neutral"  # Default if no match is found
            predictions.append(prediction)

    # Calculate accuracy
    accuracy = accuracy_score(true_labels, predictions)
    return accuracy, predictions
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Evaluate model and calculate accuracy
accuracy, predictions = evaluate_model(model, tokenizer, test_data)

# Display accuracy and sample predictions
print(f"\nModel Accuracy: {accuracy * 100:.2f}%")
for i, prediction in enumerate(predictions[:5]):
    print(f"Example {i + 1}: {prediction}")


Dataset length: 100


Predicting on test Samples:   0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Predicting on test Samples:   1%|          | 1/100 [00:00<00:58,  1.69it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Predicting on test Samples:   2%|▏         | 2/100 [00:01<00:57,  1.69it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Predicting on test Samples:   3%|▎         | 3/100 [00:01<00:57,  1.69it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Predicting on test Samples:   4%|▍         | 4/100 [00:02<00:59,  1.61it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Predicting on test Samples:   5%|▌         | 5/100 [00:03<00:58,  1.64it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Predicting on test Samples:   6%|▌         | 6/100 [00:03<00:57,  1.64it/s]Setting `pad_token_id` to `eos_token_id`:None for open-en


Model Accuracy: 52.00%
Example 1: contradiction
Example 2: entailment
Example 3: entailment
Example 4: entailment
Example 5: entailment





In [5]:
train_data

Dataset({
    features: ['premise', 'hypothesis', 'label'],
    num_rows: 1001
})

In [99]:
from datetime import datetime
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, Trainer, TrainingArguments
from peft import prepare_model_for_kbit_training, LoraConfig, PeftModel, get_peft_model
from datasets import load_dataset
import time
import psutil
import re
from tqdm import tqdm

# Define tokenize function - returning dict to avoid list concatenation issue
def tokenize(batch):
    return tokenizer(batch['premise'], batch['hypothesis'], truncation=True, max_length=512, padding="max_length")

# Tokenize datasets
train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# Lora configuration
config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", "lm_head"],
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)

# Track trainable parameters
def print_trainable_parameters(model):
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Trainable parameters: {trainable_params} || Total parameters: {total_params} || Percent trainable: {100 * trainable_params / total_params:.2f}%")

print_trainable_parameters(model)

# Custom trainer to handle unsupported 'label' argument
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("input_ids").clone()  # Shift labels for causal LM
        inputs["labels"] = labels
        inputs.pop("label", None)  # Remove unsupported 'label' argument if exists
        return super().compute_loss(model, inputs, return_outputs)

# Training arguments


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

  # expand paths, if not os.makedirs("~/bar") will make directory


Trainable parameters: 4362240 || Total parameters: 1525754880 || Percent trainable: 0.29%
Calculating initial accuracy


100%|██████████| 100/100 [06:31<00:00,  3.92s/it]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Fine-tuning model


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [101]:
output_dir = "./snli_finetune_phi2"
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=1,
    # batch_size=16
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    warmup_steps=5,
    # wandb=False"
    
    learning_rate=2.5e-5,
    logging_steps=50,
    save_steps=len(train_dataset) // 2,
    eval_steps=50,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    fp16=True,
    #bits=4
    optim="paged_adamw_8bit",
    report_to='none'
)

# Mapping for SNLI label IDs to text labels
label_map = {0: "entailment", 1: "neutral", 2: "contradiction", -1: "neutral"}

# Function to calculate accuracy and log failure cases
def evaluate_model(model, dataset, tokenizer, device):
    model.eval()
    correct = 0
    total = len(dataset)
    results = []

    for idx, sample in tqdm(enumerate(dataset), total=total):
        input_text = (
            f"Premise: {sample['premise']}\n"
            f"Hypothesis: {sample['hypothesis']}\n"
            f"Answer with one of the following: entailment, neutral, contradiction.\nAnswer:"
        )
        inputs = tokenizer(input_text, return_tensors="pt", padding=True).to(device)
        target_label = label_map[sample['label']]  # Convert label ID to text label

        with torch.no_grad():
            output_ids = model.generate(
                inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                max_new_tokens=50,
                pad_token_id=tokenizer.eos_token_id
            )
            output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()

        match = re.search(r"Answer:\s*(entailment|neutral|contradiction)", output_text, re.IGNORECASE)
        prediction = match.group(1).lower() if match else "neutral"  # Default if no match found

        # Store result details for later analysis
        results.append({
            "index": idx,
            "prediction": prediction,
            "target_label": target_label,
            "is_correct": prediction == target_label
        })
        if prediction == target_label:
            correct += 1

    accuracy = correct / total
    return accuracy, results

# Record initial accuracy and resources
start_time = time.time()
cpu_memory = psutil.virtual_memory().used / (1024 ** 3)
gpu_memory = torch.cuda.memory_allocated() / (1024 ** 3) if torch.cuda.is_available() else None


# Fine-tune and evaluate
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
trainer.train()

# Final evaluation after fine-tuning



  # expand paths, if not os.makedirs("~/bar") will make directory
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


OutOfMemoryError: CUDA out of memory. Tried to allocate 80.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 2.12 MiB is free. Process 3511 has 14.71 GiB memory in use. Of the allocated memory 13.82 GiB is allocated by PyTorch, and 690.73 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
end_time = time.time()
fine_tuned_accuracy, fine_tuned_results = evaluate_model(model, test_dataset, tokenizer, device)
training_duration = end_time - start_time

# Identify cases with improvement, still incorrect, and initially correct
improved_cases = []
still_incorrect_cases = []
initially_correct_cases = []

for init, final in zip(initial_results, fine_tuned_results):
    if not init["is_correct"] and final["is_correct"]:
        improved_cases.append(init["index"])
    elif not init["is_correct"] and not final["is_correct"]:
        still_incorrect_cases.append(init["index"])
    elif init["is_correct"] and final["is_correct"]:
        initially_correct_cases.append(init["index"])

# Save the final model
model.save_pretrained(f"{output_dir}/final_model")

# Print resources and final results
print(f"Training completed in {training_duration/60:.2f} minutes")
print(f"CPU Memory Used: {cpu_memory:.2f} GB")
if gpu_memory:
    print(f"GPU Memory Used: {gpu_memory:.2f} GB")
print(f"Initial Model Accuracy: {initial_accuracy * 100:.2f}%")
print(f"Fine-tuned Model Accuracy: {fine_tuned_accuracy * 100:.2f}%")

# Print example failure cases and category indexes
print("\nExample failure cases before fine-tuning:")
for i in range(3):
    sample = test_dataset[i]
    input_text = (
        f"Premise: {sample['premise']}\n"
        f"Hypothesis: {sample['hypothesis']}\n"
        f"Answer with one of the following: entailment, neutral, contradiction.\nAnswer:"
    )
    inputs = tokenizer(input_text, return_tensors="pt", padding=True).to(device)
    with torch.no_grad():
        output = model.generate(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_new_tokens=50,
            pad_token_id=tokenizer.eos_token_id
        )
    print(f"Sample {i+1}:")
    print(f"Premise: {sample['premise']}")
    print(f"Hypothesis: {sample['hypothesis']}")
    print(f"Generated Label (Pre-trained): {tokenizer.decode(output[0], skip_special_tokens=True)}")

print("\nExample failure cases corrected by fine-tuned model:")
for i in range(3):
    sample = test_dataset[i]
    input_text = (
        f"Premise: {sample['premise']}\n"
        f"Hypothesis: {sample['hypothesis']}\n"
        f"Answer with one of the following: entailment, neutral, contradiction.\nAnswer:"
    )
    inputs = tokenizer(input_text, return_tensors="pt", padding=True).to(device)
    with torch.no_grad():
        output = model.generate(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_new_tokens=50,
            pad_token_id=tokenizer.eos_token_id
        )
    print(f"Sample {i+1}:")
    print(f"Premise: {sample['premise']}")
    print(f"Hypothesis: {sample['hypothesis']}")
    print(f"Generated Label (Fine-tuned): {tokenizer.decode(output[0], skip_special_tokens=True)}")

# Print lists of case indexes

print(f"Indexes of cases still incorrect after fine-tuning: {still_incorrect_cases}")
print(f"Indexes of cases correct initially and stayed correct: {initially_correct_cases}")


100%|██████████| 100/100 [06:12<00:00,  3.72s/it]


Training completed in 47.64 minutes
CPU Memory Used: 11.73 GB
GPU Memory Used: 5.20 GB
Initial Model Accuracy: 52.00%
Fine-tuned Model Accuracy: 54.00%

Example failure cases before fine-tuning:
Sample 1:
Premise: This church choir sings to the masses as they sing joyous songs from the book at a church.
Hypothesis: The church has cracks in the ceiling.
Generated Label (Pre-trained): Premise: This church choir sings to the masses as they sing joyous songs from the book at a church.
Hypothesis: The church has cracks in the ceiling.
Answer with one of the following: entailment, neutral, contradiction.
Answer: Contradiction

Exercise 2:
Premise: This church choir sings to the masses as they sing joyous songs from the book at a church.
Hypothesis: The church choir is made up of only men.
Answer
Sample 2:
Premise: A woman within an orchestra is playing a violin.
Hypothesis: A woman is playing the violin.
Generated Label (Pre-trained): Premise: A woman within an orchestra is playing a violin.

In [107]:
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import torch
import re

def evaluate_model(model, tokenizer, dataset, max_length=70):
    print(f"Dataset length: {len(dataset)}")
    model.eval()
    tokenizer.pad_token_id = tokenizer.eos_token_id
    predictions = []
    true_labels = []
    label_map = {0: "entailment", 1: "neutral", 2: "contradiction"}

    # Progress bar with fixed position
    for i in tqdm(range(len(dataset)), position=0, leave=True,desc="Predicting on test Samples"):
        premise = dataset['premise'][i]
        hypothesis = dataset['hypothesis'][i]
        label = dataset['label'][i]
        
        # Skip ambiguous label (-1) in SNLI dataset
        if label == -1:
            print(f"Skipped example {i} (ambiguous label)")
            continue
        true_labels.append(label_map[label])

        # Concatenate premise and hypothesis with a more specific prompt
        input_text = (
            f"Premise: {premise}\n"
            f"Hypothesis: {hypothesis}\n"
            f"Answer with one of the following: entailment, neutral, contradiction.\nAnswer:"
        )

        # Tokenize input text
        inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)
        
        # Generate prediction
        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=4)  # Strict limit on max_new_tokens to get concise answers
            prediction = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
            # Use regex to extract the label directly after "Answer:"
            match = re.search(r"Answer:\s*(entailment|neutral|contradiction)", prediction, re.IGNORECASE)
            if match:
                prediction = match.group(1).lower()
            else:
                prediction = "neutral"  # Default if no match is found
            predictions.append(prediction)

    # Calculate accuracy
    accuracy = accuracy_score(true_labels, predictions)
    return accuracy, predictions
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Evaluate model and calculate accuracy
accuracy, predictions = evaluate_model(model, tokenizer, test_data)

# Display accuracy and sample predictions
print(f"\nModel Accuracy: {accuracy * 100:.2f}%")
for i, prediction in enumerate(predictions[:5]):
    print(f"Example {i + 1}: {prediction}")


Dataset length: 100


Predicting on test Samples:   0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Predicting on test Samples:   1%|          | 1/100 [00:00<00:56,  1.76it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Predicting on test Samples:   2%|▏         | 2/100 [00:01<00:52,  1.87it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Predicting on test Samples:   3%|▎         | 3/100 [00:01<00:50,  1.93it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Predicting on test Samples:   4%|▍         | 4/100 [00:02<00:51,  1.86it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Predicting on test Samples:   5%|▌         | 5/100 [00:02<00:49,  1.91it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Predicting on test Samples:   6%|▌         | 6/100 [00:03<00:48,  1.94it/s]Setting `pad_token_id` to `eos_token_id`:None for open-en


Model Accuracy: 54.00%
Example 1: contradiction
Example 2: entailment
Example 3: entailment
Example 4: entailment
Example 5: entailment





In [6]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )
print(print_trainable_parameters(model))

trainable params: 2621440 || all params: 1524014080 || trainable%: 0.1720089095239855
None


In [7]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Example usage
total_params = count_parameters(pretrained_model)
print(f"Total parameters in the model: {total_params}")


Total parameters in the model: 2621440


In [84]:
test_data["premise"]

['This church choir sings to the masses as they sing joyous songs from the book at a church.',
 'A woman within an orchestra is playing a violin.',
 'Two men climbing on a wooden scaffold.',
 'A man in a black shirt, in a commercial kitchen, holding up meat he took out of a bag.',
 'a woman in a black shirt looking at a bicycle.',
 'many children play in the water.',
 'A group of people stand near and on a large black square on the ground with some yellow writing on it.',
 'A female softball player wearing blue and red crouches in the infield, waiting for the next play.',
 'Workers standing on a lift.',
 'Two men in neon yellow shirts busily sawing a log in half.',
 'A Skier ski-jumping while two other skiers watch his act.',
 'Children bathe in water from large drums.',
 'A woman is standing near three stores, two have beautiful artwork and the other store has Largo written on it.',
 'People are all standing together in front of a statue of an animal, and they are all wearing cool-wea