# Lightweight Fine-Tuning Project

TODO: In this cell, describe your choices for each of the following

* PEFT technique: 
* Model: 
* Evaluation approach: 
* Fine-tuning dataset: 

In [1]:
!pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable


In [1]:
!pip install -U transformers



In [9]:
!pip install evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, evaluate
[0mSuccessfully installed evaluate-0.4.1 responses-0.18.0


In [1]:
from peft import LoraConfig, TaskType, get_peft_model, AutoPeftModelForCausalLM
from transformers import (AutoModelForCausalLM, AutoTokenizer, Trainer,
                          AutoModelForSequenceClassification, TrainingArguments,
                          DataCollatorWithPadding
                         )
from datasets import load_dataset, load_metric
import evaluate
import numpy as np
import pandas as pd
import torch



## Loading and Evaluating a Foundation Model

TODO: In the cells below, load your chosen pre-trained Hugging Face model and evaluate its performance prior to fine-tuning. This step includes loading an appropriate tokenizer and dataset.

[Distilbert for Sequence Classification](https://huggingface.co/transformers/v3.1.0/custom_datasets.html?highlight=forsequenceclassification#seq-imdb)

In [2]:
splits = ['train', 'test']
ds = {split: ds for split ,ds in zip(splits, load_dataset('imdb', split=splits))}

In [3]:
for split in splits:
    ds[split] = ds[split].shuffle(seed=42).select(range(500))
    
ds

{'train': Dataset({
     features: ['text', 'label'],
     num_rows: 500
 }),
 'test': Dataset({
     features: ['text', 'label'],
     num_rows: 500
 })}

In [4]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Let's use a lambda function to tokenize all the examples
tokenized_dataset = {}
for split in splits:
    tokenized_dataset[split] = ds[split].map(lambda x: tokenizer(x["text"], truncation=True), batched=True)

# Inspect the available columns in the dataset
tokenized_dataset["train"]

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 500
})

In [5]:
# dictionaries for labels
id2label = {0:'NEGATIVE', 1:'POSITIVE'}
label2id = dict(zip(id2label.values(), id2label.keys()))

# foundation model to evaluate
model = AutoModelForSequenceClassification.from_pretrained(
    "gpt2",
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)

# configuration for the tokenizer in the model
model.config.pad_token_id = tokenizer.pad_token_id

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# just test the dataset
tokenized_dataset["test"]

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 500
})

In [7]:
# evaluation metric and data collation
metric = evaluate.combine(["accuracy", "f1", "precision", "recall"])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# metric evaluation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# training arguments
training_args = TrainingArguments(
    output_dir="./foundation-model-results",
    evaluation_strategy="epoch",
    learning_rate=2e-3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir='logs',
    save_strategy='epoch',
    load_best_model_at_end=True,
    logging_steps=100,
    warmup_ratio=0.1
)

# perform training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.9004,0.712684,0.474,0.643148,0.482688,0.963415


TrainOutput(global_step=125, training_loss=0.8583055725097656, metrics={'train_runtime': 1077.6704, 'train_samples_per_second': 0.464, 'train_steps_per_second': 0.116, 'total_flos': 136488765947904.0, 'train_loss': 0.8583055725097656, 'epoch': 1.0})

In [8]:
# evaluate foundation model
foundation_model_evaluation = trainer.evaluate()
foundation_df = pd.DataFrame([foundation_model_evaluation], index=['GPT2'])
foundation_df

Unnamed: 0,eval_loss,eval_accuracy,eval_f1,eval_precision,eval_recall,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch
GPT2,0.712684,0.474,0.643148,0.482688,0.963415,184.8182,2.705,0.676,1.0


### View the results

Let's look at a few examples

In [9]:
# Make a dataframe with the predictions and the text and the labels
items_for_manual_review = tokenized_dataset["test"].select([0, 1, 22, 31, 43, 292, 448, 487])
results = trainer.predict(items_for_manual_review)

In [10]:
df = pd.DataFrame(
    {
        "text": [item["text"] for item in items_for_manual_review],
        "predictions": results.predictions.argmax(axis=1),
        "labels": results.label_ids,
    }
)

df

Unnamed: 0,text,predictions,labels
0,<br /><br />When I unsuspectedly rented A Thou...,0,1
1,This is the latest entry in the long series of...,1,1
2,"In the early 00's, production companies had a ...",1,0
3,This is just one of those films which cannot j...,1,0
4,I really liked the movie 'The Emporer's New Gr...,1,0
5,I picked out this DVD out of the cheepo bin at...,1,0
6,What a waste of time to watch this movie. Poor...,1,0
7,When this show first aired I will admit to bei...,1,0


## Performing Parameter-Efficient Fine-Tuning

TODO: In the cells below, create a PEFT model from your loaded model, run a training loop, and save the PEFT model weights.

In [16]:
# foundation model to do peft
model = AutoModelForSequenceClassification.from_pretrained(
    "gpt2",
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)

# configuration for the tokenizer in the model
model.config.pad_token_id = tokenizer.pad_token_id

# peft configuration
lora_config = LoraConfig(
    fan_in_fan_out=False,
    init_lora_weights=True,
    layers_pattern=None,
    layers_to_transform=None,
    lora_alpha=32,
    lora_dropout=0.05,
    modules_to_save=None,
    peft_type='LORA',
    r=8,
    revision=None,
    task_type=TaskType.CAUSAL_LM
)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
lora_model = get_peft_model(model, lora_config)
lora_model.print_trainable_parameters()

trainable params: 294,912 || all params: 124,736,256 || trainable%: 0.236428452686603


In [18]:
# evaluation metric and data collation
metric = evaluate.combine(["accuracy", "f1", "precision", "recall"])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# training arguments
training_args = TrainingArguments(
    output_dir="./lora-model-results",
    evaluation_strategy="epoch",
    learning_rate=2e-3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir='lora-logs',
    save_strategy='epoch',
    load_best_model_at_end=True,
    logging_steps=100,
    warmup_ratio=0.1
)

# perform training
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6068,0.341798,0.884,0.884921,0.864341,0.906504


TrainOutput(global_step=125, training_loss=0.5577400207519532, metrics={'train_runtime': 1045.6772, 'train_samples_per_second': 0.478, 'train_steps_per_second': 0.12, 'total_flos': 136962000617472.0, 'train_loss': 0.5577400207519532, 'epoch': 1.0})

In [19]:
# get lora evaluations
lora_evaluation = trainer.evaluate()

# view metrics
lora_df = pd.DataFrame([lora_evaluation], index=['GPT2-LoRA'])
lora_df

Unnamed: 0,eval_loss,eval_accuracy,eval_f1,eval_precision,eval_recall,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch
GPT2-LoRA,0.341798,0.884,0.884921,0.864341,0.906504,224.0936,2.231,0.558,1.0


In [20]:
lora_model.save_pretrained("issaiass/gpt2-lora")

## Performing Inference with a PEFT Model

TODO: In the cells below, load the saved PEFT model weights and evaluate the performance of the trained PEFT model. Be sure to compare the results to the results from prior to fine-tuning.

In [21]:
from peft import PeftConfig, PeftModel
pretrained_lora_config = PeftConfig.from_pretrained("issaiass/gpt2-lora")
pretrained_model = AutoModelForSequenceClassification.from_pretrained(pretrained_lora_config.base_model_name_or_path)
pretrained_lora_model = PeftModel.from_pretrained(pretrained_model, "issaiass/gpt2-lora")

lora_tokenizer = AutoTokenizer.from_pretrained("gpt2")
lora_tokenizer.pad_token = lora_tokenizer.eos_token

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
import json
def lora_predict(sentence:str):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = lora_tokenizer(sentence, return_tensors='pt', truncation=True, padding=True)
    inputs.to(device)
    lora_model.to(device)
    with torch.no_grad():
        outputs = lora_model(input_ids=inputs['input_ids'])
        logits = outputs.logits
        logits.to(device)
    probabilities = torch.nn.functional.softmax(logits, dim=1).cpu()
    predicted_class_id = probabilities.argmax(dim=1).cpu().numpy().tolist()
    probabilities = probabilities.numpy().tolist()
    predicted_label = [id2label[cid] for cid in predicted_class_id]

    json_data = [{"label": lbl, "probabilities": prob} for lbl, prob in zip(predicted_label, probabilities) ]
    json_string = json.dumps(json_data, indent=4)
    
    return json_data

lora_predict(
    sentence=
    ["Nora Roberts is the most prolific romance writer the world has ever known.", 
     "That's no way to talk to a hero!",
     "Everybody loves Mary, she's addorable",
     "I do not like it so much",
     "Messi and Cristiano are the best football players in the world",
     "Is not feasible to put all your money in bitcoins",
     "Tomorrow never dies",
    ]
)

[{'label': 'POSITIVE',
  'probabilities': [0.09818808734416962, 0.901811957359314]},
 {'label': 'NEGATIVE',
  'probabilities': [0.8778833150863647, 0.12211669981479645]},
 {'label': 'POSITIVE',
  'probabilities': [0.22314785420894623, 0.7768521308898926]},
 {'label': 'NEGATIVE',
  'probabilities': [0.8513521552085876, 0.14864780008792877]},
 {'label': 'POSITIVE',
  'probabilities': [0.2490847408771515, 0.7509152293205261]},
 {'label': 'NEGATIVE',
  'probabilities': [0.8931923508644104, 0.1068076565861702]},
 {'label': 'NEGATIVE',
  'probabilities': [0.7280558943748474, 0.271944135427475]}]

### Model Comparison

In [26]:
pd.concat([foundation_df, lora_df])

Unnamed: 0,eval_loss,eval_accuracy,eval_f1,eval_precision,eval_recall,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch
GPT2,0.712684,0.474,0.643148,0.482688,0.963415,184.8182,2.705,0.676,1.0
GPT2-LoRA,0.341798,0.884,0.884921,0.864341,0.906504,224.0936,2.231,0.558,1.0
