<a href="https://colab.research.google.com/github/jpcoleman1/Udacity-GenAI/blob/main/udacity_genai_project_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [31]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
import numpy as np
from datasets import load_metric
from transformers import Trainer
from peft import LoraModel, LoraConfig
from peft import get_peft_model, TaskType, AutoPeftModelForSequenceClassification

# Import dataset - we are using the ag_news dataset from Huggingface
dataset = load_dataset("ag_news", split={'train': 'train', 'test': 'test'})

splits = ["train", "test"]


# Prepare foundation model

## Tokenize dataset

In [32]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")



Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

## Split dataset

In [33]:
train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(5000))
test_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

## Load Pre-trained model

In [34]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=4,  # AG News has 4 labels
    id2label={0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"},
    label2id={"World": 0, "Sports": 1, "Business": 2, "Sci/Tech": 3},
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Train foundational model

In [35]:
# Set up training arguments

training_args = TrainingArguments(
    output_dir="./data/ag_news",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
)

In [36]:
# define evaluation metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [37]:
# Create trainer instance

trainer = Trainer(
    model=model,  #
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [38]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.343542,0.888
2,0.392800,0.317593,0.9


Checkpoint destination directory ./data/ag_news/checkpoint-313 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./data/ag_news/checkpoint-626 already exists and is non-empty. Saving will proceed but saved results may be invalid.


TrainOutput(global_step=626, training_loss=0.35986132819812516, metrics={'train_runtime': 159.7113, 'train_samples_per_second': 62.613, 'train_steps_per_second': 3.92, 'total_flos': 1324721233920000.0, 'train_loss': 0.35986132819812516, 'epoch': 2.0})

In [39]:
evaluation_results = trainer.evaluate()
print(evaluation_results)

{'eval_loss': 0.3175926208496094, 'eval_accuracy': 0.9, 'eval_runtime': 4.5021, 'eval_samples_per_second': 222.117, 'eval_steps_per_second': 3.554, 'epoch': 2.0}


Initial foundational model training yielded an accuracy approaching 90% as a benchmark for LoRA fine tuning.

In [40]:
## Save trained foundation model

In [41]:
model.save_pretrained("./results/ag_news_fine_tuned")
tokenizer.save_pretrained("./results/ag_news_fine_tuned")

('./results/ag_news_fine_tuned/tokenizer_config.json',
 './results/ag_news_fine_tuned/special_tokens_map.json',
 './results/ag_news_fine_tuned/vocab.txt',
 './results/ag_news_fine_tuned/added_tokens.json',
 './results/ag_news_fine_tuned/tokenizer.json')

## Apply LoRA fine tuning

In [42]:
config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_lin", "k_lin","v_lin"],
    lora_dropout=0.01,
    task_type=TaskType.SEQ_CLS # Seqence to Classification Task
)

# Load saved model
model_path = "./results/ag_news_fine_tuned"

lora_model = AutoPeftModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=4,
    id2label={0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"},
    label2id={"World": 0, "Sports": 1, "Business": 2, "Sci/Tech": 3},
)
tokenizer = AutoTokenizer.from_pretrained(model_path)



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [43]:
lora_model = get_peft_model(model, config)
lora_model.config.id2label = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"} # ensure custom lables carry through
lora_model.config.label2id = {"World": 0, "Sports": 1, "Business": 2, "Sci/Tech": 3}


lora_model.print_trainable_parameters()

trainable params: 814,852 || all params: 67,771,400 || trainable%: 1.202353795258767


# Perform lightweight tuning

In [44]:
# Set up training arguments

training_args = TrainingArguments(
    output_dir="./data/ag_news",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
)


In [45]:
# Create trainer instance

trainer = Trainer(
    model=lora_model,  #
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


## Train the model

In [46]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.329231,0.905
2,0.201400,0.330362,0.904


Checkpoint destination directory ./data/ag_news/checkpoint-313 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./data/ag_news/checkpoint-626 already exists and is non-empty. Saving will proceed but saved results may be invalid.


TrainOutput(global_step=626, training_loss=0.2016148704309433, metrics={'train_runtime': 128.03, 'train_samples_per_second': 78.107, 'train_steps_per_second': 4.889, 'total_flos': 1349753487360000.0, 'train_loss': 0.2016148704309433, 'epoch': 2.0})

## Evaluate trained model

In [47]:
evaluation_results = trainer.evaluate()
print(evaluation_results)

{'eval_loss': 0.3292309641838074, 'eval_accuracy': 0.905, 'eval_runtime': 4.7682, 'eval_samples_per_second': 209.721, 'eval_steps_per_second': 3.356, 'epoch': 2.0}


Model is performing at ~91% accuracy on unseen data after fine tuning. This is a slight improvement in performance compared to the 90% achieved from training the base model. Fine tuning could be improved with hyper parameter tuning and more epochs.

## Save trained model

In [50]:
lora_model.save_pretrained("./results/ag_news_fine_tuned_lora")
tokenizer.save_pretrained("./results/ag_news_fine_tuned_lora")


('./results/ag_news_fine_tuned_lora/tokenizer_config.json',
 './results/ag_news_fine_tuned_lora/special_tokens_map.json',
 './results/ag_news_fine_tuned_lora/vocab.txt',
 './results/ag_news_fine_tuned_lora/added_tokens.json',
 './results/ag_news_fine_tuned_lora/tokenizer.json')

# Load fine-tuned model

In [51]:
# Load model
model_path = "./results/ag_news_fine_tuned_lora"

model_for_predict = AutoPeftModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=4,
    id2label={0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"},
    label2id={"World": 0, "Sports": 1, "Business": 2, "Sci/Tech": 3},
)
tokenizer = AutoTokenizer.from_pretrained(model_path)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Inference

In [52]:
def predict(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
    outputs = model(**inputs)
    logits = outputs.logits
    predictions = logits.argmax(-1).tolist()
    return [model.config.id2label[prediction] for prediction in predictions]


sample_text = "The stock market closed lower today after a volatile trading session."
print(predict(sample_text, model_for_predict, tokenizer))


['Business']
