## Reference

[Ultimate Guide to Fine-Tuning in PyTorch : Part 1 — Pre-trained Model and Its Configuration](https://rumn.medium.com/part-1-ultimate-guide-to-fine-tuning-in-pytorch-pre-trained-model-and-its-configuration-8990194b71e)

[Training a PyTorch Model with DataLoader and Dataset](https://machinelearningmastery.com/training-a-pytorch-model-with-dataloader-and-dataset/)

[LoRA](https://huggingface.co/docs/peft/conceptual_guides/lora)

[PEFT](https://huggingface.co/docs/peft/index)

[Image classification using LoRA](https://huggingface.co/docs/peft/task_guides/image_classification_lora)

[AST clasifiers](https://huggingface.co/models?pipeline_tag=audio-classification&sort=downloads&search=ast)

[PEFT Quicktour](https://huggingface.co/docs/peft/quicktour)

[Initialize Model with Adapters](https://docs.adapterhub.ml/quickstart.html)

[Fine-tuning for Audio Classification with 🤗 Transformers](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/audio_classification.ipynb)

## imports and functions/Classes

In [None]:
!pip install mlflow
!databricks configure --host https://community.cloud.databricks.com/
!pip install adapters
!pip install peft

Collecting mlflow
  Downloading mlflow-2.10.0-py3-none-any.whl (19.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.5/19.5 MB[0m [31m27.4 MB/s[0m eta [36m0:00:00[0m
Collecting databricks-cli<1,>=0.8.7 (from mlflow)
  Downloading databricks_cli-0.18.0-py2.py3-none-any.whl (150 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.3/150.3 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
Collecting gitpython<4,>=2.1.0 (from mlflow)
  Downloading GitPython-3.1.41-py3-none-any.whl (196 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m196.4/196.4 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.0.0-py3-none-any.whl (147 kB)
[2K     [90m━━━━━

In [None]:
import torch
from google.colab import drive
from transformers import ASTFeatureExtractor, AutoProcessor, ASTModel, AutoModelForAudioClassification
from adapters import AdapterConfig

import torch.nn.functional as F
import torch.nn as nn

import torch.optim as optim
from tqdm import tqdm

from numba import cuda
from torch.utils.data import Dataset, DataLoader, random_split, default_collate
import mlflow
import peft
from peft import LoraConfig, get_peft_model


In [None]:
mlflow.set_tracking_uri("databricks")
mlflow.set_experiment("/Users/tl_vasilev@yahoo.com/birds2023")

<Experiment: artifact_location='dbfs:/databricks/mlflow-tracking/392919980505162', creation_time=1705679888584, experiment_id='392919980505162', last_update_time=1706431115432, lifecycle_stage='active', name='/Users/tl_vasilev@yahoo.com/birds2023', tags={'mlflow.experiment.sourceName': '/Users/tl_vasilev@yahoo.com/birds2023',
 'mlflow.experimentType': 'MLFLOW_EXPERIMENT',
 'mlflow.ownerEmail': 'tl_vasilev@yahoo.com',
 'mlflow.ownerId': '4375994119615411'}>

In [None]:

class TomasAST_1(nn.Module):
    def __init__(self, model):
        super(TomasAST_1, self).__init__()
        # self.part1 = model.audio_spectrogram_transformer
        self.part1 = model
        self.part2 = nn.LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        self.part3 = nn.Linear(in_features=768, out_features=384, bias=True)
        self.part4 = nn.LayerNorm((384,), eps=1e-12, elementwise_affine=True)
        self.part5 = nn.Linear(in_features=384, out_features=120, bias=True)
        self.part6 = nn.LayerNorm((120,), eps=1e-12, elementwise_affine=True)
        self.part7 = nn.Linear(in_features=120, out_features=40, bias=True)
        self.part8 = nn.LayerNorm((40,), eps=1e-12, elementwise_affine=True)
        self.part9 = nn.Linear(in_features=40, out_features=11, bias=True)


    def forward(self, x):

      x = self.part1(x)
      x = self.part2(x[1])
      x = F.relu(self.part3(x))
      x = self.part4(x)
      x = F.relu(self.part5(x))
      x = self.part6(x)
      x =F.relu( self.part7(x))
      x = self.part8(x)
      x = self.part9(x)
      return x

class TomasAST_2(nn.Module):
    def __init__(self, model):
        super(TomasAST_2, self).__init__()
        # self.part1 = model.audio_spectrogram_transformer
        self.part1 = model
        self.part2 = nn.LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        self.part3 = nn.Linear(in_features=768, out_features=160, bias=True)
        self.part4 = nn.LayerNorm((160,), eps=1e-12, elementwise_affine=True)
        self.part5 = nn.Linear(in_features=160, out_features=11, bias=True)


    def forward(self, x):

      x = self.part1(x)
      x = self.part2(x[1])
      x = F.relu(self.part3(x))
      x = self.part4(x)
      x = self.part5(x)
      return x

In [None]:
class CustomAST(nn.Module):
    def __init__(self, model):
        super(CustomAST, self).__init__()
        # self.part1 = model.audio_spectrogram_transformer
        self.part1 = model
        self.part2 = nn.LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        self.part3 = nn.Linear(in_features=768, out_features=11, bias=True)

    def forward(self, x):

      x = self.part1(x)
      x = self.part2(x[1])
      x = self.part3(x)
      return x


class BirdDataset(Dataset):
    def __init__(self, X, y):
        # convert into PyTorch tensors and remember them
        self.X = X
        self.y = y

    def __len__(self):
        # this should return the size of the dataset
        return len(self.X)

    def __getitem__(self, idx):
        # this should return one sample from the dataset
        features = self.X[idx]
        target = self.y[idx]
        return features, target

def multi_acc(y_pred, y_test):
    y_pred_softmax = torch.log_softmax(y_pred, dim = 1)
    _, y_pred_tags = torch.max(y_pred_softmax, dim = 1)

    correct_pred = (y_pred_tags == y_test).float()
    acc = correct_pred.sum() / len(correct_pred)

    # acc = torch.round(acc * 100)

    return acc

def train_model(model, train_loader, val_loader, lr = 0.01, momentum=0.9, weight_decay=0.001,
                run_name = 'basic model', n_epochs = 1, lr_decrese = False  ):
  try:

    mlflow.start_run(run_name = run_name)
    # optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=0.001)
      # number of epochs to run
    loss_fn = nn.CrossEntropyLoss()

    loss_list = [ [] for _ in range(n_epochs) ]
    accuracy = []

    for epoch in range(n_epochs):
      if (epoch+1) == n_epochs and lr_decrese == True : lr = lr/10 # the learning rate in the last epoch will be smaller so teht we can find better the minimum
      optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay)
      loss_list[epoch] = []
      for Xbatch, ybatch in tqdm(train_loader):
        # forward pass
        y_pred = model(Xbatch.to(device))
        # loss = loss_fn(y_pred.logits, ybatch.to(device))
        loss = loss_fn(y_pred, ybatch.to(device))
        # # L1 regularization
        # regularization_loss = 0.0
      # for param in model.parameters():
        #   regularization_loss += torch.norm(param, 1)
        # loss += 0.01 * regularization_loss#Adjust regularization strength as needed
        # print(loss)
        mlflow.log_metric(f"train_loss_ep{epoch}", loss)
        loss_list[epoch].append(loss)
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        # update weights
        optimizer.step()
      # validation
      with torch.no_grad():
        for x_val_b, y_val_b in val_loader:
          y_pred_b = model(x_val_b.to(device))
          accuracy.append(multi_acc(y_pred_b, y_val_b.to(device)))
        mlflow.log_metric(f"Accuracy_val_ep", sum(accuracy)/len(accuracy))
        mlflow.log_param(f'lr_epoch{epoch+1}', lr)
        print(f'Epoch {epoch+1} Accuracy = {sum(accuracy)/len(accuracy)}')
    mlflow.end_run()
  except :
    print('stopping mlflow run')
    mlflow.end_run()

def predict_model(model,test_loader):
    model.eval()
    accuracy = []
    with torch.no_grad():
      for x_test_b, y_test_b in tqdm(test_loader):
        y_pred_b = model(x_test_b.to(device))
        accuracy.append(multi_acc(y_pred_b, y_test_b.to(device)))
      print(f'Test Accuracy = {sum(accuracy)/len(accuracy)}')

def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

# Bird sound classifier using AST pre-trained model and fine-tuned with LoRA.

[The Whole project with the data can be found here](https://drive.google.com/drive/folders/1YGw6GGCBEjsg3dFgiEruD7szzUpMcSVW?usp=sharing)

We have already prepared the input data in "Bird Clssifier - Pytorch Data preparation.ipynp" using the AST feature extractor and saving the tesnsors with the input data. Here we will focus on the Finetuning.

Audio Spectrogram Transformer (AST) is presented as the first convolution-free, purely attention-based model for Audio classification. The Audio Spectrogram Transformer is equivalent to ViT, but applied on audio. Audio is first turned into an image (as a spectrogram), after which a Vision Transformer is applied. The used Audio Spectrogram Transformer (AST) model is fine-tuned on AudioSet and was introduced in the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778)

<a id='tops'></a>
## Content

* [Loading training and test data, loading the pre-trained models](#ld)
* [Train and test pre-trained AST model with my custom classifier head](#my_model)
* [Adaptation using LoRA ( Low-Rank Adaptation) on my model](#lora)
* [Bigger head does not produce better results](#heads)


<a id='ld'></a>
[TOP](#tops)
## Loading training and test data, loading the pre-trained models

In [None]:
drive.mount('/content/gdrive')
%cd gdrive/MyDrive/Colab Notebooks/Birds2023

Mounted at /content/gdrive
/content/gdrive/MyDrive/Colab Notebooks/Birds2023


In [None]:
ast_model_classifier = AutoModelForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
# AST model with a pretrained head classifier
feature_extractor = ASTFeatureExtractor()
processor = AutoProcessor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
ast_model_pure = ASTModel.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593") # AST model without head classifier

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


<b>We will load our datasets into the CPU and then run the training on the GPU</b>

In [None]:
# Use GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')  # Assuming that we are on a CUDA machine, this should print a CUDA device:
print(device)

cuda:0


In [None]:
x_tr = torch.load('X_train_tensor.pt')
y_tr = torch.load('y_train_tensor.pt')
x_val = torch.load('X_test_tensor.pt')
y_val = torch.load('y_test_tensor.pt')

<a id='my_model'></a>
[TOP](#tops)
## Train and test pre-trained AST model with my custom classifier head

<b>Split the test data from previous chapterinto validation 30% and test 70% sets</b>

In [None]:
testset, valset = random_split(BirdDataset(x_val, y_val), [0.7, 0.3])
val_loader = DataLoader(valset, shuffle=True, batch_size=20)
test_loader = DataLoader(testset, shuffle=True, batch_size=20)
# train_loader = DataLoader((x_tr, y_tr), shuffle=True, batch_size=40)
train_loader = DataLoader(BirdDataset(x_tr, y_tr), shuffle=True, batch_size=40)

We will Freeze teh layers that are coming from teh AST transformer.<br>
Leaving only our custom Head as trainable

In [None]:
torch.manual_seed(0)
my_model = CustomAST(model=ast_model_pure).to(device) #loading the model on the GPU

for name, param in my_model.named_parameters():
  if 'part1' in name :
    param.requires_grad = False # make the original AST part non trainable, so that only the classifier head gets trained
  print(name, param.requires_grad)

part1.embeddings.cls_token False
part1.embeddings.distillation_token False
part1.embeddings.position_embeddings False
part1.embeddings.patch_embeddings.projection.weight False
part1.embeddings.patch_embeddings.projection.bias False
part1.encoder.layer.0.attention.attention.query.weight False
part1.encoder.layer.0.attention.attention.query.bias False
part1.encoder.layer.0.attention.attention.key.weight False
part1.encoder.layer.0.attention.attention.key.bias False
part1.encoder.layer.0.attention.attention.value.weight False
part1.encoder.layer.0.attention.attention.value.bias False
part1.encoder.layer.0.attention.output.dense.weight False
part1.encoder.layer.0.attention.output.dense.bias False
part1.encoder.layer.0.intermediate.dense.weight False
part1.encoder.layer.0.intermediate.dense.bias False
part1.encoder.layer.0.output.dense.weight False
part1.encoder.layer.0.output.dense.bias False
part1.encoder.layer.0.layernorm_before.weight False
part1.encoder.layer.0.layernorm_before.bias Fa

I decide to train this model for 5 epochs based on the research done in [THE BIRDS NEED ATTENTION TOO](#https://arxiv.org/pdf/2211.07722.pdf).

In [None]:
train_model(my_model, train_loader = train_loader, val_loader = val_loader, n_epochs = 5)

100%|██████████| 75/75 [05:16<00:00,  4.22s/it]


Epoch 1 Accuracy = 0.830833375453949


100%|██████████| 75/75 [05:15<00:00,  4.21s/it]


Epoch 2 Accuracy = 0.8462499976158142


100%|██████████| 75/75 [05:15<00:00,  4.20s/it]


Epoch 3 Accuracy = 0.8611111044883728


100%|██████████| 75/75 [05:14<00:00,  4.20s/it]


Epoch 4 Accuracy = 0.8656250834465027


100%|██████████| 75/75 [05:14<00:00,  4.20s/it]


Epoch 5 Accuracy = 0.8673334121704102


In [None]:
predict_model(my_model, test_loader)

Test Accuracy = 0.887619137763977


In [None]:
torch.save(my_model, 'ast_with_my_head.pth')
# Remember that you must call model.eval() to set dropout and batch normalization
# layers to evaluation mode before running inference. Failing to do this will yield inconsistent inference results.

My model is using transfer learning by only adding a trainable head and shows acceptable results just after 5 epochs. Bellow I show some metrixs from MLflow (databricks community cloud is set and used for teh Mlflow):<br>
<figure>
<img src="Accuracy_my_model.png" style="max-height: 200px" alt="accuracy_per_epoch" />
<figcaption align = "center"> We can see how the accuracy increases on the validation set on the "y" axes (slowly but steady). The time on the "x" is relative and represents the time of execution. This graphs are not upto the standards, but further understanding of how to use MLflow is required to make it better.  </figcaption>
</figure>
<br>
<figure>
<img src="Loss_my_model.png" style="max-height: 200px" alt="Loss_epoch_1" />
<figcaption align = "center"> We can see how the Loss from the first epoch is decreasing (train_loss_ep0).While teh loss from the last epoch is relatively stable (train_loss_ep4)  </figcaption>
</figure>


<a id='lora'></a>
[TOP](#tops)
## Adaptation using LoRA ( Low-Rank Adaptation) on my model

LoRA freezes the pretrained model weights and injects trainable rank decomposition matrices into each layer of the Transformer architecture, greatly reducing the number of trainable parameters for downstream tasks.

In [None]:
config2 = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["query", "value"],
    lora_dropout=0.1,
    bias="none",
    modules_to_save=["part2","part3"],
)

lora_model = get_peft_model(torch.load('ast_with_my_head.pth'), config2).to(device)
print_trainable_parameters(lora_model)

trainable params: 599819 || all params: 86797078 || trainable%: 0.69


As we will train more layers, we shold adjust the size of the batches as shown bellow, so that we could process the larger number of parameters (599816) with the available resources.

In [None]:
testset, valset = random_split(BirdDataset(x_val, y_val), [0.7, 0.3])
val_loader_lora = DataLoader(valset, shuffle=True, batch_size=5)
test_loader_lora = DataLoader(testset, shuffle=True, batch_size=5)
train_loader_lora = DataLoader(BirdDataset(x_tr, y_tr), shuffle=True, batch_size=5)

we will train this for 3 epochs with a smaller Learning rate to imporove the model a bit.

In [None]:
train_model(lora_model, train_loader = train_loader_lora, val_loader = val_loader_lora, run_name = 'my_adapter_model2',  n_epochs = 3, lr = 0.0001 )

100%|██████████| 593/593 [13:07<00:00,  1.33s/it]


Epoch 1 Accuracy = 0.8899999260902405


100%|██████████| 593/593 [13:28<00:00,  1.36s/it]


Epoch 2 Accuracy = 0.8950003385543823


100%|██████████| 593/593 [13:28<00:00,  1.36s/it]


Epoch 3 Accuracy = 0.8977782726287842


In [None]:
predict_model(lora_model, test_loader = test_loader_lora)

100%|██████████| 139/139 [01:19<00:00,  1.75it/s]

Test Accuracy = 0.9093528985977173





After Lora we can see that the test set accuracy increased by around 2%

In [None]:
torch.save(lora_model, 'my_ast_with_lora.pth') # save this version of teh model

<a id='heads'></a>
[TOP](#tops)
## Bigger head does not produce better results

the model we used above is very simple we just added a layer normalization and then a Dense layer from 768 to 11. As this is big differens in one step it is worth asking: would we get better results if we gradually reach the 11 classes.
<br>Bellow i will train 2 models with different heads:<br>
option 1:<br>
```python
        self.part1 = model
        self.part2 = nn.LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        self.part3 = nn.Linear(in_features=768, out_features=384, bias=True)
        self.part4 = nn.LayerNorm((384,), eps=1e-12, elementwise_affine=True)
        self.part5 = nn.Linear(in_features=384, out_features=120, bias=True)
        self.part6 = nn.LayerNorm((120,), eps=1e-12, elementwise_affine=True)
        self.part7 = nn.Linear(in_features=120, out_features=40, bias=True)
        self.part8 = nn.LayerNorm((40,), eps=1e-12, elementwise_affine=True)
        self.part9 = nn.Linear(in_features=40, out_features=11, bias=True)
```
<br>option 2:<br>
```python
        self.part1 = model
        self.part2 = nn.LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        self.part3 = nn.Linear(in_features=768, out_features=160, bias=True)
        self.part4 = nn.LayerNorm((160,), eps=1e-12, elementwise_affine=True)
        self.part5 = nn.Linear(in_features=160, out_features=11, bias=True)
```

In [None]:
testset, valset = random_split(BirdDataset(x_val, y_val), [0.7, 0.3])
val_loader = DataLoader(valset, shuffle=True, batch_size=20)
test_loader = DataLoader(testset, shuffle=True, batch_size=20)
train_loader = DataLoader(BirdDataset(x_tr, y_tr), shuffle=True, batch_size=40)

In [None]:
torch.manual_seed(0)
tomas_model_1 = TomasAST_1(model=ast_model_pure).to(device)

for name, param in tomas_model.named_parameters():
  if 'part1' in name :
    param.requires_grad = False # make the original AST part non trainable, so that only the classifier head gets trained
  # print(name, param.requires_grad)

In [None]:
train_model(tomas_model_1, train_loader = train_loader, val_loader = val_loader, n_epochs = 5, run_name = 'tomas_model')

100%|██████████| 75/75 [05:16<00:00,  4.22s/it]


Epoch 1 Accuracy = 0.6983333826065063


100%|██████████| 75/75 [05:18<00:00,  4.25s/it]


Epoch 2 Accuracy = 0.7195833921432495


100%|██████████| 75/75 [05:17<00:00,  4.24s/it]


Epoch 3 Accuracy = 0.7658333778381348


100%|██████████| 75/75 [05:19<00:00,  4.26s/it]


Epoch 4 Accuracy = 0.7860416173934937


100%|██████████| 75/75 [05:19<00:00,  4.26s/it]


Epoch 5 Accuracy = 0.7986665368080139


In [None]:
predict_model(tomas_model_1, test_loader)

100%|██████████| 35/35 [01:14<00:00,  2.13s/it]

Test Accuracy = 0.8661904335021973





In [None]:
torch.manual_seed(0)
tomas_model_2 = TomasAST_2(model=ast_model_pure).to(device)

for name, param in tomas_model_2.named_parameters():
  if 'part1' in name :
    param.requires_grad = False
  print(name, param.requires_grad)

In [None]:
train_model(tomas_model_2, train_loader = train_loader, val_loader = val_loader, n_epochs = 5, run_name = 'tomas_model_2')

100%|██████████| 75/75 [05:18<00:00,  4.24s/it]


Epoch 1 Accuracy = 0.8250001072883606


100%|██████████| 75/75 [05:17<00:00,  4.24s/it]


Epoch 2 Accuracy = 0.8308334350585938


100%|██████████| 75/75 [05:17<00:00,  4.24s/it]


Epoch 3 Accuracy = 0.8461112380027771


100%|██████████| 75/75 [05:18<00:00,  4.24s/it]


Epoch 4 Accuracy = 0.8518751859664917


100%|██████████| 75/75 [05:18<00:00,  4.25s/it]


Epoch 5 Accuracy = 0.8653334975242615


In [None]:
predict_model(tomas_model_2, test_loader)

100%|██████████| 35/35 [01:14<00:00,  2.12s/it]

Test Accuracy = 0.9047619104385376





We can notice that adding layers in the head does not improve the results significantly and we should not be concerned dercreasing the nodes from 768 to 11 in just one Dense layer.
