### Deep Learning Miniproject - Audio

AVS 8th Semester - Group 841

#### Import packages

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import IPython.display as ipd

import torch
import torchaudio
from torch import nn
import torch.optim

from datasets import load_dataset, DatasetDict, load_metric, concatenate_datasets
from transformers import ASTFeatureExtractor, ASTForAudioClassification, ASTConfig, TrainingArguments, Trainer

import wandb

  from .autonotebook import tqdm as notebook_tqdm


#### Parameters

In [2]:
# data loading
AUDIO_DIR = "./data/"
CSV_DIR = "./data/metadata_compiled.csv"
FILE_TYPE = ".mp3"

# model
SAMPLING_RATE = 16000
BATCH_SIZE = 4 # 4
LEARNING_RATE = 1e-3
CHECKPOINT = 'MIT/ast-finetuned-audioset-10-10-0.4593'

MAX_DURATION = 1
NUM_CLASSES = 3
HIDDEN_LAYER_SIZE = 384 # 768 
NUM_HIDDEN_LAYERS = 12
HIDDEN_DROPOUT_PROB = 0.1
ATTENTION_DROPOUT_PROB = 0.1

MAX_SEQ_LENGTH = MAX_DURATION * SAMPLING_RATE

In [3]:
#.csv file loading
df = pd.read_csv(CSV_DIR)

#### 1. Explore the dataset through code

a. How many samples does the dataset contain?

In [4]:
#Check no. samples
print(f'Number of samples : {df.shape[0]}')

Number of samples : 27550


b. How many classes? How many samples per class? Show a histogram of the number of intances per class

In [None]:
print(f'Number of classes: {len(df["status"].unique())}.\n\
    Classes: {df["status"].unique()}\n\
    {pd.value_counts(df["status"], dropna=False)}')

In [None]:
pd.value_counts(df['status'], dropna=False).plot.bar()
plt.show()

c. Play a random sample from each class

In [None]:
# playing healthy
healthy = df[df['status'] == 'healthy'].sample()['uuid'].item()
path = AUDIO_DIR + healthy + FILE_TYPE
print(path)
y, sr = torchaudio.load(path)
ipd.Audio(y, rate=sr)

In [None]:
# playing COVID-19
covid = df[df['status'] == 'COVID-19'].sample()['uuid'].item()
path = AUDIO_DIR + covid + FILE_TYPE
y, sr = torchaudio.load(path)
ipd.Audio(y, rate=sr)

In [None]:
# playing symptomatic
symptomatic = df[df['status'] == 'symptomatic'].sample()['uuid'].item()
path =  AUDIO_DIR + symptomatic + FILE_TYPE
y, sr = torchaudio.load(path)
ipd.Audio(y, rate=sr)

d. Describe if/how you think the data distribution will affect training of a classifier

Because `healthy` data is over represented among the ohter classes, the model after training can összetéveszteni confuse `COVID-19` and `symptomatic`  

e. Decide what part of the dataset to use; all, some classes, some samples. Motivate your choice

We will not use the data without labels as we cannot check if those classification would be correct.

#### 2. Use a neural network of your own chose to classify the dataset. Explain your choice and at least one alternative. Document your experiences.

We choose to train the dataset with `Audio Spectogram Transformer` model

#### Create Dataset

Load soundfiles into dataset

In [6]:
# must have metadata.csv with 'file_name' column to have also the features
# unsplitted dataset
dataset = load_dataset("audiofolder", data_dir=AUDIO_DIR, split="train")
dataset_augmentation=load_dataset("audiofolder", data_dir="./data_aug/", split="train")
full_dataset= concatenate_datasets([dataset,dataset_augmentation])
ds = full_dataset.class_encode_column("label")

Resolving data files: 100%|██████████| 16226/16226 [00:01<00:00, 10007.34it/s]
Found cached dataset audiofolder (/home/ubuntu/.cache/huggingface/datasets/audiofolder/default-5813fa48534e5405/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc)
Resolving data files: 100%|██████████| 6056/6056 [00:00<00:00, 11702.57it/s]
Found cached dataset audiofolder (/home/ubuntu/.cache/huggingface/datasets/audiofolder/default-e1b877236e834087/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc)
                                                                         

Split up the data : `training` - 70%, `validation` - 20%, `test` - 10%

In [18]:
train_testvalid = ds.train_test_split(test_size=1/3,stratify_by_column='label') #Split dataset into train 70% and test 30%
test_valid = train_testvalid['test'].train_test_split(test_size=1/3)

dataset = DatasetDict({
    'train' : train_testvalid['train'],
    'test' : test_valid['test'],
    'valid' : test_valid['train']
})

List the labels from DataFrame

In [14]:
# getting the labels for classification
labels = list(df.sort_values('status')["status"].unique()[:-1])

label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

num_classes = len(labels)

##### Model and Feature Extractor

Run this cell if you want to use `random weights`

In [None]:
config = ASTConfig(
    hidden_size=768, # default : 768
    #num_hidden_layers=12, # default : 12
    hidden_dropout_prob= 0.0, # def.: 0.0
    attention_probs_dropout_prob=0.0 # def.: 0.0
)

# input normalization: mean = 0, std = 0.5
feature_extractor = ASTFeatureExtractor(config, sampling_rate=SAMPLING_RATE, num_mel_bins=32, mean=0, std=0.5)

model = ASTForAudioClassification(config)
# weights must be the same for the model and the tokenizer/feature extractor 

Run this cell if you want to use `pre-trained weights`

In [15]:
config = ASTConfig(
    hidden_size=HIDDEN_LAYER_SIZE, # default : 768
    #num_hidden_layers=NUM_HIDDEN_LAYERS, # default : 12
    hidden_dropout_prob=HIDDEN_DROPOUT_PROB, # def.: 0.0
    attention_probs_dropout_prob=ATTENTION_DROPOUT_PROB, # def.: 0.0
    num_labels=len(labels),
    label2id=label2id,
    id2label=id2label,
)

feature_extractor = ASTFeatureExtractor(
    CHECKPOINT
)

model = ASTForAudioClassification.from_pretrained(
    CHECKPOINT, 
    config=config,
    ignore_mismatched_sizes=True
)

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- audio_spectrogram_transformer.embeddings.cls_token: found shape torch.Size([1, 1, 768]) in the checkpoint and torch.Size([1, 1, 384]) in the model instantiated
- audio_spectrogram_transformer.embeddings.distillation_token: found shape torch.Size([1, 1, 768]) in the checkpoint and torch.Size([1, 1, 384]) in the model instantiated
- audio_spectrogram_transformer.embeddings.position_embeddings: found shape torch.Size([1, 1214, 768]) in the checkpoint and torch.Size([1, 1214, 384]) in the model instantiated
- audio_spectrogram_transformer.embeddings.patch_embeddings.projection.weight: found shape torch.Size([768, 1, 16, 16]) in the checkpoint and torch.Size([384, 1, 16, 16]) in the model instantiated
- audio_spectrogram_transformer.embeddings.patch_embeddings.projection.bias: found shape torch.Size(

#### Tokenizing dataset

Function for `map`

In [21]:
def preprocess(examples):
    audio_arr = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arr,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=MAX_SEQ_LENGTH,
        truncation=True,
        padding=True,
        return_tensors="pt"
    )
    
    label = [x for x in examples["label"]]
    
    inputs["label"] = label
    return inputs

Tokenizing the dataset

In [22]:
dataset['train'] = dataset['train'].map(preprocess, remove_columns=["audio"], batched=True)
dataset['valid'] = dataset['valid'].map(preprocess, remove_columns=["audio"], batched=True)
dataset['test']= dataset['test'].map(preprocess, remove_columns=["audio"], batched=True)

                                                                 

### Training the model

Set `optimizer` and `scheduler`

In [23]:
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer,step_size=30, gamma=0.1)

Metric computing function : calculates `accuracy`, `precision`, `recall` and `f1 score`

In [24]:
def compute_metrics(eval_pred):
    metrics = dict()

    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    accuracy_metric = load_metric('accuracy')
    precision_metric = load_metric('precision')
    recall_metric = load_metric('recall')
    f1_metric = load_metric('f1')

    metrics.update(accuracy_metric.compute(predictions=predictions, references=labels))
    metrics.update(precision_metric.compute(predictions=predictions, references=labels, average='weighted'))
    metrics.update(recall_metric.compute(predictions=predictions, references=labels, average='weighted'))
    metrics.update(f1_metric.compute(predictions=predictions, references=labels, average='weighted'))

    return metrics

Setting up `wandb` to visualize training results

In [25]:
# set the wandb project where this run will be logged
os.environ["WANDB_PROJECT"]="cough-project"

# save your trained model checkpoint to wandb
os.environ["WANDB_LOG_MODEL"]="true"

# turn off watch to log faster
os.environ["WANDB_WATCH"]="false"

Defining `TrainingArguments` and `Trainer`

In [26]:
# the hyperparams for Trainer
training_arg = TrainingArguments(
    output_dir="output",
    report_to="wandb",
    learning_rate=LEARNING_RATE,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    #num_train_epochs= MAX_APOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    max_steps=1000, 
    logging_steps=50,
    eval_steps=200, 
    eval_accumulation_steps=1, 
    load_best_model_at_end=True,
    warmup_steps=50,
    save_total_limit=2,
    #metric_for_best_model='accuracy'
    )

# defining trainer
trainer = Trainer(
    model=model,
    args=training_arg,
    train_dataset=dataset['train'],
    eval_dataset=dataset['valid'],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler)
)

Do the training

In [27]:
trainer.train()
wandb.finish()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mvmatth[0m ([33mdeepl-coughs[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss


#### Testing the training

In [None]:
# evaluation
predictions = trainer.predict(dataset['test'])

In [None]:
# saving model weigths into files
model.save_pretrained('./saved_model/')

a. Discuss at least four relevant hyper-parameters

Learning rate : \
Epoch Numer : \
Batch Size : \
Optimizer: \
Layer Number : [??]

In [None]:
# learning rate
# epoch number
# mini-batch size

b. Experiment with the effect of different batch sizes

c. Experiment with the effect of different learning rates

d. Experiment with different number of network layers

e. Implement at least two data agumentation techniques

f. Discuss what influences the memory use of a solution such as yours. What can be done to reduce this?