### Deep Learning Miniproject - Audio

AVS 8th Semester - Group 841

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import IPython.display as ipd

import torch
import torchaudio
from torch import nn
import torch.optim 
from torch.utils.data import random_split, DataLoader, Dataset
#import pytorch_lightning as pl

from datasets import load_dataset, DatasetDict
# https://huggingface.co/docs/transformers/main/en/model_doc/audio-spectrogram-transformer#transformers.ASTConfig
# https://huggingface.co/docs/transformers/v4.26.1/en/main_classes/trainer#transformers.TrainingArguments
from transformers import ASTFeatureExtractor, ASTForAudioClassification, ASTConfig, TrainingArguments, Trainer

from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import wandb

# Hyperparameters

In [4]:
#Hyperparaeters

AUDIO_DIR = "./data/"
CSV_DIR = "./data/metadata_compiled.csv"
FILE_TYPE = ".mp3"
BATCH_SIZE = 16

In [5]:
#Load the data!
df = pd.read_csv(CSV_DIR)

#### 1. Explore the dataset through code

a. How many samples does the dataset contain?

In [14]:
#Check no. samples
print(f'Number of samples : {df.shape[0]}')

Number of samples : 27550


b. How many classes? How many samples per class? Show a histogram of the number of intances per class

In [None]:
print(f'Number of classes: {len(df["status"].unique())}.\n\
    Classes: {df["status"].unique()}\n\
    {pd.value_counts(df["status"], dropna=False)}')

In [None]:
pd.value_counts(df['status'], dropna=False).plot.bar()
plt.show()

c. Play a random sample from each class

In [15]:
# playing healthy
healthy = df[df['status'] == 'healthy'].sample()['uuid'].item()
path = AUDIO_DIR + healthy + FILE_TYPE
print(path)
y, sr = torchaudio.load(path)
ipd.Audio(y, rate=sr)

./data/122a658a-f169-47cf-ae90-b609e469fc7a.mp3


In [None]:
# playing COVID-19
covid = df[df['status'] == 'COVID-19'].sample()['uuid'].item()
path = AUDIO_DIR + covid + FILE_TYPE
y, sr = torchaudio.load(path)
ipd.Audio(y, rate=sr)

In [None]:
# playing symptomatic
symptomatic = df[df['status'] == 'symptomatic'].sample()['uuid'].item()
path =  AUDIO_DIR + symptomatic + FILE_TYPE
y, sr = torchaudio.load(path)
ipd.Audio(y, rate=sr)

d. Describe if/how you think the data distribution will affect training of a classifier

e. Decide what part of the dataset to use; all, some classes, some samples. Motivate your choice

#### 2. Use a neural network of your own chose to classify the dataset. Explain your choice and at least one alternative. Document your experiences:.

#### Audio Spectogram Transformer Implementation

##### Creating a Dataset class

We create a custom Dataset class to load our cough files. 

In Pytorch the Dataset class has to override the functions: **\__len__** and **\__getitem__**, where **\__len__** returns the amount of files in the dataset, and **\__getitem__** returns the file and label for each file index.


In [None]:
class AudioDatatset(Dataset):
    def __init__(self, audio_dir, class_csv):
        self.audio_dir = audio_dir
        self.df = pd.read_csv(class_csv)

        self.audio_dir_list = os.listdir(self.audio_dir)

    def __len__(self):
        return len(self.audio_dir_list)
    
    def __getitem__(self, idx):
        #Loading the audio file
        audio_file_path = os.path.join(self.audio_dir, self.audio_dir_list[idx])
        waveform, sample_rate = ta.load(audio_file_path, normalize=True)
        
        #Transforming to mel spectogram
        transform = ta.transforms.MelSpectrogram(sample_rate, n_mels=32)
        mel_specgram = transform(waveform) 

        #Loading the label
        audio_file_name = self.audio_dir_list[idx].replace(FILE_TYPE, '')

        i =  self.df[ self.df['uuid']==audio_file_name].index.values
        label =  self.df["status"].loc[ self.df.index[i].values[0]]
        #Convert the label from string to a number. Healthy = 0, Symptomaic = 1, Covid = 2
        if label == 'healthy':
            label = 0
        elif label == 'symptomatic':
            label == 1
        else:
            label == 2


        return mel_specgram, label 


##### Testing the Dataset class

We can quickly create a an instance of the AudioDataset class and print out values for an item in the dataset.

In [None]:
test = AudioDatatset(AUDIO_DIR, CSV_DIR)
print(test.__getitem__(0))
print(test.__len__())

##### Creating a DataModule class

Pytorch also has a DataModule class that loads the data from the Dataset class we just made. In this class we split the dataset into training, validation and testing with a **70/20/10** split. This DataModule class also allows us to set the batch size, number of workers and more for the training. Since we are using Pytorch lightning, we need to have the following functions: **prepare_data, setup, train_dataloader, val_dataloader and test_dataloader.**

In [None]:
class DataModuleClass(pl.LightningDataModule):
    def __init__(self, batch_size):
        super().__init__()
        #self.transform = transforms.MelSpectrogram(sample_rate)
        self.batch_size = batch_size
        self.audio_files =  []

    def prepare_data(self):
        
        pass
        #Define steps that should be done
        #only on one GPU, like getting data

    def setup(self, stage=None):
        #Apply melSpectogram transform
        self.audio_files = AudioDatatset(AUDIO_DIR, CSV_DIR)

        #Splitting manually 
        audio_len = self.audio_files.__len__()
        train_size = round(audio_len * 0.7)
        val_size = round(audio_len * 0.2)
        test_size = audio_len - train_size - val_size
        
        self.train_data, self.val_data, self.test_data = random_split(self.audio_files, [train_size, val_size, test_size])


    def train_dataloader(self):
        return DataLoader(self.train_data, self.batch_size, num_workers=2, pin_memory=True, persistent_workers=True)       

    def val_dataloader(self):
        return DataLoader(self.val_data, self.batch_size, num_workers=2, pin_memory=True, persistent_workers=True)
    
    def test_dataloader(self):
        return DataLoader(self.test_data, self.batch_size, num_workers=2, pin_memory=True, persistent_workers=True)  


Vini trying model stuff

In [None]:
# https://huggingface.co/docs/transformers/main/en/model_doc/audio-spectrogram-transformer#transformers.ASTConfig
from transformers import ASTFeatureExtractor, ASTForAudioClassification, ASTConfig

# must be the same -> model and tokenizer/feature extractor
# right now with default values

config = ASTConfig()

# basically tokenizer
# input normalization: mean = 0, std = 0.5

#feature_extractor = ASTFeatureExtractor(config, sampling_rate=sr, num_mel_bins=32, mean=0, std=0.5)

model = ASTForAudioClassification(config)

train_loader = DataModuleClass(BATCH_SIZE)

trainer = pl.Trainer(max_epochs=1, accelerator='gpu', devices=1, log_every_n_steps=25)

trainer.fit(model, train_loader)



##### Loading and pre-processing data

In [6]:
# must have metadata.csv with 'file_name' column to have also the features
# unsplitted dataset
dataset = load_dataset("audiofolder", data_dir=AUDIO_DIR, split="train")

Resolving data files: 100%|██████████| 16226/16226 [00:00<00:00, 143877.77it/s]
Found cached dataset audiofolder (/home/ubuntu/.cache/huggingface/datasets/audiofolder/default-ca7edb5d3275be7e/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc)


In [7]:
# splitting up the data : {training - 70%, validation - 20%, test - 10%}
# shuffle=True
train_testvalid = dataset.train_test_split(test_size=0.3)
test_valid = train_testvalid['test'].train_test_split(test_size=1/3)

dataset = DatasetDict({
    'train' : train_testvalid['train'],
    'test' : test_valid['test'],
    'valid' : test_valid['train']
})

In [8]:
# getting the labels for classification
labels = list(df["status"].unique()[1:])

label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [9]:
# hyperparams again 
# TODO : move up
max_duration = 1

samplin_rate = 16000
batch_size = 4
num_classes = len(labels)
learning_rate = 1e-4
hidden_dim = 768

max_seq_length = max_duration * samplin_rate
max_frames = 49
max_epochs = 2


is_cuda = torch.cuda.is_available()
checkpoint = 'MIT/ast-finetuned-audioset-10-10-0.4593'

##### Training Model

In [None]:
# use this for [RANDOM WEIGHTS] - no pretraining
config = ASTConfig()

# basically tokenizer
# input normalization: mean = 0, std = 0.5
feature_extractor = ASTFeatureExtractor(config, sampling_rate=sampling_rate, num_mel_bins=32, mean=0, std=0.5)

model = ASTForAudioClassification(config)
# weights must be the same for the model and the tokenizer/feature extractor 

##### Pretrained session

In [10]:
# feature extractor and model
feature_extractor = ASTFeatureExtractor(
    checkpoint
)

model = ASTForAudioClassification.from_pretrained(
    checkpoint, 
    num_labels=len(labels),
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True
    )

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([3]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# for feature extraction - using map()
def preprocess(examples):
    audio_arr = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arr,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=max_seq_length,
        truncation=True,
        padding=True
    )
    
    label = [int(label2id[x]) for x in examples["label"]]
    
    inputs["label"] = label
    return inputs

In [12]:
# mapping datasets -> feature extraction
ds_train = dataset['train'].map(
    preprocess, remove_columns=["audio"], batched=True, batch_size=batch_size
    )

ds_valid = dataset['valid'].map(
    preprocess, remove_columns=["audio"], batched=True, batch_size=batch_size
    )

ds_test = dataset['test'].map(
    preprocess, remove_columns=["audio"], batched=True, batch_size=batch_size
    )

                                                                 

In [None]:
# TODO : choosing optimizer and scheduler
# Trainer : optimizers = (optimizer, scheduler)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.LambdaLR(
    optimizer, warmup_cosine(100, 
        max_lr=learning_rate,
        total_steps=total_steps,
        optimizer_lr=learning_rate,
        min_lr=1e-6))

In [16]:
# TODO : add Kaggle link
# computing accuracy, f1_score, recall and precision -> Trainer : compute_metrics=compute_metrics

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")
    recall = recall_score(labels, predictions, average="weighted")
    precision = precision_score(labels, predictions, average="weighted")

    return {"accuracy": acc, "f1-score": f1, "recall-score": recall, "precision-score": precision}

In [17]:
# set the wandb project where this run will be logged
os.environ["WANDB_PROJECT"]="cough-project"

# save your trained model checkpoint to wandb
os.environ["WANDB_LOG_MODEL"]="true"

# turn off watch to log faster
os.environ["WANDB_WATCH"]="false"

In [18]:
# the hyperparams for Trainer
training_arg = TrainingArguments(
    output_dir="output",
    report_to="wandb",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs= max_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    load_best_model_at_end=True,
    save_total_limit=1,
    metric_for_best_model='loss',
    learning_rate=learning_rate,
    logging_strategy="epoch"
    )

# defining trainer
trainer = Trainer(
    model=model,
    args=training_arg,
    train_dataset=ds_train,
    eval_dataset=ds_valid,
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics
)

In [19]:
# training session
trainer.train()



Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [20]:
wandb.finish()

In [None]:
# evaluation
predictions = trainer.predict()

In [None]:
# saving model weigths into files
model.save_pretrained('./saved_model/')

In [None]:
trainer.evaluate()

a. Discuss at least four relevant hyper-parameters

Learning rate : \
Epoch Numer : \
Batch Size : \
Optimizer: \
Layer Number : [??]

In [None]:
# learning rate
# epoch number
# mini-batch size

# I don't know what is the 4th one

b. Experiment with the effect of different batch sizes

c. Experiment with the effect of different learning rates

d. Experiment with different number of network layers

e. Implement at least two data agumentation techniques

f. Discuss what influences the memory use of a solution such as yours. What can be done to reduce this?