In [1]:
! pip install accelerate -U
! pip install datasets transformers[sentencepiece]



#### Imports

In [2]:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd

In [3]:
import datasets
from datasets import load_dataset, DatasetDict,  Audio
from datasets import load_dataset

In [4]:
import librosa
import os
import glob
import io

In [5]:
from sklearn.metrics import f1_score, classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, accuracy_score
from transformers import WhisperModel, WhisperFeatureExtractor, AdamW
import torch
import torch.nn as nn
import torch.utils.data
from torch.utils.data import Dataset, DataLoader

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
# !unzip /content/drive/MyDrive/archive.zip -d /content/drive/MyDrive

#### Loading CSV Metadata about the audio in a Dataframe and Randomly Sampling a Subset

In [8]:
df = pd.read_csv(r"/content/drive/MyDrive/UrbanSound8K.csv")
#Randomly Sampling a Subset
df = df.sample(n=1000, random_state=42)
df.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
6770,54898-8-0-2.wav,54898,47.992301,51.992301,2,3,8,siren
3534,172338-9-0-7.wav,172338,91.76048,95.76048,1,4,9,street_music
8556,95562-4-3-0.wav,95562,8.795241,12.795241,1,3,4,drilling
7870,75490-8-0-2.wav,75490,1.0,5.0,1,6,8,siren
1226,128891-3-0-4.wav,128891,2.0,6.0,1,6,3,dog_bark


In [9]:
folder = '/content/drive/MyDrive'
paths = []
for f_name in range(1, 11):
    folder_path = os.path.join(folder, 'fold{}'.format(f_name))
    for filename in os.listdir(folder_path):
        if filename.endswith('.wav'):
            file_path = os.path.join(folder_path, filename)
            paths.append(file_path)
all_file_paths = {}
for path in paths:
  all_file_paths[os.path.basename(path)] = path

#### Adding Audio paths

In [10]:
df['full_path'] = df['slice_file_name'].apply(lambda x: all_file_paths.get(x))
df.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class,full_path
6770,54898-8-0-2.wav,54898,47.992301,51.992301,2,3,8,siren,/content/drive/MyDrive/fold3/54898-8-0-2.wav
3534,172338-9-0-7.wav,172338,91.76048,95.76048,1,4,9,street_music,/content/drive/MyDrive/fold4/172338-9-0-7.wav
8556,95562-4-3-0.wav,95562,8.795241,12.795241,1,3,4,drilling,/content/drive/MyDrive/fold3/95562-4-3-0.wav
7870,75490-8-0-2.wav,75490,1.0,5.0,1,6,8,siren,/content/drive/MyDrive/fold6/75490-8-0-2.wav
1226,128891-3-0-4.wav,128891,2.0,6.0,1,6,3,dog_bark,/content/drive/MyDrive/fold6/128891-3-0-4.wav


#### Splitting the data

In [11]:
train_data, temp_data = train_test_split(df, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(df, test_size=0.4, random_state=42)

In [12]:
def create_audio_dataset(df):
    return datasets.Dataset.from_dict({
        "audio": df["full_path"].tolist(),
        "labels": df["classID"].tolist()
    }).cast_column("audio", Audio(sampling_rate=16_000)) #the default sampling rate for the Hugging Face Whisper model

train_final = create_audio_dataset(train_data)
test_final = create_audio_dataset(test_data)
val_final = create_audio_dataset(val_data)

In [13]:
model_checkpoint = "openai/whisper-base"
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_checkpoint)
encoder = WhisperModel.from_pretrained(model_checkpoint)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [14]:
class SpeechClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, audio_data,  text_processor):
        self.audio_data = audio_data
        self.text_processor = text_processor

    def __len__(self):
        return len(self.audio_data)

    def __getitem__(self, index):

      inputs = self.text_processor(self.audio_data[index]["audio"]["array"],return_tensors="pt", sampling_rate=self.audio_data[index]["audio"]["sampling_rate"])
      input_features = inputs.input_features
      decoder_input_ids = torch.tensor([[1, 1]]) * encoder.config.decoder_start_token_id

      labels = np.array(self.audio_data[index]['labels'])

      return input_features, decoder_input_ids, torch.tensor(labels)


In [15]:

class SpeechClassifier(nn.Module):
    def __init__(self, num_labels, encoder):
        super(SpeechClassifier, self).__init__()
        self.encoder = encoder
        self.classifier = nn.Sequential(
            nn.Linear(self.encoder.config.hidden_size, 2048),
            nn.ReLU(),
            # nn.Linear(2048, 2048),
            # nn.ReLU(),
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, num_labels)
        )

    def forward(self, input_features, decoder_input_ids):
        outputs = self.encoder(input_features, decoder_input_ids=decoder_input_ids)
        pooled_output = outputs['last_hidden_state'][:, 0, :]
        logits = self.classifier(pooled_output)
        return logits

In [16]:
batch_size = 10 # 20 # 30
train_ = SpeechClassificationDataset(train_final,  feature_extractor)
test_ = SpeechClassificationDataset(test_final,  feature_extractor)
val_ = SpeechClassificationDataset(val_final,  feature_extractor)
train_loader = DataLoader(train_, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_, batch_size=batch_size, shuffle=False)

In [17]:
num_labels = 10
model = SpeechClassifier(num_labels, encoder).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5, betas=(0.9, 0.999), eps=1e-08) #AdamW(model.parameters(), lr=3e-5, betas=(0.8, 0.999), eps=1e-08) #AdamW(model.parameters(), lr=2e-4, betas=(0.9, 0.999), eps=1e-08) #AdamW(model.parameters(), lr=2e-5, betas=(0.9, 0.999), eps=1e-07)
criterion = nn.CrossEntropyLoss()



In [18]:

def train(model, train_loader, val_loader, optimizer,  criterion, device, num_epochs):

    best_accuracy = 0.0

    for epoch in range(num_epochs):

        model.train()

        for i, batch in enumerate(train_loader):

            input_features, decoder_input_ids, labels = batch

            input_features = input_features.squeeze()
            input_features = input_features.to(device)

            decoder_input_ids = decoder_input_ids.squeeze()
            decoder_input_ids = decoder_input_ids.to(device)

            labels = labels.view(-1)
            labels = labels.to(device)

            optimizer.zero_grad()

            logits = model(input_features, decoder_input_ids)

            loss = criterion(logits, labels)
            loss.backward()

            optimizer.step()

            if (i+1) % 8 == 0:
                print(f'Epoch {epoch+1}/{num_epochs}, Batch {i+1}/{len(train_loader)}, Train Loss: {loss.item() :.4f}')
                train_loss = 0.0

        val_loss, val_accuracy, val_f1, _ , _ = evaluate(model, val_loader, device)

        if val_accuracy > best_accuracy:
            best_accuracy = val_accuracy
            torch.save(model.state_dict(), 'best_model.pt')

        print("========================================================================================")
        print(f'Epoch {epoch+1}/{num_epochs}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}, Val F1: {val_f1:.4f}, Best Accuracy: {best_accuracy:.4f}')
        print("========================================================================================")



In [19]:
def evaluate(model, data_loader,  device):

    all_labels = []
    all_preds = []
    total_loss = 0.0

    with torch.no_grad():

        for i, batch in enumerate(data_loader):

          input_features, decoder_input_ids, labels = batch

          input_features = input_features.squeeze()
          input_features = input_features.to(device)

          decoder_input_ids = decoder_input_ids.squeeze()
          decoder_input_ids = decoder_input_ids.to(device)

          labels = labels.view(-1)
          labels = labels.to(device)

          optimizer.zero_grad()

          logits = model(input_features, decoder_input_ids)

          loss = criterion(logits, labels)
          total_loss += loss.item()

          _, preds = torch.max(logits, 1)
          all_labels.append(labels.cpu().numpy())
          all_preds.append(preds.cpu().numpy())

    all_labels = np.concatenate(all_labels, axis=0)
    all_preds = np.concatenate(all_preds, axis=0)

    loss = total_loss / len(data_loader)
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='macro')
    return loss, accuracy, f1, all_labels, all_preds


In [20]:
num_epochs = 5#10
train(model, train_loader, val_loader, optimizer, criterion, device, num_epochs)

Epoch 1/5, Batch 8/80, Train Loss: 2.2866
Epoch 1/5, Batch 16/80, Train Loss: 2.1743
Epoch 1/5, Batch 24/80, Train Loss: 1.5520
Epoch 1/5, Batch 32/80, Train Loss: 1.2035
Epoch 1/5, Batch 40/80, Train Loss: 1.1577
Epoch 1/5, Batch 48/80, Train Loss: 0.6521
Epoch 1/5, Batch 56/80, Train Loss: 0.7866
Epoch 1/5, Batch 64/80, Train Loss: 1.7200
Epoch 1/5, Batch 72/80, Train Loss: 1.3407
Epoch 1/5, Batch 80/80, Train Loss: 0.5640
Epoch 1/5, Val Loss: 0.7138, Val Accuracy: 0.7617, Val F1: 0.7401, Best Accuracy: 0.7617
Epoch 2/5, Batch 8/80, Train Loss: 0.9320
Epoch 2/5, Batch 16/80, Train Loss: 0.2648
Epoch 2/5, Batch 24/80, Train Loss: 0.3849
Epoch 2/5, Batch 32/80, Train Loss: 0.9248
Epoch 2/5, Batch 40/80, Train Loss: 0.8389
Epoch 2/5, Batch 48/80, Train Loss: 0.7996
Epoch 2/5, Batch 56/80, Train Loss: 0.5522
Epoch 2/5, Batch 64/80, Train Loss: 0.3657
Epoch 2/5, Batch 72/80, Train Loss: 0.8174
Epoch 2/5, Batch 80/80, Train Loss: 0.4231
Epoch 2/5, Val Loss: 0.4637, Val Accuracy: 0.8433, Va

In [22]:
from sklearn.metrics import confusion_matrix


In [24]:
state_dict = torch.load('best_model.pt')


num_labels = 10
model = SpeechClassifier(num_labels, encoder).to(device)
model.load_state_dict(state_dict)

_, _, _, all_labels, all_preds = evaluate(model, test_loader, device)


print(classification_report(all_labels, all_preds))
print(accuracy_score(all_labels, all_preds))
print(confusion_matrix(all_labels, all_preds))

              precision    recall  f1-score   support

           0       0.77      0.82      0.79        44
           1       0.92      0.92      0.92        25
           2       0.77      1.00      0.87        49
           3       1.00      0.93      0.96        41
           4       0.97      0.80      0.88        46
           5       0.82      0.92      0.87        49
           6       1.00      0.93      0.96        14
           7       0.86      0.90      0.88        42
           8       0.96      0.76      0.85        33
           9       0.94      0.82      0.88        57

    accuracy                           0.88       400
   macro avg       0.90      0.88      0.89       400
weighted avg       0.89      0.88      0.88       400

0.8775
[[36  0  4  0  0  3  0  1  0  0]
 [ 0 23  1  0  0  0  0  0  0  1]
 [ 0  0 49  0  0  0  0  0  0  0]
 [ 0  0  3 38  0  0  0  0  0  0]
 [ 2  0  0  0 37  2  0  5  0  0]
 [ 4  0  0  0  0 45  0  0  0  0]
 [ 0  0  0  0  1  0 13  0  0  0]
 [ 