In [1]:
import os
import librosa
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import AutoFeatureExtractor, HubertForSequenceClassification

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [3]:
# Set the path to your audio files and corresponding labels
data_dir = "./data/recordings"
train_files = set(os.listdir(os.path.join(data_dir, "train")))
valid_files = set(os.listdir(os.path.join(data_dir, "validate")))
test_files = set(os.listdir(os.path.join(data_dir, "test")))

record_df = pd.read_csv("./data/overview-of-recordings.csv")
record_df["split"] = record_df["file_name"].apply(lambda x: "train" if x in train_files else ("validate" if x in valid_files else "test"))
train_df = record_df[record_df['split'] == 'train']
valid_df = record_df[record_df['split'] == 'validate']
test_df = record_df[record_df['split'] == 'test']

# append data_dir to file names
train_files = [os.path.join(data_dir, "train", f) for f in train_df["file_name"]]
valid_files = [os.path.join(data_dir, "validate", f) for f in valid_df["file_name"]]
test_files = [os.path.join(data_dir, "test", f) for f in test_df["file_name"]]

prompt_to_id = {prompt: i for i, prompt in enumerate(record_df.prompt.unique())}

train_labels = train_df.prompt.apply(lambda x: prompt_to_id[x]).values
valid_labels = valid_df.prompt.apply(lambda x: prompt_to_id[x]).values
test_labels = test_df.prompt.apply(lambda x: prompt_to_id[x]).values

In [4]:
# Function to train the model
def train_model(train_loader, val_loader, model, criterion, optimizer, num_epochs=10):
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0

        for inputs, mask, labels in train_loader:
            # Send inputs and labels to the device
            optimizer.zero_grad()
            
            outputs = model(inputs, attention_mask=mask, labels=labels)

            loss = outputs.loss
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        # Validate the model
        model.eval()
        val_loss = 0.0
        correct_predictions = 0

        with torch.no_grad():
            for inputs, mask, labels in val_loader:

                outputs = model(inputs, attention_mask=mask, labels=labels)

                loss = outputs.loss

                val_loss += loss.item()
                predicted = torch.argmax(outputs.logits, dim=1)
                correct_predictions += (predicted == labels).sum().item()

        val_accuracy = correct_predictions / len(val_loader.dataset)

        print(f'Epoch {epoch + 1}/{num_epochs} Training Loss: {running_loss / len(train_loader)} Validation Loss: {val_loss / len(val_loader)} Validation Accuracy: {val_accuracy * 100:.2f}%')

# Function to test the model
def test_model(test_loader, model):
    model.eval()
    running_loss = 0.0
    correct_predictions = 0

    with torch.no_grad():
        for inputs, mask, labels in test_loader:

            outputs = model(inputs, attention_mask=mask, labels=labels)

            loss = outputs.loss

            running_loss += loss.item()
            predicted = torch.argmax(outputs.logits, dim=1)
            correct_predictions += (predicted == labels).sum().item()
            
    accuracy = correct_predictions / len(test_loader.dataset)
    print(f'Test Loss: {running_loss / len(test_loader)} Accuracy: {accuracy * 100:.2f}%')

In [5]:
class AudioDataset(Dataset):
    def __init__(self, file_paths, labels, feature_extractor, max_seq_length):
        self.file_paths = file_paths
        self.labels = labels
        self.feature_extractor = feature_extractor
        self.max_seq_length = max_seq_length

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        audio_path = self.file_paths[idx]
        label = self.labels[idx]

        # extract audio features
        audio_input, sr = librosa.load(audio_path, sr=16000)
        audio_features = self.feature_extractor(audio_input, sampling_rate=sr, padding=True, return_tensors="pt", max_length=self.max_seq_length, truncation=True)
        input_values = audio_features["input_values"].squeeze().to(device)
        attention_mask = audio_features["attention_mask"].squeeze().to(device)

        # Pad features to the maximum sequence length
        pad_size = self.max_seq_length - input_values.size(0)
        input_values = torch.nn.functional.pad(input_values, (0, pad_size))
        attention_mask = torch.nn.functional.pad(attention_mask, (0, pad_size))

        label = torch.tensor(label, dtype=torch.long).to(device)

        return input_values, attention_mask, label

In [6]:
# Load pre-trained Hubert model and feature extractor
feature_extractor = AutoFeatureExtractor.from_pretrained("superb/hubert-base-superb-ks")
model = HubertForSequenceClassification.from_pretrained("superb/hubert-base-superb-ks", num_labels=len(prompt_to_id), ignore_mismatched_sizes=True).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-5)

max_seq_length = 295730

train_dataset = AudioDataset(train_files, train_labels, feature_extractor, max_seq_length)
valid_dataset = AudioDataset(valid_files, valid_labels, feature_extractor, max_seq_length)
test_dataset = AudioDataset(test_files, test_labels, feature_extractor, max_seq_length)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at superb/hubert-base-superb-ks and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([12, 256]) in the checkpoint and torch.Size([25, 256]) in the model instantiated
- classifier.bias: found shape torch.Size([12]) in the checkpoint and torch.Size([25]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Train the model
train_model(test_loader, valid_loader, model, criterion, optimizer, num_epochs=10)

# Test the model
test_model(train_loader, model)

OutOfMemoryError: CUDA out of memory. Tried to allocate 312.00 MiB. GPU 0 has a total capacty of 4.00 GiB of which 0 bytes is free. Of the allocated memory 10.37 GiB is allocated by PyTorch, and 136.76 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF