In [1]:
import torchaudio
import torch
import os
import glob
from torch.utils.data import Dataset, DataLoader
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift
import numpy as np
from transformers import AutoProcessor

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dir_path = r'C:../dataset/audioonly/labeled/original_dataset'
os.path.exists(dir_path)

True

In [3]:
class_name = next(os.walk(dir_path))[1]
class_name

['belly_pain', 'discomfort', 'hungry', 'tired']

In [4]:
os.listdir(dir_path)

['belly_pain', 'discomfort', 'hungry', 'tired']

In [5]:
audio_dir = [os.path.join(dir_path, name) for name in class_name]
audio_dir

['C:../dataset/audioonly/labeled/original_dataset\\belly_pain',
 'C:../dataset/audioonly/labeled/original_dataset\\discomfort',
 'C:../dataset/audioonly/labeled/original_dataset\\hungry',
 'C:../dataset/audioonly/labeled/original_dataset\\tired']

In [6]:
bpain_audio = glob.glob(os.path.join(audio_dir[0], '*.wav'))
discomf_audio = glob.glob(os.path.join(audio_dir[1], '*.wav'))
hungry_audio = glob.glob(os.path.join(audio_dir[2], '*.wav'))
tired_audio = glob.glob(os.path.join(audio_dir[3], '*.wav'))

In [7]:
audio_path_class = {
    'bpain': bpain_audio,
    'discomf': discomf_audio,
    'hungry': hungry_audio,
    'tired': tired_audio
}

In [8]:
class AudioDatasetWithSubDirs(Dataset):
    def __init__(self, root_dir, sr=16000, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.sr = sr
        self.audio_labels = []
        self.audio_paths = []
        self.class_name = next(os.walk(self.root_dir))[1]
        self.processor = AutoProcessor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

        for name in self.class_name:
            if name == '.ipynb_checkpoints':
                self.class_name.remove(name)

        self.audio_subdir = [os.path.join(dir_path, name) for name in class_name]

        for dir, (index, name) in zip(self.audio_subdir, enumerate(self.class_name)):
            temp_paths = glob.glob(os.path.join(dir, '*.wav'))
            for file in temp_paths:
                self.audio_paths.append(file)
                self.audio_labels.append(index)
                
    def __len__(self):
        return len(self.audio_paths)

    def __getitem__(self, idx):
        audio_path = self.audio_paths[idx]
        label = self.audio_labels[idx]
        waveform, _ = librosa.load(audio_path, sr=self.sr, duration=4.9)

        if self.transform:
            waveform = self.transform(waveform, sample_rate=self.sr)
            waveform = self.processor(waveform, sampling_rate=self.sr, return_tensor='pt')

        return waveform, label

In [9]:
augmentations = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
    TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
    PitchShift(min_semitones=-4, max_semitones=4, p=0.5)
])

audio_dataset = AudioDatasetWithSubDirs(root_dir=dir_path, sr=16000, transform=augmentations)
audio_dataloader = DataLoader(audio_dataset, batch_size=8, shuffle=True)



In [11]:
import librosa
temp = []

for waveform, label in audio_dataloader:
    print(waveform['input_values'][0].shape)
    print(label)
    # temp.append(waveform)

torch.Size([8, 1024, 128])
tensor([1, 2, 2, 2, 2, 2, 1, 2])
torch.Size([8, 1024, 128])
tensor([2, 2, 0, 2, 2, 2, 3, 2])
torch.Size([8, 1024, 128])
tensor([2, 2, 3, 2, 2, 2, 2, 2])
torch.Size([8, 1024, 128])
tensor([2, 2, 2, 2, 0, 2, 2, 2])
torch.Size([8, 1024, 128])
tensor([0, 2, 2, 2, 2, 2, 1, 2])
torch.Size([8, 1024, 128])
tensor([0, 2, 3, 0, 2, 2, 2, 2])
torch.Size([8, 1024, 128])
tensor([1, 2, 2, 2, 2, 2, 2, 2])
torch.Size([8, 1024, 128])
tensor([1, 2, 2, 2, 1, 2, 2, 0])
torch.Size([8, 1024, 128])
tensor([2, 2, 2, 2, 1, 2, 2, 2])
torch.Size([8, 1024, 128])
tensor([2, 2, 2, 2, 2, 2, 2, 0])
torch.Size([8, 1024, 128])
tensor([2, 2, 2, 3, 1, 2, 2, 2])
torch.Size([8, 1024, 128])
tensor([1, 2, 1, 1, 2, 2, 2, 2])
torch.Size([8, 1024, 128])
tensor([2, 2, 2, 3, 3, 2, 2, 1])
torch.Size([8, 1024, 128])
tensor([2, 2, 2, 0, 1, 2, 2, 0])
torch.Size([8, 1024, 128])
tensor([2, 2, 2, 2, 2, 2, 2, 0])
torch.Size([8, 1024, 128])
tensor([2, 2, 2, 2, 2, 2, 2, 2])
torch.Size([8, 1024, 128])
tensor([2, 2,

In [None]:
# from IPython.display import Audio

# for audio in temp[0][:5]:
#     np_audio = audio.numpy()
#     sound = Audio(np_audio, rate=16000)
#     display(sound)