In [1]:
import librosa
import os
import torch
import pandas as pd
from tqdm import tqdm
import numpy as np

As we have said in data analysis part we will use only a portion of the dataset 
(which were violin 🎻, saxophone 🎷, hihat 🎼, acoustic_guitar 🎸 and laughter 🤣 classes)
We will encode each class name with 2 to 3 letters for simplicity.

In [2]:
selected_classes = ['Acoustic_guitar','Hi-hat','Laughter','Saxophone','Violin_or_fiddle']   

def label_to_name(label):
    if label == 'Acoustic_guitar':
        return "gui"
    elif label == 'Hi-hat':
        return "hi"
    elif label == 'Laughter':
        return "lau"
    elif label == 'Saxophone':
        return "sax"
    elif label == 'Violin_or_fiddle':
        return "vio"

Now we will load our metadata to find the samples that belong to classes we will use.

In [3]:
train_audio_dir = "./data/FSDKaggle2018.meta"
file = pd.read_csv(os.path.join(train_audio_dir,"train_post_competition.csv"),usecols=['fname','label'])

In [4]:
# turn the data into a dictionary of file names as keys and labels as values ex: example_samp.wav -> Saxophone
audio_dict = file.set_index('fname')['label'].to_dict() 

Now we will extract features from our audio data. 
This will help us get meaningful inputs and reduce the dimensionality. 
We will use melspectogram as our feature. 

In [5]:
train_audio_dir = "./data/FSDKaggle2018.audio_train"
file_list = os.listdir(train_audio_dir)
file_list.sort()

#We create a directory to save each of our feature files
save_dir = "./data/melspectrogram"
train_save_dir = os.path.join(save_dir,"train")
if not os.path.exists(train_save_dir):
    os.makedirs(train_save_dir)
    

for file in tqdm(file_list):
    label = audio_dict[file]
    if label in selected_classes:
        data , sr = librosa.load(os.path.join(train_audio_dir,file),sr=16000) # load the audio sample
        hop = sr//25 #hop length is selected as sampling_rate//25 (25Hz of data), you can try other values
        
        short_label = label_to_name(label) # encode the label
        
        data = data[:sr*6] # if the sample is longer than 6 seconds we crop it
        if data.shape[0] != sr*6:
            data = np.resize(data,6*sr)
        
        melspec = librosa.feature.melspectrogram(data, n_mels = 60,hop_length=hop,n_fft=hop*2)
        features = librosa.core.amplitude_to_db(melspec)
        torch.save(features,os.path.join(train_save_dir,file[:-4]+"_"+short_label+".pt"))# save data as torch tensor
       

100%|██████████| 9473/9473 [05:19<00:00, 29.60it/s] 


In [6]:
val_audio_dir = "./data/FSDKaggle2018.meta"
file = pd.read_csv(os.path.join(val_audio_dir,"test_post_competition_scoring_clips.csv"),usecols=['fname','label'])
audio_dict = file.set_index('fname')['label'].to_dict()

val_audio_dir = "./data/FSDKaggle2018.audio_test"
file_list = os.listdir(val_audio_dir)
file_list.sort()

save_dir = "./data/melspectrogram"
val_save_dir = os.path.join(save_dir,"validation")
if not os.path.exists(val_save_dir):
    os.makedirs(val_save_dir)


for file in tqdm(file_list):
    label = audio_dict[file]
    if label in selected_classes:
        data , sr = librosa.load(os.path.join(val_audio_dir,file),sr=16000)
        hop = sr//25
        short_label = label_to_name(label)
        data = data[:sr*6]
        if data.shape[0] != sr*6:
            data = np.resize(data,6*sr)
        melspec = librosa.feature.melspectrogram(data, n_mels = 60,hop_length=hop,n_fft=hop*2)
        features = librosa.core.amplitude_to_db(melspec)
        torch.save(features,os.path.join(val_save_dir,file[:-4]+"_"+short_label+".pt"))


100%|██████████| 1600/1600 [00:56<00:00, 28.28it/s]
