In [15]:
import json
import os
from pathlib import Path
import librosa
import numpy as np
from typing import Dict, List

In [16]:
FILE_NAME = "data_25ms_50%_16mfcc_RMT-HRK"
CLASSES = ["RMT", "HRK"]

SAMPLE_RATE = 10000
SEGMENT_DURATION = 0.0256  # duration of each segment in seconds
OVERLAP_DURATION = 50   # overlap duration in % (0 means no overlap)
NUM_MFCC = 16
N_FFT = 2 ** int(np.ceil(np.log2(SEGMENT_DURATION * SAMPLE_RATE)))
HOP_LENGTH = int(N_FFT * (1 - OVERLAP_DURATION / 100))

In [17]:
DATASET_PATH = "./data/original_dataset"
JSON_PATH = "./data/" + FILE_NAME + ".json"

In [18]:
print(f"n_fft: {N_FFT}, hop_length: {HOP_LENGTH}")

n_fft: 256, hop_length: 128


In [19]:
def extract_mfcc(file_path: str, num_mfcc: int, n_fft: int, hop_length: int) -> List[List[float]]:
    """Extract MFCCs from an audio file."""
    signal, sample_rate = librosa.load(file_path, sr=SAMPLE_RATE)
    track_duration = librosa.get_duration(y=signal, sr=SAMPLE_RATE)
    samples_per_track = SAMPLE_RATE * track_duration
    samples_per_segment = int(SAMPLE_RATE * SEGMENT_DURATION)
    num_segments = int(samples_per_track / samples_per_segment)

    mfccs = []
    for d in range(num_segments):
        start = samples_per_segment * d
        finish = start + samples_per_segment
        if finish > len(signal):
            finish = len(signal)
        mfcc = librosa.feature.mfcc(y=signal[start:finish], sr=SAMPLE_RATE, n_mfcc=num_mfcc, n_fft=n_fft, hop_length=hop_length, center=False)
        mfccs.append(mfcc.T.tolist())
    return mfccs

In [20]:
def save_mfcc(dataset_path: str, json_path: str, selected_classes: List[str], voc_only: bool = True, num_mfcc: int = 13, n_fft: int = 256, hop_length: int = 128) -> None:
    """Extracts MFCCs from audio dataset and saves them into a json file along with class labels."""
    data: Dict[str, List] = {"mapping": [], "labels": [], "mfcc": [], "files": []}
    dataset_path = Path(dataset_path)

    # Create a dictionary to map the selected classes to their respective labels
    class_to_label = {c: i for i, c in enumerate(selected_classes)}

    for i, (dirpath, _, filenames) in enumerate(os.walk(dataset_path)):
        if dirpath != dataset_path:
            semantic_label = Path(dirpath).name
            if semantic_label in selected_classes:  # Check if the current class is in the selected classes
                data["mapping"].append(semantic_label)
                print(f"\nProcessing: {semantic_label}")

                for f in filenames:
                    if ('voc' in f) == voc_only:
                        file_path = str(Path(dirpath) / f)
                        mfccs = extract_mfcc(file_path, num_mfcc, n_fft, hop_length)
                        data["mfcc"].extend(mfccs)
                        # Use the class_to_label dictionary to get the correct label
                        data["labels"].extend([class_to_label[semantic_label]] * len(mfccs))
                        data["files"].extend([file_path] * len(mfccs))
                        print(f"{file_path}, segments: {len(mfccs)}")

    with open(json_path, "w") as fp:
        json.dump(data, fp, indent=4)
    
    print("\nDONE")

In [21]:
save_mfcc(DATASET_PATH, JSON_PATH, selected_classes=CLASSES, voc_only=True, num_mfcc=NUM_MFCC, n_fft=N_FFT, hop_length=HOP_LENGTH)


Processing: HRK


data/original_dataset/HRK/020806nt1_intvoc.wav, segments: 7080
data/original_dataset/HRK/083004nt1_intvoc.wav, segments: 24551
data/original_dataset/HRK/080304nt2_intvoc.wav, segments: 14137
data/original_dataset/HRK/030905nt2_intvoc.wav, segments: 5261
data/original_dataset/HRK/103105nt2_intvoc.wav, segments: 12208
data/original_dataset/HRK/091704nt1_intvoc.wav, segments: 662
data/original_dataset/HRK/032405nt2_intvoc.wav, segments: 6490
data/original_dataset/HRK/051105nt3_intvoc.wav, segments: 16321
data/original_dataset/HRK/081204nt1_intvoc.wav, segments: 9349
data/original_dataset/HRK/072604nt2_intvoc.wav, segments: 15671

Processing: RMT
data/original_dataset/RMT/091305nt1_intvoc.wav, segments: 6771
data/original_dataset/RMT/092005nt1_intvoc.wav, segments: 7567
data/original_dataset/RMT/070805nt1_intvoc.wav, segments: 14282
data/original_dataset/RMT/100803hnf_intvoc.wav, segments: 284
data/original_dataset/RMT/121803hnf_intvoc.wav, segments: 22725
data/original_dataset/RMT/092403r