In [15]:
import json
import os
from pathlib import Path
import librosa
import numpy as np
from typing import Dict, List
from tqdm import tqdm

In [16]:
FILE_NAME = "data_25ms_h=0%_16mfcc_DPR-HRK"
CLASSES = ["DPR", "HRK"]

SAMPLE_RATE = 10000
SEGMENT_DURATION = 0.0256  # duration of each segment in seconds
OVERLAP_DURATION = 0   # overlap duration in % (0 means no overlap)
NUM_MFCC = 16
N_FFT = 2 ** int(np.ceil(np.log2(SEGMENT_DURATION * SAMPLE_RATE)))
SEGMENT_HOP_LENGTH = int(SAMPLE_RATE * SEGMENT_DURATION * (1 - OVERLAP_DURATION / 100))
MFCC_HOP_LENGTH = SEGMENT_HOP_LENGTH

In [17]:
DATASET_PATH = "./data/original_dataset"
JSON_PATH = "./data/" + FILE_NAME + ".json"

In [18]:
print(f"n_fft: {N_FFT}, hop_length: {SEGMENT_HOP_LENGTH}")

n_fft: 256, hop_length: 256


In [19]:
def extract_mfcc(file_path: str, num_mfcc: int, n_fft: int, hop_length: int) -> List[List[float]]:
    """Extract MFCCs from an audio file."""
    signal, sample_rate = librosa.load(file_path, sr=SAMPLE_RATE)
    track_duration = librosa.get_duration(y=signal, sr=SAMPLE_RATE)
    samples_per_track = SAMPLE_RATE * track_duration
    samples_per_segment = int(SAMPLE_RATE * SEGMENT_DURATION)
    num_segments = int((samples_per_track - samples_per_segment) / hop_length) + 1

    mfccs = []
    for d in range(num_segments):
        start = hop_length * d
        finish = start + samples_per_segment
        if finish > len(signal):
            finish = len(signal)
        mfcc = librosa.feature.mfcc(y=signal[start:finish], sr=SAMPLE_RATE, n_mfcc=num_mfcc, n_fft=n_fft, hop_length=hop_length, center=False)
        mfccs.append(mfcc.T.tolist())
    return mfccs

In [20]:
def save_mfcc(dataset_path: str, json_path: str, selected_classes: List[str], voc_only: bool = True, num_mfcc: int = 16, n_fft: int = 256, hop_length: int = 256) -> None:
    """Extracts MFCCs from audio dataset and saves them into a json file along with class labels."""
    data: Dict[str, List] = {"mapping": [], "labels": [], "mfcc": [], "files": []}
    dataset_path = Path(dataset_path)

    # Create a dictionary to map the selected classes to their respective labels
    class_to_label = {c: i for i, c in enumerate(selected_classes)}

    # Walk through dataset path
    for dirpath, _, filenames in os.walk(dataset_path):
        if dirpath != str(dataset_path):
            semantic_label = Path(dirpath).name
            if semantic_label in ["DPR", "HRK"]:
                if "DPR+HRK" in selected_classes:
                    semantic_label = "DPR+HRK"
                elif semantic_label not in selected_classes:
                    continue
            
            if semantic_label in selected_classes:
                if semantic_label not in data["mapping"]:
                    data["mapping"].append(semantic_label)
                print(f"\nProcessing: {semantic_label}")

                for f in tqdm(filenames, desc=f"Processing files in {semantic_label}", leave=False):
                    if ('voc' in f) == voc_only:
                        file_path = str(Path(dirpath) / f)
                        mfccs = extract_mfcc(file_path, num_mfcc, n_fft, hop_length)
                        data["mfcc"].extend(mfccs)
                        data["labels"].extend([class_to_label[semantic_label]] * len(mfccs))
                        data["files"].extend([file_path] * len(mfccs))
                        print(f"{file_path}, segments: {len(mfccs)}")

    with open(json_path, "w") as fp:
        json.dump(data, fp, indent=4)
    
    print("\nDONE")

In [21]:
save_mfcc(DATASET_PATH, JSON_PATH, selected_classes=CLASSES, voc_only=True, num_mfcc=NUM_MFCC, n_fft=N_FFT, hop_length=SEGMENT_HOP_LENGTH)


Processing: DPR


Processing files in DPR:   0%|          | 0/26 [00:00<?, ?it/s]

Processing files in DPR:   4%|▍         | 1/26 [01:26<36:14, 86.99s/it]

data/original_dataset/DPR/090105nt1_intvoc.wav, segments: 8532


Processing files in DPR:  15%|█▌        | 4/26 [03:06<15:56, 43.47s/it]

data/original_dataset/DPR/061405nt1_intvoc.wav, segments: 10082


Processing files in DPR:  19%|█▉        | 5/26 [03:14<11:41, 33.41s/it]

data/original_dataset/DPR/092503hnf_intvoc.wav, segments: 761


Processing files in DPR:  23%|██▎       | 6/26 [04:34<15:33, 46.67s/it]

data/original_dataset/DPR/020106nt1_intvoc.wav, segments: 8234


Processing files in DPR:  27%|██▋       | 7/26 [06:28<20:59, 66.29s/it]

data/original_dataset/DPR/013106ab3_intvoc.wav, segments: 10737


Processing files in DPR:  31%|███       | 8/26 [06:43<15:19, 51.06s/it]

data/original_dataset/DPR/072604nt1_intvoc.wav, segments: 1281


Processing files in DPR:  35%|███▍      | 9/26 [08:32<19:18, 68.18s/it]

data/original_dataset/DPR/022505nt1_intvoc.wav, segments: 10189


Processing files in DPR:  38%|███▊      | 10/26 [10:05<20:10, 75.64s/it]

data/original_dataset/DPR/011106nt1_intvoc.wav, segments: 8527


Processing files in DPR:  50%|█████     | 13/26 [10:30<08:13, 37.96s/it]

data/original_dataset/DPR/040505nt2_intvoc.wav, segments: 2316


Processing files in DPR:  58%|█████▊    | 15/26 [11:14<05:56, 32.45s/it]

data/original_dataset/DPR/051105nt1_intvoc.wav, segments: 4301


Processing files in DPR:  69%|██████▉   | 18/26 [12:15<03:37, 27.20s/it]

data/original_dataset/DPR/110403hnf_intvoc.wav, segments: 5721


Processing files in DPR:  88%|████████▊ | 23/26 [13:31<01:03, 21.18s/it]

data/original_dataset/DPR/021605nt3_intvoc.wav, segments: 7229


                                                                        

data/original_dataset/DPR/101805ab1_intvoc.wav, segments: 9196

Processing: HRK


Processing files in HRK:  15%|█▌        | 3/20 [01:15<07:08, 25.22s/it]

data/original_dataset/HRK/020806nt1_intvoc.wav, segments: 7080


Processing files in HRK:  25%|██▌       | 5/20 [05:36<19:07, 76.51s/it]

data/original_dataset/HRK/083004nt1_intvoc.wav, segments: 24551


Processing files in HRK:  35%|███▌      | 7/20 [07:59<16:08, 74.48s/it]

data/original_dataset/HRK/080304nt2_intvoc.wav, segments: 14137


Processing files in HRK:  45%|████▌     | 9/20 [09:00<10:41, 58.29s/it]

data/original_dataset/HRK/030905nt2_intvoc.wav, segments: 5261


Processing files in HRK:  50%|█████     | 10/20 [11:22<12:37, 75.76s/it]

data/original_dataset/HRK/103105nt2_intvoc.wav, segments: 12208


Processing files in HRK:  65%|██████▌   | 13/20 [11:31<04:49, 41.31s/it]

data/original_dataset/HRK/091704nt1_intvoc.wav, segments: 662


Processing files in HRK:  80%|████████  | 16/20 [12:37<02:14, 33.53s/it]

data/original_dataset/HRK/032405nt2_intvoc.wav, segments: 6490


Processing files in HRK:  85%|████████▌ | 17/20 [14:52<02:29, 49.87s/it]

data/original_dataset/HRK/051105nt3_intvoc.wav, segments: 16321


Processing files in HRK:  95%|█████████▌| 19/20 [16:27<00:49, 49.09s/it]

data/original_dataset/HRK/081204nt1_intvoc.wav, segments: 9349


                                                                        

data/original_dataset/HRK/072604nt2_intvoc.wav, segments: 15671





DONE
