In [1]:
import json
import os
from pathlib import Path
import librosa
import numpy as np
from typing import Dict, List
from tqdm import tqdm

In [2]:
FILE_NAME = "data_TEST"
CLASSES = ["RMT", "HRK"]

SAMPLE_RATE = 10000
SEGMENT_DURATION = 0.0256  # duration of each segment in seconds
OVERLAP_DURATION = 0   # overlap duration in % (0 means no overlap)
NUM_MFCC = 16
N_FFT = 2 ** int(np.ceil(np.log2(SEGMENT_DURATION * SAMPLE_RATE)))
SEGMENT_HOP_LENGTH = int(SAMPLE_RATE * SEGMENT_DURATION * (1 - OVERLAP_DURATION / 100))
MFCC_HOP_LENGTH = SEGMENT_HOP_LENGTH

In [3]:
DATASET_PATH = "./data/original_dataset"
JSON_PATH = "./data/" + FILE_NAME + ".json"

In [4]:
print(f"n_fft: {N_FFT}, hop_length: {SEGMENT_HOP_LENGTH}")

n_fft: 256, hop_length: 256


In [5]:
def extract_mfcc(file_path: str, num_mfcc: int, n_fft: int, hop_length: int) -> List[List[float]]:
    """Extract MFCCs from an audio file."""
    signal, sample_rate = librosa.load(file_path, sr=SAMPLE_RATE)
    track_duration = librosa.get_duration(y=signal, sr=SAMPLE_RATE)
    samples_per_track = SAMPLE_RATE * track_duration
    samples_per_segment = int(SAMPLE_RATE * SEGMENT_DURATION)
    num_segments = int((samples_per_track - samples_per_segment) / hop_length) + 1

    mfccs = []
    for d in range(num_segments):
        start = hop_length * d
        finish = start + samples_per_segment
        if finish > len(signal):
            finish = len(signal)
        mfcc = librosa.feature.mfcc(y=signal[start:finish], sr=SAMPLE_RATE, n_mfcc=num_mfcc, n_fft=n_fft, hop_length=hop_length, center=False)
        mfccs.append(mfcc.T.tolist())
    return mfccs

In [6]:
def save_mfcc(dataset_path: str, json_path: str, selected_classes: List[str], voc_only: bool = True, num_mfcc: int = 16, n_fft: int = 256, hop_length: int = 256) -> None:
    """Extracts MFCCs from audio dataset and saves them into a json file along with class labels."""
    data: Dict[str, List] = {"mapping": [], "patients": {}}
    dataset_path = Path(dataset_path)

    # Create a dictionary to map the selected classes to their respective labels
    class_to_label = {c: i for i, c in enumerate(selected_classes)}

    # Walk through dataset path
    for dirpath, _, filenames in os.walk(dataset_path):
        if dirpath != str(dataset_path):
            semantic_label = Path(dirpath).name
            if semantic_label in ["DPR", "HRK"]:
                if "DPR+HRK" in selected_classes:
                    semantic_label = "DPR+HRK"
                elif semantic_label not in selected_classes:
                    continue
            
            if semantic_label in selected_classes:
                if semantic_label not in data["mapping"]:
                    data["mapping"].append(semantic_label)
                print(f"\nProcessing: {semantic_label}")

                for f in tqdm(filenames, desc=f"Processing files in {semantic_label}", leave=False):
                    if ('voc' in f) == voc_only:
                        file_path = str(Path(dirpath) / f)
                        mfccs = extract_mfcc(file_path, num_mfcc, n_fft, hop_length)
                        patient_data = {
                            "mfcc": mfccs,
                            "label": class_to_label[semantic_label],
                            "file": file_path
                        }
                        data["patients"][file_path] = patient_data
                        print(f"{file_path}, segments: {len(mfccs)}")

    with open(json_path, "w") as fp:
        json.dump(data, fp, indent=4)
    
    print("\nDONE")

In [7]:
save_mfcc(DATASET_PATH, JSON_PATH, selected_classes=CLASSES, voc_only=True, num_mfcc=NUM_MFCC, n_fft=N_FFT, hop_length=SEGMENT_HOP_LENGTH)


Processing: HRK


Processing files in HRK:   0%|          | 0/20 [00:00<?, ?it/s]

Processing files in HRK:  15%|█▌        | 3/20 [00:53<05:02, 17.82s/it]

data/original_dataset/HRK/020806nt1_intvoc.wav, segments: 7080


Processing files in HRK:  25%|██▌       | 5/20 [04:06<14:01, 56.11s/it]

data/original_dataset/HRK/083004nt1_intvoc.wav, segments: 24551


Processing files in HRK:  35%|███▌      | 7/20 [05:59<12:11, 56.30s/it]

data/original_dataset/HRK/080304nt2_intvoc.wav, segments: 14137


Processing files in HRK:  45%|████▌     | 9/20 [06:42<07:58, 43.47s/it]

data/original_dataset/HRK/030905nt2_intvoc.wav, segments: 5261


Processing files in HRK:  50%|█████     | 10/20 [08:19<09:06, 54.60s/it]

data/original_dataset/HRK/103105nt2_intvoc.wav, segments: 12208


Processing files in HRK:  65%|██████▌   | 13/20 [08:24<03:27, 29.68s/it]

data/original_dataset/HRK/091704nt1_intvoc.wav, segments: 662


Processing files in HRK:  80%|████████  | 16/20 [09:16<01:38, 24.62s/it]

data/original_dataset/HRK/032405nt2_intvoc.wav, segments: 6490


Processing files in HRK:  85%|████████▌ | 17/20 [11:24<02:04, 41.40s/it]

data/original_dataset/HRK/051105nt3_intvoc.wav, segments: 16321


Processing files in HRK:  95%|█████████▌| 19/20 [12:36<00:39, 39.61s/it]

data/original_dataset/HRK/081204nt1_intvoc.wav, segments: 9349


                                                                        

data/original_dataset/HRK/072604nt2_intvoc.wav, segments: 15671

Processing: RMT


Processing files in RMT:  20%|██        | 4/20 [00:51<03:27, 12.98s/it]

data/original_dataset/RMT/091305nt1_intvoc.wav, segments: 6771


Processing files in RMT:  25%|██▌       | 5/20 [01:49<06:10, 24.70s/it]

data/original_dataset/RMT/092005nt1_intvoc.wav, segments: 7567


Processing files in RMT:  40%|████      | 8/20 [03:40<06:14, 31.20s/it]

data/original_dataset/RMT/070805nt1_intvoc.wav, segments: 14282


Processing files in RMT:  50%|█████     | 10/20 [03:42<03:31, 21.14s/it]

data/original_dataset/RMT/100803hnf_intvoc.wav, segments: 284


Processing files in RMT:  55%|█████▌    | 11/20 [06:36<07:35, 50.66s/it]

data/original_dataset/RMT/121803hnf_intvoc.wav, segments: 22725


Processing files in RMT:  60%|██████    | 12/20 [06:50<05:42, 42.79s/it]

data/original_dataset/RMT/092403rms_intvoc.wav, segments: 1855


Processing files in RMT:  65%|██████▌   | 13/20 [07:39<05:09, 44.19s/it]

data/original_dataset/RMT/010506nt1_intvoc.wav, segments: 6416


Processing files in RMT:  75%|███████▌  | 15/20 [07:52<02:24, 28.99s/it]

data/original_dataset/RMT/100903hnf_intvoc.wav, segments: 1716


Processing files in RMT:  85%|████████▌ | 17/20 [09:16<01:41, 33.80s/it]

data/original_dataset/RMT/021406nt2_intvoc.wav, segments: 10901


                                                                        

data/original_dataset/RMT/011706nt1_intvoc.wav, segments: 8717





DONE
