# Extract Audio

## Imports

In [None]:
import os
import sys
from pathlib import Path

import numpy as np
import torch
import torchaudio

sys.path.append(os.path.abspath('../src'))
from dataset_gen import get_samples_for_subject
from helpers import FS_AUDIO

## Notebook Config

In [None]:
public_datasets_folder = "../datasets/public_dataset/"
output_folder = Path("../datasets/extracted_audio/")
window_length = 0.4
aug_factor = 2

In [None]:
# Seed the random number generator for augmentation purposes
seed = 1
np.random.seed(seed)

## Helpers

In [None]:
def extract_audio(subject_id, public_datasets_folder, output_folder, window_length, aug_factor):
    audio_data, imu_data, labels, total_coughs = get_samples_for_subject(public_datasets_folder, subj_id=subject_id, window_len=window_length, aug_factor=aug_factor)
    num_samples, num_timesteps, num_channels = audio_data.shape
    audio_data_list = np.split(audio_data, num_samples)

    subject_output = output_folder / subject_id
    subject_output.mkdir(parents=True, exist_ok=True)

    print(f"Extracting {num_samples} samples for subject {subject_id} to {subject_output}")

    metadata_file_name = subject_output / "metadata.txt"
    with open(metadata_file_name, "w") as f:
        f.write(f"{seed=}\n")
        f.write(f"{subject_id=}\n")
        f.write(f"{num_samples=}\n")
        f.write(f"{total_coughs=}\n")
        f.write(f"{window_length=}\n")
        f.write(f"{aug_factor=}\n")
        f.write(f"{FS_AUDIO=}\n")

    for i, audio_sample in enumerate(audio_data_list):
        segment_folder = subject_output / f"{i}"
        segment_folder.mkdir(parents=True, exist_ok=True)
        audio_file_name = segment_folder / f"audio.wav"
        label_file_name = segment_folder/ f"label.txt"
        sample = torch.from_numpy(audio_sample.squeeze().astype(np.float32))
        torchaudio.save(audio_file_name, sample, FS_AUDIO, channels_first=False)
        with open(label_file_name, "w") as f:
            f.write(str(int(labels[i])))

In [None]:
subject_ids = [f.name for f in Path(public_datasets_folder).iterdir() if f.is_dir()]
print(f"There are {len(subject_ids)} subjects in the public dataset.")
print(subject_ids)

## Notebook Body

In [None]:
for subject_id in subject_ids:
    extract_audio(subject_id, public_datasets_folder, output_folder, window_length, aug_factor)