In [1]:
from pathlib import Path

In [2]:
import datasets
from datasets.tasks import AutomaticSpeechRecognition

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import os

In [None]:
_DESCRIPTION = """
An Emotional Audio-Textual Corpus

The EATD-Corpus is a dataset that consists of audio and text files of 162 volunteers who received counseling.

Training set contains data from 83 volunteers (19 depressed and 64 non-depressed).

Validation set contains data from 79 volunteers (11 depressed and 68 non-depressed).
"""


_URL = "https://github.com/speechandlanguageprocessing/ICASSP2022-Depression"


_CITE = """
@INPROCEEDINGS{9746569,
  author={Shen, Ying and Yang, Huiyu and Lin, Lin},
  booktitle={ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, 
  title={Automatic Depression Detection: an Emotional Audio-Textual Corpus and A Gru/Bilstm-Based Model}, 
  year={2022},
  pages={6247-6251},
  doi={10.1109/ICASSP43922.2022.9746569}
}
"""

class EATDDataset(datasets.GeneratorBasedBuilder):
    VERSION = datasets.Version("1.1.0")

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(name="speech", version=VERSION, description="Data for speech recognition"),
    ]

    def _info(self):
        features = datasets.Features(
            {
                "audio": datasets.Audio(sampling_rate=44_100),
                "pitch_tracker": datasets.Audio(sampling_rate=44_100),
                "text": datasets.Value("string"),
                "file_stem": datasets.Value("string"),
            }
        )

        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            supervised_keys=None,
            homepage=_URL,
            task_templates=[
                AutomaticSpeechRecognition(audio_column="audio", transcription_column="text")
            ],
        )

    def _split_generators(self, dl_manager):
        if hasattr(dl_manager, 'manual_dir') and dl_manager.manual_dir is not None:
            data_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir))
            AUDIO_FILE = os.path.join(data_dir, _AUDIO_URL.split("/")[-1])
            audio_dir = dl_manager.extract(AUDIO_FILE)
        else:
            audio_dir = dl_manager.download_and_extract(_AUDIO_URL)
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "split": "train",
                    "audio_dir": audio_dir,
                },
            ),
        ]

    def _generate_examples(
        self, split, audio_dir
    ):
        filepath = Path(audio_dir) / "sw_pcms" / "mf"
        textpath = Path(audio_dir) / "sw_pcms" / "scripts" / "mf" / "sw_all"
        transcripts = {}
        counter = 1
        with open(str(textpath), encoding="latin1") as text:
            for line in text.readlines():
                line = line.strip()
                if line in IGNORE_SENT:
                    if f"{counter:04d}" in IGNORE_ID:
                        counter += 1
                    continue
                else:
                    id = f"sw_all_mf_01_{counter:04d}"
                    transcripts[id] = line
                    counter += 1
        for file in filepath.glob("*.pcm"):
            stem = file.stem
            id = stem.split("_")[-1]
            if is_pcm(str(file)) and not id in IGNORE_ID:
                data, _ = read_with_soundfile(str(file))
                yield stem, {
                    "audio": {
                        "array": data[:, 1],
                        "sampling_rate": 44_100,
                        "path": str(file),
                        "id": stem,
                    },
                    "pitch_tracker": {
                        "array": data[:, 0],
                        "sampling_rate": 44_100,
                        "path": str(file),
                        "id": stem,
                    },
                    "text": transcripts[stem],
                    "file_stem": stem,
                }
                