In [None]:
from nbdev import *

In [None]:
%nbdev_default_export data

Cells will be exported to text2speech.data,
unless a different module is specified after an export flag: `%nbdev_export special.module`


In [None]:
from pathlib import Path

In [None]:
#hide
from fastcore.test import *
from nbdev.showdoc import *

# Data Processing
>Functions used for data processing

In [None]:
from fastcore.all import *
from fastai2_audio.core.all import *

In [None]:
#hide
def _get_files(p, fs, extensions=None):
    "Construct a list of `Path`s from a list of files `fs` in directory`p`"
    p = Path(p)
    res = [p/f for f in fs 
           if not f.startswith('.') # not hidden file
              and ((not extensions) or f'.{f.split(".")[-1].lower()}' in extensions)]
    return res

In [None]:
%nbdev_export
def get_files(path, extensions=None, recurse=True, folders=None, followlinks=True):
    "Get all the files in `path` with optional `extensions`, optionally with `recurse`, only in `folders`, if specified."
    path = Path(path)
    folders = setify(folders)
    extensions = setify(extensions)
    extensions = {e.lower() for e in extensions}
    if recurse:
        res = []
        for i,(p,d,f) in enumerate(os.walk(path, followlinks=followlinks)): # returns (dirpath, dirnames, filenames)
            if folders and not set(folders).issubset(p.split('/')): continue
            else: res += _get_files(p, f, extensions)
    else:
        f = [o.name for o in os.scandir(path) if o.is_file()]
        res = _get_files(path, f, extensions)
    return L(res)

In [None]:
%nbdev_export
def get_audio_files(path, recurse=True, folders=None):
    "Get audio files in `path` recursively, only in `folders`, if specified."
    return get_files(path, extensions=audio_extensions, recurse=recurse, folders=folders)

In [None]:
%nbdev_export
def get_txt_files(path, recurse=True, folders=None):
    "Get audio files in `path` recursively, only in `folders`, if specified."
    return get_files(path, extensions='.txt', recurse=recurse, folders=folders)

## Dataset directory structure
```
Dicovery/
└── doc1
    └── AmericaFactsVsFiction
        ├── EHD_120764D
        │   └── Voituk
        │       ├── Direct_speech
        │       │   ├── txt
        │       │   └── wav
        │       └── Narrative
        │           ├── txt
        │           └── wav
        └── EHD_120765D
            └── Voituk
                ├── Direct_speech
                │   ├── txt
                │   └── wav
                └── Narrative
                    ├── txt
                    └── wav
```

In [None]:
path = Path('/home/condor/datasets/Dicovery/')

In [None]:
get_audio_files(path)

(#544) [Path('/home/condor/datasets/Dicovery/doc1/AmericaFactsVsFiction/EHD_120764D/Voituk/Direct_speech/wav/68.wav'),Path('/home/condor/datasets/Dicovery/doc1/AmericaFactsVsFiction/EHD_120764D/Voituk/Direct_speech/wav/45.wav'),Path('/home/condor/datasets/Dicovery/doc1/AmericaFactsVsFiction/EHD_120764D/Voituk/Direct_speech/wav/165.wav'),Path('/home/condor/datasets/Dicovery/doc1/AmericaFactsVsFiction/EHD_120764D/Voituk/Direct_speech/wav/46.wav'),Path('/home/condor/datasets/Dicovery/doc1/AmericaFactsVsFiction/EHD_120764D/Voituk/Direct_speech/wav/218.wav'),Path('/home/condor/datasets/Dicovery/doc1/AmericaFactsVsFiction/EHD_120764D/Voituk/Direct_speech/wav/31.wav'),Path('/home/condor/datasets/Dicovery/doc1/AmericaFactsVsFiction/EHD_120764D/Voituk/Direct_speech/wav/125.wav'),Path('/home/condor/datasets/Dicovery/doc1/AmericaFactsVsFiction/EHD_120764D/Voituk/Direct_speech/wav/146.wav'),Path('/home/condor/datasets/Dicovery/doc1/AmericaFactsVsFiction/EHD_120764D/Voituk/Direct_speech/wav/265.wav

In [None]:
get_audio_files(path, folders=['Voituk','Narrative'])

(#501) [Path('/home/condor/datasets/Dicovery/doc1/AmericaFactsVsFiction/EHD_120764D/Voituk/Narrative/wav/102.wav'),Path('/home/condor/datasets/Dicovery/doc1/AmericaFactsVsFiction/EHD_120764D/Voituk/Narrative/wav/199.wav'),Path('/home/condor/datasets/Dicovery/doc1/AmericaFactsVsFiction/EHD_120764D/Voituk/Narrative/wav/82.wav'),Path('/home/condor/datasets/Dicovery/doc1/AmericaFactsVsFiction/EHD_120764D/Voituk/Narrative/wav/182.wav'),Path('/home/condor/datasets/Dicovery/doc1/AmericaFactsVsFiction/EHD_120764D/Voituk/Narrative/wav/258.wav'),Path('/home/condor/datasets/Dicovery/doc1/AmericaFactsVsFiction/EHD_120764D/Voituk/Narrative/wav/212.wav'),Path('/home/condor/datasets/Dicovery/doc1/AmericaFactsVsFiction/EHD_120764D/Voituk/Narrative/wav/26.wav'),Path('/home/condor/datasets/Dicovery/doc1/AmericaFactsVsFiction/EHD_120764D/Voituk/Narrative/wav/115.wav'),Path('/home/condor/datasets/Dicovery/doc1/AmericaFactsVsFiction/EHD_120764D/Voituk/Narrative/wav/120.wav'),Path('/home/condor/datasets/Dic

In [None]:
test_eq(len(get_audio_files(path, folders=['Voituk','Narrative'])) +
        len(get_audio_files(path, folders=['Voituk','Direct_speech'])),
        len(get_audio_files(path, folders=['Voituk']))) 

In [None]:
test_eq(len(get_audio_files(path, folders=['Voituk','Direct_speech','EHD_120764D'])) +
        len(get_audio_files(path, folders=['Voituk','Direct_speech','EHD_120765D'])),
        len(get_audio_files(path, folders=['Voituk','Direct_speech']))) 

In [None]:
test_eq(len(get_audio_files(path, folders=['Voituk','Narrative'])),
        len(get_txt_files  (path, folders=['Voituk','Narrative'])))

In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 01_text_norm.ipynb.
Converted 02_data.ipynb.
Converted index.ipynb.
