This notebook takes a file `data/project/raw/audio.wav` and `data/project/raw/align.json` (output from [Gentle](https://lowerquality.com/gentle/)) and outputs one `data/project/samples/*.wav` per phoneme and/or `data/project/samples.npy` with all phonemes. Note that the `samples.npy` output by this script is not rectangular if `width = None`.

In [None]:
data_root = 'data/speech/'
save_wav = False # output data_root/samples/*.wav
save_samples = True # output data_root/samples.npy
width = None # 0.080 # window size in milliseconds, this will create a rectangular sample matrix
use_center = False # when using a fixed width, center the window on the chunk
limit = None # only analyze the first `limit` phonemes

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
from os.path import join
from tqdm import tqdm
from utils import *
import numpy as np
import json
import librosa

In [None]:
align = json.load(open(join(data_root, 'raw/align.json')))
words = align['words']

In [None]:
y, sr = ffmpeg_load_audio(data_root + 'raw/audio.wav', mono=True)
print y.shape

In [None]:
# this could benefit from being rewritten for multiple threads
i = 0
if width is not None:
    width_sample = librosa.time_to_samples(width, sr=sr)[0]
filenames = []
samples = []
mkdir_p(join(data_root, 'samples'))
for word in tqdm(words[:limit], leave=True):
    if 'start' in word:
        start = word['start']
        for phone in word['phones']:
            end = start + phone['duration']
            start_sample, end_sample = librosa.time_to_samples([start, end], sr=sr)
            if use_center:
                center = (start + end) / 2.
                start_sample = librosa.time_to_samples([center - width/2], sr=sr)[0]
            if width is not None:
                end_sample = start_sample + width_sample
            if start_sample > 0 and end_sample < len(y) and end_sample - start_sample > 0:
                cur = y[start_sample:end_sample]
                if save_wav:
                    fn = join(data_root, 'samples/{}_{}.wav'.format(i, phone['phone']))
                    ffmpeg_save_audio(fn, cur, sr=sr)
                    filenames.append(fn)
                if save_samples:
                    samples.append(cur)
                i = i + 1
            start = end
samples = np.asarray(samples)

if save_samples:
    np.save(join(data_root, 'samples.npy'), samples)
if save_wav:
    np.savetxt(join(data_root, 'filenames.txt'), filenames, fmt='%s')