In [1]:
import os
import numpy as np
import tgt
from scipy.stats import mode

In [2]:
phoneme_list = ['AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 
                'AH0', 'AH1', 'AH2', 'AO0', 'AO1', 'AO2', 'AW0', 
                'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 
                'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 
                'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH0', 'IH1', 'IH2', 
                'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 
                'OW0', 'OW1', 'OW2', 'OY0', 'OY1', 'OY2', 'P', 
                'R', 'S', 'SH', 'T', 'TH', 'UH0', 'UH1', 'UH2', 
                'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH', 'sil', 'sp', 'spn']
phoneme_dict = dict()
for j, p in enumerate(phoneme_list):
    phoneme_dict[p] = j

In [3]:
data_dir = 'LibriTTS'
mels_mode_dict = dict()
lens_dict = dict()
for p in phoneme_list:
    mels_mode_dict[p] = []
    lens_dict[p] = []
speakers = os.listdir(os.path.join(data_dir, 'mels'))
for s, speaker in enumerate(speakers):
    print('Speaker %d: %s' % (s + 1, speaker))
    textgrids = os.listdir(os.path.join(data_dir, 'textgrids', speaker))
    for textgrid in textgrids:
        t = tgt.io.read_textgrid(os.path.join(data_dir, 'textgrids', speaker, textgrid))
        m = np.load(os.path.join(data_dir, 'mels', speaker, textgrid.replace('.TextGrid', '_mel.npy')))
        t = t.get_tier_by_name('phones')
        for i in range(len(t)):
            phoneme = t[i].text
            start_frame = int(t[i].start_time * 22050.0) // 256
            end_frame = int(t[i].end_time * 22050.0) // 256 + 1
            mels_mode_dict[phoneme] += [np.round(np.median(m[:, start_frame:end_frame], 1), 1)]
            lens_dict[phoneme] += [end_frame - start_frame]

mels_mode = dict()
lens = dict()
for p in phoneme_list:
    mels_mode[p] = mode(np.asarray(mels_mode_dict[p]), 0).mode[0]
    lens[p] = np.mean(np.asarray(lens_dict[p]))
del mels_mode_dict
del lens_dict

Speaker 1: 8057
Speaker 2: 4014
Speaker 3: 6415
Speaker 4: 5126
Speaker 5: 3723
Speaker 6: 587
Speaker 7: 8534
Speaker 8: 5322
Speaker 9: 2238
Speaker 10: 1401
Speaker 11: 4427
Speaker 12: 1705
Speaker 13: 561
Speaker 14: 2992
Speaker 15: 8776
Speaker 16: 54
Speaker 17: 806
Speaker 18: 1970
Speaker 19: 302
Speaker 20: 6272
Speaker 21: 1289
Speaker 22: 3807
Speaker 23: 6075
Speaker 24: 329
Speaker 25: 3483
Speaker 26: 1914
Speaker 27: 6499
Speaker 28: 7117
Speaker 29: 5703
Speaker 30: 3032
Speaker 31: 3001
Speaker 32: 5304
Speaker 33: 5012
Speaker 34: 8786
Speaker 35: 3187
Speaker 36: 5935
Speaker 37: 1088
Speaker 38: 783
Speaker 39: 5186
Speaker 40: 7994
Speaker 41: 6078
Speaker 42: 3168
Speaker 43: 6550
Speaker 44: 6701
Speaker 45: 4926
Speaker 46: 1355
Speaker 47: 1337
Speaker 48: 2582
Speaker 49: 8119
Speaker 50: 5767
Speaker 51: 1112
Speaker 52: 6054
Speaker 53: 5583
Speaker 54: 6120
Speaker 55: 4290
Speaker 56: 3440
Speaker 57: 2230
Speaker 58: 5802
Speaker 59: 3448
Speaker 60: 73

In [9]:
for s, speaker in enumerate(speakers):
    print('Speaker %d: %s' % (s + 1, speaker))
    os.mkdir(os.path.join(data_dir, 'mels_mode', speaker))
    textgrids = os.listdir(os.path.join(data_dir, 'textgrids', speaker))
    for textgrid in textgrids:
        t = tgt.io.read_textgrid(os.path.join(data_dir, 'textgrids', speaker, textgrid))
        m = np.load(os.path.join(data_dir, 'mels', speaker, textgrid.replace('.TextGrid', '_mel.npy')))
        m_mode = np.copy(m)
        t = t.get_tier_by_name('phones')
        for i in range(len(t)):
            phoneme = t[i].text
            start_frame = int(t[i].start_time * 22050.0) // 256
            end_frame = int(t[i].end_time * 22050.0) // 256 + 1
            m_mode[:, start_frame:end_frame] = np.repeat(np.expand_dims(mels_mode[phoneme], 1), end_frame - start_frame, 1)
        np.save(os.path.join(data_dir, 'mels_mode', speaker, textgrid.replace('.TextGrid', '_avgmel.npy')), m_mode)

Speaker 1: 8057
Speaker 2: 4014
Speaker 3: 6415
Speaker 4: 5126
Speaker 5: 3723
Speaker 6: 587
Speaker 7: 8534
Speaker 8: 5322
Speaker 9: 2238
Speaker 10: 1401
Speaker 11: 4427
Speaker 12: 1705
Speaker 13: 561
Speaker 14: 2992
Speaker 15: 8776
Speaker 16: 54
Speaker 17: 806
Speaker 18: 1970
Speaker 19: 302
Speaker 20: 6272
Speaker 21: 1289
Speaker 22: 3807
Speaker 23: 6075
Speaker 24: 329
Speaker 25: 3483
Speaker 26: 1914
Speaker 27: 6499
Speaker 28: 7117
Speaker 29: 5703
Speaker 30: 3032
Speaker 31: 3001
Speaker 32: 5304
Speaker 33: 5012
Speaker 34: 8786
Speaker 35: 3187
Speaker 36: 5935
Speaker 37: 1088
Speaker 38: 783
Speaker 39: 5186
Speaker 40: 7994
Speaker 41: 6078
Speaker 42: 3168
Speaker 43: 6550
Speaker 44: 6701
Speaker 45: 4926
Speaker 46: 1355
Speaker 47: 1337
Speaker 48: 2582
Speaker 49: 8119
Speaker 50: 5767
Speaker 51: 1112
Speaker 52: 6054
Speaker 53: 5583
Speaker 54: 6120
Speaker 55: 4290
Speaker 56: 3440
Speaker 57: 2230
Speaker 58: 5802
Speaker 59: 3448
Speaker 60: 73