In [11]:
import numpy as np
import os
from os.path import basename, splitext
from glob import glob
from sklearn.preprocessing import LabelEncoder
import pandas as pd

In [12]:
def getATRforset(speaker, list):
    dbhome = "/var/tmp/hiroki/ATRPB"
    dataset = []
    lablistlist = []

    for s in list:
        pattern = "{}/{}/MFCC/{}SD{}*.mfcc".format(dbhome,speaker,speaker,s)
        for file in sorted(glob(pattern)):
            with open(file, "rb") as f:
                f.seek(12, os.SEEK_SET)
                X = np.fromfile(f, dtype=np.float32).byteswap(True)
            X = np.reshape(X, (-1,26))
            bn, ext = splitext(basename(file))            
            labfn = "{}/{}/lab/{}.lab".format(dbhome,speaker,bn)
            lab_df = pd.read_table(labfn, header=None, sep=' ',
                                   names=("starttime","endtime","phone"))
            labs = le.transform(lab_df["phone"].values)
            lablistlist.append(labs)
            t = align_phonelabel(len(X), lab_df)
            
            dataset.append((X,t,labs))
            
    # llen = np.array([len(x) for x in lablistlist])
    # l = np.zeros([len(lablistlist),np.max(llen)])     # l = np.zeros([len(lablistlist),len(max(lablistlist, key = lambda x: len(x)))])

    # for i,j in enumerate(lablistlist):
    #     l[i][0:len(j)] = j

    # for i,x in enumerate(dataset):
    #     dataset[i] = dataset[i] + (l[i],llen[i])
        
    return dataset

In [13]:
def align_phonelabel(numframes, lab_df):
    frame_shift_in_100nsec = 100000
    label_feature_array = np.empty(numframes, dtype=np.int16)
    label_feature_index = 0
    labint = le.transform(lab_df["phone"].values)
    for index, row in lab_df.iterrows():
        acc_frame_number = int(row['endtime'] / frame_shift_in_100nsec)
        frame_number = acc_frame_number - label_feature_index
        label_feature_array[label_feature_index:label_feature_index +
                             frame_number] = labint[index]
        label_feature_index = label_feature_index + frame_number
    label_feature_array[label_feature_index:] = labint[index] # 最後の数合わせ
    
    return label_feature_array

In [14]:
with open('phones') as f:
    phones = f.read().splitlines()
le = LabelEncoder()
le.fit(phones)
train = getATRforset("MHT", ["A", "B", "C", "D", "E", "F", "G", "H", "I"])
test = getATRforset("MHT", ["J"])

In [15]:
np.save("MHT-train.npy",train)
np.save("MHT-test.npy",test)

In [16]:
test[0]

(array([[-1.36502876e+01,  4.45860434e+00,  1.63271999e+00, ...,
         -3.18116575e-01,  8.79843354e-01, -6.73822388e-02],
        [-1.39195824e+01,  3.60357690e+00,  5.48293495e+00, ...,
          7.10672081e-01,  1.33325374e+00,  3.99609581e-02],
        [-1.33244543e+01,  2.84960127e+00,  8.78625572e-01, ...,
          1.45840299e+00,  1.13568723e+00,  7.73014054e-02],
        ...,
        [-1.28710375e+01,  3.11375189e+00, -2.00585082e-01, ...,
         -8.11767590e-04, -5.21343052e-01,  6.20312691e-02],
        [-1.32679739e+01,  3.07930446e+00,  1.80565178e+00, ...,
         -1.89365840e+00, -7.34460950e-01, -1.62603185e-01],
        [-1.27652187e+01,  3.27643180e+00,  1.88629937e+00, ...,
         -1.90092981e+00, -4.69906718e-01, -1.60786912e-01]], dtype=float32),
 array([34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34,
        34,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
        18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 32, 32, 32