In [None]:
import numpy as np
np.random.seed(666)
from keras.models import Sequential
from keras.layers import Dense, LSTM, TimeDistributed, Bidirectional, Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint
import itertools

In [None]:
NUM_UNITS = 100
DROPOUT = 0.1
RECURRENT_DROPOUT = 0.1
FEATURE_LEN = 108
NUM_EPOCHS = 40
BATCH_SIZE = 64
VERBOSE = 2
OPTIMIZER = 'rmsprop'
ACTIVATION = 'softmax'
NUM_CLASSES = 48
MAX_SENTENCE_LEN = 777
NUM_LSTM_LAYERS = 2
DATA_PATH = '../data/'
MODEL_PATH = '../models/'
PREDICTION_PATH = '../predictions/'

In [None]:
phone48_phone39 = dict()
phone48_char48 = dict()
# phone39_int39 = dict()
# int39_phone39 = dict()
phone48_int48 = dict()
int48_phone48 = dict()

with open(DATA_PATH + 'phones/48_39.map') as phone48_phone39_f:
    for line in phone48_phone39_f:
        phone48, phone39 = line.strip().split('\t')
        phone48_phone39[phone48] = phone39
#         if phone39 not in phone39_int39:
#             int39 = len(phone39_int39)
#             phone39_int39[phone39] = int39
#             int39_phone39[int39] = phone39
        if phone48 not in phone48_int48:
            int48 = len(phone48_int48)
            phone48_int48[phone48] = int48
            int48_phone48[int48] = phone48

with open(DATA_PATH + '48phone_char.map') as phone48_char48_f:
    for line in phone48_char48_f:
        phone48, int48, char48 = line.strip().split('\t')
        phone48_char48[phone48] = char48

In [None]:
def read_ark(filepath, sentence2frames):
    with open(filepath) as train_ark_f:
        last_sentence = ''
        frames = list()
        is_first_line = True
        for line in train_ark_f:
            line = line.strip().split()
            instance_id = line[0]
            speaker_id, sentence_id, frame_id = instance_id.split('_')
            sentence = '_'.join((speaker_id, sentence_id))
            features = list(map(float, line[1:]))

            if sentence != last_sentence and not is_first_line:
                if last_sentence not in sentence2frames:
                    sentence2frames[last_sentence] = frames
                else:
                    sentence2frames[last_sentence] = np.concatenate(
                        (sentence2frames[last_sentence], frames),
                        axis=1
                    )
                frames = list()

            last_sentence = sentence
            frames.append(features)
            is_first_line = False

        # process last sentence
        if last_sentence not in sentence2frames:
            sentence2frames[last_sentence] = frames
        else:
            sentence2frames[last_sentence] = np.concatenate(
                (sentence2frames[last_sentence], frames),
                axis=1
            )
        frames = list()

In [None]:
def read_label(filepath, sentence2labels):
    with open(filepath) as train_lab_f:
        last_sentence = ''
        labels = list()
        is_first_line = True
        
        for line in train_lab_f:
            instance_id, label = line.strip().split(',')
            speaker_id, sentence_id, frame_id = instance_id.split('_')
            sentence = '_'.join((speaker_id, sentence_id))
            index = phone48_int48[label]


            if sentence != last_sentence and not is_first_line:
                sentence2labels[last_sentence] = labels
                labels = list()

            last_sentence = sentence
            labels.append(index)
            is_first_line = False

        # process last sentence
        sentence2labels[last_sentence] = labels
        labels = list()

In [None]:
def one_hot(n, i):
    ans = np.zeros(n)
    ans[i] = 1
    return ans

In [None]:
def decode(seq):
    L = len(seq)
    sil = phone48_int48['sil']
    
    # trim leading sil
    for i in range(L):
        if seq[i] != sil:
            break
    start = i

    # trim trailing sil
    is_sil = False
    for i in range(L - 1, -1, -1):
        if is_sil:
            if seq[i] != sil:
                break
        else:
            if seq[i] == sil:
                is_sil = True
    end = i + 1
    
    trimmed = [phone48_char48[phone48_phone39[int48_phone48[i]]] for i in seq[start:end]]
    return''.join([k for k, v in itertools.groupby(trimmed)])

def mode(seq):
    count = {}
    lst = []
    for i in seq:
        if i not in count:
            count[i] = 1
            lst.append(i)
        else:
            count[i] += 1
    mode = max(count.values())
#     return [i for i in count if count[i] == mode]
    for i in lst:
        if count[i] == mode:
            return i

def decode_by_mode(seq, context=1):
    L = len(seq)
    sil = phone48_int48['sil']
    
    # trim leading sil
    for i in range(L):
        if seq[i] != sil:
            break
    start = i

    # trim trailing sil
    is_sil = False
    for i in range(L - 1, -1, -1):
        if is_sil:
            if seq[i] != sil:
                break
        else:
            if seq[i] == sil:
                is_sil = True
    end = i + 1
    
    trimmed = [phone48_char48[phone48_phone39[int48_phone48[i]]] for i in seq[start:end]]
#     print(''.join(trimmed))
#     print(''.join([k for k, v in itertools.groupby(trimmed)]))
    
    T = len(trimmed)
    modes = ''
    for i in range(T):
        l, r = max(0, i - context), min(T - 1, i + context)
        modes += mode(trimmed[l:r+1])[0]
#     print(modes)
#     print(''.join([k for k, v in itertools.groupby(modes)]))
    return ''.join([k for k, v in itertools.groupby(modes)])

In [None]:
def build_model(summary=True):
    model = Sequential()
    for i in range(NUM_LSTM_LAYERS):
        model.add(
            Bidirectional(
                LSTM(units=NUM_UNITS, recurrent_dropout=RECURRENT_DROPOUT, return_sequences=True),
                input_shape=(MAX_SENTENCE_LEN, FEATURE_LEN)
            )
        )
        model.add(Dropout(DROPOUT))
    model.add(TimeDistributed(Dense(NUM_CLASSES, activation=ACTIVATION)))
    model.compile(optimizer=OPTIMIZER, loss='categorical_crossentropy', metrics=['accuracy'])
    if summary:
        model.summary()
    return model