# Урок 5

В этом уроке мы будем расвознавать файлы, содержащие не одну, а уже последовательности цифр. Также мы усложним акустическую модель – будем использовать предварительно обученный нейросетевой классификатор. Для удобства возьмем уже полученные предсказания для кадров всех используемых записей.

Для того, чтобы иметь возможность декодировать последовательности слов, нужно добавить в финальные узлы графа переходы в нулевой узел, из которого токены заново могу идти по ветвям. 

<img src="graph_5.jpg">

<b>Задание 1:</b> Добавить возможность перехода в нулевой узел для последних узлов в каждой ветви графа. Словарь с эталонами хранится в файле lexicon.

In [1]:
import GraphMaker
import AcoModels
import FtrFile
import wer
import numpy as np
import os


class State:
    def __init__(self, model, idx):
        self.model = model
        self.word = None
        self.isFinal = False
        self.nextStates = []
        self.idx = idx
        self.bestToken = None
        
#---------------------------------TODO--------------------------------------        
def DictReader(dictName):
    with open(dictName, 'rt') as dict_file:
        for line in dict_file:
            line = line.strip()
            if not line:
                continue
            parts = line.split()
            word = parts[0]
            phonemes = parts[1:]
            yield word, phonemes
#---------------------------------------------------------------------------

def load_graph(rxfilename, AMs):
    startState = State(None, 0)
    graph = [startState, ]
    stateIdx = 1
    #-----------------------------------------TODO--------------------------
    for word, phones in DictReader(rxfilename):
        prevState = startState
        for frame in range(len(phones)):
            phoneme_model = AMs.findModelbyName(phones[frame])
            state = State(phoneme_model, stateIdx)
            state.currentWord = word
            state.nextStates.append(state)  # add loop
            prevState.nextStates.append(state)
            prevState = state  # prevState is state = true
            graph.append(state)
            stateIdx += 1
        if state:
            state.word = word
            state.isFinal = True
            state.nextStates.append(startState)
    #-----------------------------------------------------------------------
    return graph


Но необходимо сразу же ввести шраф за выход из слова – wd_add.Дело в том,что длинное слово вероятнее собрать из нескольких коротких, если таковые есть – поскольку у этих коротких слов может оказаться лучшая дистанция относительно полного слова . Например, слово «надо» удобно составить из «на до», или «на да», или «но да», или «над э», или «над а» – что лучше подходит к
звуку, то и выбирай.

Чтобы это обуздать, заводят штраф wd_add. Этот штраф мы прибавляем к «дистанции» токена каждый раз, когда встречаем идентификатор слова. Его подбирают на какой-то базе так,чтобы качество было максимальным.

Для этого надо реализовать функцию virtualNodePass, которая будет штрафовать наши токены за выход из слова и повторное прохождение через нулевой узел.

Чтобы оценить результаты нашего декодирования мы будем использовать уже готовую утилиту wer.py для вычисления WER в функции computeWER.

In [2]:
class Token:
    def __init__(self, state, dist=0.0, sentence=""):
        self.state = state
        self.dist = dist
        self.sentence = sentence
        self.alive = True
        

def findBest(Tokens):
    minDist = None
    bestToken = 0
    for i in range(len(Tokens)):
        if Tokens[i].alive == True:
            if minDist == None:
                minDist = Tokens[i].dist
                bestToken = Tokens[i]
            elif Tokens[i].dist <= minDist:
                minDist = Tokens[i].dist
                bestToken = Tokens[i]
    return bestToken


def beamPrunning(nextTokens, thr_common):
    bestToken = findBest(nextTokens)
    bestDist = bestToken.dist
    for i in range(len(nextTokens)):
        if nextTokens[i].alive == True and nextTokens[i].dist >= bestDist + thr_common:
            nextTokens[i].alive = False
    return nextTokens, bestDist


def statePrunning(nextTokens):
    for i in range(len(nextTokens)):
        state_index = nextTokens[i].state.idx
        if graph[state_index].bestToken == None:
            graph[state_index].bestToken = nextTokens[i]
        else:
            if nextTokens[i].dist <= graph[state_index].bestToken.dist:
                graph[state_index].bestToken.alive = False
                graph[state_index].bestToken = nextTokens[i]
            else:
                nextTokens[i].alive = False
    return nextTokens


def virtualNodePass(token, newToken):
    #---------------------TODO-----------------------
    newToken.dist += wd_add
    #------------------------------------------------
    # разделение слов в высказывании знаком '-':
    if newToken.sentence and token.state.word != '<sil>':
        newToken.sentence += '-'
    # добавление нового слова в высказывание:
    if token.state.word != '<sil>':
        newToken.sentence += token.state.word
    return newToken


def decoding(filename, num_file, AMs, graph, path2predict, thr_common, wd_add):

    print("num_file: {}, file_name: {}".format(num_file, filename))
    startState = graph[0]
    activeTokens = [Token(startState), ]
    nextTokens = []
    AMs.loadPrediction('{}/{}.npy'.format(path2predict, filename))

    for frame in range(len(AMs.predicts)):
        runningBestDist = 0
        for token in activeTokens:
            if token.alive:
                for transitState in token.state.nextStates:
                    newToken = Token(transitState, token.dist, token.sentence)
                    # virtual node pass and penalty for word:
                    #----------------TODO------------------------------------
                    if newToken.state.idx == 0:
                        newToken = virtualNodePass(token, newToken)
                        activeTokens.append(newToken)
                        continue
                    #--------------------------------------------------------
                    # end virtual node pass ...
                    newToken.dist += AMs.getDist(transitState.model, frame)
                    nextTokens.append(newToken)
        #make pruning:
        nextTokens = statePrunning(nextTokens)
        nextTokens, bestDist = beamPrunning(nextTokens, thr_common)

        activeTokens = nextTokens
        for token in nextTokens:
            index = token.state.idx
            if graph[index].bestToken:
                graph[index].bestToken = None
        nextTokens = []

    # finding final Tokens
    finalTokens = []
    for token in activeTokens:
        if token.state.isFinal and token.alive:
            finalTokens.append(token)

    assert len(finalTokens) != 0
    winToken = findBest(finalTokens)
    fn.write(str(filename) + ' ' + winToken.sentence.replace('-', ' ') + '\n')

    return frame


def computeWer(testName, resName):
    
    WER = wer.computeWER(testName, resName)
    print('\n' + '-'*10 + 'RESULT OF RECOGNITION:' + '--'*10 + '\n')
    print('%WER is {}'.format(WER))

Запустим нашу программу.

In [4]:
import time

lexicon = 'lexicon'                   # словарь для цифр с фонемной транскрипцией
phones_dict = 'phones'                # набор используемых фонем
test_mfcc = 'ark,t:test_mfcc.txtftr' # признаки записей 
path2predict = 'binaryPredict'        # папка с предсказаниями
resName = 'decode_results'            # файл с результатами декодирования
testName = 'test_ref.txt'             # файл с текстом записей

thr_common = 60
wd_add = 55 
s_time = time.time()

# 1. Make acoustic model:
AMs = AcoModels.AcoModelSet(phones_dict)
AMs.loadPosteriors('posteriors.npy')

# 2. Load and check graph from lexicon:
graph = GraphMaker.load_graph(lexicon, AMs)
GraphMaker.check_graph(graph)

#3. Load filename from test_reference:
test_list = []
with open(testName, 'r') as fn:
    for line in fn:
        test_list.append(line.split()[0])

#4. Decoding:
print("\nDECODING IS STARTED...")
num_file = 0
numbFrame = 0
with open(resName, 'w') as fn:
    for filename in test_list:
        if num_file < 1000:
            decoding(filename, num_file, AMs, graph, path2predict, thr_common, wd_add)                         
            num_file += 1
            numbFrame += len(AMs.predicts)
        else: break

#4. Compute WER:
computeWer(testName, resName)

#5. Print time and RTF:
time = time.time() - s_time
minut = int(time / 60)
second = time - minut * 60
print("\nTOTAL TIME: {} min {} sec".format(minut, round(second, 2)))
rtf = time / (numbFrame * 0.01)
print("RTF: {}".format(round(rtf, 3)))


DECODING IS STARTED...
num_file: 0, file_name: M0089_02_0273618495_Android_nexus
num_file: 1, file_name: M0072_02_05469_Android_nexus
num_file: 2, file_name: M0045_01_8453679021_iPhone_Iphone5
num_file: 3, file_name: F0197_03_2738015469_iPhone_iphone6
num_file: 4, file_name: F0031_02_04279_iPhone_iphone6
num_file: 5, file_name: F0199_03_6482310759_Android_nexus
num_file: 6, file_name: F0031_03_17806_iPhone_6s
num_file: 7, file_name: M0036_03_09687_Android_SM
num_file: 8, file_name: F0010_02_1392408567_iPhone_iphone6
num_file: 9, file_name: M0070_03_26738_Android_htc
num_file: 10, file_name: F0010_02_89071_iPhone_iphone6
num_file: 11, file_name: F0042_03_1832965470_Android_SM_Gal_J3
num_file: 12, file_name: M0047_03_72690_Android_nexus
num_file: 13, file_name: F0034_01_70531_iPhone_Iphone5
num_file: 14, file_name: M0062_01_70531_iPhone_6s
num_file: 15, file_name: M0088_01_97842_Android_SM
num_file: 16, file_name: M0064_03_6382074195_iPhone_iphone6
num_file: 17, file_name: M0038_02_5037

num_file: 149, file_name: F0090_02_21409_Android_nexus
num_file: 150, file_name: M0071_02_49586_Android_nexus
num_file: 151, file_name: M0001_03_4697803125_Android_SM_Gal_J3
num_file: 152, file_name: F0029_02_74681_Android_htc
num_file: 153, file_name: M0055_02_4029763518_iPhone_iphone6
num_file: 154, file_name: F0191_02_82673_Android_SM_Gal_J3
num_file: 155, file_name: M0033_03_14769_Android_SM_Gal_J3
num_file: 156, file_name: M0037_02_6780524931_Android_htc
num_file: 157, file_name: F0073_03_79428_iPhone_iphone6
num_file: 158, file_name: F0020_03_85310_Android_SM
num_file: 159, file_name: F0191_03_7516290438_Android_htc
num_file: 160, file_name: F0200_03_58914_Android_htc
num_file: 161, file_name: M0006_01_9761823504_iPhone_6s
num_file: 162, file_name: F0034_02_89071_iPhone_iphone6
num_file: 163, file_name: M0001_01_70216_iPhone_6s
num_file: 164, file_name: M0037_01_0594267381_Android_htc
num_file: 165, file_name: M0045_01_4682107593_iPhone_Iphone5
num_file: 166, file_name: M0058_03_

num_file: 296, file_name: F0015_03_5873419602_Android_nexus
num_file: 297, file_name: M0081_03_78053_Android_htc
num_file: 298, file_name: F0069_03_62045_iPhone_iphone6
num_file: 299, file_name: M0053_03_5607924138_Android_SM
num_file: 300, file_name: M0074_03_1247093658_Android_SM
num_file: 301, file_name: F0201_01_6931875420_Android_SM
num_file: 302, file_name: M0064_03_89265_Android_htc
num_file: 303, file_name: F0042_02_4910725836_Android_htc
num_file: 304, file_name: F0061_02_7963108245_Android_nexus
num_file: 305, file_name: M0030_03_1429067358_iPhone_6s
num_file: 306, file_name: F0068_01_70531_iPhone_6s
num_file: 307, file_name: M0058_03_2687041593_Android_SM_Gal_J3
num_file: 308, file_name: M0044_01_4802765319_Android_htc
num_file: 309, file_name: M0054_03_9203745681_Android_nexus
num_file: 310, file_name: M0081_02_74096_Android_SM_Gal_J3
num_file: 311, file_name: F0201_02_51386_iPhone_iphone6
num_file: 312, file_name: F0119_02_43765_Android_nexus
num_file: 313, file_name: M007

num_file: 444, file_name: M0078_02_4850126937_Android_SM_Gal_J3
num_file: 445, file_name: F0010_03_30245_iPhone_6s
num_file: 446, file_name: M0006_01_54398_iPhone_6s
num_file: 447, file_name: F0010_02_1894372056_iPhone_iphone6
num_file: 448, file_name: M0077_02_2760493815_Android_SM_Gal_J3
num_file: 449, file_name: M0089_01_97842_Android_SM
num_file: 450, file_name: F0029_02_4029763518_iPhone_iphone6
num_file: 451, file_name: F0061_03_41693_Android_htc
num_file: 452, file_name: M0067_02_74096_Android_SM_Gal_J3
num_file: 453, file_name: M0056_02_1392408567_iPhone_iphone6
num_file: 454, file_name: F0069_03_97642_Android_htc
num_file: 455, file_name: M0039_03_7259843016_Android_nexus
num_file: 456, file_name: F0050_01_3710859426_Android_SM
num_file: 457, file_name: M0083_01_01294_iPhone_6s
num_file: 458, file_name: M0046_01_9108765423_iPhone_Iphone5
num_file: 459, file_name: M0002_01_9761823504_iPhone_6s
num_file: 460, file_name: M0070_02_4602935718_Android_nexus
num_file: 461, file_name:

num_file: 591, file_name: M0078_02_4057961832_Android_nexus
num_file: 592, file_name: M0041_03_2463591087_iPhone_6s
num_file: 593, file_name: M0082_03_6382074195_iPhone_iphone6
num_file: 594, file_name: F0181_01_6187249305_Android_SM
num_file: 595, file_name: M0033_02_1894372056_iPhone_iphone6
num_file: 596, file_name: M0079_01_4789306251_iPhone_6s
num_file: 597, file_name: F0052_02_40689_Android_htc
num_file: 598, file_name: F0035_03_53970_Android_SM_Gal_J3
num_file: 599, file_name: F0182_02_5926870431_Android_nexus
num_file: 600, file_name: M0089_03_9745031826_Android_htc
num_file: 601, file_name: F0050_02_0582397146_Android_htc
num_file: 602, file_name: F0201_01_7096245183_iPhone_6s
num_file: 603, file_name: M0055_02_4910725836_Android_htc
num_file: 604, file_name: F0011_02_4029763518_iPhone_iphone6
num_file: 605, file_name: M0058_01_87901_iPhone_6s
num_file: 606, file_name: M0088_01_45213_Android_SM
num_file: 607, file_name: F0117_03_42051_Android_nexus
num_file: 608, file_name: F0

num_file: 739, file_name: M0053_01_2637015489_iPhone_6s
num_file: 740, file_name: F0197_01_7546301298_Android_nexus
num_file: 741, file_name: M0037_01_75963_Android_htc
num_file: 742, file_name: M0038_01_08142_Android_htc
num_file: 743, file_name: F0073_02_24801_Android_nexus
num_file: 744, file_name: M0055_03_59012_Android_SM_Gal_J3
num_file: 745, file_name: F0061_02_15238_Android_SM_Gal_J3
num_file: 746, file_name: M0081_01_65387_iPhone_6s
num_file: 747, file_name: F0022_03_6574019823_Android_SM
num_file: 748, file_name: F0068_03_9156438702_Android_htc
num_file: 749, file_name: M0016_03_12948_iPhone_6s
num_file: 750, file_name: M0075_01_2765304981_iPhone_6s
num_file: 751, file_name: M0044_01_4682107593_iPhone_Iphone5
num_file: 752, file_name: M0041_02_01748_iPhone_6s
num_file: 753, file_name: M0013_01_1324670895_iPhone_6s
num_file: 754, file_name: F0173_01_87046_iPhone_6s
num_file: 755, file_name: M0060_01_43652_iPhone_6s
num_file: 756, file_name: F0042_01_4762598103_Android_htc
num_

num_file: 885, file_name: F0010_02_90864_Android_htc
num_file: 886, file_name: F0051_03_73068_Android_SM_Gal_J3
num_file: 887, file_name: F0061_03_2738015469_iPhone_iphone6
num_file: 888, file_name: M0056_02_42365_iPhone_iphone6
num_file: 889, file_name: M0071_01_7108563429_iPhone_6s
num_file: 890, file_name: M0067_03_02764_Android_htc
num_file: 891, file_name: F0172_02_4263908751_Android_htc
num_file: 892, file_name: M0088_01_65718_iPhone_6s
num_file: 893, file_name: M0030_02_46783_Android_htc
num_file: 894, file_name: F0021_03_73284_iPhone_6s
num_file: 895, file_name: F0090_03_3978520416_iPhone_6s
num_file: 896, file_name: F0173_02_51386_iPhone_iphone6
num_file: 897, file_name: F0051_02_4029763518_iPhone_iphone6
num_file: 898, file_name: M0083_01_03716_iPhone_6s
num_file: 899, file_name: M0079_01_1427859063_iPhone_6s
num_file: 900, file_name: M0047_03_3729158604_Android_SM_Gal_J3
num_file: 901, file_name: F0090_02_62831_Android_SM_Gal_J3
num_file: 902, file_name: M0067_02_8719463205_