В данном занятии мы построим многослойную нейронную сеть (Deep Neural Network, DNN) при помощи TensorFlow, которая будет осущесвлять классификацию каждого входного фрейма (с некоторым контекстом) по фонемам (в том числе паузе).


In [1]:
import os

# Необходимо явно указать те CUDA-совместимые устройства, которые будут использоваться,
# чтобы ограничить доступ TensorFlow на остальные
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"

import tensorflow as tf
import numpy as np

import fnmatch

import matplotlib.pyplot as plt 
import random

%matplotlib inline

В качестве входных данных для нейронной сети будем использовать в произвольной комбинации следующие признаки (<code>features_types</code>):
- mfcc
- pca_W (проекция $PCA_W$ из предыдущей работы изображений губ 50x30 из предыдущей работы)
- landmarks (пара расстояний между верхней и нижней губой и между уголками рта)
- hog_pca_W (проекция $PCA_W$ HOG-признаков)

Входным вектором на $i$-м кадре для DNN будет текущий $i$-й вектор + дополнительный контекст, который задается двумя параметрами: <code>[context_size, context_shift]</code> для каждого типа признаков отдельно. context_size - размер контекста (количество фреймов слева и справа, которые добавляем), context_shift - шаг между добавляемыми векторами. К примеру, контекст [3, 2] означает, что мы для i-го кадра будем формировать вектор размером (1 + 2\*3)\*dim путем конкатенации следующих кадров: (i-6), (i-4), (i-2), i, (i+2), (i+4), (i+6)

In [3]:
work_dir = "../../data/lip_reading/"
features_dir = "2_features/"

#признаки для обучения DNN (можно указывать любую комбинацию)
features_types = ["pca_W", "mfcc"]

#контекст [context_size, context_shift]
features_context = [ [2, 4], [3, 2] ]
assert(len(features_context) == len(features_types)), "Length of features_types and features_context must be equal"

output_dir = "_".join(features_types) + "/"
dnn_path = "./models/dnn/" + output_dir
predictions_path = "./3_predictions/" + output_dir

#рассчитаем размер входного вектора DNN
x_dim = 0
for i in range(len(features_types)):
    files = fnmatch.filter(os.listdir(work_dir + features_dir + features_types[i] + "/test/"), "*.npy")
    features = np.load(work_dir + features_dir + features_types[i] + "/test/" + files[0])
    context_size, context_shift = features_context[i]
    x_dim +=(context_size * 2 + 1)*features.shape[1]

    
phones = [l.strip() for l in open(work_dir + features_dir + "phones").readlines()]
phones_indx_mapping = {p:i for i,p in enumerate(phones)}
phones_num = len(phones_indx_mapping)

Функция подготовки признаков: 
- загрузка файла(ов) признаков
- нормировка
- добавление контекста
- объединение в один массив
- добавление меток классов

Входные данные: 
<code>file</code> - имя файла; 
<code>features_dirs</code> - список путей к признакам;
<code>context</code> - контекст;
<code>phones_num</code> - количество фонем в разметке;
<code>align</code> - выравнивание по фонемам (хранит метки фонем для каждого фрейма), метка 0 используется для фонемы "пауза"

Необходимо, чтобы все признаки были предварительно построены с одинаковым frame-rate (в данной работе - 100 кадров в сек.) для обеспечения синхронности

In [4]:
def prepare_train_data(file, features_dirs, features_context, phones_num, align = np.array([])):
    
    max_pause_frames = 30
    assert (len(features_dirs) == len(features_context))
    
    # Загрузка признаков
    features_list = []
    for d in features_dirs:
        features = np.load(d + file)
        # mean normalization
        features = (features - features.mean(axis=0))
        features_list.append(features)
    
    frames_num = features_list[0].shape[0]
    for features in features_list:
        frames_num = min(frames_num, features.shape[0])  
    speech_start = 0
    speech_end = frames_num
    if len(align):
        frames_num = min(frames_num, align.shape[0])
        speech_start = np.nonzero(align)[0].min()
        speech_end = np.nonzero(align)[0].max() + 1
    
    frames_start = max(0, speech_start - max_pause_frames)
    frames_end = min(frames_num, speech_end + max_pause_frames)
    assert (frames_end > frames_start), "Prepare data error: (frames_end <= frames_start)"
    
    # расчет финального размера признаков
    result_x_dim = 0
    for i,features in enumerate(features_list):
        context_size, context_shift = features_context[i]
        dim = features.shape[1]
        result_x_dim += dim*(context_size*2 + 1)
        
    x_y = np.zeros([frames_end - frames_start, result_x_dim + phones_num])
    
    dim_shift = 0
    for i,features in enumerate(features_list):
        context_size, context_shift = features_context[i]
        dim = features.shape[1]
        for context in range(-context_size, context_size+1):
            x_y[:, dim_shift + dim*(context_size + context): dim_shift + dim*(context_size + context + 1)] = np.roll(features, context*context_shift, axis=0)[frames_start:frames_end,:]
        dim_shift += dim*(context_size*2 + 1)
    
    # добавление меток классов
    if len(align):
        for ph in range(phones_num):
            x_y[np.where(align[frames_start:frames_end] == ph)[0], result_x_dim + ph] = 1
    
    return x_y

Вспомогательная функция - загрузка всех данных для списка пользователей <code>persons</code>

In [5]:
def prepare_data_for_persons(persons, features_dirs, features_context, phones_num, align_train = {}):
    assert (len(features_dirs) > 0), "Error: len(features_dirs) > 0"
    assert (len(features_dirs) == len(features_context)), "Error: len(features_dirs) == len(features_context)"
    
    files_list_0 = fnmatch.filter(os.listdir(features_dirs[0]), "*.npy")
    
    files_list = []
    for f in files_list_0:
        person_id = f.split('_')[0]
        if (person_id in persons):
            is_ok = True
            for d in features_dirs:
                if (not os.path.isfile(d + f)):
                    is_ok = False
                    break;
            if is_ok:
                files_list.append(f)
    
    print("Files num: {}".format(len(files_list)))

    data_x_y_list = []
    for i,file in enumerate(files_list):
        if (i%1000 == 0):
            print("Processed {} / {}".format(i, len(files_list)))
        filename = os.path.splitext(file)[0]
        align = align_train.get(filename, np.array([]))
        if len(align):
            x_y = prepare_train_data(file, features_dirs, features_context, phones_num, align)
            data_x_y_list.append(x_y)
    data_x_y = np.concatenate(data_x_y_list)
    return data_x_y

Для обучения DNN будем использовать TensorFlow. Для начала будет использована простейшая архитектура: входной слой + несколько внутренних полносвязных слоев + выходной softmax слой.

<b> Задание 1.</b> Сформировать нейронную сеть со следующими параметрами:
- количество внутренних слоев - 4
- количество нейронов в каждом внутреннем слое - 256
- активационная функция для внутренних слоев - <code>tf.nn.sigmoid</code>
- первоначальная инициализация весов - при помощи <code>tf.random_normal</code>

In [6]:
n_nodes_hl1 = 256
n_nodes_hl2 = 256
n_nodes_hl3 = 256
n_nodes_hl4 = 256

n_classes = phones_num
x = tf.placeholder(tf.float32, [None, x_dim])

# пригодится в дальнейшем (не использовать в этом блоке)
y = tf.placeholder(tf.float32, [None, n_classes])

#--------------------------- TODO -----------------------------------------
# building graph:
hidden_layer_1 = {'weight': tf.Variable(tf.random_normal([x_dim, n_nodes_hl1], mean=0.0, stddev=0.05), name='w1'),
                  'biases': tf.Variable(tf.zeros([n_nodes_hl1]), name='b1')}

hidden_layer_2 = {'weight': tf.Variable(tf.random_normal([n_nodes_hl1, n_nodes_hl2], mean=0.0, stddev=0.05), name='w2'),
                  'biases': tf.Variable(tf.zeros([n_nodes_hl2]), name='b2')}

hidden_layer_3 = {'weight': tf.Variable(tf.random_normal([n_nodes_hl2, n_nodes_hl3], mean=0.0, stddev=0.05), name='w3'),
                  'biases': tf.Variable(tf.zeros([n_nodes_hl3]), name='b3')}

hidden_layer_4 = {'weight': tf.Variable(tf.random_normal([n_nodes_hl3, n_nodes_hl4], mean=0.0, stddev=0.05), name='w4'),
                  'biases': tf.Variable(tf.zeros([n_nodes_hl4]), name='b4')}

output_layer = {'weight': tf.Variable(tf.random_normal([n_nodes_hl3, n_classes], mean=0.0, stddev=0.05), name='w5'),
                'biases': tf.Variable(tf.zeros([n_classes]), name='b5')}

l1 = tf.add(tf.matmul(x, hidden_layer_1['weight']), hidden_layer_1['biases'])
l1 = tf.nn.sigmoid(l1)

l2 = tf.add(tf.matmul(l1, hidden_layer_2['weight']), hidden_layer_2['biases'])
l2 = tf.nn.sigmoid(l2)

l3 = tf.add(tf.matmul(l2, hidden_layer_3['weight']), hidden_layer_3['biases'])
l3 = tf.nn.sigmoid(l3)

l4 = tf.add(tf.matmul(l3, hidden_layer_4['weight']), hidden_layer_4['biases'])
l4 = tf.nn.sigmoid(l4)

# output - это результат вычисления непосредственно перед softmax
output = tf.add(tf.matmul(l4, output_layer['weight']), output_layer['biases'])
#--------------------------------------------------------------------------

output_softmax = tf.nn.softmax(output) # необходимо в дальнейшем при получении predict

# функция стоимости
cost_fun = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=output,
                                                                  labels=y))
# при обучении будем использовать оптимизатор Adam
train_step = tf.train.AdamOptimizer().minimize(cost_fun)

# подсчет точности предсказания
cor_predict = tf.equal(tf.arg_max(y, 1), tf.arg_max(output, 1))
accuracy = tf.reduce_mean(tf.cast(cor_predict, tf.float32))
        

Подготовка данных для обучения DNN

<b>Внимание!</b> В данном примере осуществляется загрузка всех признаков в оперативную память. Если памяти не хватает, необходимо переделать процедуру обучения DNN таким образом, чтобы batch формировался динамически. Возможный вариант реализации:
- п.0 перед началов обучения формируется список файлов обучения
- п.1 перед каждой эпохой список файлов перемешивается
- п.2 в память загружается ограниченное количество файлов (к примеру, 100 штук) и batch-и берутся из них
- п.3 если данные закончились, загружаем очередные 100 файлов и переходим к п.2

In [9]:
print("Load alignment")
align_train = {}

for line in open(work_dir + features_dir + "new_alig").readlines():
    line_split = line.strip().split(' ')
    align_train[line_split[0]] = np.array([phones_indx_mapping[phone] for phone in line_split[1:]])

features_train_dirs = []
for features_type in features_types:
    features_train_dirs.append(work_dir + features_dir + features_type + "/train/")

#files_train = fnmatch.filter(os.listdir(features_train_dir), "*.npy")
#print("Train files num: {}".format(len(files_train)))

print("Load features from: " + str(features_train_dirs))
persons_train = [l.strip() for l in open(work_dir + features_dir + "train_persons.list").readlines()]
persons_validate = [l.strip() for l in open(work_dir + features_dir + "validate_persons.list").readlines()]
data_x_y_train = prepare_data_for_persons(persons_train, features_train_dirs,  features_context, phones_num, align_train)
data_x_y_validate = prepare_data_for_persons(persons_validate, features_train_dirs,  features_context, phones_num, align_train)

Load alignment
Load features from: ['../../data/lip_reading/2_features/pca_W/train/', '../../data/lip_reading/2_features/mfcc/train/']
Files num: 8113
Processed 0 / 8113
Processed 1000 / 8113
Processed 2000 / 8113
Processed 3000 / 8113
Processed 4000 / 8113
Processed 5000 / 8113
Processed 6000 / 8113
Processed 7000 / 8113
Processed 8000 / 8113
Files num: 1322
Processed 0 / 1322
Processed 1000 / 1322


Обучение DNN

In [11]:
os.system("mkdir -p  " + dnn_path)
with open(dnn_path + "features_context.txt", "w") as f:
    f.write(str(features_context))
    
with open(dnn_path + "features_types.txt", "w") as f:
    f.write(str(features_types))

file_log = open(dnn_path + "log.txt", "w")

saver = tf.train.Saver()

batch_size = 256
# run session:
batches_num = int(data_x_y_train.shape[0] / batch_size)

config=tf.ConfigProto()
config.gpu_options.allow_growth = True # запрещаем занимать всю доступную память на GPU

with tf.Session(config=config) as sess:
    tf.global_variables_initializer().run()
    print('TRAINING IS STARTED...')
    for epoch in range(10):
        epoch_loss = 0
        # перед каждой эпохой перемешиваем данные
        np.random.shuffle(data_x_y_train)
        for batch in range(batches_num):
            batch_xs = data_x_y_train[batch*batch_size:(batch+1)*batch_size, 0:x_dim]
            batch_ys = data_x_y_train[batch*batch_size:(batch+1)*batch_size, x_dim:]
            _, c = sess.run([train_step, cost_fun], {x: batch_xs, y: batch_ys})
            epoch_loss += c
        print('Epoch: {}, loss: {}'.format(epoch, epoch_loss))
        file_log.write('Epoch: {}, loss: {}\n'.format(epoch, epoch_loss))
        
        data_validate_x = data_x_y_validate[:, 0:x_dim]
        data_validate_y = data_x_y_validate[:, x_dim:]
        frames_test_speech = np.where(data_validate_y[:,0] == 0)[0]
        
        # считаем accuracy на валидационном множестве
        accuracy_full = sess.run(accuracy, {x: data_validate_x, y: data_validate_y})
        # accuracy по всем фонемам, кроме фонемы "пауза"
        accuracy_speech_only = sess.run(accuracy, {x: data_validate_x[frames_test_speech,:], y: data_validate_y[frames_test_speech,:]})
        print('Accuracy: {}, accuracy (speech only): {}'.format(accuracy_full, accuracy_speech_only))
        file_log.write('Accuracy: {}, accuracy (speech only): {}\n'.format(accuracy_full, accuracy_speech_only))
        saver.save(sess, dnn_path + "epoch_{}".format(epoch))
file_log.close()

TRAINING IS STARTED...
Epoch: 0, loss: 11730.236666321754
Accuracy: 0.7223834991455078, accuracy (speech only): 0.6965517401695251
Epoch: 1, loss: 8995.637139260769
Accuracy: 0.7413422465324402, accuracy (speech only): 0.7187196016311646
Epoch: 2, loss: 8327.828028827906
Accuracy: 0.750708281993866, accuracy (speech only): 0.7277534604072571
Epoch: 3, loss: 7974.503780275583
Accuracy: 0.7567722201347351, accuracy (speech only): 0.7373457551002502
Epoch: 4, loss: 7731.341266602278
Accuracy: 0.7590792775154114, accuracy (speech only): 0.74509197473526
Epoch: 5, loss: 7557.149601370096
Accuracy: 0.7593922019004822, accuracy (speech only): 0.7395803332328796
Epoch: 6, loss: 7423.085683584213
Accuracy: 0.7615479230880737, accuracy (speech only): 0.7458705902099609
Epoch: 7, loss: 7309.96988388896
Accuracy: 0.7631679177284241, accuracy (speech only): 0.7493460178375244
Epoch: 8, loss: 7214.288057923317
Accuracy: 0.7647907733917236, accuracy (speech only): 0.7478048801422119
Epoch: 9, loss: 7

Подсчет постериоров тестовой базы

In [12]:
features_test_dirs = []
for features_type in features_types:
    features_test_dirs.append(work_dir + features_dir + features_type + "/test/")

test_files = fnmatch.filter(os.listdir(features_test_dirs[0]), "*.npy")
    
os.system("mkdir -p " + predictions_path)

config=tf.ConfigProto()
config.gpu_options.allow_growth = True

saver = tf.train.Saver()

with tf.Session(config=config) as sess:
    print ("Restoring session: " + tf.train.latest_checkpoint(dnn_path))
    saver.restore(sess, tf.train.latest_checkpoint(dnn_path))
    
    for file in test_files:
        
        x_y_test = prepare_train_data(file, features_test_dirs, features_context, phones_num)
    
        predict = sess.run(output_softmax, {x: x_y_test[:,0:x_dim]})
        np.save(predictions_path + file, predict.astype(np.float32))
        print("processed: " + file + "(frames num: {})".format(predict.shape[0]))

Restoring session: ./models/dnn/pca_W_mfcc/epoch_9
INFO:tensorflow:Restoring parameters from ./models/dnn/pca_W_mfcc/epoch_9
processed: M0071_01_28304_iPhone_6s.npy(frames num: 252)
processed: F0020_01_87901_iPhone_6s.npy(frames num: 376)
processed: F0031_01_98264_iPhone_Iphone5.npy(frames num: 461)
processed: M0041_02_1340768592_Android_SM.npy(frames num: 626)
processed: M0012_03_84312_Android_SM_Gal_J3.npy(frames num: 215)
processed: M0041_01_39605_Android_htc.npy(frames num: 442)
processed: M0004_02_1894372056_iPhone_iphone6.npy(frames num: 660)
processed: M0089_01_34789_iPhone_6s.npy(frames num: 512)
processed: F0052_03_32198_iPhone_6s.npy(frames num: 210)
processed: M0060_01_8102574936_iPhone_6s.npy(frames num: 489)
processed: M0054_03_54329_Android_nexus.npy(frames num: 339)
processed: M0067_02_49586_Android_nexus.npy(frames num: 432)
processed: M0046_03_6574019823_Android_SM.npy(frames num: 640)
processed: M0070_02_37528_Android_SM_Gal_J3.npy(frames num: 369)
processed: F0095_02

processed: F0117_02_0582397146_Android_htc.npy(frames num: 577)
processed: M0030_03_6574019823_Android_SM.npy(frames num: 602)
processed: M0038_01_4762598103_Android_htc.npy(frames num: 695)
processed: M0047_01_5031862974_iPhone_Iphone5.npy(frames num: 541)
processed: M0045_03_09687_Android_SM.npy(frames num: 599)
processed: M0059_03_18970_Android_SM_Gal_J3.npy(frames num: 384)
processed: M0012_02_2174905683_Android_htc.npy(frames num: 385)
processed: F0051_03_3792846105_Android_nexus.npy(frames num: 602)
processed: M0071_03_38964_Android_htc.npy(frames num: 301)
processed: F0168_03_6382074195_iPhone_iphone6.npy(frames num: 398)
processed: M0057_03_02864_iPhone_6s.npy(frames num: 374)
processed: M0076_01_28304_iPhone_6s.npy(frames num: 248)
processed: M0041_02_97085_Android_SM.npy(frames num: 300)
processed: F0009_01_4351796082_iPhone_6s.npy(frames num: 546)
processed: F0042_02_1392408567_iPhone_iphone6.npy(frames num: 487)
processed: F0029_02_42365_iPhone_iphone6.npy(frames num: 303)


processed: M0062_01_2593068741_iPhone_6s.npy(frames num: 596)
processed: M0072_01_23985_iPhone_6s.npy(frames num: 327)
processed: F0035_01_41853_iPhone_Iphone5.npy(frames num: 359)
processed: M0082_02_7963108245_Android_nexus.npy(frames num: 527)
processed: M0013_03_69570_Android_SM_Gal_J3.npy(frames num: 376)
processed: F0090_01_45213_Android_SM.npy(frames num: 541)
processed: M0013_02_5064821793_Android_htc.npy(frames num: 837)
processed: F0200_01_3621475890_Android_nexus.npy(frames num: 662)
processed: M0041_01_62795_iPhone_Iphone5.npy(frames num: 310)
processed: F0130_01_8521607493_Android_SM.npy(frames num: 544)
processed: M0045_01_4802765319_Android_htc.npy(frames num: 878)
processed: F0175_01_7419068325_iPhone_6s.npy(frames num: 465)
processed: F0035_03_6987543021_Android_SM_Gal_J3.npy(frames num: 526)
processed: F0181_01_9856243017_Android_SM.npy(frames num: 563)
processed: M0083_02_10694_Android_SM_Gal_J3.npy(frames num: 179)
processed: F0172_01_4290583176_Android_SM.npy(frame

processed: M0077_03_5823604197_iPhone_6s.npy(frames num: 846)
processed: M0002_02_4029763518_iPhone_iphone6.npy(frames num: 573)
processed: F0154_01_28376_iPhone_6s.npy(frames num: 324)
processed: F0005_03_34786_Android_SM_Gal_J3.npy(frames num: 280)
processed: F0095_01_3702149568_Android_SM.npy(frames num: 795)
processed: M0059_01_87901_iPhone_6s.npy(frames num: 539)
processed: F0069_02_21845_Android_nexus.npy(frames num: 227)
processed: F0181_01_51783_Android_SM.npy(frames num: 298)
processed: F0168_03_07516_Android_htc.npy(frames num: 274)
processed: M0053_03_6412975803_iPhone_6s.npy(frames num: 462)
processed: F0154_03_9263507184_Android_nexus.npy(frames num: 727)
processed: F0010_03_39046_Android_SM.npy(frames num: 305)
processed: F0119_03_30275_Android_SM.npy(frames num: 286)
processed: M0067_02_8643201957_Android_nexus.npy(frames num: 803)
processed: F0051_01_87901_iPhone_6s.npy(frames num: 339)
processed: F0172_01_85346_Android_SM.npy(frames num: 274)
processed: M0058_02_402976

processed: F0042_03_84312_Android_SM_Gal_J3.npy(frames num: 273)
processed: M0002_02_0578342619_Android_htc.npy(frames num: 582)
processed: M0072_01_2765304981_iPhone_6s.npy(frames num: 651)
processed: M0037_03_9015826743_Android_SM_Gal_J3.npy(frames num: 568)
processed: M0053_03_80426_Android_SM.npy(frames num: 227)
processed: M0053_02_37658_iPhone_6s.npy(frames num: 233)
processed: M0072_02_8719463205_Android_SM_Gal_J3.npy(frames num: 707)
processed: M0070_03_53061_iPhone_iphone6.npy(frames num: 200)
processed: F0069_02_3017568429_Android_nexus.npy(frames num: 371)
processed: M0038_01_38041_iPhone_Iphone5.npy(frames num: 346)
processed: M0057_02_57490_Android_SM_Gal_J3.npy(frames num: 297)
processed: M0036_03_43160_iPhone_6s.npy(frames num: 224)
processed: M0037_01_43876_iPhone_Iphone5.npy(frames num: 325)
processed: M0028_03_9385741620_Android_SM_Gal_J3.npy(frames num: 414)
processed: M0072_02_49586_Android_nexus.npy(frames num: 350)
processed: F0073_01_2765304981_iPhone_6s.npy(fram

processed: M0026_02_80316_Android_SM.npy(frames num: 340)
processed: M0016_02_21706_iPhone_Iphone5.npy(frames num: 371)
processed: M0049_01_04391_Android_htc.npy(frames num: 414)
processed: M0057_03_32198_iPhone_6s.npy(frames num: 412)
processed: M0002_01_9761823504_iPhone_6s.npy(frames num: 524)
processed: M0054_02_9730581624_Android_htc.npy(frames num: 830)
processed: F0181_01_8076135429_Android_SM.npy(frames num: 567)
processed: M0065_03_83971_iPhone_iphone6.npy(frames num: 346)
processed: F0119_03_38750_iPhone_6s.npy(frames num: 302)
processed: F0029_02_1253068479_Android_htc.npy(frames num: 674)
processed: F0169_02_41302_Android_htc.npy(frames num: 472)
processed: F0020_02_1253068479_Android_htc.npy(frames num: 625)
processed: M0048_01_82310_iPhone_Iphone5.npy(frames num: 267)
processed: F0068_02_7963108245_Android_nexus.npy(frames num: 880)
processed: M0064_02_7804526931_Android_nexus.npy(frames num: 548)
processed: F0154_02_1894372056_iPhone_iphone6.npy(frames num: 778)
processe

processed: F0119_03_67059_iPhone_6s.npy(frames num: 327)
processed: M0076_03_9745031826_Android_htc.npy(frames num: 478)
processed: F0005_03_6879204135_Android_nexus.npy(frames num: 494)
processed: F0175_03_02864_iPhone_6s.npy(frames num: 211)
processed: F0031_02_5213840967_Android_htc.npy(frames num: 671)
processed: F0095_01_7458109632_Android_SM.npy(frames num: 741)
processed: M0006_02_51386_iPhone_iphone6.npy(frames num: 248)
processed: F0027_01_43652_iPhone_6s.npy(frames num: 289)
processed: M0070_03_4532091867_iPhone_iphone6.npy(frames num: 612)
processed: M0043_02_1253068479_Android_htc.npy(frames num: 372)
processed: M0089_03_8726039514_Android_htc.npy(frames num: 830)
processed: M0012_01_25981_iPhone_6s.npy(frames num: 254)
processed: M0070_02_2104796583_Android_SM_Gal_J3.npy(frames num: 502)
processed: F0199_01_7546301298_Android_nexus.npy(frames num: 478)
processed: F0015_02_8175960234_iPhone_Iphone5.npy(frames num: 563)
processed: F0022_03_91605_iPhone_6s.npy(frames num: 309

processed: F0021_02_16073_Android_htc.npy(frames num: 334)
processed: M0013_02_90864_Android_htc.npy(frames num: 480)
processed: F0172_03_18970_Android_SM_Gal_J3.npy(frames num: 208)
processed: F0152_01_9176528340_iPhone_6s.npy(frames num: 523)
processed: F0181_02_20759_Android_nexus.npy(frames num: 260)
processed: M0002_02_17360_Android_htc.npy(frames num: 370)
processed: M0075_02_7639814520_Android_nexus.npy(frames num: 892)
processed: M0054_02_57049_Android_htc.npy(frames num: 447)
processed: M0083_03_59708_Android_htc.npy(frames num: 174)
processed: M0081_03_5976243810_Android_htc.npy(frames num: 509)
processed: F0090_03_8107293465_iPhone_6s.npy(frames num: 845)
processed: M0064_01_98264_iPhone_6s.npy(frames num: 298)
processed: F0182_01_40629_Android_SM.npy(frames num: 301)
processed: F0152_03_35891_iPhone_6s.npy(frames num: 228)
processed: F0021_03_1429067358_iPhone_6s.npy(frames num: 549)
processed: F0117_02_2691345807_Android_htc.npy(frames num: 477)
processed: M0071_02_7963108

processed: M0060_03_62045_iPhone_iphone6.npy(frames num: 388)
processed: M0043_01_50734_Android_htc.npy(frames num: 246)
processed: F0117_02_89071_iPhone_iphone6.npy(frames num: 248)
processed: M0082_01_65387_iPhone_6s.npy(frames num: 411)
processed: F0090_03_3980752614_Android_SM.npy(frames num: 835)
processed: F0201_01_91826_iPhone_6s.npy(frames num: 271)
processed: M0013_02_23157_Android_htc.npy(frames num: 438)
processed: M0013_01_70216_iPhone_6s.npy(frames num: 510)
processed: M0004_03_1576042983_Android_nexus.npy(frames num: 699)
processed: F0168_03_82493_Android_htc.npy(frames num: 318)
processed: M0045_01_0684721359_Android_htc.npy(frames num: 831)
processed: M0067_02_74096_Android_SM_Gal_J3.npy(frames num: 486)
processed: M0062_02_97063_Android_nexus.npy(frames num: 250)
processed: M0002_02_89254_Android_htc.npy(frames num: 363)
processed: F0051_02_57049_Android_htc.npy(frames num: 303)
processed: M0048_01_92135_iPhone_Iphone5.npy(frames num: 261)
processed: F0119_01_081763459

processed: M0036_02_04279_iPhone_iphone6.npy(frames num: 348)
processed: F0042_02_9730581624_Android_htc.npy(frames num: 494)
processed: M0046_01_92135_iPhone_Iphone5.npy(frames num: 365)
processed: F0009_03_5834716092_Android_SM.npy(frames num: 495)
processed: M0012_03_86071_Android_nexus.npy(frames num: 230)
processed: M0089_01_24309_iPhone_6s.npy(frames num: 490)
processed: M0058_02_2174905683_Android_htc.npy(frames num: 698)
processed: F0152_03_8107293465_iPhone_6s.npy(frames num: 413)
processed: F0027_01_48150_iPhone_6s.npy(frames num: 288)
processed: M0041_01_43876_iPhone_Iphone5.npy(frames num: 324)
processed: M0088_02_58201_Android_SM_Gal_J3.npy(frames num: 228)
processed: F0042_03_3729158604_Android_SM_Gal_J3.npy(frames num: 460)
processed: F0052_01_5326071948_iPhone_6s.npy(frames num: 784)
processed: F0201_03_73652_Android_nexus.npy(frames num: 283)
processed: M0062_02_21845_Android_nexus.npy(frames num: 284)
processed: F0029_02_04279_iPhone_iphone6.npy(frames num: 338)
proce

processed: M0075_02_67324_Android_SM_Gal_J3.npy(frames num: 518)
processed: F0052_02_4029763518_iPhone_iphone6.npy(frames num: 796)
processed: F0034_02_42365_iPhone_iphone6.npy(frames num: 322)
processed: M0075_02_3964071825_Android_SM_Gal_J3.npy(frames num: 904)
processed: F0095_02_24801_Android_nexus.npy(frames num: 266)
processed: F0015_01_43652_iPhone_6s.npy(frames num: 327)
processed: M0041_02_42163_Android_SM.npy(frames num: 277)
processed: M0064_03_2738015469_iPhone_iphone6.npy(frames num: 635)
processed: M0047_01_56947_iPhone_Iphone5.npy(frames num: 300)
processed: M0006_02_62138_Android_htc.npy(frames num: 256)
processed: M0030_03_85310_Android_SM.npy(frames num: 313)
processed: F0010_01_9761823504_iPhone_6s.npy(frames num: 785)
processed: M0039_01_48217_Android_htc.npy(frames num: 445)
processed: M0058_02_6780524931_Android_htc.npy(frames num: 655)
processed: M0067_03_08153_Android_htc.npy(frames num: 469)
processed: M0002_03_71638_Android_nexus.npy(frames num: 582)
processed

processed: F0117_03_0718926345_Android_SM_Gal_J3.npy(frames num: 627)
processed: F0050_03_2463591087_iPhone_6s.npy(frames num: 690)
processed: M0047_03_86071_Android_nexus.npy(frames num: 283)
processed: F0069_02_50782_Android_nexus.npy(frames num: 171)
processed: M0041_02_81026_iPhone_6s.npy(frames num: 377)
processed: M0057_02_3175684920_Android_nexus.npy(frames num: 587)
processed: M0075_02_4850126937_Android_SM_Gal_J3.npy(frames num: 880)
processed: M0038_01_9064728153_iPhone_Iphone5.npy(frames num: 646)
processed: M0038_01_39605_Android_htc.npy(frames num: 404)
processed: F0119_03_48721_Android_SM.npy(frames num: 273)
processed: F0090_03_35891_iPhone_6s.npy(frames num: 490)
processed: F0090_03_38467_iPhone_6s.npy(frames num: 470)
processed: F0021_03_91605_iPhone_6s.npy(frames num: 375)
processed: M0004_02_89071_iPhone_iphone6.npy(frames num: 426)
processed: M0070_02_10694_Android_SM_Gal_J3.npy(frames num: 421)
processed: F0119_02_9716524380_Android_nexus.npy(frames num: 550)
proce

processed: F0090_01_70968_Android_SM.npy(frames num: 591)
processed: M0033_02_4029763518_iPhone_iphone6.npy(frames num: 574)
processed: M0026_03_58601_Android_SM.npy(frames num: 246)
processed: M0057_03_31759_iPhone_6s.npy(frames num: 359)
processed: F0005_02_4910725836_Android_htc.npy(frames num: 582)
processed: M0001_02_36841_iPhone_6s.npy(frames num: 714)
processed: F0020_02_04279_iPhone_iphone6.npy(frames num: 373)
processed: F0011_02_9730581624_Android_htc.npy(frames num: 660)
processed: M0016_02_17053_Android_htc.npy(frames num: 430)
processed: M0057_02_3965420817_Android_SM_Gal_J3.npy(frames num: 488)
processed: F0027_03_91605_iPhone_6s.npy(frames num: 263)
processed: F0010_02_16073_Android_htc.npy(frames num: 289)
processed: M0082_03_6382975401_Android_htc.npy(frames num: 592)
processed: F0169_01_7902154638_Android_SM.npy(frames num: 775)
processed: F0181_02_3510748629_Android_nexus.npy(frames num: 493)
processed: F0090_01_50216_iPhone_6s.npy(frames num: 447)
processed: F0182_0

processed: M0038_02_95230_Android_htc.npy(frames num: 614)
processed: M0013_03_45921_Android_SM_Gal_J3.npy(frames num: 379)
processed: F0181_02_0938157624_Android_SM_Gal_J3.npy(frames num: 495)
processed: F0061_01_4523869701_iPhone_6s.npy(frames num: 771)
processed: F0090_02_57490_Android_SM_Gal_J3.npy(frames num: 441)
processed: M0074_02_5406837291_Android_SM.npy(frames num: 500)
processed: M0028_03_7952680314_Android_nexus.npy(frames num: 491)
processed: F0011_02_4029763518_iPhone_iphone6.npy(frames num: 637)
processed: F0175_01_49608_iPhone_6s.npy(frames num: 272)
processed: M0041_03_53067_Android_SM.npy(frames num: 240)
processed: F0130_01_1935482760_Android_SM.npy(frames num: 575)
processed: F0172_02_08931_Android_htc.npy(frames num: 250)
processed: M0004_02_2691345807_Android_htc.npy(frames num: 669)
processed: M0033_03_1532879406_Android_SM_Gal_J3.npy(frames num: 654)
processed: M0016_03_6574019823_Android_SM.npy(frames num: 846)
processed: M0060_01_43652_iPhone_6s.npy(frames nu

processed: M0057_01_48150_iPhone_6s.npy(frames num: 446)
processed: F0175_01_14389_Android_SM.npy(frames num: 275)
processed: M0038_02_74681_Android_htc.npy(frames num: 366)
processed: M0064_01_2593068741_iPhone_6s.npy(frames num: 552)
processed: M0001_02_98134_Android_SM.npy(frames num: 551)
processed: M0082_03_9308247615_Android_htc.npy(frames num: 735)
processed: M0075_01_47016_iPhone_6s.npy(frames num: 412)
processed: F0068_03_89265_Android_htc.npy(frames num: 401)
processed: F0200_01_1690483752_Android_SM_Gal_J3.npy(frames num: 596)
processed: F0200_02_2514860973_iPhone_6s.npy(frames num: 558)
processed: F0021_03_42561_Android_SM.npy(frames num: 274)
processed: F0182_02_68143_Android_nexus.npy(frames num: 344)
processed: F0199_01_26094_Android_nexus.npy(frames num: 263)
processed: F0157_03_6382074195_iPhone_iphone6.npy(frames num: 726)
processed: F0009_02_1894372056_iPhone_iphone6.npy(frames num: 512)
processed: F0154_01_7412980653_Android_SM.npy(frames num: 670)
processed: M0082_

processed: M0082_01_2360481975_iPhone_6s.npy(frames num: 538)
processed: F0175_02_5926870431_Android_nexus.npy(frames num: 427)
processed: F0073_02_5832104796_Android_SM_Gal_J3.npy(frames num: 633)
processed: M0070_03_6382074195_iPhone_iphone6.npy(frames num: 529)
processed: M0041_01_51902_iPhone_Iphone5.npy(frames num: 302)
processed: F0191_03_53061_iPhone_iphone6.npy(frames num: 275)
processed: F0095_03_32461_Android_htc.npy(frames num: 406)
processed: F0052_01_87901_iPhone_6s.npy(frames num: 371)
processed: M0058_01_87901_iPhone_6s.npy(frames num: 410)
processed: M0054_03_1576042983_Android_nexus.npy(frames num: 735)
processed: M0016_03_0931472685_Android_SM.npy(frames num: 817)
processed: M0058_03_2687041593_Android_SM_Gal_J3.npy(frames num: 694)
processed: F0173_03_1864903275_iPhone_6s.npy(frames num: 450)
processed: F0154_03_2543781960_Android_nexus.npy(frames num: 693)
processed: F0175_03_17625_Android_SM.npy(frames num: 314)
processed: F0061_02_10694_Android_SM_Gal_J3.npy(frame

processed: M0028_02_5037649281_Android_htc.npy(frames num: 517)
processed: F0191_02_20759_Android_nexus.npy(frames num: 312)
processed: F0029_03_7863451209_Android_SM.npy(frames num: 664)
processed: M0075_03_2594680137_iPhone_6s.npy(frames num: 747)
processed: F0191_03_38964_Android_htc.npy(frames num: 370)
processed: F0182_01_85213_iPhone_6s.npy(frames num: 312)
processed: M0030_03_73890_Android_SM.npy(frames num: 349)
processed: F0199_01_28701_Android_SM_Gal_J3.npy(frames num: 213)
processed: M0062_01_98264_iPhone_6s.npy(frames num: 298)
processed: F0027_03_2107835946_iPhone_6s.npy(frames num: 522)
processed: F0154_02_4980627135_Android_htc.npy(frames num: 746)
processed: F0169_02_75896_Android_htc.npy(frames num: 484)
processed: F0010_03_4839652701_Android_SM.npy(frames num: 575)
processed: M0075_02_4258637190_Android_SM_Gal_J3.npy(frames num: 858)
processed: F0005_03_5734802691_Android_nexus.npy(frames num: 515)
processed: F0042_03_86071_Android_nexus.npy(frames num: 245)
processed

processed: M0055_03_39685_Android_nexus.npy(frames num: 488)
processed: M0039_01_38041_iPhone_Iphone5.npy(frames num: 335)
processed: M0013_03_15843_Android_nexus.npy(frames num: 378)
processed: M0030_01_4523869701_iPhone_Iphone5.npy(frames num: 427)
processed: M0070_03_14590_Android_htc.npy(frames num: 360)
processed: F0173_03_4263107589_iPhone_6s.npy(frames num: 448)
processed: F0029_03_09687_Android_SM.npy(frames num: 321)
processed: M0012_03_69570_Android_SM_Gal_J3.npy(frames num: 213)
processed: F0169_01_14389_Android_SM.npy(frames num: 471)
processed: F0157_03_19807_Android_htc.npy(frames num: 393)
processed: F0042_02_57049_Android_htc.npy(frames num: 282)
processed: F0154_03_23654_Android_SM_Gal_J3.npy(frames num: 431)
processed: M0007_02_16094_Android_nexus.npy(frames num: 480)
processed: F0095_02_1530482697_Android_nexus.npy(frames num: 573)
processed: F0173_01_29351_iPhone_6s.npy(frames num: 261)
processed: F0029_02_5213840967_Android_htc.npy(frames num: 638)
processed: M0072

processed: M0002_01_70216_iPhone_6s.npy(frames num: 299)
processed: M0048_02_4067593812_Android_htc.npy(frames num: 542)
processed: F0042_03_4306759182_Android_SM_Gal_J3.npy(frames num: 458)
processed: F0032_02_1253068479_Android_htc.npy(frames num: 688)
processed: M0067_02_05469_Android_nexus.npy(frames num: 419)
processed: F0015_02_9367254810_Android_htc.npy(frames num: 558)
processed: M0089_03_2738015469_iPhone_iphone6.npy(frames num: 824)
processed: M0071_03_60148_Android_htc.npy(frames num: 294)
processed: M0004_03_45921_Android_SM_Gal_J3.npy(frames num: 514)
processed: F0154_03_2687041593_Android_SM_Gal_J3.npy(frames num: 768)
processed: F0095_02_1652897340_Android_nexus.npy(frames num: 560)
processed: F0042_01_51902_iPhone_Iphone5.npy(frames num: 344)
processed: F0173_03_18924_Android_SM.npy(frames num: 267)
processed: F0011_03_70315_Android_nexus.npy(frames num: 378)
processed: F0095_01_45213_Android_SM.npy(frames num: 480)
processed: F0154_03_14682_Android_SM_Gal_J3.npy(frames

processed: M0043_01_10854_Android_htc.npy(frames num: 274)
processed: M0081_03_41692_Android_htc.npy(frames num: 284)
processed: F0035_02_5213840967_Android_htc.npy(frames num: 580)
processed: M0030_03_0896425173_Android_SM.npy(frames num: 618)
processed: F0119_01_73059_iPhone_6s.npy(frames num: 338)
processed: F0169_01_0768123549_iPhone_6s.npy(frames num: 816)
processed: F0022_01_2637015489_iPhone_6s.npy(frames num: 519)
processed: F0010_03_71249_Android_SM.npy(frames num: 310)
processed: F0095_03_3098451762_Android_htc.npy(frames num: 673)
processed: M0007_01_70216_iPhone_6s.npy(frames num: 425)
processed: F0117_01_75362_Android_SM.npy(frames num: 339)
processed: F0199_01_65394_Android_SM_Gal_J3.npy(frames num: 269)
processed: M0067_02_4602935718_Android_nexus.npy(frames num: 832)
processed: M0065_03_9308247615_Android_htc.npy(frames num: 516)
processed: M0030_03_91605_iPhone_6s.npy(frames num: 349)
processed: M0012_01_54398_iPhone_6s.npy(frames num: 301)
processed: M0049_01_16980753

processed: F0157_01_93602_Android_SM.npy(frames num: 340)
processed: M0053_02_24183_Android_SM.npy(frames num: 258)
processed: F0169_02_71045_Android_htc.npy(frames num: 474)
processed: F0119_01_2571963048_Android_SM.npy(frames num: 600)
processed: F0154_03_17983_Android_nexus.npy(frames num: 397)
processed: M0065_01_41853_iPhone_6s.npy(frames num: 410)
processed: M0033_01_9872615403_iPhone_Iphone5.npy(frames num: 500)
processed: F0051_03_3729158604_Android_SM_Gal_J3.npy(frames num: 474)
processed: F0027_03_73890_Android_SM.npy(frames num: 310)
processed: F0035_03_2543781960_Android_nexus.npy(frames num: 609)
processed: F0052_03_02864_iPhone_6s.npy(frames num: 187)
processed: M0016_03_42561_Android_SM.npy(frames num: 484)
processed: M0056_03_31759_iPhone_6s.npy(frames num: 289)
processed: M0071_03_7629538140_Android_htc.npy(frames num: 522)
processed: F0200_02_91605_iPhone_6s.npy(frames num: 352)
processed: F0020_02_5213840967_Android_htc.npy(frames num: 598)
processed: M0028_03_71638_

processed: F0011_03_05849_Android_SM_Gal_J3.npy(frames num: 369)
processed: M0048_02_62138_Android_htc.npy(frames num: 313)
processed: F0172_03_34690_Android_nexus.npy(frames num: 225)
processed: F0117_03_5063784192_Android_SM_Gal_J3.npy(frames num: 623)
processed: M0082_01_01294_iPhone_6s.npy(frames num: 336)
processed: F0021_01_2637015489_iPhone_6s.npy(frames num: 510)
processed: F0020_01_8102574936_iPhone_6s.npy(frames num: 647)
processed: M0053_01_2637015489_iPhone_6s.npy(frames num: 427)
processed: M0056_02_9730581624_Android_htc.npy(frames num: 450)
processed: F0119_02_91082_Android_nexus.npy(frames num: 253)
processed: F0169_01_7419068325_iPhone_6s.npy(frames num: 825)
processed: M0001_03_5897106423_Android_SM_Gal_J3.npy(frames num: 1186)
processed: M0065_03_2738015469_iPhone_iphone6.npy(frames num: 636)
processed: M0060_03_2738015469_iPhone_iphone6.npy(frames num: 499)
processed: F0157_01_20914_Android_SM.npy(frames num: 376)
processed: F0009_02_1392408567_iPhone_iphone6.npy(fr

In [13]:
predictions_path

'./3_predictions/pca_W_mfcc/'

<b>Задание 2.</b> Проверить качество распознавания (WER) на полученных predictions

<b>Задание 3.</b> Попробовать улучшить результаты распознавания за счет:
- изменения архитектуры DNN
- изменения набора входных признаков
- изменения типа нормализации (не только по среднему, но и по дисперсии)