In [1]:
import librosa
import os
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
import random
%matplotlib inline
import IPython.display as ipd
import librosa, librosa.display
import pandas as pd

import numpy as np
import keras
import tensorflow as tf
from keras.models import Model, load_model
from keras.layers import GlobalAveragePooling2D, Input, Conv2D, Flatten, Dense, MaxPooling2D,AveragePooling2D, Dropout, UpSampling2D, concatenate, Cropping2D,LeakyReLU, Reshape, BatchNormalization
from keras import optimizers
from keras.applications.inception_v3 import InceptionV3
from keras.applications.vgg19 import VGG19
from keras.optimizers import SGD, Adam, Nadam
from keras import regularizers
from keras import backend as K
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping

import os
import pickle
import sklearn
import itertools
import math
from PIL import Image
from sklearn.model_selection import train_test_split
import matplotlib
import matplotlib.pyplot as plt
from IPython.display import clear_output
import pandas as pd

from custom_layers import PoolHelper, LRN2D

PHONEME_LOCATION= r"PHN/"
SOUND_LOCATION= r"SOUND/"
SPECTROGRAM_LOCATION = r"SPECTROGRAMS/"

# TRAIN_FOLDER = r"data_train"


Using TensorFlow backend.


In [2]:
if tf.test.gpu_device_name():
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
    print("Please install GPU version of TF")
    
# config = tf.ConfigProto()
# config.gpu_options.allow_growth=True
# sess = tf.Session(config=config)


Default GPU Device: /device:GPU:0


## SpectrogramGenerator

In [3]:
class SpectrogramGenerator:
    def __init__(self,sample_rate = 16000): 
        self.step_frames= 160
        self.window_frames = 4096
        
        self.step_size_ms = int((self.step_frames/sample_rate) * 1000) 
        self.window_size_ms = int((self.window_frames/sample_rate) * 1000)
        
        self.hop_length = 512 
        self.n_fft = 2048
    
    def create_labels(self, starts, ends, phonemes):
        j=0
        labels = []
            
        for i in range(0, ends[-1], self.step_frames):
            if (i > ends[-1] - int(self.window_frames / 2)):
                continue
            if i + self.window_frames / 2 > ends[j]:
                j += 1
            labels.append(phonemes[j])
        
        return labels
    
    def read_phoneme_information(self,file):
        rows = [x.split(' ') for x in open(file).readlines()]

        starts = []
        ends = []
        phonemes = []

        for row in rows:
            starts.append(int(row[0]))
            ends.append(int(row[1]))
            phonemes.append(row[2])  
        return starts, ends, phonemes

    def get_spectrograms(self, wav, max_phones, verbose = 0 ):
        samples, sample_rate = librosa.load(wav,sr=None)
        
        windows = []
        for i in range(0, max_phones[-1], self.step_frames):
            if (i > max_phones[-1] - int(self.window_frames / 2)):
                continue
            windows.append(samples[i: i + self.window_frames])

        spectrograms = []
        n_fft = 160 # N/2 +1
        hop_length = int(n_fft/4)
    
        if(verbose):
            print("Hop length in seconds = ", float(hop_length)/sample_rate)
            print("frame size in seconds = ", float(n_fft)/sample_rate)

        for window in windows:
            X = librosa.stft(window, n_fft = n_fft, hop_length=hop_length)
            S = librosa.amplitude_to_db(abs(X))
            norm_S = (S-np.min(S))/(np.max(S)-np.min(S))
            spectrograms.append(norm_S)
        return spectrograms
        
    
    def label_spectrograms(self, file, dataset ):
        starts, ends, phonemes = self.read_phoneme_information(PHONEME_LOCATION + dataset + r"/" + file +".PHN")
        labels = self.create_labels(starts, ends, phonemes)
        spectrograms = self.get_spectrograms(SOUND_LOCATION + dataset + r"/" + file + ".WAV", ends)
        return spectrograms, labels
                   
    def create_patches(self, file, dataset = "TRAIN/" ):
        spectrograms, labels = self.label_spectrograms(file, dataset)
                       
        i = 0 
        for s, l in zip(spectrograms, labels):
            out_folder = './SPECTROGRAMS/' + dataset + l.rstrip()
            if not os.path.exists(out_folder):
                os.makedirs(out_folder)
            img = Image.fromarray(s*255)
            img = img.convert('L')
            img = img.resize((256,256))
            img.save(out_folder + '/' + file + "_" + str(i) + '.png')
            i+=1
            
                   

## Generating Spectrogram Images
First Training Set and then Test Set

In [None]:
spectrogram_generator = SpectrogramGenerator()

In [None]:
# dataset = "TRAIN/"

# listp = os.listdir(PHONEME_LOCATION+dataset)
# listp[0].split('.')
# # print(listp)

# for idx, file in enumerate(os.listdir(PHONEME_LOCATION+dataset)):
#     if idx % 100 == 0:
#         print("Running at, ", idx, "of total, ", len(os.listdir(PHONEME_LOCATION+dataset)))
#     file_name = file.split('.')[0]
#     spectrogram_generator.create_patches(file_name, dataset)

## Generating Spectrogram Images For Testing

In [None]:
# dataset = "TEST/"
# listp = os.listdir(PHONEME_LOCATION+dataset)
# listp[0].split('.')
# # print(listp)

# for idx, file in enumerate(os.listdir(PHONEME_LOCATION+dataset)):
#     if idx % 100 == 0:
#         print("Running at, ", idx, "of total, ", len(os.listdir(PHONEME_LOCATION+dataset)))
#     file_name = file.split('.')[0]
#     spectrogram_generator.create_patches(file_name, dataset)

## Load Dataset

## Loading Data Location
We load the data such that we can easily retrieve it in the batch creator we will define later

In [70]:
PHONEME_DIRS = [x[0] for x in os.walk(SPECTROGRAM_LOCATION + "TRAIN/") if x[0] != SPECTROGRAM_LOCATION + "TRAIN/"]
PHONEME_DIRS.sort()
PHONEMES = [x.split('/')[-1] for x in PHONEME_DIRS]

number_files = 0
    
N_CLASSES = len(PHONEME_DIRS)

print("Data Location per Phoneme = " ,PHONEME_DIRS)
print("Number of classes = ", N_CLASSES)

Data Location per Phoneme =  ['SPECTROGRAMS/TRAIN/aa', 'SPECTROGRAMS/TRAIN/ae', 'SPECTROGRAMS/TRAIN/ah', 'SPECTROGRAMS/TRAIN/ao', 'SPECTROGRAMS/TRAIN/aw', 'SPECTROGRAMS/TRAIN/ax', 'SPECTROGRAMS/TRAIN/ax-h', 'SPECTROGRAMS/TRAIN/axr', 'SPECTROGRAMS/TRAIN/ay', 'SPECTROGRAMS/TRAIN/b', 'SPECTROGRAMS/TRAIN/bcl', 'SPECTROGRAMS/TRAIN/ch', 'SPECTROGRAMS/TRAIN/d', 'SPECTROGRAMS/TRAIN/dcl', 'SPECTROGRAMS/TRAIN/dh', 'SPECTROGRAMS/TRAIN/dx', 'SPECTROGRAMS/TRAIN/eh', 'SPECTROGRAMS/TRAIN/el', 'SPECTROGRAMS/TRAIN/em', 'SPECTROGRAMS/TRAIN/en', 'SPECTROGRAMS/TRAIN/eng', 'SPECTROGRAMS/TRAIN/epi', 'SPECTROGRAMS/TRAIN/er', 'SPECTROGRAMS/TRAIN/ey', 'SPECTROGRAMS/TRAIN/f', 'SPECTROGRAMS/TRAIN/g', 'SPECTROGRAMS/TRAIN/gcl', 'SPECTROGRAMS/TRAIN/h#', 'SPECTROGRAMS/TRAIN/hh', 'SPECTROGRAMS/TRAIN/hv', 'SPECTROGRAMS/TRAIN/ih', 'SPECTROGRAMS/TRAIN/ix', 'SPECTROGRAMS/TRAIN/iy', 'SPECTROGRAMS/TRAIN/jh', 'SPECTROGRAMS/TRAIN/k', 'SPECTROGRAMS/TRAIN/kcl', 'SPECTROGRAMS/TRAIN/l', 'SPECTROGRAMS/TRAIN/m', 'SPECTROGRAMS/TRAI

In [None]:
def create_validation_train_set(val_split,random_state = 42):
    x = []
    y = []
    for subdir, dirs, files in os.walk(SPECTROGRAM_LOCATION + "TRAIN/"):
        for file in files:
            #print os.path.join(subdir, file)
            filepath = subdir + os.sep + file

            if filepath.endswith(".png"):
                x.append(filepath)
                y.append(subdir.split('/')[-1])
#                 print(filepath)
#                 print(subdir)

    X_train,X_val,y_train,y_val = train_test_split(x,y,test_size = 0.1, shuffle = True,random_state = 42)
    
    uniques, counts = np.unique(y_train, return_counts = True)
    size = len(y_train)
    p = [x/size for x in counts]
    phoneme_dict_percentages = dict(zip(uniques, p))
    phoneme_dict_counts = dict(zip(uniques,counts))
    
    ps = []
    for phoneme in y_train:
        ps.append(phoneme_dict_percentages[phoneme]/phoneme_dict_counts[phoneme])
    
    d = {'instance':X_train,'p':ps}
    X_train= pd.DataFrame(d)
     
    
    uniques, counts = np.unique(y_val, return_counts = True)
    size = len(y_val)
    p = [x/size for x in counts]
    phoneme_dict_percentages = dict(zip(uniques, p))
    phoneme_dict_counts = dict(zip(uniques,counts))
    
    ps = []
    for phoneme in y_val:
         ps.append(phoneme_dict_percentages[phoneme]/phoneme_dict_counts[phoneme])
    
    d = {'instance':X_val,'p':ps}
    X_val= pd.DataFrame(d)
     
    
    return X_train,X_val,y_train,y_val

In [None]:
X_train,X_val,y_train,y_val= create_validation_train_set(0.1)
TRAIN_DATASET = X_train
VALIDATION_DATASET = X_val

##  Plot of Training Data

In [None]:
unique, counts = np.unique(np.asarray(y_train),return_counts =True)
plt.rcParams['figure.figsize'] = 20, 10
plt.bar(unique,counts)
plt.xticks(rotation=90)
plt.show()

## Plot Validation Data

In [None]:
unique, counts = np.unique(np.asarray(y_val),return_counts =True)
plt.bar(unique,counts)
plt.show()

## PatchExtractor

In [None]:
class PatchExtractor:

    def __init__(self, patch_size=(256,256), max_rotation=0, flipping=False):
        self.patch_size = patch_size 
        self.flipping = flipping
        self.max_rotation = max_rotation

        
    def get_patch(self, image, location=[0,0]):
        ''' 
        image: a numpy array representing the input image
        location: a tuple with an y and x coordinate
        
        This method normalizes the patch
        
        return a patch from the image at `location`, representing the top left corner of the patch
        if self.flipping = True, there is a 50% chance the patch is horizontally flipped   
        we will not rotate it or perform other augmentations for now to speed up the training process
        '''
        y, x = location      
        h, w = self.patch_size 
        
#         image  = np.stack((image,)*3, axis=-1)
#         image = np.squeeze(np.stack((image,) * 3, -1)) 
#         print("Image shape = ", image.shape)
        
        # - patch should be a numpy array of size <h, w>
        (image_h, image_w,channels) = image.shape
        
        if (h + y > image_h | w + x > image_x):
            print("Exceeding Bounds")
            return None
        
        # - the patch should be normalized (intensity values between 0-1)
        patch = image[y: y + h, x : x + w, :]
        
#         patch = (patch-np.min(patch))/(np.max(patch)-np.min(patch))
        patch = patch / 255.0    
        
        return patch

## BatchCreator
Now it's time to create a BatchCreator. The BatchCreator will allow us to generate batches to train on. These batches contain a set of (class-balanced) samples or patches, and their corresponding labels. The data returned by the BatchCreator can directly be fed into the neural network for training or classification.

In [None]:
class BatchCreator:
    
    def __init__(self, patch_extractor, dataset, balanced = True):
#         self.dataset = dataset
        self.patch_extractor = patch_extractor
        self.dataset = dataset
        self.balanced = balanced
        self.enc = OneHotEncoder(handle_unknown='ignore',sparse = False)
    
        self.enc.fit(np.asarray(PHONEMES).reshape(-1,1))
                    
    def load_img(self,path):
        img = np.array(Image.open(path))
        img = np.expand_dims(img, axis=2)
        return img
    
    def create_batch(self, batch_size,n_outputs=1, use_patches = False):
        '''
        returns a class-balanced array of patches (x) with corresponding labels (y) in one-hot structure
        patches are by default full image
        '''
        
        n_patches_per_class = int(math.ceil(batch_size/N_CLASSES))

        # We have to shuffle the list of dirs such that we loop differently through the list every time
#         phoneme_dirs = PHONEME_DIRS[:]
#         random.shuffle(phoneme_dirs)

#         x_data = np.zeros((batch_size, *(256,256), 1))
#         y_data = np.zeros((batch_size, N_CLASSES)) # one-hot encoding

#         index = 0
        if self.balanced:
            data = np.random.choice(self.dataset['instance'],batch_size,p = self.dataset['p'])
        else:
            data = np.random.choice(self.dataset['instance'],batch_size)
        x_data = np.asarray([self.patch_extractor.get_patch(self.load_img(x)) for x in data])
        y_data = np.asarray([self.enc.transform(np.asarray(x.split('/'))[-2].reshape(-1,1)) for x in data]).reshape(batch_size,61)
        
#         for i,phoneme_dir in enumerate(phoneme_dirs):
#             for j in range(0,n_patches_per_class): 
#                 if index>=(batch_size):
# #                     print("BREAKING FOR LOOP BECAUSE = ", index, "and",batch_size-1 )
#                     break
#                 img_file = random.choice(self.dataset[0])
#                 img = self.load_img(phoneme_dir + '/' + img_file)
#                 patch = self.patch_extractor.get_patch(img)
#                 x_data[index] = patch
#                 y_data[index,PHONEME_DIRS.index(phoneme_dir)] = 1
#                 index = index + 1
        if n_outputs == 1:
            return x_data,y_data
        else:
            return x_data,[y_data,y_data,y_data]
        
    def get_generator(self, batch_size,n_outputs=1):
        '''returns a generator that will yield batches infinitely'''
        while True:
            yield self.create_batch(batch_size,n_outputs)

## Visualize BatchCreator

In [None]:
PATCH_SIZE = (256,256)
PATCH_EXTRACTOR = PatchExtractor(PATCH_SIZE)
BATCH_CREATOR = BatchCreator(PATCH_EXTRACTOR,TRAIN_DATASET)


# create a batch
x, y = BATCH_CREATOR.create_batch(256)

# visualize it
matplotlib.rcParams['figure.figsize'] = (30, 30)
f, axes = plt.subplots(7, 4)

for ax_row in axes:
    for ax in ax_row:
        i = random.randint(0,255)
        ax.imshow(np.squeeze(x[i]))
        ax.set_title('class: {}'.format(PHONEME_DIRS[np.argmax(y[i])].split('/')[2]))
        ax.scatter(*[p/2 for p in PATCH_EXTRACTOR.patch_size], alpha=0.5)
#         i += 1
plt.show()

del PATCH_SIZE, PATCH_EXTRACTOR, BATCH_CREATOR

# Neural Network

## Inception Model
First we make a inception module function

![title](inception_module.png)

![title](googlenet.png)

https://arxiv.org/pdf/1409.4842v1.pdf

In [56]:
def inception_module_3a(inputs):
    padding = 'same'
    
    # Tower 0
    tower_0 = Conv2D(64,(1,1),padding=padding, activation='relu')(inputs)
    
    # Tower 1
    tower_1 = Conv2D(96, (1,1), padding=padding, activation='relu')(inputs)
    tower_1 = Conv2D(128, (3,3), padding=padding, activation='relu')(tower_1)
    
    # Tower 2
    tower_2 = Conv2D(16, (1,1), padding=padding, activation='relu')(inputs)
    tower_2 = Conv2D(32, (5,5), padding=padding, activation='relu')(tower_2)
    
    # Tower 3
    tower_3 = MaxPooling2D((3,3), strides=(1,1), padding=padding)(inputs)
    tower_3 = Conv2D(32, (1,1), padding=padding, activation='relu')(tower_3)
    
    # concatenation
    output = concatenate([tower_1, tower_2, tower_3], axis = 3)
    
    return output


In [57]:
def inception_module_3b(inputs):
    padding = 'same'
    
    # Tower 0
    tower_0 = Conv2D(128,(1,1),padding=padding, activation='relu')(inputs)
    
    # Tower 1
    tower_1 = Conv2D(128, (1,1), padding=padding, activation='relu')(inputs)
    tower_1 = Conv2D(192, (3,3), padding=padding, activation='relu')(tower_1)
    
    # Tower 2
    tower_2 = Conv2D(32, (1,1), padding=padding, activation='relu')(inputs)
    tower_2 = Conv2D(96, (5,5), padding=padding, activation='relu')(tower_2)
    
    # Tower 3
    tower_3 = MaxPooling2D((3,3), strides=(1,1), padding=padding)(inputs)
    tower_3 = Conv2D(64, (1,1), padding=padding, activation='relu')(tower_3)
    
    # concatenation
    output = concatenate([tower_1, tower_2, tower_3], axis = 3)
    
    return output

In [58]:
def inception_module_4a(inputs):
    padding = 'same'
    
    # Tower 0
    tower_0 = Conv2D(192,(1,1),padding=padding, activation='relu')(inputs)
    
    # Tower 1
    tower_1 = Conv2D(96, (1,1), padding=padding, activation='relu')(inputs)
    tower_1 = Conv2D(208, (3,3), padding=padding, activation='relu')(tower_1)
    
    # Tower 2
    tower_2 = Conv2D(16, (1,1), padding=padding, activation='relu')(inputs)
    tower_2 = Conv2D(48, (5,5), padding=padding, activation='relu')(tower_2)
    
    # Tower 3
    tower_3 = MaxPooling2D((3,3), strides=(1,1), padding=padding)(inputs)
    tower_3 = Conv2D(64, (1,1), padding=padding, activation='relu')(tower_3)
    
    # concatenation
    output = concatenate([tower_1, tower_2, tower_3], axis = 3)
    
    return output

In [59]:
def inception_module_4b(inputs):
    padding = 'same'
    
    # Tower 0
    tower_0 = Conv2D(160,(1,1),padding=padding, activation='relu')(inputs)
    
    # Tower 1
    tower_1 = Conv2D(112, (1,1), padding=padding, activation='relu')(inputs)
    tower_1 = Conv2D(224, (3,3), padding=padding, activation='relu')(tower_1)
    
    # Tower 2
    tower_2 = Conv2D(24, (1,1), padding=padding, activation='relu')(inputs)
    tower_2 = Conv2D(64, (5,5), padding=padding, activation='relu')(tower_2)
    
    # Tower 3
    tower_3 = MaxPooling2D((3,3), strides=(1,1), padding=padding)(inputs)
    tower_3 = Conv2D(64, (1,1), padding=padding, activation='relu')(tower_3)
    
    # concatenation
    output = concatenate([tower_1, tower_2, tower_3], axis = 3)
    
    return output

In [60]:
def inception_module_4c(inputs):
    padding = 'same'
    
    # Tower 0
    tower_0 = Conv2D(128,(1,1),padding=padding, activation='relu')(inputs)
    
    # Tower 1
    tower_1 = Conv2D(128, (1,1), padding=padding, activation='relu')(inputs)
    tower_1 = Conv2D(256, (3,3), padding=padding, activation='relu')(tower_1)
    
    # Tower 2
    tower_2 = Conv2D(24, (1,1), padding=padding, activation='relu')(inputs)
    tower_2 = Conv2D(64, (5,5), padding=padding, activation='relu')(tower_2)
    
    # Tower 3
    tower_3 = MaxPooling2D((3,3), strides=(1,1), padding=padding)(inputs)
    tower_3 = Conv2D(64, (1,1), padding=padding, activation='relu')(tower_3)
    
    # concatenation
    output = concatenate([tower_1, tower_2, tower_3], axis = 3)
    
    return output

In [61]:
def inception_module_4d(inputs):
    padding = 'same'
    
    # Tower 0
    tower_0 = Conv2D(112,(1,1),padding=padding, activation='relu')(inputs)
    
    # Tower 1
    tower_1 = Conv2D(144, (1,1), padding=padding, activation='relu')(inputs)
    tower_1 = Conv2D(288, (3,3), padding=padding, activation='relu')(tower_1)
    
    # Tower 2
    tower_2 = Conv2D(32, (1,1), padding=padding, activation='relu')(inputs)
    tower_2 = Conv2D(64, (5,5), padding=padding, activation='relu')(tower_2)
    
    # Tower 3
    tower_3 = MaxPooling2D((3,3), strides=(1,1), padding=padding)(inputs)
    tower_3 = Conv2D(64, (1,1), padding=padding, activation='relu')(tower_3)
    
    # concatenation
    output = concatenate([tower_1, tower_2, tower_3], axis = 3)
    
    return output

In [62]:
def inception_module_4e(inputs):
    padding = 'same'
    
    # Tower 0
    tower_0 = Conv2D(256,(1,1),padding=padding, activation='relu')(inputs)
    
    # Tower 1
    tower_1 = Conv2D(160, (1,1), padding=padding, activation='relu')(inputs)
    tower_1 = Conv2D(320, (3,3), padding=padding, activation='relu')(tower_1)
    
    # Tower 2
    tower_2 = Conv2D(32, (1,1), padding=padding, activation='relu')(inputs)
    tower_2 = Conv2D(128, (5,5), padding=padding, activation='relu')(tower_2)
    
    # Tower 3
    tower_3 = MaxPooling2D((3,3), strides=(1,1), padding=padding)(inputs)
    tower_3 = Conv2D(128, (1,1), padding=padding, activation='relu')(tower_3)
    
    # concatenation
    output = concatenate([tower_1, tower_2, tower_3], axis = 3)
    
    return output

In [63]:
def inception_module_5a(inputs):
    padding = 'same'
    
    # Tower 0
    tower_0 = Conv2D(256,(1,1),padding=padding, activation='relu')(inputs)
    
    # Tower 1
    tower_1 = Conv2D(160, (1,1), padding=padding, activation='relu')(inputs)
    tower_1 = Conv2D(320, (3,3), padding=padding, activation='relu')(tower_1)
    
    # Tower 2
    tower_2 = Conv2D(32, (1,1), padding=padding, activation='relu')(inputs)
    tower_2 = Conv2D(128, (5,5), padding=padding, activation='relu')(tower_2)
    
    # Tower 3
    tower_3 = MaxPooling2D((3,3), strides=(1,1), padding=padding)(inputs)
    tower_3 = Conv2D(128, (1,1), padding=padding, activation='relu')(tower_3)
    
    # concatenation
    output = concatenate([tower_1, tower_2, tower_3], axis = 3)
    
    return output

In [64]:
def inception_module_5b(inputs):
    padding = 'same'
    
    # Tower 0
    tower_0 = Conv2D(384,(1,1),padding=padding, activation='relu')(inputs)
    
    # Tower 1
    tower_1 = Conv2D(192, (1,1), padding=padding, activation='relu')(inputs)
    tower_1 = Conv2D(384, (3,3), padding=padding, activation='relu')(tower_1)
    
    # Tower 2
    tower_2 = Conv2D(48, (1,1), padding=padding, activation='relu')(inputs)
    tower_2 = Conv2D(128, (5,5), padding=padding, activation='relu')(tower_2)
    
    # Tower 3
    tower_3 = MaxPooling2D((3,3), strides=(1,1), padding=padding)(inputs)
    tower_3 = Conv2D(128, (1,1), padding=padding, activation='relu')(tower_3)
    
    # concatenation
    output = concatenate([tower_1, tower_2, tower_3], axis = 3)
    
    return output

In [65]:
def inception_stem(inputs):
    # 7x7 conv with stride = 2
    conv1 = Conv2D(64,kernel_size = (7,7),strides=(2,2),padding='same', activation ='relu')(inputs)
    
    # MaxPoolingLayer with 3x3 stride = 2
    maxPool1 = MaxPooling2D(pool_size=(3,3), strides=(2,2), padding='same')(conv1)
    
#     bn1 = BatchNormalization()(maxPool1)
    
    lrn1 = LRN2D()(maxPool1)

    
    conv2 = Conv2D(64,kernel_size = (1,1),strides=(1,1),padding='valid', activation ='relu')(lrn1)
    
    conv3 = Conv2D(192,kernel_size = (3,3),strides=(1,1),padding='same', activation ='relu')(conv2)
    
#     bn2 = BatchNormalization()(conv3)
    lrn2 = LRN2D()(conv3)
    
    maxPool2 = MaxPooling2D(pool_size=(3,3), strides=(2,2), padding='same')(lrn2)
    
    return maxPool2

    
    # Maybe we need batchnormalization or local response normalization
    

In [66]:
def build_classifier(inputs,n_classes):
    
    averagePool1 = AveragePooling2D(pool_size=(7,7),strides=(1,1),padding='valid')(inputs)
    flat = Flatten()(inputs)
    dropout1 = Dropout(0.4)(flat)
    
    dense1 = Dense(n_classes,activation='softmax')(dropout1)
    
    return dense1
    

In [67]:
def build_multi_task(inputs,n_classes):
    averagePool1 = AveragePooling2D(pool_size=(5,5),strides=(3,3),padding='valid')(inputs)
    
    conv1 = Conv2D(128,kernel_size = (1,1),strides=(1,1),padding='same', activation ='relu')(averagePool1)
    flat1 = Flatten()(conv1)
    dense1 = Dense(1024, activation = 'relu')(flat1)
    dropout1 = Dropout(0.7)(dense1)
    dense2 = Dense(n_classes,activation='softmax')(dropout1)
    
    return dense2

In [71]:
def build_inception_small(n_classes=N_CLASSES):
    inputs = Input(shape=(256, 256, 1))
    
    stem = inception_stem(inputs)
    
    module3a = inception_module_3a(stem)
    
    classifier = build_classifier(module3a,n_classes)
    
    model = Model(inputs, classifier)
    
    return model

In [72]:
def build_inception_v1(n_classes=N_CLASSES):
    inputs = Input(shape=(256, 256, 1))
    
    stem = inception_stem(inputs,)
    
    module3a = inception_module_3a(stem)
    module3b = inception_module_3b(module3a)
    
    maxPool1 = MaxPooling2D(pool_size=(3,3), strides=(2,2), padding='same')(module3b)
    
    module4a = inception_module_4a(maxPool1)
    
    multi_task1 = build_multi_task(module4a,n_classes)
    
    module4b = inception_module_4b(module4a)
    module4c = inception_module_4c(module4b)
    module4d = inception_module_4d(module4c)
    
    multi_task2 = build_multi_task(module4d,n_classes)
    
    module4e = inception_module_4e(module4d)
    
    maxPool2 = MaxPooling2D(pool_size=(3,3), strides=(2,2), padding='same')(module4e)
    
    module5a = inception_module_5a(maxPool2)
    module5b = inception_module_5b(module5a)
    
    classifier = build_classifier(module5b,n_classes)
    
    model = Model(inputs, [classifier,multi_task1, multi_task2])
    
    return model

In [None]:
def top3_accuracy(y_true, y_pred):
    return keras.metrics.top_k_categorical_accuracy(y_true, y_pred, k=3)
def top5_accuracy(y_true,y_pred):
    return keras.metrics.top_k_categorical_accuracy(y_true, y_pred, k=5)

## Train Models
We define our own function for training a model

In [None]:
# function to train a model
def train_model(model, training_params, use_class_weights=False, crop=False, crop_size=(0,0)):
    
    patch_size = training_params['patch_size']
    batch_size = training_params['batch_size']
    loss = training_params['loss']
    metrics = training_params['metrics']
    epochs = training_params['epochs']
    steps_per_epoch = training_params['steps_per_epoch']
    optimizer = training_params['optimizer']
#     training_dataset = training_params['training_dataset']
    validation_size = training_params['validation_size']
    validation_steps = training_params['validation_steps']
    stop = training_params['stop']
    n_outputs = training_params['n_outputs']
    
    str_name = training_params['name'] +'_'+ str(batch_size) + '_' + str(epochs) + '_' + str(steps_per_epoch) + '_' + str(validation_size * validation_steps)
    # batch generator 
    patch_generator = PatchExtractor(patch_size)
    
    train_batch_generator = BatchCreator(patch_generator,dataset = TRAIN_DATASET, balanced=True)
    train_image_generator = train_batch_generator.get_generator(batch_size,n_outputs)
    
    validation_batch_generator = BatchCreator(patch_generator, dataset = VALIDATION_DATASET,balanced=False)
    validation_image_generator = validation_batch_generator.get_generator(validation_size,n_outputs)

    # compile the model
    model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
    
    filepath = "_saved-model-{epoch:02d}.hdf5"
    # train the model
    # callbacks
    checker = ModelCheckpoint('models/' + str_name + filepath, monitor='val_acc', verbose=1, save_best_only=False, mode='auto')
#     checker = ModelCheckpoint('models/phoneme.best.hdf5', monitor='loss',
#                                         verbose=1, save_best_only=True, save_weights_only=False, mode='auto', period=1)
    saver = ModelCheckpoint('models/phoneme_models_' + str_name + '.h5', monitor='loss', verbose=1,
                                      save_best_only=True, save_weights_only=True, mode='auto', period=1)
    tb =TensorBoard(log_dir='logs/' + str_name, histogram_freq=0,
                               write_graph=False, write_images=True)
    stopper = EarlyStopping(monitor='loss', min_delta=0.001, patience=stop, verbose=0, mode='auto')

    
    
    
    history = model.fit_generator(generator=train_image_generator, callbacks=[checker, tb, stopper, saver], steps_per_epoch=steps_per_epoch, 
                            epochs=epochs, validation_data=validation_image_generator,validation_steps = validation_steps)
    return history

In [None]:
model = build_inception_small()
model.summary()

In [None]:
model = build_inception_v1()
model.summary()

### Inception Small

In [None]:
inception_model_small = build_inception_small()

plt.rcParams['figure.figsize'] = 10, 5
training_params = {}
training_params['name'] ='inception_small'
training_params['learning_rate'] = 0.001
training_params['patch_size'] = (256, 256) # input size
training_params['batch_size'] = 61 * 3 # number of patches in a mini-batch
training_params['steps_per_epoch'] = 100
training_params['epochs'] = 200
training_params['optimizer'] = SGD(lr=training_params['learning_rate'], momentum=0.9, nesterov=True)
#training_params['optimizer'] = Adam()
training_params['loss'] = ['categorical_crossentropy']
training_params['n_outputs'] = 1

training_params['metrics'] = ['accuracy',top3_accuracy,top5_accuracy]


training_params['validation_size'] = 64 
training_params['validation_steps'] = 20
training_params['stop'] = 50 # N|umber of epochs with no increase for EarlyStopping

history_small = train_model(inception_model_small, training_params)

# Inception V1

In [None]:
inception_model_v1 = build_inception_v1()
# print(inception_model_v1.summary())

plt.rcParams['figure.figsize'] = 10, 5
training_params = {}
training_params['name'] ='inception_v1_correct'
training_params['learning_rate'] = 0.001
training_params['patch_size'] = (256, 256) # input size
training_params['batch_size'] = 61*3 # number of patches in a mini-batch
training_params['steps_per_epoch'] = 100
training_params['epochs'] = 200
training_params['optimizer'] = SGD(lr=training_params['learning_rate'], momentum=0.9, nesterov=True)
#training_params['optimizer'] = Adam()
training_params['loss'] = ['categorical_crossentropy','categorical_crossentropy','categorical_crossentropy']
training_params['n_outputs'] = 3
training_params['metrics'] = ['accuracy',top3_accuracy,top5_accuracy]

training_params['validation_size'] = 64 
training_params['validation_steps'] =20
training_params['stop'] = 50 # Number of epochs with no increase for EarlyStopping

history_v1 = train_model(inception_model_v1, training_params)

## Function for Mapping Phonemes

In [11]:
def map_phoneme(phoneme):
    if phoneme in ["ao" , "aa"]:
        return "aa"
    elif phoneme in ["ah" , "ax" , "ax-h"]:
        return "ah"
    elif phoneme in ["er" , "axr"]:
        return "er"
    elif phoneme in ["hh" , "hv"]:
        return "hh"
    elif(phoneme in ["ih" , "ix"]):
        return "ih"
    elif(phoneme in ["l" , "el"]): 
        return "l"
    elif(phoneme in ["m" , "em"]):
        return "m"
    elif(phoneme in ["n" , "en" , "nx"]):
        return "n"
    elif(phoneme in ["ng" , "eng"]):
        return "ng"
    elif(phoneme in ["sh" , "zh"]):
        return "sh"
    elif(phoneme in ["uw" , "ux"]):
        return "uw"
    elif(phoneme in ["pcl" , "tcl" , "kcl", "bcl" , "dcl" , "gcl" , "h#" , "pau" , "epi"]):
        return "sil"
    else:
        return phoneme
    
    

## Visualizing Predictions

In [None]:
def plot_confusion_matrix(conf_mat, classes, title='Confusion matrix', cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix
    """
    matplotlib.rcParams['figure.figsize'] = (N_CLASSES, N_CLASSES)
    plt.imshow(conf_mat, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)

    thresh = conf_mat.max() / 2.
    for i, j in itertools.product(range(conf_mat.shape[0]), range(conf_mat.shape[1])):
        plt.text(j, i, conf_mat[i, j], horizontalalignment="center",
                 color="white" if conf_mat[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

In [None]:
conf_mat = sklearn.metrics.confusion_matrix(actual_phonemes, predicted_phonemes)
plot_confusion_matrix(conf_mat, classes=[x.split('/')[1] for x in PHONEME_DIRS],
                      title='Confusion matrix: Classifier (True label vs. Predicted label)')

In [78]:
np.load("PREDICTIONS_V1/TEST/DR1FAKS0SA1.npy").shape

(384, 3, 1, 61)

## Preparing Output Arrays for LSTM

In [79]:
class Predicter:
    def __init__(self, model):
        self.model = model
        self.step_frames= 160
        self.window_frames = 4096
        self.sample_rate = 16000
        self.step_size_ms = int((self.step_frames/self.sample_rate) * 1000) 
        self.window_size_ms = int((self.window_frames/self.sample_rate) * 1000)
        
        self.hop_length = 512 
        self.n_fft = 2048
        
    def generate_prediction_for_wav_file(self,dataset):
        '''generates prediction for wav file per window of 10 ms similar to spectrograms'''
        
        for idx, file in enumerate(os.listdir(SOUND_LOCATION+dataset)):

            if idx % 100 == 0:
                print("Running at, ", idx, "of total, ", len(os.listdir(SOUND_LOCATION+dataset)))
            file_name = file.split('.')[0]
            if not len(file_name) == 0:
                self.create_patches(file_name, dataset)

    def label_spectrograms(self, file,dataset ):
        starts, ends, phonemes = self.read_phoneme_information(PHONEME_LOCATION + dataset + r"/" + file +".PHN")
        labels = self.create_labels(starts, ends, phonemes)
        spectrograms = self.get_spectrograms(SOUND_LOCATION + dataset + r"/" + file + ".WAV", ends)
        #spectrograms = []
        return spectrograms, labels

    def read_phoneme_information(self,file):
        rows = [x.split(' ') for x in open(file).readlines()]

        starts = []
        ends = []
        phonemes = []

        for row in rows:
            starts.append(int(row[0]))
            ends.append(int(row[1]))
            phonemes.append(row[2])  
        return starts, ends, phonemes
    
    def create_labels(self, starts, ends, phonemes):
        j=0
        labels = []
            
        for i in range(0, ends[-1], self.step_frames):
            if (i > ends[-1] - int(self.window_frames / 2)):
                continue
            if i + self.window_frames / 2 > ends[j]:
                j += 1
            labels.append(phonemes[j])
        
        return labels

    def create_patches(self, file, dataset):  
        spectrograms, labels = self.label_spectrograms(file, dataset)
        out_folder_phn_converted = './PHN_converted_2_V1/' + dataset
        if not os.path.exists(out_folder_phn_converted):
            os.makedirs(out_folder_phn_converted)
        
        #labels = [self.map_phoneme(l.strip('\n')) for l in labels]
        with open(out_folder_phn_converted + '/'  + file + '.txt','w' ) as f:
            for l in labels:
                f.write(l)
            
        out_folder_predictions = './PREDICTIONS_V1/' + dataset
        if not os.path.exists(out_folder_predictions):
            os.makedirs(out_folder_predictions)
        i = 0 
        predictions = []
        for s, l in zip(spectrograms, labels):
            img = Image.fromarray(s*255)
            img = img.convert('L')
            img = img.resize((256,256))
            np_img = np.array(img)
            np_img = np_img / 255.0  
            
            np_img = np.expand_dims(np_img, axis = 2)
            np_img = np.expand_dims(np_img, axis = 0)
            prediction = self.model.predict(np_img)
            predictions.append(prediction)
            #print(prediction, l)
            i+=1
#         print("Shape predictions = " , np.asarray(predictions).shape)
        np.save(out_folder_predictions + '/'  + file, predictions)
            
    def get_spectrograms(self, wav, max_phones, verbose = 0 ):
        samples, sample_rate = librosa.load(wav,sr=None)
        windows = []
        for i in range(0, max_phones[-1], self.step_frames):
            if (i > max_phones[-1] - int(self.window_frames / 2)):
                continue
            windows.append(samples[i: i + self.window_frames])

        spectrograms = []
        n_fft = 160 # N/2 +1
        hop_length = int(n_fft/4)

        if(verbose):
            print("Hop length in seconds = ", float(hop_length)/sample_rate)
            print("frame size in seconds = ", float(n_fft)/sample_rate)

        for window in windows:
            X = librosa.stft(window, n_fft = n_fft, hop_length=hop_length)
            S = librosa.amplitude_to_db(abs(X))
            norm_S = (S-np.min(S))/(np.max(S)-np.min(S))
            spectrograms.append(norm_S)  
        return spectrograms
    
    def map_phoneme(self, phoneme):
        if phoneme in ["ao" , "aa"]:
            return "aa"
        elif phoneme in ["ah" , "ax" , "ax-h"]:
            return "ah"
        elif phoneme in ["er" , "axr"]:
            return "er"
        elif phoneme in ["hh" , "hv"]:
            return "hh"
        elif(phoneme in ["ih" , "ix"]):
            return "ih"
        elif(phoneme in ["l" , "el"]): 
            return "l"
        elif(phoneme in ["m" , "em"]):
            return "m"
        elif(phoneme in ["n" , "en" , "nx"]):
            return "n"
        elif(phoneme in ["ng" , "eng"]):
            return "ng"
        elif(phoneme in ["sh" , "zh"]):
            return "sh"
        elif(phoneme in ["uw" , "ux"]):
            return "uw"
        elif(phoneme in ["pcl" , "tcl" , "kcl", "bcl" , "dcl" , "gcl" , "h#" , "pau" , "epi"]):
            return "sil"
        else:
            return phoneme


In [80]:
model = build_inception_v1()
model.load_weights('models/phoneme_models_inception_v1_correct_183_200_100_1280.h5')
print(model)

<keras.engine.training.Model object at 0x2b9da3f86b38>


In [81]:
predicter = Predicter(model)

In [82]:
#starts, ends, phonemes = predicter.read_phoneme_information(PHONEME_LOCATION + "TRAIN" + r"/" + "DR7FJSK0SA2" +".PHN")
#predicter.get_spectrograms(SOUND_LOCATION + "TRAIN/DR7FJSK0SA2.WAV", ends)

In [None]:
predicter.generate_prediction_for_wav_file("TEST/")

Running at,  0 of total,  1680
Running at,  100 of total,  1680
Running at,  200 of total,  1680
Running at,  300 of total,  1680
Running at,  400 of total,  1680
Running at,  500 of total,  1680
Running at,  600 of total,  1680
Running at,  700 of total,  1680
Running at,  800 of total,  1680
Running at,  900 of total,  1680
Running at,  1000 of total,  1680
Running at,  1100 of total,  1680
Running at,  1200 of total,  1680
Running at,  1300 of total,  1680
Running at,  1400 of total,  1680
Running at,  1500 of total,  1680


## Calculate predictions on the test set for smalle inception

In [4]:
import glob
PHONEMES = ['aa', 'ae', 'ah', 'ao', 'aw', 'ax', 'ax-h', 'axr', 'ay', 'b', 'bcl', 'ch', 'd', 'dcl', 'dh', 'dx', 'eh', 'el', 
            'em', 'en', 'eng', 'epi', 'er', 'ey', 'f', 'g', 'gcl', 'h#', 'hh', 'hv', 'ih', 'ix', 'iy', 'jh', 'k', 'kcl', 
            'l', 'm', 'n', 'ng', 'nx', 'ow', 'oy', 'p', 'pau', 'pcl', 'q', 'r', 's', 'sh', 't', 'tcl', 'th', 'uh', 'uw', 
            'ux', 'v', 'w', 'y', 'z', 'zh']

In [13]:
y_pred = []
y_true = []

for prediction in os.listdir("PREDICTIONS_small/TEST"):
    file_lines = []
    if os.path.isdir("PREDICTIONS_small/TEST/" + prediction):
        continue
    predictions = prediction.split('.')[0]
    
    labels = glob.glob(r'PHN_converted_2_small/TEST/' + predictions + '.txt')
   
    predicted = np.load("PREDICTIONS_small/TEST/" + prediction)
#     print(predicted[0])
    predicted = np.argmax(predicted, axis=2)
#     print(predicted.shape)
#     print(predicted.reshape(1,-1).shape)
    file = open(labels[0], 'r')
    for line in file:
        phone = str(line).split('\n')[0]
        location = PHONEMES.index(phone)
        file_lines.append(location)
    y_pred.append(predicted)
    y_true.append(file_lines)
    

In [14]:
def flatten(x):
    result = []
    for el in x:
        if hasattr(el, "__iter__") and not isinstance(el, str):
            result.extend(flatten(el))
        else:
            result.append(el)
    return result

In [15]:
y_true = list(flatten(y_true))
y_pred = list(flatten(y_pred))

In [24]:
y_true_p = [PHONEMES[x] for x in y_true]
y_pred_p = [PHONEMES[x] for x in y_pred]

In [29]:
y_true_p_mapped = [map_phoneme(x) for x in y_true_p]
y_pred_p_mapped = [map_phoneme(x) for x in y_pred_p]

In [31]:
sklearn.metrics.accuracy_score(y_true_p, y_pred_p)

0.6576196131317807

In [32]:
sklearn.metrics.accuracy_score(y_true_p_mapped, y_pred_p_mapped)

0.7227632294035949

# Calculate predictions on test set for Inception V1

In [196]:
def most_frequent(List):
    # Returns last element if all are different
    counter = 1
    num = List[0]
#     print(List)
    for i in List: 
        curr_frequency = List.count(i) 
        if(curr_frequency> counter): 
            counter = curr_frequency 
            num = i 
  
    return num 

In [197]:
import glob
PHONEMES = ['aa', 'ae', 'ah', 'ao', 'aw', 'ax', 'ax-h', 'axr', 'ay', 'b', 'bcl', 'ch', 'd', 'dcl', 'dh', 'dx', 'eh', 'el', 
            'em', 'en', 'eng', 'epi', 'er', 'ey', 'f', 'g', 'gcl', 'h#', 'hh', 'hv', 'ih', 'ix', 'iy', 'jh', 'k', 'kcl', 
            'l', 'm', 'n', 'ng', 'nx', 'ow', 'oy', 'p', 'pau', 'pcl', 'q', 'r', 's', 'sh', 't', 'tcl', 'th', 'uh', 'uw', 
            'ux', 'v', 'w', 'y', 'z', 'zh']

In [198]:
y_pred = []
y_true = []

for prediction in os.listdir("PREDICTIONS_V1/TEST"):
    file_lines = []
    if os.path.isdir("PREDICTIONS_V1/TEST/" + prediction):
        continue
    predictions = prediction.split('.')[0]
    
    labels = glob.glob(r'PHN_converted_2_V1/TEST/' + predictions + '.txt')
   
    predicted = np.load("PREDICTIONS_V1/TEST/" + prediction)
    print(predicted.shape)
    
    predicted = np.argmax(predicted, axis=3)
    print(predicted[20])
    predicted = [most_frequent(list(x)) for x in predicted]
#     predicted = predicted[:,0,:]
    print(predicted[20])
#     print(predicted.reshape(1,-1).shape)
    file = open(labels[0], 'r')
    for line in file:
        phone = str(line).split('\n')[0]
        location = PHONEMES.index(phone)
        file_lines.append(location)
    y_pred.append(predicted)
    y_true.append(file_lines)
    

(185, 3, 1, 61)
[[32]
 [32]
 [32]]
[32]
(373, 3, 1, 61)
[[32]
 [32]
 [32]]
[32]
(284, 3, 1, 61)
[[30]
 [30]
 [30]]
[30]
(242, 3, 1, 61)
[[41]
 [41]
 [41]]
[41]
(236, 3, 1, 61)
[[34]
 [34]
 [34]]
[34]
(360, 3, 1, 61)
[[27]
 [27]
 [27]]
27
(236, 3, 1, 61)
[[41]
 [47]
 [47]]
[41]
(336, 3, 1, 61)
[[48]
 [48]
 [48]]
[48]
(309, 3, 1, 61)
[[ 4]
 [41]
 [ 4]]
[4]
(235, 3, 1, 61)
[[12]
 [12]
 [12]]
[12]
(357, 3, 1, 61)
[[32]
 [32]
 [32]]
[32]
(260, 3, 1, 61)
[[1]
 [1]
 [1]]
[1]
(244, 3, 1, 61)
[[35]
 [35]
 [35]]
27
(360, 3, 1, 61)
[[45]
 [45]
 [45]]
27
(184, 3, 1, 61)
[[4]
 [0]
 [4]]
[4]
(424, 3, 1, 61)
[[ 2]
 [48]
 [48]]
[2]
(396, 3, 1, 61)
[[10]
 [10]
 [10]]
27
(259, 3, 1, 61)
[[30]
 [30]
 [30]]
[30]
(337, 3, 1, 61)
[[29]
 [29]
 [29]]
[29]
(288, 3, 1, 61)
[[1]
 [1]
 [1]]
[1]
(301, 3, 1, 61)
[[37]
 [37]
 [37]]
[37]
(293, 3, 1, 61)
[[ 4]
 [41]
 [41]]
[4]
(331, 3, 1, 61)
[[34]
 [34]
 [34]]
[34]
(257, 3, 1, 61)
[[2]
 [2]
 [2]]
[2]
(267, 3, 1, 61)
[[32]
 [32]
 [32]]
[32]
(446, 3, 1, 61)
[[32]
 [32]

(236, 3, 1, 61)
[[0]
 [8]
 [0]]
[0]
(211, 3, 1, 61)
[[38]
 [38]
 [38]]
[38]
(562, 3, 1, 61)
[[51]
 [51]
 [51]]
27
(233, 3, 1, 61)
[[41]
 [41]
 [41]]
[41]
(232, 3, 1, 61)
[[30]
 [30]
 [30]]
[30]
(239, 3, 1, 61)
[[59]
 [59]
 [59]]
[59]
(296, 3, 1, 61)
[[27]
 [27]
 [27]]
27
(275, 3, 1, 61)
[[29]
 [29]
 [29]]
[29]
(371, 3, 1, 61)
[[49]
 [49]
 [49]]
[49]
(279, 3, 1, 61)
[[ 4]
 [36]
 [ 4]]
[4]
(252, 3, 1, 61)
[[1]
 [1]
 [1]]
[1]
(283, 3, 1, 61)
[[32]
 [32]
 [32]]
[32]
(285, 3, 1, 61)
[[41]
 [41]
 [41]]
[41]
(745, 3, 1, 61)
[[27]
 [27]
 [27]]
27
(425, 3, 1, 61)
[[36]
 [36]
 [36]]
[36]
(287, 3, 1, 61)
[[34]
 [34]
 [34]]
[34]
(332, 3, 1, 61)
[[29]
 [29]
 [29]]
[29]
(190, 3, 1, 61)
[[36]
 [36]
 [32]]
[36]
(281, 3, 1, 61)
[[33]
 [59]
 [59]]
[33]
(213, 3, 1, 61)
[[35]
 [35]
 [35]]
27
(471, 3, 1, 61)
[[27]
 [27]
 [27]]
27
(237, 3, 1, 61)
[[1]
 [1]
 [1]]
[1]
(213, 3, 1, 61)
[[1]
 [1]
 [1]]
[1]
(258, 3, 1, 61)
[[16]
 [23]
 [16]]
[16]
(332, 3, 1, 61)
[[34]
 [34]
 [34]]
[34]
(565, 3, 1, 61)
[[27]
 [27]

(277, 3, 1, 61)
[[38]
 [26]
 [26]]
27
(429, 3, 1, 61)
[[48]
 [48]
 [48]]
[48]
(242, 3, 1, 61)
[[1]
 [1]
 [1]]
[1]
(279, 3, 1, 61)
[[29]
 [29]
 [29]]
[29]
(414, 3, 1, 61)
[[12]
 [12]
 [50]]
[12]
(294, 3, 1, 61)
[[1]
 [1]
 [1]]
[1]
(263, 3, 1, 61)
[[35]
 [46]
 [38]]
[35]
(180, 3, 1, 61)
[[47]
 [47]
 [47]]
[47]
(206, 3, 1, 61)
[[10]
 [10]
 [10]]
27
(322, 3, 1, 61)
[[31]
 [32]
 [32]]
[31]
(241, 3, 1, 61)
[[0]
 [0]
 [0]]
[0]
(175, 3, 1, 61)
[[51]
 [51]
 [51]]
27
(470, 3, 1, 61)
[[48]
 [48]
 [48]]
[48]
(258, 3, 1, 61)
[[22]
 [22]
 [22]]
[22]
(207, 3, 1, 61)
[[46]
 [29]
 [29]]
[46]
(260, 3, 1, 61)
[[37]
 [37]
 [37]]
[37]
(279, 3, 1, 61)
[[36]
 [36]
 [36]]
[36]
(251, 3, 1, 61)
[[13]
 [13]
 [13]]
27
(311, 3, 1, 61)
[[32]
 [32]
 [32]]
[32]
(339, 3, 1, 61)
[[27]
 [27]
 [27]]
27
(368, 3, 1, 61)
[[16]
 [16]
 [16]]
[16]
(297, 3, 1, 61)
[[29]
 [29]
 [29]]
[29]
(485, 3, 1, 61)
[[51]
 [35]
 [51]]
27
(572, 3, 1, 61)
[[23]
 [23]
 [23]]
[23]
(291, 3, 1, 61)
[[32]
 [32]
 [32]]
[32]
(266, 3, 1, 61)
[[34]
 [

(387, 3, 1, 61)
[[12]
 [12]
 [12]]
[12]
(223, 3, 1, 61)
[[8]
 [8]
 [8]]
[8]
(220, 3, 1, 61)
[[28]
 [28]
 [50]]
[28]
(320, 3, 1, 61)
[[41]
 [36]
 [36]]
[41]
(226, 3, 1, 61)
[[48]
 [50]
 [50]]
[48]
(281, 3, 1, 61)
[[41]
 [41]
 [41]]
[41]
(312, 3, 1, 61)
[[29]
 [29]
 [32]]
[29]
(149, 3, 1, 61)
[[31]
 [32]
 [31]]
[31]
(321, 3, 1, 61)
[[38]
 [24]
 [38]]
[38]
(288, 3, 1, 61)
[[32]
 [32]
 [32]]
[32]
(255, 3, 1, 61)
[[27]
 [27]
 [27]]
27
(341, 3, 1, 61)
[[0]
 [0]
 [0]]
[0]
(248, 3, 1, 61)
[[48]
 [48]
 [48]]
[48]
(254, 3, 1, 61)
[[24]
 [24]
 [24]]
[24]
(424, 3, 1, 61)
[[59]
 [59]
 [59]]
[59]
(277, 3, 1, 61)
[[23]
 [23]
 [23]]
[23]
(497, 3, 1, 61)
[[59]
 [59]
 [59]]
[59]
(243, 3, 1, 61)
[[47]
 [ 8]
 [ 0]]
[47]
(534, 3, 1, 61)
[[48]
 [48]
 [48]]
[48]
(203, 3, 1, 61)
[[47]
 [58]
 [36]]
[47]
(450, 3, 1, 61)
[[30]
 [16]
 [30]]
[30]
(242, 3, 1, 61)
[[4]
 [4]
 [4]]
[4]
(439, 3, 1, 61)
[[34]
 [34]
 [34]]
[34]
(329, 3, 1, 61)
[[51]
 [51]
 [51]]
27
(206, 3, 1, 61)
[[10]
 [45]
 [10]]
27
(285, 3, 1, 61)
[[

(165, 3, 1, 61)
[[16]
 [28]
 [50]]
[16]
(276, 3, 1, 61)
[[35]
 [35]
 [35]]
27
(295, 3, 1, 61)
[[48]
 [48]
 [48]]
[48]
(457, 3, 1, 61)
[[32]
 [32]
 [32]]
[32]
(281, 3, 1, 61)
[[25]
 [36]
 [25]]
[25]
(161, 3, 1, 61)
[[41]
 [41]
 [41]]
[41]
(184, 3, 1, 61)
[[47]
 [47]
 [47]]
[47]
(426, 3, 1, 61)
[[57]
 [57]
 [57]]
[57]
(243, 3, 1, 61)
[[1]
 [1]
 [1]]
[1]
(264, 3, 1, 61)
[[41]
 [41]
 [41]]
[41]
(237, 3, 1, 61)
[[41]
 [41]
 [41]]
[41]
(202, 3, 1, 61)
[[48]
 [48]
 [48]]
[48]
(337, 3, 1, 61)
[[10]
 [10]
 [10]]
27
(333, 3, 1, 61)
[[32]
 [32]
 [32]]
[32]
(388, 3, 1, 61)
[[48]
 [48]
 [48]]
[48]
(391, 3, 1, 61)
[[47]
 [47]
 [47]]
[47]
(363, 3, 1, 61)
[[27]
 [44]
 [27]]
27
(323, 3, 1, 61)
[[23]
 [31]
 [23]]
[23]
(290, 3, 1, 61)
[[42]
 [32]
 [32]]
[42]
(268, 3, 1, 61)
[[51]
 [51]
 [51]]
27
(220, 3, 1, 61)
[[ 7]
 [41]
 [ 7]]
[7]
(280, 3, 1, 61)
[[48]
 [48]
 [48]]
[48]
(324, 3, 1, 61)
[[57]
 [57]
 [57]]
[57]
(261, 3, 1, 61)
[[48]
 [48]
 [48]]
[48]
(430, 3, 1, 61)
[[30]
 [32]
 [32]]
[30]
(185, 3, 1, 6

[57]
(367, 3, 1, 61)
[[32]
 [32]
 [32]]
[32]
(238, 3, 1, 61)
[[34]
 [34]
 [34]]
[34]
(219, 3, 1, 61)
[[41]
 [41]
 [41]]
[41]
(341, 3, 1, 61)
[[14]
 [14]
 [14]]
[14]
(245, 3, 1, 61)
[[36]
 [45]
 [36]]
[36]
(197, 3, 1, 61)
[[58]
 [58]
 [58]]
[58]
(447, 3, 1, 61)
[[24]
 [24]
 [24]]
[24]
(217, 3, 1, 61)
[[36]
 [36]
 [36]]
[36]
(196, 3, 1, 61)
[[51]
 [51]
 [51]]
27
(149, 3, 1, 61)
[[12]
 [47]
 [47]]
[12]
(181, 3, 1, 61)
[[33]
 [11]
 [33]]
[33]
(240, 3, 1, 61)
[[12]
 [33]
 [12]]
[12]
(347, 3, 1, 61)
[[45]
 [45]
 [45]]
27
(209, 3, 1, 61)
[[1]
 [1]
 [1]]
[1]
(232, 3, 1, 61)
[[53]
 [41]
 [36]]
[53]
(270, 3, 1, 61)
[[2]
 [2]
 [2]]
[2]
(373, 3, 1, 61)
[[32]
 [32]
 [32]]
[32]
(308, 3, 1, 61)
[[39]
 [38]
 [38]]
[39]
(372, 3, 1, 61)
[[32]
 [32]
 [32]]
[32]
(510, 3, 1, 61)
[[57]
 [57]
 [57]]
[57]
(427, 3, 1, 61)
[[ 8]
 [31]
 [ 8]]
[8]
(258, 3, 1, 61)
[[23]
 [23]
 [23]]
[23]
(305, 3, 1, 61)
[[ 1]
 [30]
 [31]]
[1]
(232, 3, 1, 61)
[[51]
 [51]
 [51]]
27
(305, 3, 1, 61)
[[30]
 [32]
 [32]]
[30]
(345, 3, 1,

(368, 3, 1, 61)
[[31]
 [31]
 [31]]
[31]
(356, 3, 1, 61)
[[32]
 [32]
 [32]]
[32]
(438, 3, 1, 61)
[[43]
 [43]
 [43]]
[43]
(267, 3, 1, 61)
[[41]
 [41]
 [41]]
[41]
(172, 3, 1, 61)
[[31]
 [31]
 [31]]
[31]
(352, 3, 1, 61)
[[49]
 [49]
 [49]]
[49]
(207, 3, 1, 61)
[[14]
 [27]
 [27]]
27
(261, 3, 1, 61)
[[30]
 [30]
 [30]]
[30]
(293, 3, 1, 61)
[[45]
 [45]
 [45]]
27
(256, 3, 1, 61)
[[1]
 [1]
 [1]]
[1]
(296, 3, 1, 61)
[[32]
 [31]
 [32]]
[32]
(359, 3, 1, 61)
[[32]
 [49]
 [32]]
[32]
(201, 3, 1, 61)
[[30]
 [36]
 [36]]
[30]
(381, 3, 1, 61)
[[32]
 [32]
 [32]]
[32]
(223, 3, 1, 61)
[[1]
 [1]
 [1]]
[1]
(249, 3, 1, 61)
[[11]
 [11]
 [11]]
[11]
(339, 3, 1, 61)
[[27]
 [27]
 [27]]
27
(267, 3, 1, 61)
[[1]
 [1]
 [1]]
[1]
(306, 3, 1, 61)
[[29]
 [29]
 [32]]
[29]
(306, 3, 1, 61)
[[10]
 [10]
 [10]]
27
(284, 3, 1, 61)
[[41]
 [41]
 [41]]
[41]
(355, 3, 1, 61)
[[32]
 [32]
 [32]]
[32]
(375, 3, 1, 61)
[[ 9]
 [43]
 [ 9]]
[9]
(270, 3, 1, 61)
[[ 3]
 [47]
 [47]]
[3]
(460, 3, 1, 61)
[[29]
 [29]
 [29]]
[29]
(271, 3, 1, 61)
[[59]


[2]
(349, 3, 1, 61)
[[59]
 [59]
 [59]]
[59]
(241, 3, 1, 61)
[[30]
 [30]
 [30]]
[30]
(494, 3, 1, 61)
[[50]
 [50]
 [50]]
[50]
(287, 3, 1, 61)
[[3]
 [3]
 [3]]
[3]
(234, 3, 1, 61)
[[30]
 [16]
 [16]]
[30]
(500, 3, 1, 61)
[[47]
 [47]
 [47]]
[47]


In [199]:
def flatten(x):
    result = []
    for el in x:
        if hasattr(el, "__iter__") and not isinstance(el, str):
            result.extend(flatten(el))
        else:
            result.append(el)
    return result

In [200]:
y_true = list(flatten(y_true))
y_pred = list(flatten(y_pred))

In [201]:
y_true_p = [PHONEMES[x] for x in y_true]
y_pred_p = [PHONEMES[x] for x in y_pred]

In [202]:
len(y_pred_p)

496810

In [203]:
y_true_p_mapped = [map_phoneme(x) for x in y_true_p]
y_pred_p_mapped = [map_phoneme(x) for x in y_pred_p]

In [204]:
sklearn.metrics.accuracy_score(y_true_p, y_pred_p)

0.6417926370242145

In [205]:
sklearn.metrics.accuracy_score(y_true_p_mapped, y_pred_p_mapped)

0.7796461423884382