In [3]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from pathlib import Path
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [4]:
# This code is from https://fehiepsi.github.io/blog/grapheme-to-phoneme/
# Based on https://github.com/SeanNaren/deepspeech.pytorch/blob/master/decoder.py.
import Levenshtein  # https://github.com/ztane/python-Levenshtein/


def phoneme_error_rate(p_seq1, p_seq2):
    # To calculate the PER
    p_vocab = set(p_seq1 + p_seq2)
    p2c = dict(zip(p_vocab, range(len(p_vocab))))
    c_seq1 = [chr(p2c[p]) for p in p_seq1]
    c_seq2 = [chr(p2c[p]) for p in p_seq2]
    return Levenshtein.distance(''.join(c_seq1),
                                ''.join(c_seq2)) / len(c_seq2)

In [5]:
# These are so that we can map the phonemes to integers and then back to phonemes
phones = [
    "aa",
    "ae",
    "ah",
    "ay",
    "aw",
    "b",
    "ch",
    "d",
    "dh",
    "dx",
    "eh",
    "er",
    "ey",
    "f",
    "g",
    "hh",
    "ih",
    "k",
    "iy",
    "jh",
    "l",
    "m",
    "n",
    "ng",
    "oy",
    "ow",
    "r",
    "s",
    "sh",
    "t",
    "th",
    "uw",
    "uh",
    "p",
    "v",
    "w",
    "y",
    "z",
    "sil"
]

In [6]:
path = Path("MFCCS2")

trainX = np.load(path / "X_train.npy")
trainY = np.load(path / "y_train.npy")

testX = np.load(path / "X_test.npy")
testY = np.load(path / "y_test.npy")

In [7]:
trainX.shape

(539007, 40, 45)

In [8]:
# From https://machinelearningmastery.com/how-to-one-hot-encode-sequence-data-in-python/
char_to_int = dict((c, i) for i, c in enumerate(phones))
int_to_char = dict((i, c) for i, c in enumerate(phones))
trainY = [char_to_int[phone] for phone in trainY]
testY = [char_to_int[phone] for phone in testY]

from keras.utils import to_categorical
trainY = to_categorical(trainY)
testY= to_categorical(testY)

Using TensorFlow backend.


In [9]:
trainX = trainX.reshape((-1, trainX.shape[1], trainX.shape[2], 1))
testX = testX.reshape((-1, testX.shape[1], testX.shape[2], 1))

In [10]:
trainX, testX, trainY, testY = train_test_split(
        trainX, trainY, test_size=0.20, random_state=42)

In [11]:
from keras.models import Sequential, model_from_json
from keras.layers import Dense, Dropout, Activation, Convolution2D, MaxPooling2D, Flatten
from keras.optimizers import Adam, SGD

In [13]:
model = Sequential()

output_dim = len(trainY[0])

model = Sequential()

model.add(Convolution2D(150,
                        (5, 5), 
                        input_shape=(trainX.shape[1], trainX.shape[2], 1),
                        activation = 'relu'))
model.add(MaxPooling2D((6,6),
                      padding="same"))

model.add(Flatten())
    
model.add(Dense(1000, activation = 'relu'))
model.add(Dense(1000, activation = 'relu'))

model.add(Dense(output_dim=output_dim, activation = 'softmax'))
model.summary() 


model.compile(loss='categorical_crossentropy', 
      metrics=["accuracy"],
      optimizer=Adam(lr=0.001)
)

stats = model.fit(trainX, trainY,
        shuffle=True,
        batch_size=1024,
        epochs=11,
        validation_split=0.1,
        verbose=1
)




Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_2 (Conv2D)            (None, 36, 41, 150)       3900      
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 6, 7, 150)         0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 6300)              0         
_________________________________________________________________
dense_4 (Dense)              (None, 1000)              6301000   
_________________________________________________________________
dense_5 (Dense)              (None, 1000)              1001000   
_________________________________________________________________
dense_6 (Dense)              (None, 39)                39039     
Total params: 7,344,939
Trainable params: 7,344,939
Non-trainable params: 0
____________________________________________

In [14]:

predictions = model.predict(x=testX, batch_size=512, verbose=1)
print(f"PER is : {phoneme_error_rate(predictions.argmax(axis=1), testY.argmax(axis=1))} Accuracy is : {model.evaluate(x=testX, y = testY, batch_size=512, verbose=1)}")

PER is : 0.461234485445539 Accuracy is : [1.544046977864942, 0.5382367968559265]


In [None]:

output_dim = len(trainY[0])
epochs = 10


model2 = Sequential()

model2.add(Flatten())
model2.add(Dense(256, activation = 'relu'))
model2.add(Dense(256, activation = 'relu'))
model2.add(Dense(output_dim=output_dim, activation = 'relu'))
model2.add(Dense(output_dim=output_dim))
model2.add(Activation('softmax'))

model2.compile(loss='categorical_crossentropy', 
              optimizer=Adam(), 
              metrics=["accuracy"])

stats = model2.fit(trainX, trainY,
    shuffle=True,
    batch_size=2048,
    nb_epoch=epochs,
    validation_split=0.1,
    verbose=1
)
model2.summary()

In [None]:
predictions = model.predict(x=testX, batch_size=512, verbose=1)
print(f"PER is : {phoneme_error_rate(predictions.argmax(axis=1), testY.argmax(axis=1))} Accuracy is : {model.evaluate(x=testX, y = testY, batch_size=512, verbose=1}")

In [None]:
# Mainly used to get the confusion matrix
# This code is copied from somewhere, but I can sadly not figure out where that was from.

print("[INFO] evaluating network...")
print(testX[0].shape)
print(trainX[0].shape)
predictions = model.predict(x=testX, batch_size=512, verbose=1)
print(classification_report(testY.argmax(axis=1),
	predictions.argmax(axis=1)))
# plot the training loss and accuracy
N = np.arange(0, epochs)
plt.style.use("ggplot")
plt.figure()
plt.plot(N, stats.history["accuracy"], label="train_acc")
plt.plot(N, stats.history["val_accuracy"], label="val_acc")
plt.title("Training Loss and Accuracy (Simple NN)")
plt.xlabel("Epoch #")
plt.ylabel("Loss/Accuracy")
plt.legend()

In [None]:

predictions = model2.predict(x=testX, batch_size=512, verbose=1)
print(f"PER is : {phoneme_error_rate(predictions.argmax(axis=1), testY.argmax(axis=1))} Accuracy is : {model2.evaluate(x=testX, y = testY, batch_size=512, verbose=1}")

### PER of tests
#### CNN Test Result 8.24
0.6514674425125122

0.6436238884925842

0.6488529443740845

#### CNN Horizontal Test Result (40x1) 2.3M
0.6527880430221558

0.6515655517578125

0.6558755040168762

#### Small CNN Horizontal Test Result (40x1) 181k  pars
0.641758918762207

0.6443377733230591

0.6448642611503601

#### Small CNN 181k params (40x8)
0.6403490900993347

0.6396352052688599

0.6392336487770081

#### DNN Test Result 6,8M
0.6274282336235046

0.6198791861534119

0.6232432126998901

#### DNN 256 * 2 567k
0.5806347727775574

0.5770119428634644

0.575503945350647

In [None]:

cnn = [0.6514674425125122,0.6436238884925842,0.6488529443740845]

cnn2 = [0.6527880430221558, 0.6515655517578125, 0.6558755040168762]

cnn3 = [0.6403490900993347, 0.6396352052688599, 0.6392336487770081]

cnn4 = [0.641758918762207, 0.6443377733230591, 0.6448642611503601]

dnn = [0.6274282336235046, 0.6198791861534119, 0.6232432126998901]

dnn2 = [0.5806347727775574, 0.5770119428634644, 0.575503945350647]