In [1]:
import numpy as np
import matplotlib.pyplot as plt
import random

# Features
from feature_extraction import fit_normalizer
from feature_extraction import read_feat_file
#from feature_extraction import readFeatsAndLabsSingleWavFile
from sklearn import preprocessing

# Training and testing data
from sklearn.model_selection import train_test_split
from validation import read_feat_list
from validation import train_val_split

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Models
from net import Net
from net import initialize_weights

# Evaluation
from sklearn.metrics import confusion_matrix
from phone_mapping import phoneme2moa
from confusion_matrix import sort_classes

In [2]:
# Necessary files
train_feat_file = "features/mfcc13_train.txt"
train_feat_list = "data/train.txt"
dev_feat_list = "data/dev.txt"
test_feat_list = "data/test.txt"

In [3]:
train_list = read_feat_list(train_feat_list)
dev_list = read_feat_list(dev_feat_list)
test_list = read_feat_list(test_feat_list)

In [4]:
# Combine dev and test sets into one
test_list = test_list + dev_list

In [5]:
# Split list of utterances into training and validation sets
valid_list, train_list = train_val_split(train_list, 184)

In [6]:
# Normalize features according to training data
scaler = fit_normalizer(train_feat_file)

In [7]:
labels = ["sil",
         "b","d","g","p","t","k","dx",
         "ch","jh",
         "s","sh","z","zh","f","v","th","dh","hh",
         "m","n","ng",
         "l","r","w","y",
         "aa","ae","ah","ao","aw","ay","eh","er","ey","ih","iy","ow","oy","uh","uw"]

le = preprocessing.LabelEncoder()
le.fit(labels)

LabelEncoder()

In [8]:
if torch.cuda.is_available():
    device = torch.device("cuda:0")
    print("Running on the GPU")
else:
    device = torch.device("cpu")
    print("Running on the CPU")

Running on the GPU


In [9]:
# Instantiate the network
net = Net(26,41) # should calculate number of features and classes above

# Initialize weights
net.apply(initialize_weights)

# Send network to GPU (if applicable)
net.to(device)

Net(
  (fc1): Linear(in_features=26, out_features=250, bias=True)
  (fc2): Linear(in_features=250, out_features=41, bias=True)
)

In [10]:
from train_and_test import train
from train_and_test import validate
from train_and_test import test

In [11]:
# Training
num_epochs = 10
learn_rate = 1e-5
m = 0.9

# Stochastic gradient descent with user-defined learning rate and momentum
optimizer = optim.SGD(net.parameters(), lr=learn_rate, momentum=m)

# Preallocate vectors to hold loss
train_loss = np.zeros((num_epochs, 1))
valid_loss = np.zeros((num_epochs, 1))

# Set to training mode
net.train()

# Training
for epoch in range(num_epochs):
    # Randomly shuffle list of training and validation files
    random.shuffle(train_list)
    random.shuffle(valid_list)
    
    # Training
    train_loss[epoch] = train(net, optimizer, le, train_list)
    
    # Validation
    valid_loss[epoch] = validate(net, le, valid_list)
    
    if epoch % 10 == 0:
        print("Epoch: {}, Training Loss: {}, Validation Loss {}".
              format(epoch+1, round(float(train_loss[epoch]),3), round(float(valid_loss[epoch]),3)))


Epoch: 1, Training Loss: 3.518, Validation Loss 3.37


KeyboardInterrupt: 

In [None]:
# Loss Curves
epochs = np.arange(0, epoch, 1)
plt.plot(epochs, train_loss[0:epoch], 'b')
plt.plot(epochs, valid_loss[0:epoch], 'r--')
plt.title("Cross Entropy Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend(["Training", "Validation"])
plt.show()

In [None]:
# Testing
summary = test(net, le, scaler, test_list)

# Calculate confusion matrix
cm = confusion_matrix(np.concatenate(summary['y_true']), np.concatenate(summary['y_pred']))
    
# Calculate accuracy
accuracy = np.sum(np.diagonal(cm))/float(np.sum(cm))
    
# Normalize confusion matrix
cm = cm.astype('float')/np.tile(np.reshape(np.sum(cm,axis=1),(len(cm),1)),(1,len(cm)))

print("Accuracy: ", round(accuracy,3))

In [None]:
# Confusion matrix
classes_int = np.arange(0,len(labels),1)
classes_str = le.inverse_transform(classes_int)

# Sort confusion matrix in specific order
cm = sort_classes(cm, classes_str, labels)

plt.figure(figsize=(10,10))
plt.imshow(cm)
plt.title("Percent Correct = {}%".format(round(accuracy*100,1)))
plt.xlabel("Predicted Class")
plt.ylabel("True Class")
plt.xticks(classes_int, labels)
plt.yticks(classes_int, labels)
plt.colorbar()
plt.show()

In [None]:
# Convert phonemes to manner of articulation
moa_true = phoneme2moa(le.inverse_transform(np.concatenate(summary['y_true'])))
moa_pred = phoneme2moa(le.inverse_transform(np.concatenate(summary['y_pred'])))

# Convert manner of articulation string labels into integer labels
le2 = preprocessing.LabelEncoder()
moa_true = le2.fit_transform(moa_true)
moa_pred = le2.transform(moa_pred)

# Calculate accuracy for manner of articulation
moa_accuracy = np.sum(np.array(moa_true)==np.array(moa_pred))/float(len(moa_true))

In [None]:
# Confusion matrix
moa_int = np.arange(0,len(np.unique(moa_true)),1)
moa_str = le2.inverse_transform(moa_int)

cm2 = confusion_matrix(moa_true, moa_pred)

# Sort in specific order
sort_order = ["silence","stop","affricate","fricative","nasal","semivowel","vowel"]
cm2 = sort_classes(cm2, moa_str, sort_order)

cm2 = cm2.astype('float')/np.tile(np.reshape(np.sum(cm2,axis=1),(len(cm2),1)),(1,len(cm2)))
plt.figure(figsize=(10,10))
plt.imshow(cm2)
plt.title("Percent Correct = {}%".format(round(moa_accuracy*100,1)))
plt.xlabel("Predicted Class")
plt.ylabel("True Class")
plt.xticks(moa_int, sort_order)
plt.yticks(moa_int, sort_order)
plt.colorbar()
plt.clim(0, 1)
plt.show()