In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F  
from DNN_utils import (flatten) 
from torch.utils.data import Dataset, DataLoader, sampler

import numpy as np 
import pandas as pd

# Import helper functions.
import mfcc_label 
import get_prob


df_train = pd.read_csv('processed_data/dnn_never_train.csv')

if isinstance(df_train.iloc[0]['mfcc'], str):
    df_train['mfcc'] = df_train['mfcc'].apply(lambda x: np.fromstring(x[1:-1], sep=' '))
if isinstance(df_train.iloc[0]['label'], str):
    df_train['label'] = df_train['label'].apply(lambda x: np.fromstring(x[1:-1], sep=' '))


# Configurations 
NUM_TRAIN = int(0.8*len(df_train)) # Number of training examples for splitting training and validation datasets. 
NUM_ROWS = len(df_train)
device = 'cpu'
dtype = torch.float32
print_every = 50

# DNN Architecture Hyperparameters
minibatch_size = 512

In [3]:
from joblib import load
load('processed_data/train_test_dataset_never.joblib')['test']

[('timit/data/TRAIN/DR3/FLTM0/SX440.WAV',
  'timit/data/TRAIN/DR3/FLTM0/SX440.PHN',
  'timit/data/TRAIN/DR3/FLTM0/SX440.WRD'),
 ('timit/data/TRAIN/DR1/FDAW0/SX146.WAV',
  'timit/data/TRAIN/DR1/FDAW0/SX146.PHN',
  'timit/data/TRAIN/DR1/FDAW0/SX146.WRD'),
 ('timit/data/TRAIN/DR3/MDBB1/SX376.WAV',
  'timit/data/TRAIN/DR3/MDBB1/SX376.PHN',
  'timit/data/TRAIN/DR3/MDBB1/SX376.WRD'),
 ('timit/data/TRAIN/DR8/FMBG0/SX440.WAV',
  'timit/data/TRAIN/DR8/FMBG0/SX440.PHN',
  'timit/data/TRAIN/DR8/FMBG0/SX440.WRD'),
 ('timit/data/TRAIN/DR2/FMMH0/SI907.WAV',
  'timit/data/TRAIN/DR2/FMMH0/SI907.PHN',
  'timit/data/TRAIN/DR2/FMMH0/SI907.WRD'),
 ('timit/data/TRAIN/DR3/FEME0/SX425.WAV',
  'timit/data/TRAIN/DR3/FEME0/SX425.PHN',
  'timit/data/TRAIN/DR3/FEME0/SX425.WRD'),
 ('timit/data/TRAIN/DR5/MCLM0/SX376.WAV',
  'timit/data/TRAIN/DR5/MCLM0/SX376.PHN',
  'timit/data/TRAIN/DR5/MCLM0/SX376.WRD'),
 ('timit/data/TRAIN/DR7/MDLC1/SI2065.WAV',
  'timit/data/TRAIN/DR7/MDLC1/SI2065.PHN',
  'timit/data/TRAIN/DR7/M

In [4]:
class DNN_FC(nn.Module):
    def __init__(self, input_size, num_classes):
        super().__init__()
        # assign layer objects to class attributes
        # We may write a loop if we use the same activation function for all layers.
        self.fc1 = nn.Linear(input_size, input_size)
        nn.init.kaiming_normal_(self.fc1.weight)
        self.fc2 = nn.Linear(input_size, input_size)
        nn.init.kaiming_normal_(self.fc2.weight) 
        self.fc3 = nn.Linear(input_size, input_size)
        nn.init.kaiming_normal_(self.fc3.weight)
        self.fc4 = nn.Linear(input_size, input_size)
        nn.init.kaiming_normal_(self.fc4.weight)
        self.fc5 = nn.Linear(input_size, num_classes)
        nn.init.kaiming_normal_(self.fc5.weight)
    
    def forward(self, x):
        x_temp = x
        x_temp = flatten(x_temp)
        x_temp = F.relu(self.fc1(x_temp))
        x_temp = F.relu(self.fc2(x_temp))
        x_temp = F.relu(self.fc3(x_temp))
        x_temp = F.relu(self.fc4(x_temp))
        scores = self.fc5(x_temp)
        return scores


def test_DNN_FC():
    input_size = 20  # Feature dimension for mfcc
    num_classes = 11 # Number of phoneme classes
    dtype = torch.float32
    x = torch.zeros((minibatch_size, input_size), dtype=dtype)  # minibatch size 64, feature dimension 20
    model = DNN_FC(input_size, num_classes)
    scores = model(x)
    print(scores.size())  # you should see [minibatch_size, num_classes]
test_DNN_FC()

torch.Size([512, 11])


In [5]:
print(type(df_train.iloc[0]['label']))
# If the type is str, convert it

display(df_train) 

<class 'numpy.ndarray'>


Unnamed: 0,mfcc,label,state_weights
0,"[-730.505798, 45.0450668, -24.2304726, -16.595...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",{'h#': 1.0}
1,"[-714.74677, 51.683174, -20.392345, -13.274165...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",{'h#': 1.0}
2,"[-720.12866, 46.353207, -18.542915, -15.895822...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",{'h#': 1.0}
3,"[-726.85284, 38.16919, -21.354225, -16.98072, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",{'h#': 1.0}
4,"[-733.83295, 39.958935, -22.630146, -19.272156...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",{'h#': 1.0}
...,...,...,...
15488,"[-773.1762, 4.0019016, 2.5322413, 2.8646927, 3...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",{'h#': 1.0}
15489,"[-773.2592, 4.076532, 2.7111504, 3.3019888, 4....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",{'h#': 1.0}
15490,"[-774.35565, 3.5023093, 3.4348507, 3.9783638, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",{'h#': 1.0}
15491,"[-774.29865, 4.247159, 3.8669136, 3.643158, 3....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",{'h#': 1.0}


In [6]:
#mfcc = df_train.iloc[0:5]['mfcc']
#mfcc = np.vstack(mfcc)
#mfcc 


labels = df_train.iloc[0:5]['label']
labels = np.vstack(labels)
display(labels)

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.]])

In [7]:
# Convert dataset into a format that torch can read.
class CustomDataset(Dataset):
    def __init__(self, dataframe, transform=None, train=True):
        self.df = dataframe
        self.transform = transform
        self.train = train

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # Get the rows that are selected by the idx.
        mfcc = self.df.iloc[idx]['mfcc']
        label = self.df.iloc[idx]['label']

        # Stack the rows for mfcc and label.
        # Stack a list of (1,n) dimensional np.ndarrays into (m,n) dimensional np.ndarray. 
        mfcc = np.vstack(mfcc)
        label = np.vstack(label)

        # Convert 2 dimensional np.ndarrays into torch tensors.
        mfcc = torch.tensor(mfcc, dtype=torch.float32)
        label = torch.tensor(label, dtype=torch.long)
        
        if self.transform:
            mfcc = self.transform(mfcc)

        return mfcc, label

# Create an instance of your dataset with your DataFrame
dataset_train = CustomDataset(df_train,train=True)  # Assuming df is your pandas DataFrame

# Create the DataLoader to handle batching
loader_train = DataLoader(dataset_train, batch_size=minibatch_size,
                          sampler=sampler.SubsetRandomSampler(range(NUM_TRAIN)))

# Create the DataLoader to handle batching
loader_val = DataLoader(dataset_train, batch_size=1,
                        sampler=sampler.SequentialSampler(range(NUM_TRAIN, NUM_ROWS)))
display(dataset_train.__getitem__(range(5))) 

(tensor([[-7.3051e+02,  4.5045e+01, -2.4230e+01, -1.6596e+01,  7.8798e+00,
           6.0388e-01, -1.2444e+01,  1.5479e+00,  1.8096e+01,  1.0397e+01,
           9.5008e-01,  6.5883e+00,  1.3456e+01,  4.3042e+00, -9.5567e+00,
          -5.4092e+00,  2.0956e+00,  1.1521e+00,  4.0445e+00,  4.9519e+00],
         [-7.1475e+02,  5.1683e+01, -2.0392e+01, -1.3274e+01, -3.0518e+00,
          -3.8946e+00, -7.8653e+00,  4.2943e+00,  1.9027e+01,  5.3817e+00,
          -7.3529e+00,  3.3265e+00,  9.8732e+00,  4.8012e+00, -1.0586e+01,
          -8.3340e+00,  1.0399e+01,  1.2690e+01,  1.1761e+01,  8.7251e+00],
         [-7.2013e+02,  4.6353e+01, -1.8543e+01, -1.5896e+01, -1.6697e+01,
          -6.0928e+00, -1.3419e+00,  5.4095e+00,  2.1859e+01,  5.5194e+00,
          -1.4134e+01,  5.9534e-01,  9.5893e+00,  5.5199e+00, -6.0089e+00,
          -5.3524e+00,  1.0020e+01,  1.4292e+01,  1.3084e+01,  5.8263e+00],
         [-7.2685e+02,  3.8169e+01, -2.1354e+01, -1.6981e+01, -1.2307e+01,
          -1.3834e+00,

In [8]:
# Example code to print the contents of the first few batches in loader_train
for i, (inputs, labels) in enumerate(loader_train):
    print(f"Batch {i + 1}")
    print("Features (MFCCs) size:", inputs.size())
    print("Labels size:", labels.size())
    print("\n")
    
    # Optional: Stop after a few batches to avoid flooding the output
    if i == 2:  # Adjust this number based on how many batches you want to see
        break

Batch 1
Features (MFCCs) size: torch.Size([512, 20, 1])
Labels size: torch.Size([512, 14, 1])


Batch 2
Features (MFCCs) size: torch.Size([512, 20, 1])
Labels size: torch.Size([512, 14, 1])


Batch 3
Features (MFCCs) size: torch.Size([512, 20, 1])
Labels size: torch.Size([512, 14, 1])




In [9]:
def check_accuracy(loader, model):
    if loader.dataset.train:
        print('Checking accuracy on validation set')
    else:
        print('Checking accuracy on test set')
    num_correct = 0
    num_samples = 0
    model.eval()  # set model to evaluation mode
    with torch.no_grad():
        for x, y in loader:
            y = flatten(y) # Flatten y to convert dimension from (Nx1) to (N,)
            x = x.to(device=device, dtype=dtype)  # move to device, e.g. GPU
            y = y.to(device=device, dtype=dtype) 
            scores = model(x) 
            _, preds = scores.max(1) 
            true_class = y.argmax(dim=1) # True class is the one that has the highest probability in the data.
            num_correct += (preds == true_class).sum()
            num_samples += preds.size(0)
        acc = float(num_correct) / num_samples
        print('Got %d / %d correct (%.2f)' % (num_correct, num_samples, 100 * acc))
    return acc 
        

In [10]:
device = 'cpu'
dtype = torch.float32

def train(model, optimizer, epochs=1):
    """
    Train the model using the PyTorch Module API.

    Inputs:
    - model: A PyTorch Module giving the model to train.
    - optimizer: An Optimizer object we will use to train the model
    - epochs: (Optional) A Python integer giving the number of epochs to train for

    Returns: Nothing, but prints model accuracies during training.
    """
    model = model.to(device=device)  # move the model parameters to CPU/GPU
    accuracy_val_lst = []
    accuracy_cal_max = 0
    for e in range(epochs):
        for t, (x, y) in enumerate(loader_train):
            model.train()  # put model to training mode
            y = flatten(y) # Flatten y to convert the dimension from (Nx1) to (N,)
            x = x.to(device=device, dtype=dtype)  # move to device, e.g. GPU
            y = y.to(device=device, dtype=dtype)

            scores = model(x)
            criterion = nn.BCEWithLogitsLoss()
            loss = criterion(scores, y)

            # Zero out all of the gradients for the variables which the optimizer
            # will update.
            optimizer.zero_grad()

            # This is the backwards pass: compute the gradient of the loss with
            # respect to each  parameter of the model.
            loss.backward()

            # Actually update the parameters of the model using the gradients
            # computed by the backwards pass.
            optimizer.step()
            
            if t % print_every == 0:
                print('Iteration %d, loss = %.4f' % (t, loss.item()))
                print()
                accuracy_val = check_accuracy(loader_val, model)
                if accuracy_val > accuracy_cal_max:
                    accuracy_cal_max = accuracy_val
                    model_params = model.state_dict()
                accuracy_val_lst.append((t,accuracy_val))
        
    print('Training is complete. Accuracies on the validation set are:') 
    print(accuracy_val_lst)
    return 
    
    


In [11]:
input_size = len(df_train['mfcc'][0])
num_classes = len(df_train['label'][0])
learning_rate = 1e-2
model = DNN_FC(input_size, num_classes)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
train(model, optimizer,epochs = 10) 

# Accuracy on the validation set: 72.57%. 

Iteration 0, loss = 39.5990

Checking accuracy on validation set
Got 0 / 3099 correct (0.00)
Iteration 0, loss = 0.1602

Checking accuracy on validation set
Got 1910 / 3099 correct (61.63)
Iteration 0, loss = 0.0933

Checking accuracy on validation set
Got 1958 / 3099 correct (63.18)
Iteration 0, loss = 0.0768

Checking accuracy on validation set
Got 2005 / 3099 correct (64.70)
Iteration 0, loss = 0.0695

Checking accuracy on validation set
Got 2027 / 3099 correct (65.41)
Iteration 0, loss = 0.0773

Checking accuracy on validation set
Got 2010 / 3099 correct (64.86)
Iteration 0, loss = 0.0785

Checking accuracy on validation set
Got 2033 / 3099 correct (65.60)
Iteration 0, loss = 0.0734

Checking accuracy on validation set
Got 2029 / 3099 correct (65.47)
Iteration 0, loss = 0.0751

Checking accuracy on validation set
Got 2056 / 3099 correct (66.34)
Iteration 0, loss = 0.0722

Checking accuracy on validation set
Got 2047 / 3099 correct (66.05)
Training is complete. Accuracies on the val

In [20]:
def infer_probabilities(loader, model):
    if loader.dataset.train:
        print('Getting estimated probabilities on validation set')
    else:
        print('Getting estimated probabilities on test set') 
    model.eval()  # set model to evaluation mode
    probabilities_dict = {} 
    with torch.no_grad():
        for idx, (x, y) in enumerate(loader):
            y = flatten(y) # Flatten y to convert dimension from (Nx1) to (N,)
            x = x.to(device=device, dtype=dtype)  # move to device, e.g. GPU
            y = y.to(device=device, dtype=dtype) 
            scores = model(x) 
            probabilities = torch.softmax(scores, dim=1) 
            
            # Save the probabilities with the corresponding row index
            for i in range(len(probabilities)):
                probabilities_dict[idx * minibatch_size + i] = probabilities[i].numpy()
    
    return probabilities_dict


def find_emission(loader, model):
    '''
    Find emission probabilities for a given data loader and model.
    Consider changing this function if it takes too long. Currently: O(n)
    '''
    # Get the inferred probabilities for each class (12 states, background and silence)
    probabilities_dict = infer_probabilities(loader, model) 

    # Get the prior vector and the transition probabilities. We don't need the transition probabilities.
    prior_vector, _ = get_prob.main(log_space=False)

    # For each key=row_idx and val=prob_array, convert the inferred probabilities into emission.
    for key, val in probabilities_dict.items():
        # Slice val to exclude the probabilities for background and silence.
        probabilities_dict[key] = probabilities_dict[key][:-2]/prior_vector  
    
    return probabilities_dict



# File id: Keep path of the audio file, and probabilities with size (num_frames, num_classes).  
        

In [21]:
estimate_prob = infer_probabilities(loader_val, model)
emission_data = find_emission(loader_val, model)

prior_vector, _ = get_prob.main(log_space=False)



print(estimate_prob[0][:-2]/prior_vector)
print(emission_data[0])
print('THERE IS A NUMERICAL ERROR. CHECK THIS.')


Getting estimated probabilities on validation set
Getting estimated probabilities on validation set


KeyError: 'log_prior'