In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F  
from DNN_utils import (flatten) 
from torch.utils.data import Dataset, DataLoader, sampler

import numpy as np 
import pandas as pd
from joblib import load 

# Import helper functions.
import mfcc_label 
import get_prob


df_train = pd.read_csv('processed_data/dnn_never_train.csv')
df_test = pd.read_csv('processed_data/dnn_never_test.csv')

def column_str_to_numpy(df, colname:str):
    # Given pd.DataFrame df, convert the column colname from string to numpy array.
    if isinstance(df.iloc[0][colname], str):
        df[colname]=df[colname].apply(lambda x: np.fromstring(x[1:-1], sep=' '))

column_str_to_numpy(df_train, 'mfcc')
column_str_to_numpy(df_train, 'label')
column_str_to_numpy(df_test, 'mfcc')
column_str_to_numpy(df_test, 'label')

# Configurations 
NUM_TRAIN = int(0.8*len(df_train)) # Number of training examples for splitting training and validation datasets. 
NUM_ROWS = len(df_train)
device = 'cpu'
dtype = torch.float32
print_every = 50

# DNN Architecture Hyperparameters
minibatch_size = 512

In [3]:
isinstance(df_test.iloc[0]['mfcc'],str)
print(type(df_test.iloc[0]['label']))
# If the type is str, convert it

display(df_test) 

<class 'numpy.ndarray'>


Unnamed: 0,mfcc,label,state_weights
0,"[-861.406067, 25.538353, 28.6545143, 1.8917164...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",{'h#': 1.0}
1,"[-824.945, 20.405128, 15.508747, -2.5022912, 2...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",{'h#': 1.0}
2,"[-818.1322, 16.536594, 10.08112, -9.460713, 13...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",{'h#': 1.0}
3,"[-816.3057, 14.217917, 15.505737, -9.549528, 1...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",{'h#': 1.0}
4,"[-809.08112, 17.429758, 14.575484, -10.453241,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",{'h#': 1.0}
...,...,...,...
17244,"[-719.52673, 54.290421, -17.710644, -19.354218...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",{'h#': 1.0}
17245,"[-717.0269, 61.623383, -12.215822, -7.030402, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",{'h#': 1.0}
17246,"[-713.20428, 57.323204, -11.463234, -5.8994236...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",{'h#': 1.0}
17247,"[-718.0743, 46.839127, -18.039227, -11.385343,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",{'h#': 1.0}


In [4]:
load('processed_data/train_test_dataset_never.joblib')['test'] 

[('timit/data/TRAIN/DR4/MDCD0/SX425.WAV',
  'timit/data/TRAIN/DR4/MDCD0/SX425.PHN',
  'timit/data/TRAIN/DR4/MDCD0/SX425.WRD'),
 ('timit/data/TRAIN/DR4/MJLS0/SX376.WAV',
  'timit/data/TRAIN/DR4/MJLS0/SX376.PHN',
  'timit/data/TRAIN/DR4/MJLS0/SX376.WRD'),
 ('timit/data/TRAIN/DR1/FDAW0/SX146.WAV',
  'timit/data/TRAIN/DR1/FDAW0/SX146.PHN',
  'timit/data/TRAIN/DR1/FDAW0/SX146.WRD'),
 ('timit/data/TRAIN/DR7/MREM0/SX61.WAV',
  'timit/data/TRAIN/DR7/MREM0/SX61.PHN',
  'timit/data/TRAIN/DR7/MREM0/SX61.WRD'),
 ('timit/data/TRAIN/DR6/MBMA1/SI2214.WAV',
  'timit/data/TRAIN/DR6/MBMA1/SI2214.PHN',
  'timit/data/TRAIN/DR6/MBMA1/SI2214.WRD'),
 ('timit/data/TRAIN/DR4/MDMA0/SX440.WAV',
  'timit/data/TRAIN/DR4/MDMA0/SX440.PHN',
  'timit/data/TRAIN/DR4/MDMA0/SX440.WRD'),
 ('timit/data/TRAIN/DR4/FALR0/SX425.WAV',
  'timit/data/TRAIN/DR4/FALR0/SX425.PHN',
  'timit/data/TRAIN/DR4/FALR0/SX425.WRD'),
 ('timit/data/TEST/DR1/MDAB0/SI1039.WAV',
  'timit/data/TEST/DR1/MDAB0/SI1039.PHN',
  'timit/data/TEST/DR1/MDAB

In [5]:
class DNN_FC(nn.Module):
    def __init__(self, input_size, num_classes):
        super().__init__()
        # assign layer objects to class attributes
        # We may write a loop if we use the same activation function for all layers.
        self.fc1 = nn.Linear(input_size, input_size)
        nn.init.kaiming_normal_(self.fc1.weight)
        self.fc2 = nn.Linear(input_size, input_size)
        nn.init.kaiming_normal_(self.fc2.weight) 
        self.fc3 = nn.Linear(input_size, input_size)
        nn.init.kaiming_normal_(self.fc3.weight)
        self.fc4 = nn.Linear(input_size, input_size)
        nn.init.kaiming_normal_(self.fc4.weight)
        self.fc5 = nn.Linear(input_size, num_classes)
        nn.init.kaiming_normal_(self.fc5.weight)
    
    def forward(self, x):
        x_temp = x
        x_temp = flatten(x_temp)
        x_temp = F.relu(self.fc1(x_temp))
        x_temp = F.relu(self.fc2(x_temp))
        x_temp = F.relu(self.fc3(x_temp))
        x_temp = F.relu(self.fc4(x_temp))
        scores = self.fc5(x_temp)
        return scores


def test_DNN_FC():
    input_size = 20  # Feature dimension for mfcc
    num_classes = 11 # Number of phoneme classes
    dtype = torch.float32
    x = torch.zeros((minibatch_size, input_size), dtype=dtype)  # minibatch size 64, feature dimension 20
    model = DNN_FC(input_size, num_classes)
    scores = model(x)
    print(scores.size())  # you should see [minibatch_size, num_classes]
test_DNN_FC()

torch.Size([512, 11])


In [6]:
#mfcc = df_train.iloc[0:5]['mfcc']
#mfcc = np.vstack(mfcc)
#mfcc 


labels = df_train.iloc[0:5]['label']
labels = np.vstack(labels)
display(labels)

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.]])

In [7]:
# Convert dataset into a format that torch can read.
class CustomDataset(Dataset):
    def __init__(self, dataframe, transform=None, train=True):
        self.df = dataframe
        self.transform = transform
        self.train = train

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # Get the rows that are selected by the idx.
        mfcc = self.df.iloc[idx]['mfcc']
        label = self.df.iloc[idx]['label']

        # Stack the rows for mfcc and label.
        # Stack a list of (1,n) dimensional np.ndarrays into (m,n) dimensional np.ndarray. 
        mfcc = np.vstack(mfcc)
        label = np.vstack(label)

        # Convert 2 dimensional np.ndarrays into torch tensors.
        mfcc = torch.tensor(mfcc, dtype=torch.float32)
        label = torch.tensor(label, dtype=torch.long)
        
        if self.transform:
            mfcc = self.transform(mfcc)

        return mfcc, label

# Create an instance of your dataset with your DataFrame
dataset_train = CustomDataset(df_train,train=True)  # Assuming df is your pandas DataFrame
dataset_test = CustomDataset(df_test,train=False)

# Create the DataLoader to handle batching
loader_train = DataLoader(dataset_train, batch_size=minibatch_size,
                          sampler=sampler.SubsetRandomSampler(range(NUM_TRAIN)))

# Create the DataLoader to handle batching
loader_val = DataLoader(dataset_train, batch_size=1,
                        sampler=sampler.SequentialSampler(range(NUM_TRAIN, len(df_train))))

loader_test = DataLoader(dataset_test, batch_size=1,
                        sampler=sampler.SequentialSampler(range(len(df_test))))


display(dataset_test.__getitem__(range(5))) 

(tensor([[-8.6141e+02,  2.5538e+01,  2.8655e+01,  1.8917e+00,  2.0691e+01,
           1.7659e+01,  1.6280e+01,  1.7272e+00,  1.8124e+01,  8.3629e+00,
           1.0131e+01,  6.0546e+00, -3.3920e+00,  3.4839e+00,  1.1448e+01,
           5.1237e-01,  6.1350e+00,  3.2503e+00,  9.1858e+00,  1.1667e+01],
         [-8.2495e+02,  2.0405e+01,  1.5509e+01, -2.5023e+00,  2.0970e+01,
           1.8829e+01,  5.9126e+00,  3.8527e+00,  1.6322e+01,  6.7955e+00,
           1.4075e+01,  6.0702e+00, -6.9586e+00,  3.7661e+00,  1.0885e+01,
           8.5535e+00,  1.2061e+01,  4.0659e+00,  6.1298e+00,  1.0815e+01],
         [-8.1813e+02,  1.6537e+01,  1.0081e+01, -9.4607e+00,  1.3550e+01,
           1.4193e+01,  1.7715e+00,  9.0556e+00,  2.0455e+01,  9.8571e+00,
           1.8602e+01,  9.4670e+00, -6.6779e+00,  2.9148e+00,  5.4056e+00,
           1.0185e+01,  8.6857e+00,  2.8653e+00,  4.7726e+00,  7.7956e+00],
         [-8.1631e+02,  1.4218e+01,  1.5506e+01, -9.5495e+00,  1.0859e+01,
           5.1730e+00,

In [8]:
# Example code to print the contents of the first few batches in loader_train
for i, (inputs, labels) in enumerate(loader_test):
    print(f"Batch {i + 1}")
    print("Features (MFCCs) size:", inputs.size())
    print("Labels size:", labels.size())
    print("\n")
    
    # Optional: Stop after a few batches to avoid flooding the output
    if i == 2:  # Adjust this number based on how many batches you want to see
        break

Batch 1
Features (MFCCs) size: torch.Size([1, 20, 1])
Labels size: torch.Size([1, 14, 1])


Batch 2
Features (MFCCs) size: torch.Size([1, 20, 1])
Labels size: torch.Size([1, 14, 1])


Batch 3
Features (MFCCs) size: torch.Size([1, 20, 1])
Labels size: torch.Size([1, 14, 1])




In [9]:
def check_accuracy(loader, model):
    if loader.dataset.train:
        print('Checking accuracy on validation set')
    else:
        print('Checking accuracy on test set')
    num_correct = 0
    num_samples = 0
    model.eval()  # set model to evaluation mode
    with torch.no_grad():
        for x, y in loader:
            y = flatten(y) # Flatten y to convert dimension from (Nx1) to (N,)
            x = x.to(device=device, dtype=dtype)  # move to device, e.g. GPU
            y = y.to(device=device, dtype=dtype) 
            scores = model(x) 
            _, preds = scores.max(1) 
            true_class = y.argmax(dim=1) # True class is the one that has the highest probability in the data.
            num_correct += (preds == true_class).sum()
            num_samples += preds.size(0)
        acc = float(num_correct) / num_samples
        print('Got %d / %d correct (%.2f)' % (num_correct, num_samples, 100 * acc))
    return acc 
        

In [10]:
device = 'cpu'
dtype = torch.float32

def train(model, optimizer, epochs=1):
    """
    Train the model using the PyTorch Module API.

    Inputs:
    - model: A PyTorch Module giving the model to train.
    - optimizer: An Optimizer object we will use to train the model
    - epochs: (Optional) A Python integer giving the number of epochs to train for

    Returns: Nothing, but prints model accuracies during training.
    """
    model = model.to(device=device)  # move the model parameters to CPU/GPU
    accuracy_val_lst = []
    accuracy_cal_max = 0
    for e in range(epochs):
        for t, (x, y) in enumerate(loader_train):
            model.train()  # put model to training mode
            y = flatten(y) # Flatten y to convert the dimension from (Nx1) to (N,)
            x = x.to(device=device, dtype=dtype)  # move to device, e.g. GPU
            y = y.to(device=device, dtype=dtype)

            scores = model(x)
            criterion = nn.BCEWithLogitsLoss()
            loss = criterion(scores, y)

            # Zero out all of the gradients for the variables which the optimizer
            # will update.
            optimizer.zero_grad()

            # This is the backwards pass: compute the gradient of the loss with
            # respect to each  parameter of the model.
            loss.backward()

            # Actually update the parameters of the model using the gradients
            # computed by the backwards pass.
            optimizer.step()
            
            if t % print_every == 0:
                print('Iteration %d, loss = %.4f' % (t, loss.item()))
                print()
                accuracy_val = check_accuracy(loader_val, model)
                if accuracy_val > accuracy_cal_max:
                    accuracy_cal_max = accuracy_val
                    model_params = model.state_dict()
                accuracy_val_lst.append((t,accuracy_val))
        
    print('Training is complete. Accuracies on the validation set are:') 
    print(accuracy_val_lst)
    return 
    
    


In [11]:
input_size = len(df_train['mfcc'][0])
num_classes = len(df_train['label'][0])
learning_rate = 1e-2
model = DNN_FC(input_size, num_classes)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
train(model, optimizer,epochs = 10) 

# Accuracy on the validation set: 72.57%. 

Iteration 0, loss = 38.2585

Checking accuracy on validation set
Got 2 / 3099 correct (0.06)
Iteration 0, loss = 0.1570

Checking accuracy on validation set
Got 1694 / 3099 correct (54.66)
Iteration 0, loss = 0.0808

Checking accuracy on validation set
Got 1980 / 3099 correct (63.89)
Iteration 0, loss = 0.0716

Checking accuracy on validation set
Got 2001 / 3099 correct (64.57)
Iteration 0, loss = 0.0757

Checking accuracy on validation set
Got 1996 / 3099 correct (64.41)
Iteration 0, loss = 0.0737

Checking accuracy on validation set
Got 2022 / 3099 correct (65.25)
Iteration 0, loss = 0.0835

Checking accuracy on validation set
Got 1997 / 3099 correct (64.44)
Iteration 0, loss = 0.0751

Checking accuracy on validation set
Got 2031 / 3099 correct (65.54)
Iteration 0, loss = 0.0739

Checking accuracy on validation set
Got 2023 / 3099 correct (65.28)
Iteration 0, loss = 0.0686

Checking accuracy on validation set
Got 2026 / 3099 correct (65.38)
Training is complete. Accuracies on the val

In [12]:
def infer_probabilities(loader, model):
    if loader.dataset.train:
        print('Getting estimated probabilities on validation set')
    else:
        print('Getting estimated probabilities on test set') 
    model.eval()  # set model to evaluation mode
    probabilities_dict = {} 
    batch_size = loader.batch_size
    with torch.no_grad():
        for idx, (x, y) in enumerate(loader):
            y = flatten(y) # Flatten y to convert dimension from (Nx1) to (N,)
            x = x.to(device=device, dtype=dtype)  # move to device, e.g. GPU
            y = y.to(device=device, dtype=dtype) 
            scores = model(x) 
            probabilities = torch.softmax(scores, dim=1) 
            
            # Save the probabilities with the corresponding row index
            for i in range(len(probabilities)):
                probabilities_dict[idx * batch_size + i] = probabilities[i].numpy()
    
    return probabilities_dict


def find_emission(loader, model):
    '''
    Find emission probabilities for a given data loader and model.
    Consider changing this function if it takes too long. Currently: O(n)
    '''
    # Get the inferred probabilities for each class (12 states, background and silence)
    probabilities_dict = infer_probabilities(loader, model) 
    emission = probabilities_dict
    # Get the prior vector and the transition probabilities. We don't need the transition probabilities.
    prior_vector, _ = get_prob.main(rerun=False) 

    # For each key=row_idx and val=prob_array, convert the inferred probabilities into emission.
    for key, val in emission.items():
        # Slice val to exclude the probabilities for background and silence.
        log_prob = np.where(val > 0, np.log(val), -np.inf)   # Get the log probabilities. 
        log_prob = log_prob[:-2]  # Exclude the background and silence in the emission probability calculation. 
        emission[key] = [log_prob-prior_vector]  # Divide by prior vector in the log space. 
    
    emission_df = pd.DataFrame.from_dict(emission, orient='index', columns=['Emission']) 
    return emission_df 

In [13]:
estimate_prob = infer_probabilities(loader_test, model)
emission_data = find_emission(loader_test, model)

prior_vector, _ = get_prob.main(rerun=False)


Getting estimated probabilities on test set
Getting estimated probabilities on test set


In [14]:
def path_to_emission(file_path_wav: str, file_path_phn: str):
    '''
    Given the path of a file, get the emission probabilities.
    Args:
        file_path: Path of the audio file as a string.
    Returns:
        emit: pd.dataframe
            Emission probabilities for each frame in the audio file.
    '''
    df_test = mfcc_label.prepare_data(file_path_phn,file_path_wav)
    column_str_to_numpy(df_test, 'mfcc')
    column_str_to_numpy(df_test, 'label')
    # Convert dataframe into a loader so that torch can work with.
    dataset_test = CustomDataset(df_test,train=False)
    loader_test = DataLoader(dataset_test, batch_size=1,
                        sampler=sampler.SequentialSampler(range(len(df_test))))

    emission_data = find_emission(loader_test, model)
    return emission_data
    

path_to_emission('timit/data/TRAIN/DR4/MDCD0/SX425.WAV','timit/data/TRAIN/DR4/MDCD0/SX425.PHN')

Getting estimated probabilities on test set


Unnamed: 0,Emission
0,"[-15.582437007565192, -24.97697559242876, -15...."
1,"[-21.774905650753666, -28.727770956808644, -13..."
2,"[-22.352752177853276, -28.232329520041066, -12..."
3,"[-24.73541208996265, -31.144221457296926, -14...."
4,"[-23.565106837887456, -30.57955470925005, -14...."
...,...
402,"[-14.78414294018238, -23.28407588844927, -15.1..."
403,"[-11.245132892269782, -19.383465918356496, -12..."
404,"[-9.937634914059332, -17.298281821066457, -10...."
405,"[-12.550544231075934, -20.39851108437212, -12...."


In [17]:
def get_emission_all_paths(path_type: str = 'test'):
    paths = load('processed_data/train_test_dataset_never.joblib')[path_type]
    data = {}
    for i in range(len(paths)):
        file_path_wav, file_path_phn, file_path_word = paths[i]
        emission_data = path_to_emission(file_path_wav, file_path_phn)
        data[(file_path_wav, file_path_phn, file_path_word)] = emission_data

    return data
data = get_emission_all_paths() 

from joblib import dump
dump(data, "processed_data/test_data_for_hmm.joblib")

Getting estimated probabilities on test set
Getting estimated probabilities on test set
Getting estimated probabilities on test set
Getting estimated probabilities on test set
For start 49040 and end 49440, there is no time-overlapping row.
Caution: There is no time-overlapping rows for start 49040 and end 49440
Getting estimated probabilities on test set
Getting estimated probabilities on test set
Getting estimated probabilities on test set
For start 62960 and end 63360, there is no time-overlapping row.
Caution: There is no time-overlapping rows for start 62960 and end 63360
Getting estimated probabilities on test set
Getting estimated probabilities on test set
For start 43840 and end 44240, there is no time-overlapping row.
Caution: There is no time-overlapping rows for start 43840 and end 44240
Getting estimated probabilities on test set
For start 60640 and end 61040, there is no time-overlapping row.
Caution: There is no time-overlapping rows for start 60640 and end 61040
Getting 

['processed_data/test_data_for_hmm.joblib']