# Speech Processing Task

In [1]:
# import essential libraries
from zipfile import ZipFile

# Libraries for NN
import torch
import torch.nn as nn
import torch.nn.functional as F


import time
import numpy as np

cuda = torch.cuda.is_available()
cuda

# questions: how to load data from data source & how to use cuda? & how to use instance? 

True

## 1. Loading data from data files

In [2]:
# read zip files
from zipfile import ZipFile
file_name = "11-785-s20-hw1p2.zip"

# opening the zip file in READ mode 
with ZipFile(file_name, 'r') as zp: 
    # extracting all the files 
    print('Extracting all the files now...') 
    zp.extractall() 
    print('Done!') 

Extracting all the files now...
Done!


In [3]:
# check zipfile object
print(zp.namelist())


['dev.npy', 'dev_labels.npy', 'hw1p2_sample_submission.csv', 'test.npy', 'train.npy', 'train_labels.npy']


In [4]:
# loading all data from files
dev = np.load("dev.npy")
dev_labels = np.load("dev_labels.npy")

test = np.load("test.npy")

train = np.load("train.npy")
train_labels = np.load("train_labels.npy")


In [5]:
# print out and check the shape of training set
print("The number of training samples: ", train.shape)
print("The frames in utterance 1: ",train[0].shape)
train_temp = train

The number of training samples:  (24500,)
The frames in utterance 1:  (477, 40)


## 2. Creat Dataset object

In [6]:
class Dataset():
    def __init__(self, data, labels, context_len, train_mode = True):
        self.labels = labels
        self.context_len = context_len
        self.data = data
        # padding all the utterance
        self.length_list = [0]
        count = 0
        self.train_mode = train_mode
        for i in range(data.shape[0]):
            count += self.data[i].shape[0]
            self.data[i] = np.pad(self.data[i],((context_len,context_len),(0,0)), 'constant', constant_values=0)
            self.length_list.append(count)
    def __len__(self):
        # len should be total frames
        return self.length_list[-1]

    def binary_search(self, target):
        low = 0
        high = len(self.length_list) - 1
        while (low < high - 1):
            mid = (low + high) // 2
            if target > self.length_list[mid]:
                low = mid
            elif target < self.length_list[mid]:
                high = mid
            elif target == self.length_list[mid]:
                # corner case
                low = mid
                break
        # low -- u_idx
        # target - f_idx
        return low, target - self.length_list[low]

    
    def __getitem__(self, idx):
        # convert it to a regular python int.
        if torch.is_tensor(idx):
            idx = idx.tolist()
        # how to get the utterance binary search
            frames = np.zeros((len(idx), (2 * self.context_len + 1) * 40))
            if self.train_mode:
                labels = np.zeros(len(idx))
    
            # idx should list, np.array
            for i in range(len(idx)):
                u_idx, f_idx = self.binary_search(idx[i])

                # change frames and labels
                frames[i] = self.data[u_idx][f_idx:f_idx + 2 * self.context_len + 1].reshape(1, -1)
                if self.train_mode:
                    labels[i] = self.labels[u_idx][f_idx]
        else:
            u_idx, f_idx = self.binary_search(idx)
            
            # change frames and labels
            frames = self.data[u_idx][f_idx:f_idx + 2 * self.context_len + 1].reshape(1, -1).squeeze()
            if self.train_mode:
                labels = self.labels[u_idx][f_idx]
            else:
                labels = []
        # return tensor
        return {"frames": frames, "labels": labels}



## 3. DataLoader
PyTorch provides convenient wrappers in torch.utils.data to load, shuffle, batch, and iterate over data. You should use it whenever possible because it has several advantages:  
1. Bulit-in logic for shuffling and batching. (Don't reinvent wheels.)  
2. Asynchronous data loading. (Load and preprocess data for the next batch while training the current batch.)
3. On-the-fly data preprocessing. (No need to store the whole preprocessed dataset.)    

The basic usage involves two classes: Dataset and Dataloader.  
1. The abstract class Dataset represents a dataset whose samples can be randomly accessed via its __getitem__ method. We can use one of the provided implementation like TensorDataset, or inherit Dataset ourselves and implement loading and preprocessing in __getitem__.  
2. Dataloader turns a Dataset into an iterator of mini-batches, handling shuffling, batching and multiprocessing under the hood.  
Refer to the documentation for more usage and details: https://pytorch.org/docs/stable/data.html

In [7]:
# step 1, read and load the data, use Dataset() function

# step 2, create classes to preprocess the data, or cat the data

# step 3, use DataLoader to shuffle, customize batch size

# step 4, once training, use DataLoader object (iterator)
## looping with two for loops, for epoch & for i in dataloader

## 4. Construct the Network and Define the model
define the network object

To reach the baseline, we use following steps:
Baseline model:
* Layers -> [input_size, 2048, 1024, output_size]
* ReLU activations
* Context size k = 12 frames on both sides
* Adam optimizer, with the default learning rate 1e-3
* Zero padding of k frames on both sides of each utterance
This should get you to around the cutoffs for B.

### note: How much time would the model take to run an epoc
Depends on network size and context and batch size. Could vary from a few minutes to 20 mins.
I would advise that if an epoch is taking you an hour on training data or something similar, there is something wrong with the way you are making batches (dataset, dataloader)

### Input Description
The utterance, context and the sliding window. The sliding window will help differentiate the phoe in a utterance
but considering the fact that the length of the sliding window, we also want to create the context for the model.
the dimension of data (3, Y_x , 40 ) -- means there are three utterance



In [8]:
# Hyperparameters
context_len = 15
input_size = (2 * context_len + 1) * 40
hiddens = [2048, 1024, 512, 256, 256]
output_size  = 138

# Load data into pre-defined datasets.
# Training Data
train_dataset = Dataset(train, train_labels, context_len)

# Dev Data
dev_dataset = Dataset(dev, dev_labels, context_len)

# Test Data
test_dataset = Dataset(test, None, context_len, train_mode = False)



In [9]:
print(train[0].shape)
print((train_dataset[15388712]['frames']))
print((train_dataset[15388712]['labels']))
print(train_dataset.binary_search(9001))

(507, 40)
[-377.28824994 -297.82954848 -329.87172842 ...    0.            0.
    0.        ]
110
(18, 368)


## 5. Model Training

Train the model for several epochs, and validate performance on validation dataset.

In [10]:
# create the model and optimizer, criterion for training
# define the network
# model = MLP(input_size, output_size, hiddens)
# criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(model.parameters(),lr = 0.001)
# device = torch.device("cuda" if cuda else "cpu")
# model.to(device)
# print(model)

In [11]:
def test_model(model, test_loader, criterion):
    with torch.no_grad():
        model.eval()

        running_loss = 0.0
        total_predictions = 0.0
        correct_predictions = 0.0

        for batch_idx, data1 in enumerate(test_loader): 
            
            data = data1['frames'].float().to(device)
            target = data1['labels'].to(device)

            outputs = model(data)

            _, predicted = torch.max(outputs.data, 1)
            total_predictions += target.size(0)
            correct_predictions += (predicted == target).sum().item()

            loss = criterion(outputs, target).detach()
            running_loss += loss.item()


        running_loss /= len(test_loader)
        acc = (correct_predictions/total_predictions)*100.0
        print('Testing Loss: ', running_loss)
        print('Testing Accuracy: ', acc, '%')
        return running_loss, acc
    

In [12]:
# print(model)


In [13]:
train_dataset[15]

{'frames': array([-575.51613546, -390.27229869, -449.75549712, ...,  168.07380721,
         140.17706286,  177.80356023]), 'labels': 24}

In [14]:

def train_epoch(model, train_dataset, criterion, optimizer, batch_size):
    model.train()
    running_loss = 0.0
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
    start_time = time.time()
    
    # change of the learning rate
#     optimizer = torch.optim.Adam(model.parameters(),lr = lr)
    
    # print current lr
    print("Current lr is : " , optimizer.param_groups[0]['lr'])
    for data1 in train_loader:   
        optimizer.zero_grad()   # .backward() accumulates gradients
        data = data1["frames"].float().to(device)
        target = data1["labels"].to(device) # all data & model on same device

        outputs = model(data)
        loss = criterion(outputs, target)
        running_loss += loss.item()
        
        # backward is only applicable after the forward 
        # call, so if you want to call backward again, 
        # it should follow the forward call.
        loss.backward()
        # optimizer step update params after backward
        # based on current gradient, and that is why we
        # should call optimizer.zero_grad() each time
        optimizer.step()
    
    end_time = time.time()
    
    running_loss /= len(train_loader)
    print('Training Loss: ', running_loss, 'Time: ',end_time - start_time, 's')
    return model, running_loss

In [15]:
# import torch.nn as nn
# # MLP model construction
class MLP1(nn.Module):
    def __init__(self,input_size, output_size, hiddens):
        super(MLP1, self).__init__()
        # remember to add the hiddens into nn.ModuleList()
        # it depends on the way you define a network
        self.layers = nn.Sequential(
            nn.Linear(input_size,hiddens[0]),
            nn.ReLU(),
            nn.Linear(hiddens[0],hiddens[1]),
            nn.BatchNorm1d(num_features = hiddens[1], momentum = 0.1),
            nn.ReLU(),
            nn.Linear(hiddens[1],hiddens[2]),
            nn.BatchNorm1d(num_features = hiddens[2], momentum = 0.1),
            nn.ReLU(),
            nn.Linear(hiddens[2],hiddens[3]),
            nn.BatchNorm1d(num_features = hiddens[3], momentum = 0.1),
            nn.ReLU(),
            nn.Linear(hiddens[3],hiddens[4]),
            nn.BatchNorm1d(num_features = hiddens[4], momentum = 0.1),
            nn.ReLU(),
            nn.Linear(hiddens[4],hiddens[5]),
            nn.BatchNorm1d(num_features = hiddens[5], momentum = 0.1),
            nn.ReLU(),
            nn.Linear(hiddens[5],output_size)
        )
        
    def forward(self,X):
#         for i in self.layers:
#             print("X shape is :", X.shape)
        X = self.layers(X)
        self.output = X
        return X
    

In [16]:
# Init the model to find good training method
input_size = 1240
output_size = 138
hiddens = [2048,1024,512,512,256,256]
model1 = MLP1(input_size, output_size, hiddens)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model1.parameters())
device = torch.device("cuda" if cuda else "cpu")
model1.to(device)
print(model1)

# Model settings
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min')
batch_size = 256
dev_loader = torch.utils.data.DataLoader(dev_dataset, batch_size = batch_size)

MLP1(
  (layers): Sequential(
    (0): Linear(in_features=1240, out_features=2048, bias=True)
    (1): ReLU()
    (2): Linear(in_features=2048, out_features=1024, bias=True)
    (3): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (4): ReLU()
    (5): Linear(in_features=1024, out_features=512, bias=True)
    (6): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): ReLU()
    (8): Linear(in_features=512, out_features=512, bias=True)
    (9): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU()
    (11): Linear(in_features=512, out_features=256, bias=True)
    (12): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (13): ReLU()
    (14): Linear(in_features=256, out_features=256, bias=True)
    (15): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (16): ReLU()
    (17): Linear(in_features=256, out_f

In [17]:
# Start the training process and inspect changes of parmas
for i in range(20):

    model1,_ = train_epoch(model1, train_dataset, criterion, optimizer, batch_size)
    # evaluate the model
    eval_loss,_  = test_model(model1,dev_loader, criterion)
    scheduler.step(eval_loss)

Current lr is :  0.001
Training Loss:  2.0475332946997704 Time:  521.7506194114685 s
Testing Loss:  1.7618870750295392
Testing Accuracy:  52.10539243248362 %
Current lr is :  0.001
Training Loss:  1.7391971089971494 Time:  521.7650563716888 s
Testing Loss:  1.6719749860691302
Testing Accuracy:  54.379760770364406 %
Current lr is :  0.001
Training Loss:  1.6417104922991188 Time:  523.4633958339691 s
Testing Loss:  1.57223868162343
Testing Accuracy:  56.93422664670127 %
Current lr is :  0.001
Training Loss:  1.5834612108368804 Time:  528.5029680728912 s
Testing Loss:  1.5012852238208958
Testing Accuracy:  58.700631514154324 %
Current lr is :  0.001
Training Loss:  1.5434074055687859 Time:  524.707864522934 s
Testing Loss:  1.4854004507940828
Testing Accuracy:  59.14008724009967 %
Current lr is :  0.001
Training Loss:  1.5132051880607575 Time:  522.5344154834747 s
Testing Loss:  1.4607314886694605
Testing Accuracy:  59.741416556679425 %
Current lr is :  0.001
Training Loss:  1.48866718659

In [18]:
# inspect training params
print(optimizer.param_groups[0]['lr'])
print(scheduler.state_dict())

0.001
{'factor': 0.1, 'min_lrs': [0], 'patience': 10, 'verbose': False, 'cooldown': 0, 'cooldown_counter': 0, 'mode': 'min', 'threshold': 0.0001, 'threshold_mode': 'rel', 'best': 1.366493466011051, 'num_bad_epochs': 0, 'mode_worse': inf, 'eps': 1e-08, 'last_epoch': 19}


In [19]:
# Continue training to get better performance
for i in range(10):
    #################### change the learning rate rule. #####################
    model1,_ = train_epoch(model1, train_dataset, criterion, optimizer, batch_size)
    ########################################################################
    # evaluate the model
    eval_loss,_  = test_model(model1,dev_loader, criterion)
    scheduler.step(eval_loss)


Current lr is :  0.001
Training Loss:  1.3392689984417037 Time:  521.8033604621887 s
Testing Loss:  1.366871906884692
Testing Accuracy:  62.44488307814322 %
Current lr is :  0.001
Training Loss:  1.3337235155165559 Time:  521.2782769203186 s
Testing Loss:  1.3699928627772764
Testing Accuracy:  62.337608532247465 %
Current lr is :  0.001
Training Loss:  1.3281757631321394 Time:  525.1525976657867 s
Testing Loss:  1.362134625390172
Testing Accuracy:  62.66949378251528 %
Current lr is :  0.001
Training Loss:  1.3227056846645544 Time:  528.6545436382294 s
Testing Loss:  1.3644574579528788
Testing Accuracy:  62.45257725246953 %
Current lr is :  0.001
Training Loss:  1.317948128487257 Time:  532.6425511837006 s
Testing Loss:  1.3692855260588905
Testing Accuracy:  62.35447652980901 %
Current lr is :  0.001
Training Loss:  1.3133519661624258 Time:  522.7350435256958 s
Testing Loss:  1.3598318353746877
Testing Accuracy:  62.66949378251528 %
Current lr is :  0.001
Training Loss:  1.3087506615986

In [20]:
# Continue training...
for i in range(3):
    #################### change the learning rate rule. #####################
    model1,_ = train_epoch(model1, train_dataset, criterion, optimizer, batch_size)
    ########################################################################
    # evaluate the model
    eval_loss,_  = test_model(model1,dev_loader, criterion)
    scheduler.step(eval_loss)

Current lr is :  0.001
Training Loss:  1.2924058641059992 Time:  522.4022789001465 s
Testing Loss:  1.3546084003014998
Testing Accuracy:  62.88596641788836 %
Current lr is :  0.001
Training Loss:  1.2883698746579109 Time:  523.164671421051 s
Testing Loss:  1.3575674547277616
Testing Accuracy:  62.83580631987642 %
Current lr is :  0.001
Training Loss:  1.2849656797348963 Time:  524.7252948284149 s
Testing Loss:  1.355825785028212
Testing Accuracy:  62.90638557283128 %


In [21]:
# Change LR manually to get better performance
optimizer = torch.optim.Adam(model1.parameters(), lr=0.0001)
model1,_ = train_epoch(model1, train_dataset, criterion, optimizer, batch_size)

##############################ZZ##########################################
# evaluate the model
eval_loss,_  = test_model(model1,dev_loader, criterion)

Current lr is :  0.0001
Training Loss:  1.2112064915954062 Time:  526.798223733902 s
Testing Loss:  1.3167077033582961
Testing Accuracy:  64.0867015074663 %


In [22]:
# save model dicts for future learning
torch.save(model1.state_dict(), 'mode1.pth')

## Optional: load and resume learning model

In [23]:
# model1 = MLP1(input_size,output_size, hiddens)
# model1.load_state_dict(torch.load('mode1.pth'))
# model1.to(device)

## 6. Dev Dataset

After defining all the procedures, then in this part, generate labels for submission.

In [24]:
test = np.load("test.npy")
test_dataset = Dataset(test, None, context_len,  False)
# makeing predictions and construct labels and stored results into files
import pandas as pd

def predict(model, test_dataset, filename = None):
#     model.eval()
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size = batch_size)
    y_predicts = []
    for data1 in test_loader:  
#         print(data1)
        data = data1['frames'].float().to(device)

        outputs = model(data)

        # make predictions
        _, predicted = torch.max(outputs.data, 1)
        y_predicts += predicted.tolist()
            
    # if filename, then output the predicted labels
    if filename:
        # before writing files, put onto cpu
        pred = np.array(y_predicts)
        columns = np.arange(len(pred))
        # write file
        df = pd.DataFrame({"id" :columns, "label" : pred})
        df.to_csv(filename, index=False)
    return df

In [25]:
# Store the result...
predicted = predict(model1, test_dataset, filename="submission1.csv")