Model: 
- A CNN layer of 256 units with kernel size = 3 is used to extract features, followed by batch norm and hardtan.  
- 5 stacked BiGRU layers, each of 256 units.
- This is followed by one linear layer of 47 (num_classes) units.

Training: 
- Adam optimizer with default learning rate of 1e-3, weight_decay of 5e-5 and ReduceLROnPlateau scheduler.
- Trained for 24 epoches. 

Loss Function:
-  CTCLoss

Inference:
- Decoding with beam search and a beam width of 30.



In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
DATA_PATH = "/content/drive/My Drive/DL HW3/data/"
MODEL_PATH = "/content/drive/My Drive/DL HW3/model/"
num_workers = 4
batch_size = 64

In [0]:
# get ctcdecode
!git clone --recursive https://github.com/parlance/ctcdecode.git
!pip install wget
import os
os.chdir('/content')
os.chdir('ctcdecode')
!ls
!pip install .
os.chdir('/content')
!pip install editdistance

fatal: destination path 'ctcdecode' already exists and is not an empty directory.
build.py   LICENSE    requirements.txt	tests
ctcdecode  README.md  setup.py		third_party
Processing /content/ctcdecode
Building wheels for collected packages: ctcdecode
  Building wheel for ctcdecode (setup.py) ... [?25l[?25hdone
  Created wheel for ctcdecode: filename=ctcdecode-0.4-cp36-cp36m-linux_x86_64.whl size=12178337 sha256=47dc4c6fa833be5a447ba02693fec69f0facfcadda9b42109cb0f886045eece7
  Stored in directory: /tmp/pip-ephem-wheel-cache-_74yh0wn/wheels/c3/6c/94/7d57d4f20a87a22ef1722eaad22052b4c435892b55400e5f4e
Successfully built ctcdecode
Installing collected packages: ctcdecode
  Found existing installation: ctcdecode 0.4
    Uninstalling ctcdecode-0.4:
      Successfully uninstalled ctcdecode-0.4
Successfully installed ctcdecode-0.4


In [0]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm_notebook as tqdm
from torch import nn
from torch.nn.utils.rnn import *
import os
from torch.autograd import Variable
import torch.nn.functional as F
import time
from ctcdecode import CTCBeamDecoder
import sys
sys.path.append(DATA_PATH)
from phoneme_list import *
import editdistance

In [0]:
device = torch.device('cuda:0')
# set SEED
os.environ["SEED"] = "999"
torch.manual_seed(999)
np.random.seed(0)

In [0]:
!nvidia-smi

Sat Apr  4 23:19:07 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.64.00    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    25W / 250W |      0MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [0]:
def load_data(x_path,y_path=""):
    x = np.load(x_path, allow_pickle=True)
    print ("X:")
    print ("Number of utterances " + str(x.shape[0]))
    print ("Number of dimentions " + str(x[0].shape[1]))
    print ("Avg length of utterances " + str(np.mean([i.shape[0] for i in x])))
    if y_path:
        print ("Y:")
        y = np.load(y_path, allow_pickle=True)
        print ("Avg length of phonemes " + str(np.mean([i.shape[0] for i in y])))
        return x, y
    return x

In [0]:
dev_x, dev_y = load_data(DATA_PATH+"wsj0_dev.npy",y_path=DATA_PATH+"wsj0_dev_merged_labels.npy")

X:
Number of utterances 1106
Number of dimentions 40
Avg length of utterances 651.8164556962025
Y:
Avg length of phonemes 78.09493670886076


In [0]:
train_x, train_y = load_data(DATA_PATH+"wsj0_train.npy",y_path=DATA_PATH+"wsj0_train_merged_labels.npy")

X:
Number of utterances 24724
Number of dimentions 40
Avg length of utterances 651.6843552823168
Y:
Avg length of phonemes 77.74033327940462


In [0]:
print(train_x[0].shape)
print(train_y[0].shape)

(548, 40)
(57,)


In [0]:
class MyDataset(Dataset):
    def __init__ (self, x, y=None):
        self.x = x
        if y != None:
            self.y = y + 1
        else:
            self.y = None

    def __len__(self):
        return self.x.shape[0]

    def __getitem__(self, index):
        if self.y != None:
            return torch.from_numpy(self.x[index]), torch.from_numpy(self.y[index])
        else:
            return torch.from_numpy(self.x[index])


In [0]:
def collate_pad(batch):
    if len(batch[0]) == 2:
        x, y = zip(*batch)
    else:
        x = batch
        y = None
    x_len = torch.LongTensor([i.shape[0] for i in x])
    x = pad_sequence(x)
    if y != None:
        y_len = torch.LongTensor([i.shape[0] for i in y])
        y = pad_sequence(y, batch_first=True)
        return x, x_len, y, y_len
    return x, x_len

In [0]:
train_dataloader = DataLoader(MyDataset(train_x, train_y), 
                              shuffle=True, 
                              batch_size=batch_size, 
                              collate_fn = collate_pad,
                              pin_memory=True)
dev_dataloader = DataLoader(MyDataset(dev_x, dev_y), 
                            shuffle=False, 
                            batch_size=batch_size, 
                            collate_fn = collate_pad,
                            pin_memory=True)


  after removing the cwd from sys.path.


In [0]:
for x, _, _, _ in train_dataloader:
    print (x.shape)
    break

torch.Size([1505, 64, 40])


  del sys.path[0]


In [0]:
class Model(nn.Module):
    def __init__(self, in_dim, out_vocab):
        super(Model, self).__init__()
        self.cnn = torch.nn.Sequential(
            nn.BatchNorm1d(in_dim), 
            nn.Conv1d(in_dim, 256, 3, padding=1),
            nn.BatchNorm1d(256),
            nn.Hardtanh(inplace=True)
          )
        
        self.rnn = nn.GRU(256, 256, 5, bidirectional=True)
        self.output = nn.Sequential(
            nn.Linear(512, out_vocab)
        )
    
    def forward(self, X, lengths):
        X = self.cnn(X.permute(1, 2, 0).contiguous()).permute(2, 0, 1)
        packed_X = pack_padded_sequence(X, lengths, enforce_sorted=False)
        packed_out = self.rnn(packed_X)[0]
        out, out_lens = pad_packed_sequence(packed_out)
        # Log softmax after output layer is required since`nn.CTCLoss` expects log probabilities.
        out = self.output(out).log_softmax(2)
        return out, out_lens

def init_model(m):
    if isinstance(m, nn.Linear):
        nn.init.kaiming_normal_(m.weight.data)
        nn.init.normal_(m.bias.data)
    if isinstance(m, nn.LSTMCell) or isinstance(m, nn.GRUCell):
        for name, param in m.named_parameters():
            if 'weight' in name:
                nn.init.orthogonal_(param.data)
            if 'bias' in name:
                nn.init.normal_(param.data)

ERROR! Session/line number was not unique in database. History logging moved to new session 62


In [0]:
model = Model(train_x[0].shape[1],len(PHONEME_MAP)+1)
model.apply(init_model)
model.cuda()
print (model)

Model(
  (cnn): Sequential(
    (0): BatchNorm1d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (1): Conv1d(40, 256, kernel_size=(3,), stride=(1,), padding=(1,))
    (2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Hardtanh(min_val=-1.0, max_val=1.0, inplace=True)
  )
  (rnn): GRU(256, 256, num_layers=5, bidirectional=True)
  (output): Sequential(
    (0): Linear(in_features=512, out_features=47, bias=True)
  )
)


In [0]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=5e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=1, threshold=0.01, verbose=True)
loss_fn = nn.CTCLoss()

In [0]:
def predict(out, out_len, beam_width = 30):
    decoder = CTCBeamDecoder([" "] + PHONEME_MAP, beam_width=beam_width, log_probs_input=True)
    out = out.to("cpu")
    out_len = out_len.to("cpu")
    y, _, _, y_len = decoder.decode(out.transpose(0, 1), out_len)
    result = []
    for i in range(y.shape[0]):
        best_seq = y[i, 0, :y_len[i, 0]]
        best_pron = ''.join((['']+PHONEME_MAP)[i] for i in best_seq)
        result.append(best_pron)
    #print (best_pron)
    return result

In [0]:
def idx_to_phoneme(y, y_len):
    y = y.to("cpu")
    result = []
    for i in range(y.shape[0]):
        seq = y[i, :y_len[i]]
        pron = ''.join((['']+PHONEME_MAP)[i] for i in seq)
        result.append(pron)
    #print (result[0])
    return result

In [0]:
def get_distance(predict,real):
    distance = 0
    total = len(predict)
    for i in range(total):
        distance += editdistance.eval(predict[i], real[i])
    return distance/total


In [0]:
mean_train_losses = []
mean_valid_losses = []
mean_valid_distance = []
epochs = 24
best_model = None
for epoch in range(epochs):
    model.train()
    train_losses = []
    for x_batch, x_len, y_batch, y_len in tqdm(train_dataloader):
        optimizer.zero_grad()
        x_batch = x_batch.cuda()
        y_batch = y_batch.cuda()
        out, out_len = model(x_batch, x_len)
        loss = loss_fn(out, y_batch, out_len, y_len)
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())
    
    model.eval()
    valid_losses = []
    valid_distance = []
    with torch.no_grad():
        for x_batch, x_len, y_batch, y_len in tqdm(dev_dataloader):
            x_batch = x_batch.cuda()
            y_batch = y_batch.cuda()
            out, out_len = model(x_batch, x_len)
            loss = loss_fn(out, y_batch, out_len, y_len)
            valid_losses.append(loss.item())
            y_predict_ph = predict(out, out_len)
            y_real_ph = idx_to_phoneme(y_batch, y_len)
            valid_distance.append(get_distance(y_predict_ph,y_real_ph))
    
    mean_train_losses.append(np.mean(train_losses))
    mean_valid_losses.append(np.mean(valid_losses))
    scheduler.step(np.mean(valid_losses))
    if (best_model is None) or (np.mean(valid_distance) < min(mean_valid_distance)):
        best_model = model

    mean_valid_distance.append(np.mean(valid_distance))
    print('epoch {}: train loss : {:.4f}, valid loss : {:.4f}, valid distance : {:.2f}'\
         .format(epoch+1, np.mean(train_losses), np.mean(valid_losses), np.mean(valid_distance)))
torch.save(best_model.state_dict(), MODEL_PATH+'best_%d.pt'%int(time.time()))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  if __name__ == '__main__':


HBox(children=(IntProgress(value=0, max=387), HTML(value='')))

  del sys.path[0]





Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=0, max=18), HTML(value='')))


epoch 1: train loss : 3.5128, valid loss : 1.9302, valid distance : 48.81


HBox(children=(IntProgress(value=0, max=387), HTML(value='')))




HBox(children=(IntProgress(value=0, max=18), HTML(value='')))


epoch 2: train loss : 1.2535, valid loss : 0.9361, valid distance : 20.74


HBox(children=(IntProgress(value=0, max=387), HTML(value='')))




HBox(children=(IntProgress(value=0, max=18), HTML(value='')))


epoch 3: train loss : 0.8030, valid loss : 0.7211, valid distance : 16.29


HBox(children=(IntProgress(value=0, max=387), HTML(value='')))




HBox(children=(IntProgress(value=0, max=18), HTML(value='')))


epoch 4: train loss : 0.6545, valid loss : 0.6318, valid distance : 14.24


HBox(children=(IntProgress(value=0, max=387), HTML(value='')))




HBox(children=(IntProgress(value=0, max=18), HTML(value='')))


epoch 5: train loss : 0.5759, valid loss : 0.5776, valid distance : 13.29


HBox(children=(IntProgress(value=0, max=387), HTML(value='')))




HBox(children=(IntProgress(value=0, max=18), HTML(value='')))


epoch 6: train loss : 0.5277, valid loss : 0.5509, valid distance : 12.57


HBox(children=(IntProgress(value=0, max=387), HTML(value='')))




HBox(children=(IntProgress(value=0, max=18), HTML(value='')))


epoch 7: train loss : 0.4910, valid loss : 0.5206, valid distance : 11.93


HBox(children=(IntProgress(value=0, max=387), HTML(value='')))




HBox(children=(IntProgress(value=0, max=18), HTML(value='')))


epoch 8: train loss : 0.4625, valid loss : 0.5069, valid distance : 11.48


HBox(children=(IntProgress(value=0, max=387), HTML(value='')))




HBox(children=(IntProgress(value=0, max=18), HTML(value='')))


epoch 9: train loss : 0.4376, valid loss : 0.5146, valid distance : 11.68


HBox(children=(IntProgress(value=0, max=387), HTML(value='')))




HBox(children=(IntProgress(value=0, max=18), HTML(value='')))


epoch 10: train loss : 0.4273, valid loss : 0.4823, valid distance : 11.06


HBox(children=(IntProgress(value=0, max=387), HTML(value='')))




HBox(children=(IntProgress(value=0, max=18), HTML(value='')))


epoch 11: train loss : 0.4245, valid loss : 0.4693, valid distance : 10.59


HBox(children=(IntProgress(value=0, max=387), HTML(value='')))




HBox(children=(IntProgress(value=0, max=18), HTML(value='')))


epoch 12: train loss : 0.3969, valid loss : 0.4530, valid distance : 10.36


HBox(children=(IntProgress(value=0, max=387), HTML(value='')))




HBox(children=(IntProgress(value=0, max=18), HTML(value='')))


epoch 13: train loss : 0.3839, valid loss : 0.4340, valid distance : 9.87


HBox(children=(IntProgress(value=0, max=387), HTML(value='')))




HBox(children=(IntProgress(value=0, max=18), HTML(value='')))


epoch 14: train loss : 0.3725, valid loss : 0.4406, valid distance : 9.99


HBox(children=(IntProgress(value=0, max=387), HTML(value='')))




HBox(children=(IntProgress(value=0, max=18), HTML(value='')))


Epoch    15: reducing learning rate of group 0 to 1.0000e-04.
epoch 15: train loss : 0.3666, valid loss : 0.4604, valid distance : 10.51


HBox(children=(IntProgress(value=0, max=387), HTML(value='')))




HBox(children=(IntProgress(value=0, max=18), HTML(value='')))


epoch 16: train loss : 0.2729, valid loss : 0.3463, valid distance : 7.97


HBox(children=(IntProgress(value=0, max=387), HTML(value='')))




HBox(children=(IntProgress(value=0, max=18), HTML(value='')))


epoch 17: train loss : 0.2362, valid loss : 0.3359, valid distance : 7.71


HBox(children=(IntProgress(value=0, max=387), HTML(value='')))




HBox(children=(IntProgress(value=0, max=18), HTML(value='')))


epoch 18: train loss : 0.2228, valid loss : 0.3361, valid distance : 7.58


HBox(children=(IntProgress(value=0, max=387), HTML(value='')))




HBox(children=(IntProgress(value=0, max=18), HTML(value='')))


epoch 19: train loss : 0.2127, valid loss : 0.3318, valid distance : 7.53


HBox(children=(IntProgress(value=0, max=387), HTML(value='')))




HBox(children=(IntProgress(value=0, max=18), HTML(value='')))


epoch 20: train loss : 0.2042, valid loss : 0.3334, valid distance : 7.45


HBox(children=(IntProgress(value=0, max=387), HTML(value='')))




HBox(children=(IntProgress(value=0, max=18), HTML(value='')))


Epoch    21: reducing learning rate of group 0 to 1.0000e-05.
epoch 21: train loss : 0.1975, valid loss : 0.3327, valid distance : 7.46


HBox(children=(IntProgress(value=0, max=387), HTML(value='')))




HBox(children=(IntProgress(value=0, max=18), HTML(value='')))


epoch 22: train loss : 0.1838, valid loss : 0.3284, valid distance : 7.39


HBox(children=(IntProgress(value=0, max=387), HTML(value='')))




HBox(children=(IntProgress(value=0, max=18), HTML(value='')))


epoch 23: train loss : 0.1811, valid loss : 0.3279, valid distance : 7.35


HBox(children=(IntProgress(value=0, max=387), HTML(value='')))




HBox(children=(IntProgress(value=0, max=18), HTML(value='')))


epoch 24: train loss : 0.1796, valid loss : 0.3277, valid distance : 7.34


In [0]:
final_model = best_model

In [0]:
test_x = load_data(DATA_PATH+"wsj0_test.npy")
test_dataloader = DataLoader(MyDataset(test_x), 
                            shuffle=False, 
                            batch_size=batch_size, 
                            collate_fn = collate_pad,
                            num_workers=num_workers)

X:
Number of utterances 523
Number of dimentions 40
Avg length of utterances 658.9560229445507


In [0]:
final_model.eval()
test_preds = []

with torch.no_grad():
    for x_batch, x_len in tqdm(test_dataloader):
        x_batch = x_batch.cuda()
        out, out_len = final_model(x_batch, x_len)
        pred = predict(out, out_len)
        test_preds.extend(pred)

out_df = pd.DataFrame()
out_df['Id'] = np.arange(0, len(test_x))
out_df['Predicted'] = test_preds
out_df.head()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(IntProgress(value=0, max=9), HTML(value='')))




Unnamed: 0,Id,Predicted
0,0,.!inshvhvHalIWhzbestelhn+HAdbinekstyldhRinpRiz...
1,1,..DhsOvIetskamhmInWylkamh_HAvbighnWiTgRiGtRUts...
2,2,.pRyvhtfRIskUlzWrfoRmd_bOTinpuRnEbrHudzAndinmi...
3,3,.gREnzindsYbIniskOlhn_DhTRethvREnhndfEvrhvhlsp...
4,4,..DI?UnytidstEts.hndrtuk.dUdifAndWestrn?uRhp.h...


In [0]:
SUBMISSION_PATH = "/content/drive/My Drive/DL HW3/submission/"
file_name = SUBMISSION_PATH+"submission_%d.csv"%int(time.time())
out_df.to_csv(file_name,index=False)