# Predicting speech tags 

Refer: `Frame Level Classification of Speech.doc` for the problem description

Why it's a bad idea to just pad the phonemes?

Ther range of data varies from 50 frames to about a 1000 frames

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import os

In [12]:
# Path to home
PATH = os.getcwd()

In [3]:
# Check the work space is enabled  with GPU.
import torch
cuda = torch.cuda.is_available()
print(cuda)

False


In [4]:
data_folder = os.path.join(PATH, 'data')

In [5]:
train = np.load(os.path.join(data_folder, 'train.npy'), allow_pickle=True)
train_labels = np.load(os.path.join(data_folder, 'train_labels.npy'), allow_pickle=True)

dev = np.load(os.path.join(data_folder, 'dev.npy'), allow_pickle=True)
dev_labels = np.load(os.path.join(data_folder, 'dev_labels.npy'), allow_pickle=True)

In [6]:
train.shape

(24500,)

In [14]:
# # Using subset of complete data when modifying

# dev = dev[:20]
# dev_labels = dev_labels[:20]
# train = train[:10]
# train_labels = train_labels[:10]

In [8]:
from torch.utils.data import DataLoader, Dataset, TensorDataset

class TensorDataset(Dataset):
    def __init__(self, x, y):
        super().__init__()
        assert len(x) == len(y)
        self._x = x
        self._y = y
    
    def __len__(self):
        return len(self._x)
      
    def __getitem__(self, index):
        x_item = self._x[index]
        return torch.FloatTensor(x_item), torch.FloatTensor(self._y[index])

In [9]:
train_dataset = TensorDataset(train, train_labels)

load_train = DataLoader(
    train_dataset,
    batch_size = 1,
    shuffle=False,
    pin_memory=True
)

dev_dataset = TensorDataset(dev, dev_labels)

load_valid = DataLoader(
    dev_dataset,
    batch_size = 1
)

In [6]:
import torch
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [7]:
DEVICE

device(type='cpu')

## Notes:
3 layer: Plateaus at 2-2.4 loss
5 layer: starts over-fitting

Below hyperparameters were recommended values.
Layers were determined by hyperparameter optimization

In [8]:
embedding_dim = 40
hidden_dim = 10
vocab_size = 138 # [0-137]
layers=4

def hidden_init():
    return (torch.rand(layers*2, 1, hidden_dim).to(DEVICE) ,
            torch.rand(layers*2, 1, hidden_dim).to(DEVICE))

hidden_init()

(tensor([[[0.2261, 0.0635, 0.4993, 0.8936, 0.1026, 0.4744, 0.9558, 0.9520,
           0.9811, 0.8996]],
 
         [[0.6332, 0.8928, 0.3691, 0.7337, 0.3118, 0.7843, 0.3708, 0.8421,
           0.6584, 0.3892]],
 
         [[0.1393, 0.9138, 0.9894, 0.3627, 0.1949, 0.1461, 0.8501, 0.8128,
           0.1747, 0.1738]],
 
         [[0.6443, 0.4553, 0.3866, 0.0910, 0.4398, 0.9565, 0.2513, 0.8180,
           0.1458, 0.6562]],
 
         [[0.4320, 0.3839, 0.3742, 0.1623, 0.1179, 0.5242, 0.2380, 0.8052,
           0.2964, 0.8655]],
 
         [[0.9025, 0.7097, 0.8623, 0.2287, 0.2640, 0.7032, 0.6808, 0.3681,
           0.8246, 0.3216]],
 
         [[0.2206, 0.1264, 0.4175, 0.6466, 0.4449, 0.1084, 0.9714, 0.8738,
           0.7677, 0.4911]],
 
         [[0.4271, 0.3124, 0.3283, 0.8620, 0.8813, 0.5015, 0.6065, 0.1876,
           0.8247, 0.9766]]]),
 tensor([[[0.2657, 0.9852, 0.5736, 0.5801, 0.6973, 0.6188, 0.4101, 0.1752,
           0.9324, 0.9191]],
 
         [[0.1456, 0.2634, 0.4260, 0.6772, 0.1

In [9]:
class LSTM_model(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(LSTM_model, self).__init__()
        self.vocab_size = 138 #vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        
        self.lstm = torch.nn.LSTM(embedding_dim, hidden_dim, num_layers=layers, dropout = 0.2, bidirectional = True).to(DEVICE)
        self.linear = torch.nn.Linear(hidden_dim*2, vocab_size)       # *2 applied if bidir = true
        self.softmax = torch.nn.functional.softmax
        
    def forward(self, encrypted):
        lstm_in = encrypted.transpose(0,1)

        lstm_out, lstm_hidden = self.lstm(lstm_in.float(), hidden_init())
        
        scores = self.linear(lstm_out)
        scores = scores.transpose(1, 2)

        return scores

model = LSTM_model(vocab_size, embedding_dim, hidden_dim)

In [10]:
model = model.to(DEVICE)

In [0]:
model.load_state_dict(torch.load(PATH + 'model_5.sav'))

<All keys matched successfully>

In [None]:
# Printing validation loss at regular intervals
validation_time = len(train) / 20

In [30]:
losses = []

class LSTM_Trainer():
    def __init__(self, model):
        self.model = model
        self.loss_fn = torch.nn.CrossEntropyLoss().to(DEVICE)
        self.optimizer = torch.optim.Adam(model.parameters(), lr=0.001) 

    def get_loss(self, encrypted, original) :
        encrypted = encrypted.to(DEVICE).long()
        original = original.to(DEVICE).long()

        scores = self.model.forward(encrypted)
        original = original.transpose(0,1)
        original = original.long()

        loss = self.loss_fn(scores, original)  # <- Training loss
        return loss

    def train(self, num_epochs):
        accuracies, max_accuracy = [], 0
        best_valid_loss = 10   # V.high initialization

        with open(os.path.join(PATH, 'history.csv'),'w') as writer:
            for N in range(num_epochs):
                print('Epoch: {}'.format(N))
                for i, (encrypted, original) in enumerate(load_train):  #dataset(num_examples):
                    self.optimizer.zero_grad()
  
                    loss = self.get_loss(encrypted, original)  # <- Training loss
                    loss.backward()

                    self.optimizer.step()

                # Validation
                    if i % validation_time == 0:

                        print('Validation:' + str(i))
                        validation_loss = []
                        for (val_encrypted, val_original) in load_valid:    #val dataset(num_examples):
                            val_loss = self.get_loss(val_encrypted, val_original) 
                      
                            validation_loss.append(val_loss.item())

                        avg_loss = sum(validation_loss) / len(validation_loss)
                        print('Training Loss: {:6.4f}'.format(loss.item()))
                        print('Validation Loss: {:6.4f}'.format(avg_loss))        
                        writer.write(str(N)+','+str(i)+','+str(loss.item())+','+str(avg_loss))
                        writer.write('\n')

                # Saving the model after an epoch
                model_saved = os.path.join(PATH, 'model_' + str(N+1) + '.sav')
                torch.save(self.model.state_dict(), model_saved)

                print('Train Loss at end of epoch: {:6.4f}'.format(loss.item()))

In [31]:
trainer = LSTM_Trainer(model)

Working on GPU

In [None]:
n_epochs = 15

In [None]:
# Below call starts training models for required epochs

trainer.train(n_epochs)

The above module will train the output `model` using which the below predictions on test will be carried out

In [None]:
from torch.utils.data import DataLoader, Dataset, TensorDataset

class TestDataset(Dataset):
    def __init__(self, x, y):
        super().__init__()
        assert len(x) == len(y)
        self._x = x
    
    def __len__(self):
        return len(self._x)
      
    def __getitem__(self, index):
        x_item = self._x[index]
        return torch.FloatTensor(x_item)

In [None]:
test = np.load(os.path.join(data_folder, 'test.npy'), allow_pickle=True)

test_dataset = TestDataset(test)

load_test = DataLoader(
    train_dataset,
    batch_size = 1,
    shuffle=False,
    pin_memory=True
)

In [18]:
soft = torch.nn.Softmax(dim=0)

with open('hw1_submission.csv') as output:
    output.write('id,label')
    output_id = 0
    for encrypted in load_test:
        encrypted = encrypted.to(DEVICE)
        scores = model.forward(encrypted)

        soft_scores = soft(scores[0])      
        predictions = torch.max(soft_scores, 0)   
        for prediction in predictions:
            output.write(output_id + ',' + prediction)
            output_id += 1
        

KeyboardInterrupt: ignored