<a href="https://colab.research.google.com/github/hai105178362/colab_seq2seq/blob/master/cmu_11785_seq2seq_without_pad.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
print("hello")

hello


In [0]:
#from google.colab import drive
#drive.mount('/content/drive')

In [0]:
import numpy as np
import torch
from torch.utils.data import Dataset
from torch.nn.utils.rnn import *

  

import torch.nn as nn
import torch.nn.utils as utils


from torch import Tensor
from torch.distributions.categorical import Categorical
from torch.nn.utils.rnn import pad_sequence

import time

import torch.optim as optim
from torch.utils.data import DataLoader
import pandas as pd

In [0]:
'''
Loading all the numpy files containing the utterance information and text information
'''


def load_data():
    speech_train = np.load('/content/drive/My Drive/Colab Notebooks/Data Sets/11 785/hw4/train_new.npy', allow_pickle=True, encoding='bytes')
    speech_valid = np.load('/content/drive/My Drive/Colab Notebooks/Data Sets/11 785/hw4/dev_new.npy', allow_pickle=True, encoding='bytes')
    speech_test = np.load('/content/drive/My Drive/Colab Notebooks/Data Sets/11 785/hw4/test_new.npy', allow_pickle=True, encoding='bytes')

    transcript_train = np.load('/content/drive/My Drive/Colab Notebooks/Data Sets/11 785/hw4/train_transcripts.npy', allow_pickle=True,encoding='bytes')
    transcript_valid = np.load('/content/drive/My Drive/Colab Notebooks/Data Sets/11 785/hw4/dev_transcripts.npy', allow_pickle=True, encoding='bytes')

    return speech_train, speech_valid, speech_test, transcript_train, transcript_valid


'''
Transforms alphabetical input to numerical input, replace each letter by its corresponding 
index from letter2index
'''


def transform_letter_to_index(transcript, letter2index):
    """
    :param transcript :(N, ) Transcripts are the text input
    :param letter2index: letter2index dict
    :return letter_to_index_list: Returns a list for all the transcript sentence to index
    """
    full_res = []

    for cur_sentence in transcript:
        cur_res = [letter2index["<sos>"], ]
        
        for cur_word in cur_sentence:
            cur_res += [letter2index[c] for c in cur_word.decode("utf-8")]
            cur_res.append(letter2index[" "])

        # pop the last space
        cur_res.pop()
        cur_res.append(letter2index["<eos>"])

        full_res.append(np.array(cur_res))
            
    return np.array(full_res)


def transform_index_to_letter(index_arr, letter_list):
    """
    :param index_arr :(N, ) index
    :param letter_list: index2index dict
    :return transcript:
    """
    transcript = "".join([letter_list[i] for i in index_arr[1:-1]])

    return transcript


'''
Optional, create dictionaries for letter2index and index2letter transformations
'''


def create_dictionaries(letter_list):
    letter2index = {y: x for x, y in enumerate(letter_list)}
    index2letter = {x: y for x, y in enumerate(letter_list)}
    return letter2index, index2letter


class Speech2TextDataset(Dataset):
    """
    Dataset class for the speech to text data, this may need some tweaking in the
    getitem method as your implementation in the collate function may be different from
    ours.
    """
    def __init__(self, speech, text=None, is_train=True):
        self.speech = speech
        self.speech_len = [len(x) for x in speech]

        self.is_train = is_train
        if text is not None:
            self.text = text
            self.text_len = [len(x) for x in text]

    def __len__(self):
        return self.speech.shape[0]

    def __getitem__(self, index):
        if self.is_train:
            return torch.tensor(self.speech[index].astype(np.float32)), self.speech_len[index], \
                   torch.tensor(self.text[index]), self.text_len[index] - 1,
        else:
            return torch.tensor(self.speech[index].astype(np.float32)), self.speech_len[index]


def collate_train(batch_data):
    # Return the padded speech and text data, and the length of utterance and transcript ###
    cur_speech, cur_speech_len, cur_text, cur_text_len = zip(*batch_data)
    cur_speech = pad_sequence(cur_speech)
    cur_text = pad_sequence(cur_text, batch_first=True)

    return cur_speech, torch.tensor(cur_speech_len, dtype=torch.int64), \
           cur_text, torch.tensor(cur_text_len, dtype=torch.int64)


def collate_test(batch_data):
    # Return padded speech and length of utterance ###
    cur_speech, cur_speech_len = zip(*batch_data)
    cur_speech = pad_sequence(cur_speech)

    return cur_speech, torch.tensor(cur_speech_len, dtype=torch.int64)

In [0]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
SOS_INDEX = 33
EOS_INDEX = 34


class Attention(nn.Module):
    """
    Attention is calculated using key, value and query from Encoder and decoder.
    Below are the set of operations you need to perform for computing attention:
        energy = bmm(key, query)
        attention = softmax(energy)
        context = bmm(attention, value)
    """
    def __init__(self):
        super(Attention, self).__init__()

    def forward(self, query, key, value, lens):
        """
        :param query :(N, context_size) Query is the output of LSTMCell from Decoder
        :param key: (N, key_size) Key Projection from Encoder per time step
        :param value: (N, value_size) Value Projection from Encoder per time step
        :param lens: (N)
        :return output: Attended Context
        :return attention_mask: Attention mask that can be plotted
        """
        key = torch.transpose(key, 0, 1)
        value = torch.transpose(value, 0, 1)
        query = query.unsqueeze(2)

        mask = (torch.arange(value.shape[1]).reshape((-1, 1)) >= lens).transpose(0, 1).to(DEVICE)

        energy = torch.bmm(key, query).squeeze(2)
        energy.masked_fill_(mask, -1e9)
        energy = energy.unsqueeze(2)

        attention = torch.softmax(energy, dim=1)

        attention = torch.transpose(attention, 1, 2)

        context = torch.bmm(attention, value).squeeze(1)

        return context, mask


class pBLSTM(nn.Module):
    """
    Pyramidal BiLSTM
    The length of utterance (speech input) can be hundereds to thousands of frames long.
    The Paper reports that a direct LSTM implementation as Encoder resulted in slow convergence,
    and inferior results even after extensive training.
    The major reason is inability of AttendAndSpell operation to extract relevant information
    from a large number of input steps.
    """
    def __init__(self, input_dim, hidden_dim):
        super(pBLSTM, self).__init__()
        self.lstm_1 = nn.LSTM(input_size=input_dim * 2, hidden_size=hidden_dim, num_layers=1, bidirectional=True)
        self.lstm_2 = nn.LSTM(input_size=hidden_dim * 4, hidden_size=hidden_dim, num_layers=1, bidirectional=True)
        self.lstm_3 = nn.LSTM(input_size=hidden_dim * 4, hidden_size=hidden_dim, num_layers=1, bidirectional=True)

        self.lstm_layers = nn.ModuleList([self.lstm_1, self.lstm_2, self.lstm_3])

    def forward(self, x):
        """
        :param x :(T, N, H) input to the pBLSTM
        :return output: (N, T, H) encoded sequence from pyramidal Bi-LSTM
        """

        x, lens = utils.rnn.pad_packed_sequence(x, batch_first=False)

        x = torch.transpose(x, 0, 1)
        x = x[:x.shape[0], :x.shape[1] // 2 * 2, :]
        x = x.reshape((x.shape[0], x.shape[1] // 2, x.shape[2] * 2))
        x = torch.transpose(x, 0, 1)
        lens = lens // 2

        cur_inp = utils.rnn.pack_padded_sequence(x, lengths=lens, batch_first=False, enforce_sorted=False)
        x, _ = self.lstm_1(cur_inp)

        x, lens = utils.rnn.pad_packed_sequence(x, batch_first=False)

        x = torch.transpose(x, 0, 1)
        x = x[:x.shape[0], :x.shape[1] // 2 * 2, :]
        x = x.reshape((x.shape[0], x.shape[1] // 2, x.shape[2] * 2))
        x = torch.transpose(x, 0, 1)
        lens = lens // 2

        cur_inp = utils.rnn.pack_padded_sequence(x, lengths=lens, batch_first=False, enforce_sorted=False)
        x, _ = self.lstm_2(cur_inp)

        x, lens = utils.rnn.pad_packed_sequence(x, batch_first=False)

        x = torch.transpose(x, 0, 1)
        x = x[:x.shape[0], :x.shape[1] // 2 * 2, :]
        x = x.reshape((x.shape[0], x.shape[1] // 2, x.shape[2] * 2))
        x = torch.transpose(x, 0, 1)
        lens = lens // 2

        cur_inp = utils.rnn.pack_padded_sequence(x, lengths=lens, batch_first=False, enforce_sorted=False)
        x, _ = self.lstm_3(cur_inp)

        return x


class Encoder(nn.Module):
    """
    Encoder takes the utterances as inputs and returns the key and value.
    Key and value are nothing but simple projections of the output from pBLSTM network.
    """
    def __init__(self, input_dim, hidden_dim, value_size, key_size):
        super(Encoder, self).__init__()
        self.base_lstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, num_layers=1, bidirectional=True)
        self.pblstm = pBLSTM(hidden_dim * 2, hidden_dim)

        self.key_network = nn.Linear(hidden_dim * 2, key_size)
        self.value_network = nn.Linear(hidden_dim * 2, value_size)

    def forward(self, x, lens):
        rnn_inp = utils.rnn.pack_padded_sequence(x, lengths=lens, batch_first=False, enforce_sorted=False)

        packed_out, _ = self.base_lstm(rnn_inp)

        # outputs (max_seq_len / 4, batch_size, hidden_size * 2)
        packed_out = self.pblstm(packed_out)

        # outputs (max_seq_len / 4, batch_size, hidden_size * 2)
        # len_out (batch_size)
        # For tests: outputs (130, 64, 256), len_out [55, 58, 66, 60, 62, ...]
        linear_input, lens_out = utils.rnn.pad_packed_sequence(packed_out)

        keys = self.key_network(linear_input)
        value = self.value_network(linear_input)

        return keys, value, lens_out


class Decoder(nn.Module):
    """
    As mentioned in a previous recitation, each forward call of decoder deals with just one time step,
    thus we use LSTMCell instead of LSLTM here.
    The output from the second LSTMCell can be used as query here for attention module.
    In place of value that we get from the attention, this can be replace by context we get from the attention.
    Methods like Gumble noise and teacher forcing can also be incorporated for improving the performance.
    """
    def __init__(self, vocab_size, hidden_dim, value_size, key_size, is_attended):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_dim, padding_idx=0)
        self.lstm1 = nn.LSTMCell(input_size=hidden_dim + value_size, hidden_size=hidden_dim)
        self.lstm2 = nn.LSTMCell(input_size=hidden_dim, hidden_size=key_size)

        self.is_attended = is_attended
        if is_attended:
            self.attention = Attention()

        self.teacher_forcing_rate = 0.6

        self.character_prob = nn.Linear(key_size + value_size, vocab_size)

    def forward(self, key, values, lens, text=None, is_train=True):
        """
        :param key :(T, N, key_size) Output of the Encoder Key projection layer
        :param values: (T, N, value_size) Output of the Encoder Value projection layer
        :param lens: (N) lens for key and values
        :param text: (N, text_len) Batch input of text with text_length
        :param is_train: Train or eval mode
        :return predictions: Returns the character prediction probability
        """
        batch_size = key.shape[1]
        embeddings = None

        if is_train:
            max_len = text.shape[1]
            # embeddings (batch_size, text_len, hidden_size)
            embeddings = self.embedding(text)
        else:
            max_len = 250

        predictions = []
        hidden_states = [None, None]
        prediction = (torch.ones(batch_size, 1) * SOS_INDEX).to(DEVICE)

        attention_score = values.mean(dim=0)

        for i in range(max_len):
            # * Implement Gumble noise and teacher forcing techniques 
            # * When attention is True, replace values[i,:,:] with the context you get from attention.
            # * If you haven't implemented attention yet, then you may want to check the index and break 
            #   out of the loop so you do you do not get index out of range errors. 

            if is_train:

                rnd = np.random.rand()

                if rnd >= self.teacher_forcing_rate:
                    char_embed = embeddings[:, i, :]
                else:
                    char_embed = self.embedding(prediction.argmax(dim=-1))
            else:
                char_embed = self.embedding(prediction.argmax(dim=-1))

            inp = torch.cat([char_embed, attention_score], dim=1)
            hidden_states[0] = self.lstm1(inp, hidden_states[0])

            inp_2 = hidden_states[0][0]
            hidden_states[1] = self.lstm2(inp_2, hidden_states[1])

            # query
            output = hidden_states[1][0]

            if self.is_attended:
                attention_score, attention_mask = self.attention(output, key, values, lens)

            prediction = self.character_prob(torch.cat([output, attention_score], dim=1))
            predictions.append(prediction.unsqueeze(1))

        return torch.cat(predictions, dim=1)


class Seq2Seq(nn.Module):
    """
    We train an end-to-end sequence to sequence model comprising of Encoder and Decoder.
    This is simply a wrapper "model" for your encoder and decoder.
    """
    def __init__(self, input_dim, vocab_size, hidden_dim, value_size, key_size, is_attended=True):
        super(Seq2Seq, self).__init__()
        self.encoder = Encoder(input_dim, hidden_dim, value_size, key_size)
        self.decoder = Decoder(vocab_size, hidden_dim, value_size, key_size, is_attended)

    def forward(self, speech_input, speech_len, text_input=None, is_train=True):
        key, value, lens = self.encoder(speech_input, speech_len)

        if is_train:
            predictions = self.decoder(key, value, lens, text_input)
        else:
            predictions = self.decoder(key, value, lens, text=None, is_train=False)

        return predictions


def greedy_search_gen(outputs):
    decoded_outputs = torch.argmax(outputs, dim=2)

    decoded_outputs = torch.cat([torch.ones((decoded_outputs.shape[0], 1), dtype=torch.int64).to(DEVICE) * SOS_INDEX,
                                 decoded_outputs], dim=1)

    cur_text_len = torch.zeros(decoded_outputs.shape[0], dtype=torch.int64).to(DEVICE)
    cur_text = []

    for i in range(decoded_outputs.shape[0]):
        cur_text_len[i] = next(j for j in range(decoded_outputs.shape[1])
                               if (decoded_outputs[i][j] == EOS_INDEX or j == decoded_outputs.shape[1] - 1)) + 1

        cur_text.append(decoded_outputs[i][:cur_text_len[i]])

    return cur_text, cur_text_len

In [0]:
def train(model, train_loader, criterion, optimizer, epoch):
    model.train()
    model.to(DEVICE)

    criterion = criterion.to(DEVICE)

    start = time.time()

    pp_loss = None

    print("train start")

    # 1) Iterate through your loader
    for cur_speech, cur_speech_len, cur_text, cur_text_len in train_loader:

        # 2) Use torch.autograd.set_detect_anomaly(True) to get notices about gradient explosion
        # 3) Set the inputs to the device.
        # 4) Pass your inputs, and length of speech into the model.
        # outputs = model(cur_speech, cur_speech_len, cur_text)
        # 5) Generate a mask based on the lengths of the text to create a masked loss.
        # 5.1) Ensure the mask is on the device and is the correct shape.
        # 6) If necessary, reshape your predictions and origianl text input
        # 6.1) Use .contiguous() if you need to.
        # 7) Use the criterion to get the loss.
        # 8) Use the mask to calculate a masked loss.
        # 9) Run the backward pass on the masked loss.
        # 10) Use torch.nn.utils.clip_grad_norm(model.parameters(), 2)
        # 11) Take a step with your optimizer
        # 12) Normalize the masked loss
        # 13) Optionally print the training loss after every N batches

        cur_speech = cur_speech.to(DEVICE)  # (max_seq_len, batch_size, utter_len (or 40))
        cur_speech_len = cur_speech_len.to(DEVICE)  # (batch_size)
        cur_text = cur_text.to(DEVICE)  # (batch_size, max_seq_len)
        cur_text_len = cur_text_len.to(DEVICE)  # (batch_size)

        outputs = model(cur_speech, cur_speech_len, cur_text, True)

        outputs_mask = torch.arange(cur_text.shape[1]).reshape((-1, 1)).to(DEVICE) < cur_text_len
        optimizer.zero_grad()

        loss = None
        n_tokens = cur_text_len.sum()

        for i in range(cur_text.size(1) - 1):
            cur_output = outputs[:, i, :]
            active = outputs_mask[i, :]

            if loss is None:
                loss = criterion(cur_output[active], cur_text[active, i + 1])
            else:
                loss += criterion(cur_output[active], cur_text[active, i + 1])

        loss /= n_tokens
        loss.backward()
        optimizer.step()

        pp_loss = torch.exp(loss)

    print("training loss of {} epoch".format(epoch))
    print(pp_loss)

    end = time.time()


def val(model, val_loader, criterion, epoch):
    model.eval()
    model.to(DEVICE)

    criterion = criterion.to(DEVICE)

    start = time.time()

    pp_loss = None

    for cur_speech, cur_speech_len, cur_text, cur_text_len in val_loader:

        cur_speech = cur_speech.to(DEVICE)  # (max_seq_len, batch_size, utter_len (or 40))
        cur_speech_len = cur_speech_len.to(DEVICE)  # (batch_size)
        cur_text = cur_text.to(DEVICE)  # (batch_size, max_seq_len)
        cur_text_len = cur_text_len.to(DEVICE)  # (batch_size)

        outputs = model(cur_speech, cur_speech_len, cur_text, True)

        outputs_mask = torch.arange(cur_text.shape[1]).reshape((-1, 1)).to(DEVICE) < cur_text_len

        loss = None
        n_tokens = cur_text_len.sum()

        for i in range(cur_text.size(1) - 1):
            cur_output = outputs[:, i, :]
            active = outputs_mask[i, :]

            if loss is None:
                loss = criterion(cur_output[active], cur_text[active, i + 1])
            else:
                loss += criterion(cur_output[active], cur_text[active, i + 1])

        loss /= n_tokens

        pp_loss = torch.exp(loss)

    print("validation loss of {} epoch".format(epoch))
    print(pp_loss)
    
    end = time.time()
    return loss


TEST_SEQ_LEN = 200


def test(model, test_loader):
    model.eval()
    model.to(DEVICE)

    start = time.time()

    full_text = []
    full_text_len = None

    for cur_speech, cur_speech_len in test_loader:

        cur_speech = cur_speech.to(DEVICE)  # (max_seq_len, batch_size, utter_len (or 40))
        cur_speech_len = cur_speech_len.to(DEVICE)  # (batch_size)

        outputs = model(cur_speech, cur_speech_len, None, False)

        cur_text, _ = greedy_search_gen(outputs)

        # if full_text_len is None:
        #     full_text_len = cur_text_len
        # else:
        #     full_text_len = torch.cat([full_text_len, cur_text_len])

        full_text += cur_text

    return full_text

In [0]:
LETTER_LIST = ['<pad>', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
               'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '-', "'", '.', '_', '+', ' ', '<sos>', '<eos>']
LETTER2INDEX, INDEX2LETTER = create_dictionaries(LETTER_LIST)



def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

def main():
    best_valid_loss = float('inf')
    model = Seq2Seq(input_dim=40, vocab_size=len(LETTER_LIST), hidden_dim=128, value_size=128, key_size=256,
                    is_attended=True)

    # cur_model_num = 6
    # model.load_state_dict(torch.load('model_{}'.format(cur_model_num)))

    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss(reduction="sum")
    n_epochs = 50
    batch_size = 64 if DEVICE == 'cuda' else 1

    speech_train, speech_valid, speech_test, transcript_train, transcript_valid = load_data()
    character_text_train = transform_letter_to_index(transcript_train, LETTER2INDEX)
    character_text_valid = transform_letter_to_index(transcript_valid, LETTER2INDEX)

    train_dataset = Speech2TextDataset(speech_train, character_text_train)
    val_dataset = Speech2TextDataset(speech_valid, character_text_valid)
    test_dataset = Speech2TextDataset(speech_test, None, False)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_train)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_train)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_test)

    for epoch in range(n_epochs):
        start_time = time.time()
        train(model, train_loader, criterion, optimizer, epoch)
        valid_loss = val(model, val_loader, criterion, epoch)
        if valid_loss < best_valid_loss:
          best_valid_loss = valid_loss
          torch.save(model.state_dict(), '11785-seq2seq-model.pt')
        end_time = time.time()
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {torch.exp(valid_loss):7.3f}')
    # test(model, test_loader)


    result_gen(test_loader, 0)
    print("finish")


def result_gen(test_loader, model_num):
    model = Seq2Seq(input_dim=40, vocab_size=len(LETTER_LIST), hidden_dim=128, value_size=128, key_size=256,
                    is_attended=True)

    model.load_state_dict(torch.load('11785-seq2seq-model.pt'))
    model.eval()

    model = model.to(DEVICE)

    test_text = test(model, test_loader)

    test_text_str = []

    for cur_text in test_text:
        test_text_str.append(transform_index_to_letter(cur_text, LETTER_LIST))

    res_df = pd.DataFrame(test_text_str)
    res_df.to_csv('result_{}.csv'.format(model_num + 1), index=True, header=False)


if __name__ == '__main__':
    main()

train start


# 新段落