In [1]:
import torch
import torchaudio
import torch.nn as nn #neural network modules
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torchaudio.functional as F
import torchaudio.transforms as T
import matplotlib.pyplot as plt

target = torch.empty(3, dtype=torch.long).random_(5)
print(target)

In [2]:
train_dataset = torchaudio.datasets.LIBRISPEECH("./", url="train-clean-100", download=True)
# test_dataset = torchaudio.datasets.LIBRISPEECH("./", url="test-clean", download=True)
# dev_dataset = torchaudio.datasets.LIBRISPEECH("./", url="dev-clean", download=True)

In [5]:
waveforms = []
labels = []
for i in range(10000):
    sample = train_dataset.__getitem__(i)
    waveform, _, label, _, _, _ = sample
    specgram = T.MelSpectrogram(sample_rate=16000, win_length=400, hop_length=160, n_mels=80)(waveform)
    waveforms.append(torch.squeeze(specgram))
    labels.append(label)
    

print(waveforms[0].shape)

In [6]:
from torch.nn.utils.rnn import pad_sequence
xtrain = []
for i in range(len(waveforms)):
    xtrain.append(waveforms[i].permute(1,0))
print(xtrain[1].shape)
padded_sequences = pad_sequence(xtrain, batch_first=True)

torch.Size([1595, 80])


In [17]:
print(type(padded_sequences), padded_sequences[0:8000].shape)
print(len(labels), len(labels[0:8000]))

<class 'torch.Tensor'> torch.Size([8000, 1930, 80])
10000 8000


print(len(labels))

In [None]:
# ['sos'] + labels[0].lower().split() + ['eos']els

In [11]:
def yield_tokens(labels):
    for item in labels:
        yield item.lower().strip().split()

In [12]:
tokens = yield_tokens(labels)
print(type(tokens))

<class 'generator'>


In [13]:
from torchtext.vocab import build_vocab_from_iterator
vocab = build_vocab_from_iterator(tokens,min_freq=1,specials=['<sos>', '<pad>'])
vocab.append_token('<eos>')

In [14]:
print(vocab.__len__())
print(vocab.__getitem__('<pad>'))

20376
1


l = labels[0].lower().strip().split()
print(l)
print(vocab.lookup_indices(l))

In [15]:
class LibriDataset(Dataset):
    def __init__(self, waveforms, labels, vocab):
        self.waveforms = waveforms
        self.labels = labels
        self.vocab = vocab

    def __len__(self):
        return len(waveforms)

    def __getitem__(self, index):
        waveform = self.waveforms[index]
#         print(waveform.shape)
        label = self.labels[index]
        numericalized_caption = []
        numericalized_caption.append(self.vocab.__getitem__('<sos>'))
        numericalized_caption+=self.vocab.lookup_indices(label.lower().strip().split())
        numericalized_caption.append(self.vocab.__getitem__('<eos>'))

        return waveform, torch.tensor(numericalized_caption)

numericalized_caption = []
numericalized_caption.append(vocab.__getitem__('<sos>'))
numericalized_caption+=vocab.lookup_indices(labels[0].lower().strip().split())
numericalized_caption.append(vocab.__getitem__('<eos>'))
print(numericalized_caption)

In [16]:
class MyCollate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx

    def __call__(self, batch):
#         print(len(batch), print(batch[0]))
#         for item in batch:
#             print(item[0].shape)
        waves = [item[0] for item in batch]
#         print(waves[0].shape, waves[1].shape)
        waves = torch.stack(waves, dim=0)
#         print(type(waves), waves.shape)
        targets = [item[1] for item in batch]
        targets = pad_sequence(targets, batch_first=True, padding_value=self.pad_idx)

        return waves, targets

In [27]:
dataset = LibriDataset(padded_sequences, labels, vocab)
# train_loader = DataLoader(dataset=dataset, batch_size=64, shuffle=True)

In [28]:
loader = DataLoader(
        dataset=dataset,
        batch_size=64,
        shuffle=True,
        collate_fn=MyCollate(pad_idx=1))
# print(len(labels[1].strip().split()))

In [29]:
from conformer_encoder import ConformerBlock
#conv subsampling
class Conv2dSubampling(nn.Module):
    """
    Convolutional 2D subsampling (to 1/4 length)
    Args:
        in_channels (int): Number of channels in the input image
        out_channels (int): Number of channels produced by the convolution
    Inputs: inputs
        - **inputs** (batch, time, dim): Tensor containing sequence of inputs
    Returns: outputs, output_lengths
        - **outputs** (batch, time, dim): Tensor produced by the convolution
        - **output_lengths** (batch): list of sequence output lengths
    """
    def __init__(self, in_channels: int, out_channels: int) -> None:
        super(Conv2dSubampling, self).__init__()
        self.sequential = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=2),
            nn.ReLU(),
            nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=2),
            nn.ReLU(),
        )

    def forward(self, inputs, input_lengths):
        outputs = self.sequential(inputs.unsqueeze(1))
#         print(outputs.size())
        batch_size, channels, subsampled_lengths, sumsampled_dim = outputs.size()

        outputs = outputs.permute(0, 2, 1, 3)
        outputs = outputs.contiguous().view(batch_size, subsampled_lengths, channels * sumsampled_dim)

        output_lengths = input_lengths >> 2
        output_lengths -= 1

        return outputs, output_lengths

#Encoder
class EncoderCNN(nn.Module):
    def __init__(self, embed_size):
        super(EncoderCNN, self).__init__()
        self.conv2dss = Conv2dSubampling(1, 80)
        self.linear = nn.Linear(731120, 768)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        self.conformer = ConformerBlock(dim = 1520, dim_head = 64, heads = 4, ff_mult = 2, conv_expansion_factor = 2,
                                        conv_kernel_size = 31, attn_dropout = 0.2, ff_dropout = 0.1, conv_dropout = 0.2)

    def forward(self, waves):
        features = self.conv2dss(waves, torch.tensor([]))
        features = self.conformer(features[0])
        return self.dropout(self.relu(self.linear(features.reshape(features.shape[0], -1))))
#Decoder
class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
        super(DecoderRNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size*2, hidden_size, num_layers)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(0.5)

    def forward(self, features, captions):
        embeddings = self.dropout(self.embed(captions))
        embeddings = torch.cat((features.unsqueeze(1).repeat(1, embeddings.shape[1], 1), embeddings), dim=2)
        hiddens, _ = self.lstm(embeddings)
        outputs = self.linear(hiddens)
        return outputs
#model    
class ConformerEncDec(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
        super(ConformerEncDec, self).__init__()
        self.encoderCNN = EncoderCNN(embed_size)
        self.decoderRNN = DecoderRNN(embed_size, hidden_size, vocab_size, num_layers)

    def forward(self, images, captions):
        features = self.encoderCNN(images)
#         print(features.shape)
        outputs = self.decoderRNN(features, captions)
        return outputs

    def predict_caption(self, waves, vocab, max_length=88):
        result_caption = []

        with torch.no_grad():
            x = self.encoderCNN(waves).unsqueeze(0)
            states = None

            for _ in range(max_length):
                hiddens, states = self.decoderRNN.lstm(x, states)
                output = self.decoderRNN.linear(hiddens.squeeze(0))
                predicted = output.argmax(2)
                result_caption.append(predicted.item())
                x = self.decoderRNN.embed(predicted).unsqueeze()

                if vocab.__getitem__(predicted.item()) == "<eos>":
                    break

        return vocab.lookup_indices(result_caption)

In [21]:
def save_checkpoint(state, filename="my_model.pth.tar"):
    print("=> saving checkpoint")
    torch.save(state, filename)

In [30]:
model = ConformerEncDec(768, 320, vocab.__len__(), 1)
criterion = nn.CrossEntropyLoss(ignore_index=1)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=2, verbose=True)

In [33]:
for epoch in range(5):
    losses = []
    print("epoch", epoch)
    if epoch==2:
        checkpoint = {'state_dict':model.state_dict(), 'optimizer': optimizer.state_dict()}
        save_checkpoint(checkpoint)
    for batch_idx, (data, targets) in enumerate(loader):
        print(epoch, batch_idx)
        o = model(data, targets)
        loss = criterion(o.reshape(-1, o.shape[2]), targets.reshape(-1))
        losses.append(loss.item())
        optimizer.zero_grad()
        loss.backward(loss)
        optimizer.step()
    mean_loss = sum(losses)/len(losses)
    scheduler.step(mean_loss)
#     print(o.shape)

epoch 0
0 0
0 1
0 2
0 3
0 4
0 5
0 6
0 7
0 8
0 9
0 10
0 11
0 12
0 13
0 14
0 15
0 16
0 17
0 18
0 19
0 20
0 21
0 22
0 23
0 24
0 25
0 26
0 27
0 28
0 29
0 30
0 31
0 32
0 33
0 34
0 35
0 36
0 37
0 38
0 39
0 40
0 41
0 42
0 43
0 44
0 45
0 46
0 47
0 48
0 49
0 50
0 51
0 52
0 53
0 54
0 55
0 56
0 57
0 58
0 59
0 60
0 61
0 62
0 63
0 64
0 65
0 66
0 67
0 68
0 69
0 70
0 71
0 72
0 73
0 74
0 75
0 76
0 77
0 78
0 79
0 80
0 81
0 82
0 83
0 84
0 85
0 86
0 87
0 88
0 89
0 90
0 91
0 92
0 93
0 94
0 95
0 96
0 97
0 98
0 99
0 100
0 101
0 102
0 103
0 104
0 105
0 106
0 107
0 108
0 109
0 110
0 111
0 112
0 113
0 114
0 115
0 116
0 117
0 118
0 119
0 120
0 121
0 122
0 123
0 124
0 125
0 126
0 127
0 128
0 129
0 130
0 131
0 132
0 133
0 134
0 135
0 136
0 137
0 138
0 139
0 140
0 141
0 142
0 143
0 144
0 145
0 146
0 147
0 148
0 149
0 150
0 151
0 152
0 153
0 154
0 155
0 156
epoch 1
1 0
1 1
1 2
1 3
1 4
1 5
1 6
1 7
1 8
1 9
1 10
1 11
1 12
1 13
1 14
1 15
1 16
1 17
1 18
1 19
1 20
1 21
1 22
1 23
1 24
1 25
1 26
1 27
1 28
1 29
1 30
1 31
1 

KeyboardInterrupt: 

In [36]:
print(mean_loss, loss)

6.823648811145953 tensor(6.7516, grad_fn=<NllLossBackward>)
