In [2]:
import torch
import torch.nn as nn
from conformer import ConformerBlock

In [3]:
batch_size, sequence_length, dim = 64, 1930, 80

# cuda = torch.cuda.is_available()  
# device = torch.device('cuda' if cuda else 'cpu')

inputs = torch.rand(batch_size, sequence_length, dim)
input_lengths = torch.IntTensor([])
# targets = torch.LongTensor([[1, 3, 3, 3, 3, 3, 4, 5, 6, 2],
#                             [1, 3, 3, 3, 3, 3, 4, 5, 2, 0],
#                             [1, 3, 3, 3, 3, 3, 4, 2, 0, 0]]).to(device)
# target_lengths = torch.LongTensor([9, 8, 7])




In [4]:
class Conv2dSubampling(nn.Module):
    """
    Convolutional 2D subsampling (to 1/4 length)
    Args:
        in_channels (int): Number of channels in the input image
        out_channels (int): Number of channels produced by the convolution
    Inputs: inputs
        - **inputs** (batch, time, dim): Tensor containing sequence of inputs
    Returns: outputs, output_lengths
        - **outputs** (batch, time, dim): Tensor produced by the convolution
        - **output_lengths** (batch): list of sequence output lengths
    """
    def __init__(self, in_channels: int, out_channels: int) -> None:
        super(Conv2dSubampling, self).__init__()
        self.sequential = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=2),
            nn.ReLU(),
            nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=2),
            nn.ReLU(),
        )

    def forward(self, inputs, input_lengths):
        outputs = self.sequential(inputs.unsqueeze(1))
#         print(outputs.size())
        batch_size, channels, subsampled_lengths, sumsampled_dim = outputs.size()

        outputs = outputs.permute(0, 2, 1, 3)
        outputs = outputs.contiguous().view(batch_size, subsampled_lengths, channels * sumsampled_dim)

        output_lengths = input_lengths >> 2
        output_lengths -= 1

        return outputs, output_lengths

In [5]:
print(inputs.size())

torch.Size([64, 1930, 80])


In [6]:
outputs = Conv2dSubampling(1, 80).forward(inputs, input_lengths)

In [7]:
print(outputs[0].reshape(outputs[0].shape[0],-1).size(), outputs[1])

torch.Size([64, 731120]) tensor([], dtype=torch.int32)


In [14]:
outp = nn.Linear(731120, 768).forward(outputs[0].reshape(outputs[0].shape[0],-1))
print(outp.shape)

torch.Size([64, 768])


In [7]:
block = ConformerBlock(
    dim = 1520,
    dim_head = 64,
    heads = 4,
    ff_mult = 2,
    conv_expansion_factor = 2,
    conv_kernel_size = 31,
    attn_dropout = 0.2,
    ff_dropout = 0.1,
    conv_dropout = 0.2
)
y = block(outputs[0]) # (1, 1024, 512)

In [30]:
# print(y.size())
499*1520

758480

In [15]:
outs = torch.ones((64, 63), dtype=torch.long)
print(outp.unsqueeze(1).shape)
embed = nn.Embedding(20000, 768)(outs)
print(embed.shape)

torch.Size([64, 1, 768])
torch.Size([64, 20, 768])


In [21]:
print(outp.unsqueeze(1).repeat(1, 20, 1).shape)
embeddings = torch.cat((outp.unsqueeze(1).repeat(1, embed.shape[1], 1), embed), dim=2)
print(embeddings.shape)

torch.Size([64, 20, 768])
torch.Size([64, 20, 1536])


In [27]:
lst = nn.LSTM(1536, 320, 1)(embeddings)
print(lst[0].shape)

torch.Size([64, 20, 320])


In [46]:
class EncoderCNN(nn.Module):
    def __init__(self, embed_size):
        super(EncoderCNN, self).__init__()
        self.conv2dss = Conv2dSubampling(1, 80)
        self.linear = nn.Linear(731120, 768)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        self.conformer = ConformerBlock(dim = 1520, dim_head = 64, heads = 4, ff_mult = 2, conv_expansion_factor = 2,
                                        conv_kernel_size = 31, attn_dropout = 0.2, ff_dropout = 0.1, conv_dropout = 0.2)

    def forward(self, waves):
        features = self.conv2dss(waves, torch.tensor([]))
        features = self.conformer(features[0])
        return self.dropout(self.relu(self.linear(features.reshape(features.shape[0], -1))))

In [37]:
enc = EncoderCNN(768)(inputs)
print(enc.shape)

torch.Size([64, 768])


In [39]:
dec = DecoderRNN(768, 320, 20000, 1)(enc, outs)
print(dec.shape)

torch.Size([64, 20, 20000])


In [24]:
class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
        super(DecoderRNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size*2, hidden_size, num_layers)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(0.5)

    def forward(self, features, captions):
        embeddings = self.dropout(self.embed(captions))
        embeddings = torch.cat((features.unsqueeze(1).repeat(1, embeddings.shape[1], 1), embeddings), dim=2)
        hiddens, _ = self.lstm(embeddings)
        outputs = self.linear(hiddens)
        return outputs

In [45]:
class ConformerEncDec(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
        super(CNNtoRNN, self).__init__()
        self.encoderCNN = EncoderCNN(embed_size)
        self.decoderRNN = DecoderRNN(embed_size, hidden_size, vocab_size, num_layers)

    def forward(self, images, captions):
        features = self.encoderCNN(images)
        outputs = self.decoderRNN(features, captions)
        return outputs

    def predict_caption(self, waves, vocab, max_length=88):
        result_caption = []

        with torch.no_grad():
            x = self.encoderCNN(waves).unsqueeze(0)
            states = None

            for _ in range(max_length):
                hiddens, states = self.decoderRNN.lstm(x, states)
                output = self.decoderRNN.linear(hiddens.squeeze(0))
                predicted = output.argmax(2)
                result_caption.append(predicted.item())
                x = self.decoderRNN.embed(predicted).unsqueeze()

                if vocab.__getitem__(predicted.item()) == "<eos>":
                    break

        return vocab.lookup_indices(result_caption)

In [42]:
model = ConformerEncDec(768, 320, 20000, 1)(inputs, outs)

In [1]:
print(inputs.shape)
print(outs.shape)

NameError: name 'inputs' is not defined

In [9]:
input = torch.randn(3, 5, requires_grad=True)
print(input)

tensor([[-0.4995,  0.0689, -0.2849,  0.7602, -0.0645],
        [-0.0924, -0.5724, -0.4970,  1.3827,  0.6878],
        [ 1.8441,  1.1221, -1.1771,  1.1200, -0.7552]], requires_grad=True)
