### Only run to clear dataset from colab storage

In [None]:
!rm -rf dataset/

## Package Installation

In [None]:
!pip install torchinfo torchviz

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl.metadata (21 kB)
Collecting torchviz
  Downloading torchviz-0.0.3-py3-none-any.whl.metadata (2.1 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->torchviz)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->torchviz)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->torchviz)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch->torchviz)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch->torchviz)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cu

# Variational Auto-Encoder
Based on https://arxiv.org/pdf/2107.03298

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torchaudio import datasets, transforms
from torchinfo import summary
import os
from torchviz import make_dot
import collections

## Dataset Loading
Using lj_speech dataset, a public domain speech and text dataset. This dataset will be saved to memory for each session in colab but will need to be redownloaded with each seperate session. If one wants to clear the preexisting dataset if this block times out, please use the utility at the top of this program. IMPORTANT NOTE: please wait for this block to finish running, if it is timed out only a subset of the data will be loaded into memory.

Source: https://keithito.com/LJ-Speech-Dataset/lj_speech

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Device: {device}')
path = os.path.join(os.getcwd(), 'dataset/')
os.makedirs(path, exist_ok=True)
rawData = datasets.LJSPEECH(path, download=True)
datasetLoader = torch.utils.data.DataLoader(rawData, batch_size=32, shuffle=True)
print(f'Sample Data: {rawData[0]}')

Device: cpu


100%|██████████| 2.56G/2.56G [00:33<00:00, 81.1MB/s]


Sample Data: (tensor([[-7.3242e-04, -7.6294e-04, -6.4087e-04,  ...,  7.3242e-04,
          2.1362e-04,  6.1035e-05]]), 22050, 'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition', 'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition')


## Text Tokenization
Create a dictionary based on the training text.

In [None]:
'''
Create a pair of dictionaries that map text to integers and vice versa.
This is done to facilitate
'''
class Vocabulary():
  def __init__(self, max_size):
    self.vocab = []
    self.max_size = max_size

  def create_dictionary(self, freq):
    self.vocab = [key for key, val in freq.items() if val >= self.max_size]
    word_to_index = {word: index for index, word in enumerate(self.vocab)}
    index_to_word = {index: word for word, index in word_to_index.items()}

    return (word_to_index, index_to_word)

f = open('dataset/LJSpeech-1.1/metadata.csv')
count = collections.Counter()
for line in f:
  text = line.split('|')[2]
  count.update(text.split())

vocab = Vocabulary(1)
(word_to_index, index_to_word) = vocab.create_dictionary(count)
print(word_to_index)



## Text Encoder
Convolution layers with dropout, batch normalization and ReLU activation. Follow this with positional encoding, and then self attention blocks.

In [None]:
'''
Sinosoidal Positional Encoding
Input: batch x seq len x embed len
Output: batch x seq len x embed len

Applies the sinosoidal positional encoding to the input's embedding. Should be
functional accross all batches.
'''
def sinosoidal_position_encoding(token_size, embedding_dim, batch_size):
  pos = torch.arange(0, token_size).unsqueeze(1)
  emb = torch.zeros(token_size, embedding_dim)

  # based on definition in Attention is All You Need
  emb[:, 0::2] = torch.sin(pos/torch.pow(10000, 2*torch.arange(0, embedding_dim//2)/embedding_dim))
  emb[:, 1::2] = torch.cos(pos/torch.pow(10000, 2*torch.arange(0, embedding_dim//2)/embedding_dim))

  # permutate to place the embeddings in the right location and repeat for each batch
  return emb.permute(1, 0).unsqueeze(0).repeat(batch_size, 1, 1)

'''
Convolutional Layer Stack for Text Encoder
Input: batch x seq len x embed len
Output: batch x seq len x embed len
Convolute each 1D sequence, batch norm, ReLU, and dropout.
'''
class ConvStack(torch.nn.Module):
  def __init__(self, D, K):
    super(ConvStack, self).__init__()
    self.conv = torch.nn.Conv1d(D, D, K)
    self.norm = torch.nn.BatchNorm1d(D)
    self.relu = torch.nn.ReLU()
    self.dropout = torch.nn.Dropout1d()
  def forward(self, X):
    convolution = self.conv(X)
    normalization = self.norm(convolution)
    relu = self.relu(normalization)
    output = self.dropout(relu)
    return output

'''
Self Attention Layer
Input: batch x seq len x embed len
Output: batch x seq len x embed len
Can take in either the direct query value or create the values on the fly.
'''
class SelfAttentionLayer(torch.nn.Module):
  def __init__(self, data_dim):
    super(SelfAttentionLayer, self).__init__()
    self.query = nn.Linear(data_dim, data_dim)
    self.value = nn.Linear(data_dim, data_dim)
    self.key = nn.Linear(data_dim, data_dim)
    self.d = data_dim
    self.softmax = nn.Softmax(dim=1)

  def forward(self, X, Q=None, V=None, K=None):
    if Q is None or V is None or V is None:
      Q = self.query(X)
      V = self.value(X)
      K = self.key(X)
      # permutate values to work with self attention, needs to be in the order of batch x embed len x seq len
      Q = Q.permute(0, 2, 1)
      K = K.permute(0, 2, 1)
      V = V.permute(0, 2, 1)

    # based on definition in Attention is All You Need
    Y = self.softmax(Q @ K.mT).div(self.d ** 0.5)
    output = (Y @ V)
    return output.permute(0, 2, 1) # permutate to return a form that can be added

'''
Text Encoder Block
Input: batch x seq len x em
Output: batch x seq len x embed len

Needs to utilize multiple permutations in order to work within Pytorch's built in encoder,
sinusoidal positional encoding, and self attention.
'''
class TextEncoder(torch.nn.Module):
  def __init__(self, embedding_size, conv_size, K):
    super(TextEncoder, self).__init__()
    self.embedding = torch.nn.Embedding(embedding_size, conv_size)
    self.stack1 = ConvStack(conv_size, K)
    self.stack2 = ConvStack(conv_size, K)
    self.stack3 = ConvStack(conv_size, K)
    self.stack4 = ConvStack(conv_size, K)
    self.stack5 = ConvStack(conv_size, K)
    self.attention1 = SelfAttentionLayer(conv_size)
    self.attention2 = SelfAttentionLayer(conv_size)
    self.attention3 = SelfAttentionLayer(conv_size)
    self.attention4 = SelfAttentionLayer(conv_size)

  def forward(self, X):
    print(f'Input Shape: {X.shape}')
    embed = self.embedding(X)
    print(f'Embedding Shape: {embed.shape}')
    # permutate to shift order from [batch, seq len, embed len] to [batch, embed len, seq len]
    embed = embed.permute(0, 2, 1)
    conv1 = self.stack1(embed)
    conv2 = self.stack2(conv1)
    conv3 = self.stack3(conv2)
    conv4 = self.stack4(conv3)
    conv5 = self.stack5(conv4)

    # add positional encoding to embedding
    pos = sinosoidal_position_encoding(conv5.size(2), conv5.size(1), conv5.size(0))
    conv_pos = conv5 + pos
    conv_pos = conv_pos.permute(0, 2, 1)

    attn1 = self.attention1(conv_pos)
    attn2 = self.attention2(attn1)
    attn3 = self.attention3(attn2)
    attn4 = self.attention4(attn3)

    return attn4

model = TextEncoder(2000, 512, 5)
#input_tensor = torch.randint(0, 2000, (1, 32), dtype=torch.long)
input_text = rawData[0][3].split(' ')
input_tensor = torch.tensor([word_to_index[item] for item in input_text]).unsqueeze(0)
summary(model, input_data=input_tensor)

## Posterior Encoder
Fully connected layers w/ dropout and ReLU activation followed by sinusoidal positional encoding.

Attention layers are then stacked upon one another with Q = encoded spectrogram, K, V = encoded text. This should take the form of a self attention layer, a cross-attention layer, and a feed-forward NN with a hidden layer of 1024 and output size of 256.


In [None]:
'''
Self Attention Layer for Posterior Encoder
Input: batch x seq len x embed len
Output: batch x seq len x embed len
Designed to work with self attention between
'''
class PosteriorSelfAttentionLayer(torch.nn.Module):
  def __init__(self, data_dim):
    super(PosteriorSelfAttentionLayer, self).__init__()
    self.query = nn.Linear(data_dim, data_dim)
    self.value = nn.Linear(data_dim, data_dim)
    self.key = nn.Linear(data_dim, data_dim)
    self.d = data_dim
    self.softmax = nn.Softmax(dim=-1)

  def forward(self, Q, V, K):
    print(f'Q shape: {Q.shape}')
    print(f'K shape: {K.shape}')
    print(f'V shape: {V.shape}')
    # based on definition in Attention is All You Need
    QK = Q @ K
    print(f'QK Shape: {QK.shape}')
    Y = self.softmax(QK).div(self.d ** 0.5)
    print(f'Y shape: {Y.shape}')
    output = (Y @ V.mT)
    return output # permutate to return a form that can be added

'''
Attention Block for Posterior Encoder
Input: batch x seq len x embed len
Output: batch x seq len x embed len
Utilizes a self attention layer, and a cross attention layer with a feed-forward NN.
'''
class AttentionBlock(torch.nn.Module):
  def __init__(self, data_dim, head_num, hidden_dim):
    super(AttentionBlock, self).__init__()
    self.d = data_dim
    self.attn_self = PosteriorSelfAttentionLayer(data_dim)
    self.Q = nn.Linear(data_dim, data_dim)
    self.V = nn.Linear(data_dim, data_dim)
    self.K = nn.Linear(data_dim, data_dim)
    self.attn_cross = nn.MultiheadAttention(data_dim, head_num)
    self.linear1 = nn.Linear(data_dim, hidden_dim)
    self.relu1 = nn.ReLU()
    self.linear2 = nn.Linear(hidden_dim, 256)

  def forward(self, Q, V, K):
    attn_s = self.attn_self(Q, V, K)
    print(f'Self Attn shape: {attn_s.shape}')
    print(f'Permuted V: {V.permute(0, 2, 1).shape}')
    attn_c = self.attn_cross(attn_s, V.permute(0, 2, 1), K.permute(0, 2, 1))
    lin1 = self.linear1(attn_c)
    relu = self.relu1(lin1)
    return self.lin2(relu)


'''
Posterior Encoder
Input: batch x seq len x embed len, Spectrogram Y
Output: batch x posterior probability

Returns the posterior probability of the spectrogram given the encoded and transformed text.
'''
class PosteriorEncoder(torch.nn.Module):
  def __init__(self, wav_dim):
    super(PosteriorEncoder, self).__init__()
    self.linear1 = nn.Linear(wav_dim, 256)
    self.relu1 = torch.nn.ReLU()
    self.dropout1 = torch.nn.Dropout()
    self.linear2 = nn.Linear(256, 512)
    self.relu2 = torch.nn.ReLU()
    self.dropout2 = torch.nn.Dropout()
    self.attn1 = AttentionBlock(512, 4, 1024)
    self.attn2 = AttentionBlock(512, 4, 1024)

  def forward(self, spectrogram, text):
    print(f'Spectrogram Shape: {spectrogram.shape}')
    print(f'Text Shape: {text.shape}')
    lin1 = self.linear1(spectrogram)
    relu1 = self.relu1(lin1)
    drop1 = self.dropout1(relu1)
    lin2 = self.linear2(drop1)
    relu2 = self.relu2(lin2)
    transformed_spec = self.dropout2(relu2)
    print(f'Transformed Spectrogram Shape: {transformed_spec.shape}')
    attn1 = self.attn1(transformed_spec, text, text)
    attn2 = self.attn2(attn1, text, text)
    return attn2


sampleSpec = rawData[1][0]
model = PosteriorEncoder(128)
mel_transformer = transforms.MelSpectrogram(sample_rate=22050, n_fft=1024, win_length=512, hop_length=256)
input_waveform_tensor = mel_transformer(sampleSpec).permute(0, 2, 1)
input_tensor = torch.randint(0, 2000, (1, 512, 32), dtype=torch.float)
summary(model, input_data=[input_waveform_tensor, input_tensor])



Spectrogram Shape: torch.Size([1, 164, 128])
Text Shape: torch.Size([1, 512, 32])
Transformed Spectrogram Shape: torch.Size([1, 164, 512])
Q shape: torch.Size([1, 164, 512])
K shape: torch.Size([1, 512, 32])
V shape: torch.Size([1, 512, 32])
QK Shape: torch.Size([1, 164, 32])
Y shape: torch.Size([1, 164, 32])
Self Attn shape: torch.Size([1, 164, 512])
Permuted V: torch.Size([1, 32, 512])


RuntimeError: Failed to run torchinfo. See above stack traces for more details. Executed layers up to: [Linear: 1, ReLU: 1, Dropout: 1, Linear: 1, ReLU: 1, Dropout: 1, PosteriorSelfAttentionLayer: 2, Softmax: 3]

## Prior Encoder
Utilizes Glow Blocks, which contain an actnorm layer, an invertible 1x1 convolution layer, and an affine-coupling layer.

In [None]:
class ActNormLayer(torch.nn.Module):
  def __init__(self, data_dim, data_initialized=False):


## Length Predictor
From the encoded text representation, the predicted length of the Mel Spectrogram will be predicted. The loss for this specific module will not be passed back to the text encoder.


In [None]:
class LengthPredictor(torch.nn.Module):
  def __init__(self, text_dim):
    super(LengthPredictor, self).__init__()
    self.linear1 = nn.Linear(text_dim, 256)
    self.relu = nn.ReLU()
    self.linear2 = nn.Linear(256, 1)

  def forward(self, X):
    lin1 = self.linear1(X)
    relu = self.relu(lin1)
    return self.linear2(relu).squeeze(2)

## Decoder Block

In [None]:
class Decoder(torch.nn.Module):
  def __init__(self, Z, X):
    super(Decoder, self).__init__()
    self.deco1 = nn.TransformerDecoderLayer(512)

## Training