In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchaudio
import sys
import numpy as np
import torch.utils.data as data
import librosa

import matplotlib.pyplot as plt
import IPython.display as ipd

from tqdm import tqdm

### Importing the Data

In [4]:
from torchaudio.datasets import SPEECHCOMMANDS


In [5]:
import os

class SubsetSC(SPEECHCOMMANDS):
    def __init__(self, subset: str = None):
        super().__init__("./", download=True)

        def load_list(filename):
            filepath = os.path.join(self._path, filename)
            with open(filepath) as fileobj:
                return [os.path.normpath(os.path.join(self._path, line.strip())) for line in fileobj]

        if subset == "validation":
            self._walker = load_list("validation_list.txt")
        elif subset == "testing":
            self._walker = load_list("testing_list.txt")
        elif subset == "training":
            excludes = load_list("validation_list.txt") + load_list("testing_list.txt")
            excludes = set(excludes)
            self._walker = [w for w in self._walker if w not in excludes]

Load the data locally and create training and testing split of the data.

In [None]:
train_set = SubsetSC("training")
test_set = SubsetSC("testing")

waveform, sample_rate, label, speaker_id, utterance_number = train_set[0]

A data point in the SPEECHCOMMANDS dataset is a tuple made of a waveform (the audio signal), the sample rate, the utterance (label), the ID of the speaker, the number of the utterance.

In [None]:
print("Shape of waveform: {}".format(waveform.size()))
print("Sample rate of waveform: {}".format(sample_rate))

plt.plot(waveform.t().numpy())

### Formatting the Data

Downsample the audio for faster processing with the hope of not losing too much of the classification power.

In [None]:
new_sample_rate = 8000
transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=new_sample_rate)
transformed = transform(waveform)

ipd.Audio(transformed.numpy(), rate=new_sample_rate)

In [None]:
def pad_sequence(batch):
    # Make all tensor in a batch the same length by padding with zeros
    batch = [item.t() for item in batch]
    batch = torch.nn.utils.rnn.pad_sequence(batch, batch_first=True, padding_value=0.)
    return batch.permute(0, 2, 1)

To turn a list of data point made of audio recordings and utterances into two batched tensors for the model, we implement a collate function which is used by the PyTorch DataLoader that allows us to iterate over a dataset by batches. 

In [None]:
labels = sorted(list(set(datapoint[2] for datapoint in train_set)))

In [None]:
def label_to_index(word):
    # Return the position of the word in labels
    return torch.tensor(labels.index(word))

In [None]:
def collate_fn(batch):

    # A data tuple has the form:
    # waveform, sample_rate, label, speaker_id, utterance_number

    tensors, targets = [], []

    # Gather in lists, and encode labels as indices
    for waveform, _, label, *_ in batch:
        tensors += [waveform]
        targets += [label_to_index(label)]

    # Group the list of tensors into a batched tensor
    tensors = pad_sequence(tensors)
    targets = torch.stack(targets)

    return tensors, targets

In [None]:
batch_size = 256
num_workers = 0
pin_memory = False

In [None]:
train_loader = torch.utils.data.DataLoader(
    train_set,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn,
    num_workers=num_workers,
    pin_memory=pin_memory,
)
test_loader = torch.utils.data.DataLoader(
    test_set,
    batch_size=batch_size,
    shuffle=False,
    drop_last=False,
    collate_fn=collate_fn,
    num_workers=num_workers,
    pin_memory=pin_memory,
)

In [6]:
def load_audio(filename, sample_rate=16000, trim=True, trim_frame_length=2048):
    audio, _ = librosa.load(filename, sr=sample_rate, mono=True)
    audio = audio.reshape(-1, 1)

    if trim > 0:
        audio, _ = librosa.effects.trim(audio, frame_length=trim_frame_length)

    return audio


def one_hot_encode(data, channels=256):
    one_hot = np.zeros((data.size, channels), dtype=float)
    one_hot[np.arange(data.size), data.ravel()] = 1

    return one_hot


def one_hot_decode(data, axis=1):
    decoded = np.argmax(data, axis=axis)

    return decoded


def mu_law_encode(audio, quantization_channels=256):
    """
    Quantize waveform amplitudes.
    Reference: https://github.com/vincentherrmann/pytorch-wavenet/blob/master/audio_data.py
    """
    mu = float(quantization_channels - 1)
    quantize_space = np.linspace(-1, 1, quantization_channels)

    quantized = np.sign(audio) * np.log(1 + mu * np.abs(audio)) / np.log(mu + 1)
    quantized = np.digitize(quantized, quantize_space) - 1

    return quantized


def mu_law_decode(output, quantization_channels=256):
    """
    Recovers waveform from quantized values.
    Reference: https://github.com/vincentherrmann/pytorch-wavenet/blob/master/audio_data.py
    """
    mu = float(quantization_channels - 1)

    expanded = (output / quantization_channels) * 2. - 1
    waveform = np.sign(expanded) * (
                   np.exp(np.abs(expanded) * np.log(mu + 1)) - 1
               ) / mu

    return waveform


class Dataset(data.Dataset):
    def __init__(self, data_dir, sample_rate=16000, in_channels=256, trim=True):
        super(Dataset, self).__init__()

        self.in_channels = in_channels
        self.sample_rate = sample_rate
        self.trim = trim

        self.root_path = data_dir
        self.filenames = [x for x in sorted(os.listdir(data_dir))]

    def __getitem__(self, index):
        filepath = os.path.join(self.root_path, self.filenames[index])

        raw_audio = load_audio(filepath, self.sample_rate, self.trim)

        encoded_audio = mu_law_encode(raw_audio, self.in_channels)
        encoded_audio = one_hot_encode(encoded_audio, self.in_channels)

        return encoded_audio

    def __len__(self):
        return len(self.filenames)


class DataLoader(data.DataLoader):
    def __init__(self, data_dir, receptive_fields,
                 sample_size=0, sample_rate=16000, in_channels=256,
                 batch_size=1, shuffle=True):
        """
        DataLoader for WaveNet
        :param data_dir:
        :param receptive_fields: integer. size(length) of receptive fields
        :param sample_size: integer. number of timesteps to train at once.
                            sample size has to be bigger than receptive fields.
                            |-- receptive field --|---------------------|
                            |------- samples -------------------|
                            |---------------------|-- outputs --|
        :param sample_rate: sound sampling rates
        :param in_channels: number of input channels
        :param batch_size:
        :param shuffle:
        """
        dataset = Dataset(data_dir, sample_rate, in_channels)

        super(DataLoader, self).__init__(dataset, batch_size, shuffle)

        if sample_size <= receptive_fields:
            raise Exception("sample_size has to be bigger than receptive_fields")

        self.sample_size = sample_size
        self.receptive_fields = receptive_fields

        self.collate_fn = self._collate_fn

    def calc_sample_size(self, audio):
        return self.sample_size if len(audio[0]) >= self.sample_size\
                                else len(audio[0])

    @staticmethod
    def _variable(data):
        tensor = torch.from_numpy(data).float()

        if torch.cuda.is_available():
            return torch.autograd.Variable(tensor.cuda())
        else:
            return torch.autograd.Variable(tensor)

    def _collate_fn(self, audio):
        audio = np.pad(audio, [[0, 0], [self.receptive_fields, 0], [0, 0]], 'constant')

        if self.sample_size:
            sample_size = self.calc_sample_size(audio)

            while sample_size > self.receptive_fields:
                inputs = audio[:, :sample_size, :]
                targets = audio[:, self.receptive_fields:sample_size, :]

                yield self._variable(inputs),\
                      self._variable(one_hot_decode(targets, 2))

                audio = audio[:, sample_size-self.receptive_fields:, :]
                sample_size = self.calc_sample_size(audio)
        else:
            targets = audio[:, self.receptive_fields:, :]
            return self._variable(audio),\
                   self._variable(one_hot_decode(targets, 2))

In [21]:
class ResLayer(nn.Module):
    def __init__(self, resChannels, skipChannels, dilation):
        super(ResLayer, self).__init__()
        self.dconv = nn.Conv1d(resChannels, resChannels,
                                    kernel_size=2, stride=1,
                                    dilation=dilation,
                                    padding=0, 
                                    bias=False)
        self.resConv = nn.Conv1d(resChannels, resChannels, 1)
        self.skipConv = nn.Conv1d(resChannels, skipChannels, 1)

    def forward(self, x, skipSize):
        out = self.dconv(x)
        tanh = nn.Tanh()
        tan = tanh(out)
        sigmoid = nn.Sigmoid()
        sig = sigmoid(out)
        gated = tan * sig
        output = self.resConv(gated)
        skipOutput = self.skipConv(gated)
        skipOutput = skipOutput[:, :, -skipSize:]
        return output, skipOutput
        

In [31]:
class WaveNetModel(nn.Module):
    def __init__(self, layers, layerSize, resChannels, skipChannels):
        super(WaveNetModel, self).__init__()
        self.resChannels = resChannels
        self.skipChannels = skipChannels
        dilations = []
        #build dilation numbers
        [[dilations.append(2**x) for x in range(layerSize)] for i in range(layers)]
        #build residual layers
        self.resLayers = []
        [self.resLayers.append(ResLayer(resChannels, skipChannels, d)) for d in dilations]
        #calculate receptive fields
        l = []
        for i in range(layerSize):
            l.append(2**i)
        l = l * layers
        self.recFields = int(np.sum(l))


    def forward(self, x):
        output = x.transpose(1, 2)
        size = int(output.size(2)) - self.recFields
        #Causal convolution
        causal = nn.Conv1d(self.skipChannels, self.resChannels, kernel_size=2, stride=1, padding=1, bias=False)
        output = causal(output)
        output = output[:, :, :-1]
        #residual layers
        skip = []
        for layer in self.resLayers:
            output, skipOutput = layer(output, size)
            skip.append(skipOutput)
        output = torch.stack(skip)
        output = torch.sum(output, dim=0)
        #skip connections blocks
        relu = nn.ReLU()
        output = relu(output)
        conv1 = nn.Conv1d(self.skipChannels, self.skipChannels, 1)
        output = conv1(output)
        output = relu(output)
        conv2 = nn.Conv1d(self.skipChannels, self.skipChannels, 1)
        output = conv2(output)
        softmax = nn.Softmax(dim=1)
        output = softmax(output)
        return output.transpose(1, 2).contiguous()

In [33]:
def trainModel(model, lr, inputs, targets):
    outputs = model.forward(inputs)
    loss = nn.CrossEntropyLoss()
    model_loss = loss(outputs.view(-1, model.skipChannels), targets.long().view(-1))
    optimizer = optim.Adam(model.parameters(), lr=lr)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    return loss.data[0]

def batch(dataL):
    while True:
        for dataset in dataL:
            for inputs, targets in dataset:
                yield inputs, targets

@staticmethod
def get_model_path(model_dir, step=0):
    basename = 'WaveNet'
    if step:
        return os.path.join(model_dir, '{0}_{1}.pkl'.format(basename, step))
    else:
        return os.path.join(model_dir, '{0}.pkl'.format(basename))


def train(model, lr, numSteps, dataL, saveDir):
    steps = 0
    for inputs, targets in batch(dataL):
        loss = trainModel(model, lr, inputs, targets)
        steps += 1
        if steps > num_steps:
            break
    path = get_model_path(saveDir, 0)
    torch.save(model.state_dict(), path)
    


In [34]:
wavenet = WaveNetModel(layers=5, layerSize=10, resChannels=512, skipChannels=256)
sampleSize = 100000
sampleRate = 16000
loader = DataLoader("./Files", wavenet.recFields, sampleSize, sampleRate, wavenet.skipChannels)

train(wavenet, lr=0.002, numSteps=100000, dataL=loader, saveDir = "./outputs")

ValueError: optimizer got an empty parameter list