## Data Augmentation

Here we use the librosa library to implement the data augmentation recommended by the Facebook FAIR A Universal Music Translation Network paper. We import music, split it into (by default) 1 second clips, and then randomly shift [0.25, .5] second clips up or down by [-0.5, .5] half-steps. 

This is implemented as a PyTorch Dataset, which can be fed (as shown) into a dataloader.

In [1]:
import librosa as libr
import numpy as np
import torch
import os
import torch.utils.data

In [23]:
class MusicDataset(torch.utils.data.Dataset):
    """Music"""

    def __init__(self, root_dir, sr = 22050, clip_length = 1, range = 0.5):
        """
        Args:
            root_dir (string): Directory with all the music.
            sr (int): Sampling rate (all music will be resampled to this rate by default. Default = 22050)
            clip_length (float): Clip length in seconds
        """
        self.root_dir = root_dir
        self.sr = sr
        self.clip_length = clip_length
        self.range = range
        
        allowed_formats = ['.m4a', '.wav', '.mp3']
        
        data = []
        
        for file in os.listdir(self.root_dir):
            print(file)
            if not any((file.endswith(ext) for ext in allowed_formats)):
                continue
    
            try:
                X, sr = libr.load("{}\{}".format(root_dir, file), self.sr)
                assert(sr == self.sr)
                Y = libr.util.frame(X, self.sr * self.clip_length) # split into 1 second clips
                data.append(Y)
                print("successfully loaded {} {}-second ({} sample) clip(s) from {}".format(Y.shape[1], self.clip_length, self.clip_length * self.sr, file))
            except AssertionError as e:
                print("unable to load {}".format(file))
                
        self.data = np.concatenate(data, axis = 1).T
        
        # to speed this up, maybe something like this, i.e. augment first
        
#         pitch = np.random.random_sample(self.data.shape[1]) - 0.5 # how much to raise/lower by
#         dur = np.random.random_sample(self.data.shape[1]) / 4 + 0.25 # duration of subsample between [0.25, .5]
#         low = min(np.random.random_sample(self.data.shape[1]), 1 - dur) # lower bound
        
#         a = np.round(self.sr * low, 0)
#         b = np.round(self.sr * dur, 0) + a
        
#         clip[:, a : b] = libr.effects.pitch_shift(clip[:, a : b], self.sr, n_steps = pitch) # may modify data matrix, not a huge deal
        
        
    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, idx):
        pitch = self.range * 2 * (np.random.random_sample() - 0.5) # how much to raise/lower by
        dur = (np.random.random_sample() / 4 + 0.25) * self.clip_length # duration of subsample between [0.25, .5]
        low = min(self.clip_length * np.random.random_sample(), self.clip_length - dur) # lower bound
        
        clip = self.data[idx]
        
        a = int(self.sr * low)
        b = int(self.sr * dur) + a
        
        clip[a : b] = libr.effects.pitch_shift(clip[a : b], self.sr, n_steps = pitch) # may modify data matrix, not a huge deal
        print('hi')
        return torch.Tensor(clip)

## To create a dataset:

Pass it the path to the audio files and optionally some command line arguments. 

Generating the dataset takes a while. This takes so long because it uses librosa which resamples automatically. If you are using a bunch of audio with the same sampling rate, you can use scipy instead. 

In [34]:
d1 = MusicDataset('music_data/Hahn', clip_length = 1)
d2 = MusicDataset('music_data/Milstein', clip_length = 1)
d2[1000]


09 Bach_ Violin Partita #2 In D Mino.m4a
successfully loaded 5517 1-second (22050 sample) clip(s) from 09 Bach_ Violin Partita #2 In D Mino.m4a
12 Bach_ Violin Partita #2 In D Mino.m4a
successfully loaded 46125 1-second (22050 sample) clip(s) from 12 Bach_ Violin Partita #2 In D Mino.m4a
13 Bach_ Sonata #3 In C For Solo Vio.m4a
successfully loaded 12666 1-second (22050 sample) clip(s) from 13 Bach_ Sonata #3 In C For Solo Vio.m4a
15 Bach_ Sonata #3 In C For Solo Vio.m4a
successfully loaded 10157 1-second (22050 sample) clip(s) from 15 Bach_ Sonata #3 In C For Solo Vio.m4a


MemoryError: 

In [33]:
with open('hahn_dataset.pkl', 'wb') as f1:
    pkl.dump(d1, f, pkl.HIGHEST_PROTOCOL)
with open('milstein_dataset.pkl', 'wb') as f2:
    pkl.dump(d2, f, pkl.HIGHEST_PROTOCOL)

MemoryError: 

### Pickle the object

In [30]:
import pickle as pkl
with open('perlman_dataset.pkl', 'wb') as f:
    pkl.dump(d, f, pkl.HIGHEST_PROTOCOL)

In [27]:
libr.output.write_wav('example2.wav', d[1900].numpy(), 22050)
type(d[5])

10142 19298 -0.446739963880348
4376 15172 -0.4925592750895442


torch.Tensor

## DataLoader

PyTorch DataLoaders allow you do load a bunch of data and iterate over it. For example, you can do

```
for minibatch in dataloader:
    train
 ```

In [123]:
dataloader = torch.utils.data.DataLoader(d, batch_size=10, shuffle=True, num_workers=1)

In [20]:
def foo(self, idx):
    pitch = self.range * 2 * (np.random.random_sample() - 0.5) # how much to raise/lower by
    dur = (np.random.random_sample() / 4 + 0.25) * self.clip_length # duration of subsample between [0.25, .5]
    low = min(self.clip_length * np.random.random_sample(), self.clip_length - dur) # lower bound

    clip = self.data[idx]

    a = int(self.sr * low)
    b = int(self.sr * dur) + a
    
    print(a, b, pitch)
    
    clip[a : b] = libr.effects.pitch_shift(clip[a : b], self.sr, n_steps = pitch) # may modify data matrix, not a huge deal

    return torch.Tensor(clip)

In [21]:
MusicDataset.__getitem__ = foo

### Extract a sample

In [22]:
sample.shape

NameError: name 'sample' is not defined

### Write an example to a wav file

In [115]:
libr.output.write_wav('example.wav', sample[3].numpy(), 22050)