### Install `mmk` and its dependencies

In [None]:
!git clone https://github.com/antoinedaurat/mmk.git
!pip install -r mmk/requirements.txt

### load the api token of your neptune account

In [None]:
from getpass import getpass
api_token = getpass('Enter your private Neptune API token: ')

In [None]:
%cd ..

### Imports

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd
plt.rcParams['figure.figsize'] = (20, 6)
import torch
import numpy as np
import os
import sys

sys.path.pop(0)
sys.path.pop(0)
print(sys.path)

# class defining the class of FreqNet models and the dictionary to subclass it
from mmk.models.freqnet import FreqNetModel, layer_funcs
from mmk.kit import get_trainer
from mmk.models.freqnet import FreqNet

# generate function 
from mmk.modules.generate import generate

# data utils
from mmk.data import Database

# debug, interact
from mmk.utils import show, audio

### download some data and load it here

In [None]:
db_name = "../gould.h5"
# download_database(api_token, project_name="k-tonal/data", experiment_id="DAT-1", database_name=db_name)
db = Database(db_name)
db.metadata

# Model's Definition and Training

## FreqNet's Parameters :

### Layer Functions

`FreqNet` is, similar to Wavenet, a model with stacked dilated convolution layer. The layer function of FreqNet allows you to specify different options for the implemententation of those stacked dilated convolution layers that result in models with different properties.

Options are :
- `strict` can be `True` or `False`. It specifies whether each layer is to be defined as an autoregressive function or not. Originally, Wavenet considers the whole network to be such a function. Accordingly, for an input of the length of wavenet's receptive-field, wavenet outputs a single (future) time-step. Setting `strict` to `True`, makes each layer outputs a (future) time-step, which results in an output of `#of-layers` time-steps for an input with length equal to the receptive-field. 
- `accum_outputs` adds residual to the layer's definition. Possible values are `[-1, 0, 1]` for residuals aligned to the left, no residuals, and residuals aligned to the right respectively.
- `concat_outputs` also takes `[-1, 0, 1]` as possible values for left-, None and right-concatenation respectively. Since convolution layers outputs fewer time-steps then they recieve, this option concatenates some of the input (left or right) to make the output of the layer have the same length as the input.
- `pad_input` also takes `[-1, 0, 1]` as possible values for left-, None and right-padding of the input.

`layer_funcs` is a `dict` holding pre-defined layer functions. Just take a look at this object to see some possible combinations of these options.


### Other Parameters

- `database` must be the `Database` object holding the data you want to use for training.
- `train_set` must be a `pandas.DataFrame` specifying some subset of the data.
- `gate_c`, `residuals_c` and `skips_c` are the number of channels to use for each of this convolutions
- `conv_kwargs` is a `dict` of parameters for the `torch.nn.Conv1d` objects.
- `lf` is the layer_function. It should be a `partial` packed in a tuple (otherwise it mysteriously disappear from the hyperparameters...)
- `layers` is a tuple of integers. Each integers `k` defines a block of `k` layers that will have `2**k` time-steps in its receptive-field.
- `learn_padding` lets you make the input padding learnable whenever you use `pad-input` in the layer-function.
- `lr` is the max learning rate. Values between 1e-3 and 1e-4 work in most cases.
- `batch_size` and `sequence_length` lets you specify `N` and `S` for batches of shape `(N x S x D)` aka (number of sequences, number of steps per sequence, time-step dimensionality)
- `root_dir`, `name` and `version` specify where to store the model files. The path to all the files will be `root_dir/name/version`. If you don't specify a `version` and `overwrite` is `False` a new folder is added in `root_dir/name`
- `era_duration` is the frequency at which you want to make training checkpoints. `era_duration=50` means you'll have one checkpoint every 50 epochs. 

In [None]:
fnet = FreqNet(
    inputs=db.fft.get(db.metadata.iloc[[0, ]]),
    batch_size=64,
    input_seq_length=64,
    to_gpu=True,
    splits=[.8, .2],
    input_dim=db.fft.shape[-1],
    model_dim=512,
    conv_kwargs=dict(groups=1),
    lf=layer_funcs["residuals_left"],
    layers=(int(np.log2(16)),),
    max_lr=5e-4,
    betas=(.9, .9),
    div_factor=3.,
    final_div_factor=1.,
    pct_start=.25,
    cycle_momentum=False,
)

trainer = get_trainer(
    root_dir="freqnet_test", 
    version=None,
    epochs=[1,],
    max_epochs=3,
)

trainer.fit(fnet, )

In [None]:
FreqNet.load_from_checkpoint("freqnet_test/states/epoch=1.ckpt")

In [None]:
fnet.state_dict()

### upload your model to neptune

In [None]:
upload_model(fnet, api_token, "k-tonal/model-upload-test")

### Either load a checkpoint ...

In [None]:
root_dir="mmk_test_model/"
name="mmk_test_model"
version = str(0)
epoch = None

# load the checkpoint
fnet = FreqNet.load(FreqNet, root_dir + name + "/v" + version + "/", epoch)

### ... or download a model from neptune

In [None]:
epoch = None
model_path  = download_model(api_token,
                             project_name="k-tonal/model-upload-test",
                             experiment_id="MOD-2")
fnet = FreqNet.load(FreqNet, model_path, epoch)

## Generate

In [None]:
input_length = 64
n_steps = 512
# pick a random input slice from the trainset
piece = db.metadata.iloc[[0, ]].sample(1)
input_start = np.random.randint(piece.start.min(), piece.stop.max(), 1)[0]

inpt = db.fft[input_start:input_start+input_length]

if torch.cuda.is_available():
    fnet.to("cuda")

generated = generate(fnet.eval(), inpt, n_steps, *fnet.generation_slices())

audio(generated.T)
show(generated.T)

### Plot the latent variables (skips) for each layer

In [None]:
from mmk.utils import numcpu, to_torch

fnet = fnet.to("cuda").eval()
inpt, target = next(iter(fnet.dl))

# helper func
permute = lambda x: x.transpose(1, 2).contiguous()

# modify the forward method to collect the latents
with torch.no_grad():
    latents = []
    x = inpt.clone()
    x = fnet.inpt(x)
    latents += [permute(x.clone())]
    skips = None
    block = fnet.blocks[0]
    for layer in block.block:
        x, skips = block.layer_func(layer, x, skips)
        latents += [permute(skips.clone())]
    y = fnet.outpt(skips)
    
inpt = numcpu(inpt[0].squeeze()).T
plt.figure()
show(inpt, db_scale=True, title="input")
print("input:")
audio(inpt)

latents = [numcpu(h[0].squeeze()).T for h in latents]

for i, h in enumerate(latents):
    plt.figure()
    show(h, db_scale=False, title="latent_" + str(i))

    
y = numcpu(y[0].squeeze()).T
plt.figure()
show(y, title="output")
print("output:")
audio(y)

    
target = numcpu(target[0].squeeze()).T[:, :y.shape[1]]
plt.figure()
show(target, title="target")
print("target:")
audio(target)

In [None]:
plt.plot(latents[1][:, [38,13]])

In [None]:
X = latents[3]
cov = np.zeros((512, 512))
for t in range(X.shape[1]-1):
    cov += np.cov(X[:, [t, t+1]])
    
show(cov, db_scale=False)
plt.figure()
show(np.cov(X), db_scale=False)

In [None]:
Wf = numcpu(fnet.inpt.fcg.weight.detach().T)
Wg = numcpu(fnet.inpt.fcf.weight.detach().T)

show(Wf, db_scale=False, title="filter components")
plt.figure()
show(Wg, db_scale=False, title="gate components")

In [None]:
plt.plot(np.linalg.norm(Wf, axis=0))
plt.plot(np.linalg.norm(Wg, axis=0))