In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 1

import torch
import torch.nn.functional as F

import numpy as np
from scipy import interpolate
import librosa
from soundfile import write
import music21

import random
from copy import deepcopy

import math
from itertools import permutations
import csv
from fractions import Fraction

from performer.models.ddsp_module import DDSP
from performer.datamodules.components.ddsp_dataset import DDSPDataset

from IPython.display import Audio, Image
from matplotlib import pyplot as plt

In [None]:
from easing_functions import *

In [None]:
flt_ckpt = '../checkpoints/flute_longrun.ckpt'

In [None]:
with torch.inference_mode():
    model = DDSP.load_from_checkpoint(flt_ckpt, map_location='cuda')
    model = model.to('cuda')
    model.eval()
    pass

## Another mess

In [None]:
%aimport performer.composition.score

In [None]:
expo = performer.composition.score.Envelope(0.1, 3.5, 0., 0.8)
t = np.linspace(-1, 4, 1250)
plt.plot(t, expo(t))

In [None]:
notes = performer.composition.score.NoteList()
start, duration, gap = 1.0, 0.2, 0.2
for i in range(8):
    notes.append(performer.composition.score.Note(start + gap * i, duration, 0.6, 110. * (i+3)))
t, env, f0 = notes.render()

adsr = env * 90 - 100

with torch.inference_mode():
    y = model(torch.from_numpy(f0[None, None, :].astype('float32')).cuda(), torch.from_numpy(adsr[None, None, :].astype('float32')).cuda())

plt.plot(t, env)
plt.show()

Audio(y.cpu().squeeze(), rate=48000, normalize=False)

## Examine real loudness and f0 envelopes

In [None]:
sample_path = "/home/kureta/Music/Flute Samples/02. Fantasia No. 1 in A Major, TWV 40_2.wav"

In [None]:
from performer.utils.features import Loudness, get_f0
from performer.utils.constants import N_FFT
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
ld = Loudness()

In [None]:
audio, _ = librosa.load(sample_path, sr=48000, mono=False, dtype='float32', duration=15.)
audio = torch.from_numpy(audio)
audio.unsqueeze_(0)
audio = F.pad(audio, (N_FFT //2, N_FFT //2))

In [None]:
loudness = ld.get_amp(audio)
f0 = get_f0(audio, fmin=31.7)

In [None]:
amp = loudness.squeeze().cpu().numpy()
freq = f0.squeeze().cpu().numpy()
t = np.arange(len(amp)) / 250

source = {
    'time': t, 
    'f0': freq,
    'amp': amp,
}

In [None]:
# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=t, y=freq, name="F0 (Hz)"),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=t, y=amp, name="Loudness (dB)"),
    secondary_y=True,
)

# Add figure title
fig.update_layout(
    title_text="Flute control parameters"
)

# Set x-axis title
fig.update_xaxes(title_text="time (seconds)")

# Set y-axes titles
fig.update_yaxes(title_text="F0 (Hz)", secondary_y=False)
fig.update_yaxes(title_text="Loudness (dB)", secondary_y=True)

fig.show()

In [None]:
Audio(audio.squeeze().cpu()[0], rate=48000)

In [None]:
with torch.inference_mode():
    y = model(f0, loudness)
Audio(y.cpu().squeeze(), rate=48000, normalize=False)

# Lilypond event stream parser

## Plan

### Loudness envelope

- Set default values for `attack_time`, `decay_time`, `release_time`.
  - If duration is less than the sum of those:
    - If duration is also less than `attack_time` + `decay_time` set both to half the duration
  - Else, keep `attack_time`, extend `decay_time` to the end if necessary
  - Extend duration with a sustain as necessary, if all of the above fits into the duration
- Set `peak_amp` to 1.0 and set a default `sustain_amp`
  - Multiply the resulting envelope with constant values if there are any dynamic indicators (ex. ___pp___, ___f___, ...)
  - if there are no dynamic indicators at the start of the piece, assume ___mf___.
  - If there are hairpins, multiply with a line from starting dynamic to ending dynamic
  - ___sfz___ modifies `peak_amp` / `sustain_amp` ratio.
  - Decide what to do with other dynamic indicators when they come up.
  - Finally, map the envelop values from 0-1 to `min_db`-`max_db`.
- If there is a slur or tie, total duration of the envelope will be equal to the duration of the slur.
  - Maybe add tiny attack/decays if note changes inside the slur.

__Note__: not enough information in events to support phrasing slurs (slurs within slurs)

In [None]:
a_f1 = './CanisMajoris2022-Flute I.notes'

In [None]:
def whole_note_sec(tempo):
    return 60 * 16 / tempo

def moment_to_sec(moment, tempo):
    return whole_note_sec(tempo) * moment

In [None]:
def midi_to_hz(midi: float) -> float:
    return 440. * 2**((midi - 69) / 12)

def hz_to_midi(hz: float) -> float:
    return 12 * torch.log2(hz / 440) + 69

def ratio_to_interval(ratio):
    return 12 * torch.log2(ratio)

In [None]:
def map_from_unit(value, low, high):
    scale = high - low
    return value * scale + low

In [None]:
def adsr(ta, td, tr, zero, peak, sustain, dur):
    ts = dur - ta - td - tr
    
    env_a = torch.linspace(zero, peak, round(ta * 250))
    env_d = torch.linspace(peak, sustain, round(td * 250))
    env_sus = torch.ones(round(ts * 250)) * sustain
    env_rel = torch.linspace(sustain, zero, round(tr * 250))

    env = torch.cat([env_a, env_d, env_sus, env_rel]).cuda()
    
    return env

In [None]:
def to_float(val: float | str):
    if isinstance(val, str):
        return float(val)
    else:
        return val

class Event:
    def __init__(self, row):
        self.moment = to_float(row[0])
        self.tempo = None
    
    @property
    def time(self) -> float:
        return moment_to_sec(self.moment, self.tempo)
    
    def __repr__(self):
        return f'<{self.__class__.__name__.upper()}>\ttime: {self.time:.2f} tempo: {self.tempo:.2f}'

class Tempo(Event):
    def __init__(self, row):
        super().__init__(row)
        self.tempo = to_float(row[2])

class NoteOrRest(Event):
    def __init__(self, row, tempo):
        super().__init__(row)        
        self.tempo = tempo
    
    @property
    def dur(self):
        return moment_to_sec(self.dur_moment, self.tempo)
    
    def __repr__(self):
        parent_repr = super().__repr__()
        return f'{parent_repr} duration: {self.dur:.2f}'

class Note(NoteOrRest):
    def __init__(self, row, tempo):
        super().__init__(row, tempo)
        self.pitch = to_float(row[2])
        self.dur_moment = to_float(row[4])
    
    def __repr__(self):
        parent_repr = super().__repr__()
        return f'{parent_repr} pitch: {self.pitch:.2f}'

    @property
    def dur(self):
        return moment_to_sec(self.dur_moment, self.tempo)

class Rest(NoteOrRest):
    def __init__(self, row, tempo):
        super().__init__(row, tempo)
        self.dur_moment = to_float(row[3])

## Implementation details

- We need 3 objects: pitch, gate, amp


In [None]:
def parser(path: str):
    with open(path) as csvfile:        
        current_tempo = None
        
        for row in csv.reader(csvfile, delimiter='\t'):
            match row[1]:
                case 'tempo':
                    tempo = Tempo(row)
                    current_tempo = tempo.tempo
                    yield tempo
                case 'note':
                    yield Note(row, current_tempo)
                case 'rest':
                    yield Rest(row, current_tempo)
                case default:
                    yield f'<NA>\ttime: {moment_to_sec(to_float(row[0]), current_tempo):.2f} kind: {row[1]} values: {" - ".join(row[2:])}'

In [None]:
events = []
pitch_vals = []
amp_vals = []
t_vals = []
gates = []
for event in parser(a_f1):
    print(event)
    if isinstance(event, Tempo):
        # Nothing to do yet
        pass
    elif isinstance(event, Note):
        pitch_vals.append(event.pitch)
        amp_vals.append(0.7)
        t_vals.append(event.time)
        pitch_vals.append(event.pitch)
        amp_vals.append(0.7)
        t_vals.append(event.time + event.dur - 1e-10)
    elif isinstance(event, Rest):
        last_pitch = pitch_vals[-1]
        pitch_vals.append(last_pitch)
        amp_vals.append(0.0)
        t_vals.append(event.time)
        pitch_vals.append(last_pitch)
        amp_vals.append(0.0)
        t_vals.append(event.time + event.dur - 1e-10)

pitch = np.array(pitch_vals, dtype='float32')
amp = np.array(amp_vals, dtype='float32')
t = np.array(t_vals, dtype='float32')

interp_pitch = interpolate.interp1d(t, pitch)
interp_amp = interpolate.interp1d(t, amp)

t_new = np.linspace(t[0], t[-1], round(t[-1] * 250), dtype='float32')

pitch_new = interp_pitch(t_new)
amp_new = interp_amp(t_new)

In [None]:
freq = torch.from_numpy(midi_to_hz(pitch_new)).cuda()
loudness = torch.from_numpy(map_from_unit(amp_new, -100, -15)).cuda()

In [None]:
loudness.min(), loudness.max(), amp_new.min(), amp_new.max()

In [None]:
with torch.inference_mode():
    y = model(freq[None, None, :], loudness[None, None, :]).squeeze()

In [None]:
Audio(data=y.cpu(), rate=48000, normalize=True)

In [None]:
plt.plot(loudness.cpu())

# Old method

In [None]:
us = music21.environment.UserSettings()
us['musescoreDirectPNGPath'] = '/usr/bin/musescore'

In [None]:
def midi_to_hz(midi: float) -> float:
    return 440. * 2**((midi - 69) / 12)

def hz_to_midi(hz: float) -> float:
    return 12 * torch.log2(hz / 440) + 69

def ratio_to_interval(ratio):
    return 12 * torch.log2(ratio)

In [None]:
def adsr(ta, td, tr, zero, peak, sustain, dur):
    ts = dur - ta - td - tr
    
    env_a = torch.linspace(zero, peak, round(ta * 250))
    env_d = torch.linspace(peak, sustain, round(td * 250))
    env_sus = torch.ones(round(ts * 250)) * sustain
    env_rel = torch.linspace(sustain, zero, round(tr * 250))

    env = torch.cat([env_a, env_d, env_sus, env_rel]).cuda()
    
    return env

In [None]:
def sin(ts: float, f: float):
    t = torch.arange(int(ts * 250), dtype=torch.float32, device='cuda') / 250
    result = torch.sin(2 * np.pi * f * t)
    
    return result

def sin_like(ts: torch.Tensor, f: float):
    t = torch.arange(ts.shape[-1], dtype=torch.float32, device='cuda') / 250
    result = torch.sin(2 * np.pi * f * t)
    
    return result

In [None]:
def show(music):
    display(Image(str(music.write("lily.png"))))

In [None]:
def add_microtone(note):
    cents = note.pitch.microtone.cents
    prefix = ''
    if cents > 0:
        prefix = '+'
    if abs(cents) >= 10:
        note.addLyric(f'{prefix}{int(np.round(cents))}', applyRaw=True)

In [None]:
rand = random.Random(123)
beat = 0.75  # 1 beat is 0.75 seconds
fps = 250

In [None]:
def build_measure(p1, p2):
    one = music21.note.Note(quarterLength=1/3)
    one.articulations.append(music21.articulations.Accent())
    one.pitch.frequency = p1
    add_microtone(one)

    two = music21.note.Note(quarterLength=1/3)
    two.pitch.frequency = p2
    two.articulations.append(music21.articulations.Staccato())
    add_microtone(two)

    sl1 = music21.spanner.Slur([one, two])

    rest1 = music21.note.Rest(1/3)
    rest = music21.note.Rest(4)

    m01 = music21.stream.Measure(number=1)

    # m01.append(music21.dynamics.Dynamic('sfz'))
    m01.append(one)
    m01.append(two)
    m01.append(sl1)
    m01.append(rest1)
    m01.append(rest)
    
    return m01

In [None]:
constant = [2, 3, 5, 7, 11/2, 13/2, 17/4, 19/4]

In [None]:
mm = music21.stream.Measure()
for val in constant:
    nn = music21.note.Note()
    freq = midi_to_hz(ratio_to_interval(torch.tensor(val)) + 52)
    nn.pitch.frequency = freq
    mm.append(nn)
mm.show()

In [None]:
lines = [
    [(0, 1), (2, 7), (2, 4), (5, 2), (7, 5), (7, 6), (7, 6), (2, 5)],
    [(7, 5), (4, 3), (7, 3), (3, 0), (1, 0), (3, 1), (4, 5), (3, 0)],
]

In [None]:
cons = [Fraction(c) for c in constant]
[(cons[i], cons[j]) for i, j in lines[1]]    

In [None]:
from torchaudio.functional.filtering import lowpass_biquad

In [None]:
%aimport performer.canis

In [None]:
adsr = performer.canis.ADSR(0.2, 4.0)
adsr.set_staccato()
# adsr.set_sforzando()
env = adsr.get_envelope_func()
t = np.linspace(0, 8.0, 250*5)

In [None]:
plt.plot(t, env(t))

In [None]:
with torch.inference_mode():
    amp = env(t)
    amp = amp * 90 - 100 - 10.
    amp = torch.from_numpy(amp.astype('float32')).cuda()

    f0 = torch.ones_like(amp, device='cuda') * 440.

    y = model(f0[None, None, :], amp[None, None, :])
Audio(y.cpu().squeeze(), rate=48000, normalize=False)

In [None]:
many = performer.canis.ADSRList()
delta = 0.25
duration = 1.5
start = 0.5
for idx in range(5):
    many.notes.append(performer.canis.ADSR(start, duration))
    start += duration
env = many.get_envelope_func()

In [None]:
with torch.inference_mode():
    amp = env(t)
    dynamics = performer.canis.get_line(many.notes[0].start, duration * 5, 1, 0.5)
    def dynamo(t):
        return np.minimum(1, np.maximum(dynamics(t), 0))
    amp = amp * dynamo(t)
    amp = amp * 90 - 100 - 10.
    amp = torch.from_numpy(amp.astype('float32')).cuda()

    f0 = torch.ones_like(amp, device='cuda') * 440.

    y = model(f0[None, None, :], amp[None, None, :])
Audio(y.cpu().squeeze(), rate=48000, normalize=False)

## WHAT

In [None]:
amp = adsr(0.1, 0.7, 0.01, -100, -8, -48, 1.5 * beat)
silence = torch.ones(round(3.5 * beat * fps), device='cuda') * -100.
env = torch.cat([amp, silence], dim=-1)


s = music21.stream.Score(id='mainScore')
part0 = music21.stream.Part(id='part0')
part1 = music21.stream.Part(id='part1')


ys = []
parts = [part0, part1]
lines = [
#     [(0, 1), (2, 4), (7, 4), (5, 2), (7, 5), (7, 6), (7, 6), (7, 6)],
#     [(7, 5), (2, 3), (7, 3), (3, 0), (1, 0), (3, 1), (4, 5), (3, 0)],
    [(0, 1), (2, 7), (2, 4), (5, 2), (7, 5), (7, 6), (7, 6), (2, 5)],
    [(7, 5), (4, 3), (7, 3), (3, 0), (1, 0), (3, 1), (4, 5), (3, 0)],
]
for part, line in zip(parts, lines):
    oll = []
    for idx1, idx2 in line:
        with torch.inference_mode():
            p1 = midi_to_hz(ratio_to_interval(torch.tensor(constant[idx1])) + 52)
            p2 = midi_to_hz(ratio_to_interval(torch.tensor(constant[idx2])) + 52)
            mezura = build_measure(p1, p2)
            # if j % 3 == 2:
            #     mezura.append(music21.layout.SystemLayout(isNew=True))
            part.append(mezura)
            
            f0 = torch.ones_like(env) * p2
            f0[:int(beat*0.333*fps)] = p1
            y = model(f0[None, None, :], env[None, None, :])
            oll.append(y)

    ys.append(torch.cat(oll, dim=-1).cpu().numpy().squeeze())


tempo = music21.tempo.MetronomeMark(referent=1.0, number=90.0)

for part in parts:
    part.measure(1).insert(tempo)
    part.insert(0, music21.meter.TimeSignature('5/4'))
    s.insert(0, part)

f0 = midi_to_hz(torch.ones_like(env, device='cuda') * 51-12)
amp = adsr(0.1, 0.7, 0.01, -100, -3, -48, 2.5 * beat)
silence = torch.ones(round(2.5 * beat * fps), device='cuda') * -100.
env = torch.cat([amp, silence], dim=-1)
oll = []
for _ in range(8):
    with torch.inference_mode():
        y = model(f0[None, None, :], env[None, None, :])  # * (torch.randn(1, device='cuda') * 0.25 + 1))
        oll.append(y)

ys.append(torch.cat(oll, dim=-1).cpu().numpy().squeeze())

s.show()
Audio(data=sum(ys), rate=48000, normalize=True)

In [None]:
env.min(), env.max()

In [None]:
plt.plot(env.cpu())

## Braids

In [None]:
tones = np.arange(1, 4)
print(tones)
for i in range(4):
    print(np.random.permutation(tones))