# Concatenative Synthesis Notebook

This notebook provides an interface for some simple and approximate concatenative synthesis. It is split into several chunks:

- Corpus Analysis: Analyze a corpus of input sounds.
- Input Matching: Match an input sound to similar sound sources from the corpus.
- Synthesis: Re-synthesize the input using the corpus, with control over the granular parameters.

In [1]:
# first, some imports!
import librosa
import numpy as np
import os
import scipy
import IPython
import signalflow as sf
from scipy import interpolate
from ipycanvas import Canvas
from ipywidgets import Output

# Corpus Analysis

First, some parameters we care about. This is designed to work in my pretty particular directory layout:
```
481-final/
-- code/
   -- ConcatReSynth.ipynb
-- sounds-1/ <- this directory is one we could use for corpus_dir!
   -- file.wav
   -- ...
   -- another_file.wav
```

In [543]:
corpus_dir = "sounds-1" # corpus directory. see note above.
sr = 44100              # sample rate for analysis
N = 16                   # analysis blocksize = 2^N

We load in the corpus and split all the files into an array of $2^N$-sized segments. 

In [544]:
blocksize = int(2**N)
path = os.path.join(os.getcwd(), "../", corpus_dir)
corpus_wav_paths = [os.path.join(path, p) for p in os.listdir(path) if p.endswith(".wav")]

def segment(y, n):
    s = len(y) // n + (len(y) % n > 0)
    pad_amt = s * n - len(y)
    y = np.pad(y, (0, pad_amt))
    segs = np.array(np.split(y, n))
    return segs.T

def load_and_segment(files, blocksize, sr):
    sounds = [librosa.load(p, sr=sr)[0] for p in files]
    sound_segs = [segment(s, blocksize) for s in sounds]
    sound_segs = np.array(sound_segs)
    s = sound_segs.shape
    sound_segs = sound_segs.reshape((s[0] * s[1], blocksize))
    return sound_segs

corpus = load_and_segment(corpus_wav_paths, blocksize, sr)

Next, we perform some analysis on the corpus. For each entry in `corpus`, we find the:

- centroid
- bandwidth
- contrast
- flatness

These are used to create a feauture matrix `corpus_features`, where, for `n` segments, the shape is `(n, 4)`. 

In [545]:
def compute_features(ys, blocksize):
    n = len(ys)
    cent = librosa.feature.spectral_centroid(y=ys, sr=sr, n_fft=blocksize)
    bw = librosa.feature.spectral_bandwidth(y=ys, sr=sr, n_fft=blocksize)
    cont = librosa.feature.spectral_contrast(y=ys, sr=sr, n_fft=blocksize)
    flat = librosa.feature.spectral_flatness(y=ys, n_fft=blocksize)
    
    cent = np.mean(cent, axis=(1,2)).reshape(n, 1)
    flat = np.mean(flat, axis=(1,2)).reshape(n, 1)
    bw = np.mean(bw, axis=(1,2)).reshape(n, 1)
    cont = np.mean(cont, axis=(1,2)).reshape(n, 1)
    return np.hstack((cent, bw, cont, flat))

corpus_features = compute_features(corpus, blocksize)

This next bit begins our crossing into matching. We want to find set of similar corpus indicies for some input sample. Working on a sample of $2^N$ samples, we find its features, then find close-matching entries in `corpus_features`. The indicies of these are indicies in the corpus to use.

In [546]:
def compute_feature(y, blocksize):
    cent = librosa.feature.spectral_centroid(y=y, sr=sr, n_fft=blocksize)
    bw = librosa.feature.spectral_bandwidth(y=y, sr=sr, n_fft=blocksize)
    cont = librosa.feature.spectral_contrast(y=y, sr=sr, n_fft=blocksize)
    flat = librosa.feature.spectral_flatness(y=y, n_fft=blocksize)

    cent = np.mean(cent)
    bw = np.mean(bw)
    cont = np.mean(cont)
    flat = np.mean(flat)
    return np.array([cent, bw, cont, flat])

def find_matches(y, blocksize, src):
    features = compute_feature(y, blocksize)
    
    # find distance from features to each element in src
    distances = [np.linalg.norm(u - features, 1) for u in src]
    distances = np.array(distances)
    min = np.min(distances)
    max = np.max(distances)
    distances = (distances - min) / (max - min)

    l = 1e-2
    t = (1 - l) * np.min(distances) + l * np.mean(distances) + l
    matches = np.nonzero(distances < t)
    return matches

# Input Matching

Now that we have an analyzed corpus, we can reconstruct some other input sound with these fragments!

In [643]:
input_file = "inputs/s1-l5.wav"

Note that this can take a while!

In [644]:
# load input into array of 2^N sized chunks
targ, _ = librosa.load(input_file, sr=sr)
target_segments = segment(targ, blocksize)
max = np.max(target_segments)
min = np.min(target_segments)
target_segments = ((target_segments - min) / (max - min)) * 2 - 1

# find matches for each chunk
target_matches = [find_matches(y, blocksize, corpus_features) for y in target_segments]
target_matches = [np.random.choice(m[0]) for m in target_matches]
target_matches = np.array(target_matches).flatten()

# make an array of matching segments and a set of unique segments
reconstruction_unique = corpus[np.unique(target_matches)]
l = len(corpus)
rsegs = [np.concatenate((corpus[m], corpus[(m + 1) % l], corpus[(m + 2) % l])) for m in target_matches]
reconstruction_segments = np.array(rsegs)

Now, we have to match the indicies in target_matches to sounds in `corpus_wav_paths`...

Or not! we can just load them as buffers. But loading the full files later would be smart.

# Resynthesis

Now we use `signalflow` to synthesize the output.

In [655]:
if 'graph' in locals():
    graph.destroy()

graph = sf.AudioGraph(output_device=sf.AudioOut_Dummy(2))
buffers = [sf.Buffer(b) for b in reconstruction_segments]
s_per_seg = blocksize / sr
seg_per_s = 1 / s_per_seg

Now we use a canvas to let us put in sequence data. This will be used to parameterize the granular synthesis.

In [659]:
canvas = Canvas(width=800, height=600)
out = Output()

pts = 10
x = np.linspace(0, 1, pts)
interp = 'nearest'
# quadratic, linear, nearest are good!

keys = ['ratio', 'rate', 'clock', 'noise']
params = {'arr': {}, 'fn': {}}
for k in keys:
    params['arr'][k] = np.linspace(0, 1, pts)
    params['fn'][k] = interpolate.interp1d(x, params['arr'][k], interp)

def draw(canvas, param_dict):
    canvas.clear()
    canvas.fill_style = 'white'
    h = canvas.height / len(param_dict['arr'])
    
    top = 0
    for k, x in param_dict['arr'].items():
        w = canvas.width / len(x)
        xs = np.arange(0, canvas.width, w)
        ys = top + h
        ws = w
        hs = -x * h
        canvas.fill_rects(xs, ys, ws, hs)
        top += h

    canvas.fill_style = 'red'
    canvas.stroke_style = 'red'
    top = 0
    for k, f in param_dict['fn'].items():
        # x = f(np.arange(0, len(param_dict['arr'][k]) - 1, 0.05))
        x = f(np.linspace(0, 1, 200))
        w = canvas.width / len(x)
        xs = np.arange(0, canvas.width, w)
        ys = top + h  + (-x * h)
        ws = w
        hs = 3 #-x * h
        canvas.stroke_rects(xs, ys, ws, hs)
        top += h
    
    canvas.font = '12px sans-serif'
    for i, s in enumerate(keys):
        canvas.fill_text(s, 0, i * h + 20)

@out.capture()
def handle_mouse_down(x, y):
    x_idx = int((x / canvas.width) * pts)
    y_idx = int((y / canvas.height) * len(keys))
    h = canvas.height / len(params['arr'])
    y_min = y_idx * h
    y_max = (y_idx + 1) * h
    pct = (y - y_min) / (y_max - y_min)
    pct = 1 - pct
    x = np.linspace(0, 1, pts)
    params['arr'][keys[y_idx]][x_idx] = pct
    params['fn'][keys[y_idx]] = interpolate.interp1d(x, params['arr'][keys[y_idx]], interp)
    draw(canvas, params)

canvas.on_mouse_down(handle_mouse_down)
    
draw(canvas, params)
display(out)
canvas

Output()

Canvas(height=600, width=800)

In [664]:
rate = 0.5
out_buf = sf.Buffer(2, int(len(buffers) * blocksize * (1 / rate)))

xs = np.linspace(0, 1, len(buffers) * blocksize)
ratio_arr = np.clip(params['fn']['ratio'](xs), 0, 1)
rate_arr = np.clip(params['fn']['rate'](xs), 0, 1)
clock_arr = np.clip(params['fn']['clock'](xs), 0, 1)
noise_arr = np.clip(params['fn']['noise'](xs), 0, 1)

# triangle, linear-decay, hanning, rectangular
grain_shape = 'linear-decay'

# can do transformations (quantization etc) to arrays here
# noise_arr *= 1e-2
clock_arr *= 0
clock_arr += (8 - 1) / 40

grain_ratio = sf.Buffer(ratio_arr)
grain_rate = sf.Buffer(rate_arr)
grain_clockrate = sf.Buffer(clock_arr)
grain_noise = sf.Buffer(noise_arr)

noise_amp = sf.Clip(sf.BufferPlayer(grain_noise, rate=rate), 0, 1)

clockrate = sf.BufferPlayer(grain_clockrate, rate=rate) * 40 + 1
clockrate += sf.RandomGaussian(0, noise_amp * clockrate / 2)
clock = sf.Clip(sf.Impulse(clockrate) + sf.RandomImpulse(clockrate), 0, 1)

rate = sf.Round(sf.BufferPlayer(grain_rate, rate=rate) * 2) * 0.5 + 0.5
rate *= 0.25
rate += sf.RandomGaussian(0, noise_amp * 1e-5, clock=clock)

ratio = sf.BufferPlayer(grain_ratio, rate=rate) * 40 + 0.1
ratio += sf.RandomGaussian(0, noise_amp * 1e-10, clock=clock)

dur = (1 / clockrate) * ratio

pan = sf.RandomGaussian(0, noise_amp, clock=clock)

# grains = [sf.Granulator(b, clock=clock, pos=sf.RandomUniform(0, b.duration, clock=clock) * sf.Abs(pan), rate=rate, pan=pan, duration=dur) for b in buffers]
grains = [sf.Granulator(b, clock=clock, pos=0, rate=rate, pan=pan, duration=dur) for b in buffers]

for g in grains:
    g.set_buffer("envelope", sf.EnvelopeBuffer(grain_shape))

grain_sel = np.linspace(0, 1, len(buffers) * blocksize) * len(grains)
idx = sf.BufferPlayer(sf.Buffer(grain_sel), rate=rate)

grains = sf.Tanh(sf.ChannelArray(grains) * sf.DecibelsToAmplitude(-6))
output = sf.ChannelCrossfade(grains, idx, 2)
graph.play(output)
graph.render_to_buffer(out_buf)

out = "out.wav"
out_buf.save(out)
IPython.display.Audio(out)

In [665]:
!cp out.wav ../selected-sounds/s1-l5-6.wav