# Corpus Analysis

This notebook takes in a corpus of audio files & uses `librosa` to analyze various features wrt. time.

In [51]:
import librosa
import numpy as np
import os
import scipy
import IPython
from scipy import interpolate

In [3]:
dirname = "sounds-1"

path = os.path.join(os.getcwd(), "../", dirname)
files = [os.path.join(path, p) for p in os.listdir(path) if p.endswith(".wav")]

In [4]:
print(files)

['/Users/keatonkowal/Documents/School/dxarts/481-final/code/../sounds-1/clear2glitch.wav', '/Users/keatonkowal/Documents/School/dxarts/481-final/code/../sounds-1/overfit.wav', '/Users/keatonkowal/Documents/School/dxarts/481-final/code/../sounds-1/solo2all.wav', '/Users/keatonkowal/Documents/School/dxarts/481-final/code/../sounds-1/moving.wav', '/Users/keatonkowal/Documents/School/dxarts/481-final/code/../sounds-1/who.wav', '/Users/keatonkowal/Documents/School/dxarts/481-final/code/../sounds-1/wavering.wav', '/Users/keatonkowal/Documents/School/dxarts/481-final/code/../sounds-1/crones.wav', '/Users/keatonkowal/Documents/School/dxarts/481-final/code/../sounds-1/rytm.wav', '/Users/keatonkowal/Documents/School/dxarts/481-final/code/../sounds-1/spoken.wav', '/Users/keatonkowal/Documents/School/dxarts/481-final/code/../sounds-1/amen.wav', '/Users/keatonkowal/Documents/School/dxarts/481-final/code/../sounds-1/chant.wav', '/Users/keatonkowal/Documents/School/dxarts/481-final/code/../sounds-1/g

In [5]:
sr = 22050
N = 6
sound_files = [librosa.load(p, sr=sr) for p in files]

In [6]:
def segment(y, n=10):
    n = int(2**n)
    s = len(y) // n + (len(y) % n > 0)
    pad_amt = s * n - len(y)
    y = np.pad(y, (0, pad_amt))
    segs = np.array(np.split(y, n))
    return segs.T

def get_ffts(y, n=10):
    # take divide y into (2**n, )-shape segments
    segs = segment(y, n)
    
    # take fft of each segment
    return np.fft.fft(segs, n=n)

In [7]:
# list of sample's fft series
# axis in samples are segment, fft 
stfts = [get_ffts(y, N) for y, _ in sound_files]

In [8]:
# next - would like to create, for each sample
# - piecewise function (scipy?) mapping the dot product with the unit vector to an index into the segments
#   e.g. a function from a fft's dot product to the unit vector -> closest-match segment
# no wait that but with the cosine distance:
# 1 - uv / (||u||_2 ||v||_2)
# returns value in [0, 1]
# first, find each segment's distance to 1 vector:
def stft_id_distances(y):
    d = lambda u : scipy.spatial.distance.cosine(np.abs(u), np.ones(u.shape))
    return np.apply_along_axis(d, 1, y)

In [9]:
stft_distances = [stft_id_distances(s) for s in stfts]
stft_distances_idxs = [np.argsort(n) for n in stft_distances]

In [10]:
mean_distances = [np.mean(n) for n in stft_distances]
mean_distances_idxs = np.argsort(mean_distances)

Ok! So, to determine maximum similarity within a sample, we take the stft_id_distances on our input, then find the closest matching distance in stft_distances using binary search on stft_distances_idxs. We can then use that index to identify the corresponding time within the source sample.

We now also want to do this at a meta-level. I think I'll take the mean of each sample's stft distances such that I have an (n, ) shape list of mean/sample. I can then do a similar thing to find the most similar mean to our input.

That way we have a sort of index-by-input. Take the input, find the most similar samples. Then, within those samples, find the most similar timestamps.

Parameters like grain size, play rate, etc. should be user-controllable wrt time.

# Reconstruction

In [11]:
corpus_dict = {
    "corpus": stfts,
    "sr": sr,
    "stft_d": (stft_distances, stft_distances_idxs),
    "mean_d": (mean_distances, mean_distances_idxs),
}

In [12]:
def closest_match(corpus_dict, target_sample):
    # try to find closest match to target sample
    # return (sample idx, sample time offset in seconds)
    stft_d, stft_di = corpus_dict["stft_d"]
    mean_d, mean_di = corpus_dict["mean_d"]
    stft_d = np.array(stft_d)
    stft_di = np.array(stft_di)
    mean_d = np.array(mean_d)
    mean_di = np.array(mean_di)

    targ_ft_d = stft_id_distances(get_ffts(target_sample))
    targ_mean_d = np.mean(targ_ft_d)
    
    # first, find the closest matching sample:
    close_sample = np.searchsorted(mean_d[mean_di], targ_mean_d, 'right')
    sample_idx = mean_di[np.clip(close_sample, 0, len(mean_di) - 1)]

    sample_offset = np.searchsorted(stft_d[sample_idx], targ_mean_d)
    samps = len(corpus_dict["corpus"][sample_idx][np.clip(sample_offset, 0, len(corpus_dict["corpus"][sample_idx]) - 1)])
    sample_offset *= samps
    time_offset = sample_offset / corpus_dict["sr"]
    
    return (sample_idx, time_offset)

In [13]:
closest_match(corpus_dict, sound_files[1][0] + np.random.random(sound_files[4][0].shape) * 0.01)

(11, 0.1273469387755102)

See above - this is how we can do a lookup! Next is to do this for several segments of some input sample and return an array of sample, offset pairs. We'll then want to make a signalflow script that can take this array and render it!

In [14]:
test_target, _ = librosa.load("test-target.wav", sr=sr)
target_segments = segment(test_target, N)
matches = [closest_match(corpus_dict, seg) for seg in target_segments]
match_names = [(files[i], p) for i, p in matches]

In [15]:
print(matches)

[(16, 1.0345578231292516), (11, 0.1273469387755102), (1, 0.34476190476190477), (0, 0.2236734693877551), (1, 0.8451700680272108), (1, 0.8451700680272108), (1, 0.0838095238095238), (11, 0.1273469387755102), (1, 0.8663945578231292), (3, 1.4974149659863945), (11, 1.0035374149659864), (11, 0.12761904761904763), (1, 0.8666666666666667), (16, 1.0345578231292516), (16, 1.0345578231292516), (3, 1.497142857142857), (11, 1.0035374149659864), (1, 0.34993197278911564), (11, 1.0035374149659864), (1, 0.8666666666666667), (11, 0.12761904761904763), (0, 0.2236734693877551), (11, 1.0035374149659864), (11, 0.12761904761904763), (1, 0.8666666666666667), (6, 1.4272108843537414), (16, 1.0345578231292516), (15, 1.3194557823129252), (1, 0.8666666666666667), (1, 0.8666666666666667), (7, 0.8397278911564626), (1, 0.8663945578231292), (1, 0.34476190476190477), (1, 0.8590476190476191), (1, 0.8663945578231292), (1, 0.34476190476190477), (1, 0.8666666666666667), (1, 0.8663945578231292), (1, 0.8590476190476191), (1, 

# Resynthesis
Now, using the matches, we re-synthesize the input.

In [16]:
graph.destroy()

NameError: name 'graph' is not defined

In [17]:
from signalflow import *
graph = AudioGraph(output_device=AudioOut_Dummy(2))

In [18]:
buffers = [Buffer(f) for f in files]

In [19]:
targ_buf = Buffer("test-target.wav")
out_buf = Buffer(2, len(targ_buf))

In [20]:
segment_clock = Impulse(1 / (64 / sr))
idxs, times = zip(*matches)
grain_pos = Sequence(times, segment_clock) + WhiteNoise(0, -1e-2, 1e-2)
idx = Sequence(idxs, segment_clock)
grain_clock = RandomImpulse(10)
grain_dur = (1024 / sr) * WhiteNoise(0, 1, 4) * 4
grain_rate = (1 - WhiteNoise(0, -1e-1, 1e-1))
players = [Granulator(buffer=b, clock=grain_clock, pos=grain_pos, duration=grain_dur, rate=grain_rate) for b in buffers]
for g in players:
    g.set_buffer("envelope", EnvelopeBuffer("triangle"))
players = ChannelArray(players)
output = ChannelCrossfade(players, idx, 2)
graph.play(output)
graph.render_to_buffer(out_buf)

In [21]:
output.stop()

In [22]:
out_name = "out.wav"
out_buf.save(out_name)
IPython.display.Audio(out_name)

Pretty cool! Need to get a better interface to define the following parameters WRT time:

- Grain ratio: e.g. grain dur, but as a function of grain rate too.
- Grain playback rate: `rate` parameter in `Granulator`
- Grain clock rate: rate of impulse grain_clock above.
- Grain deviation: amount of noise to each or all parameters

# Canvas Input

Can I make a little canvas thing to control these parameters?

In [35]:
from ipycanvas import Canvas
from ipywidgets import Output

In [66]:
canvas = Canvas(width=800, height=100)
out = Output()

def draw(canvas, point_vals, color="green", clear=True):
    if clear:
        canvas.clear()
    num_points = len(point_vals)
    ws = canvas.width // num_points
    point_idxs = np.arange(num_points)
    hs = point_vals
    xs = point_idxs * ws
    ys = canvas.height - hs
    
    canvas.fill_style = color
    canvas.fill_rects(xs, ys, ws, hs)

ps = np.random.rand(20) * canvas.height
draw(canvas, ps)
mouse_down = True

# @out.capture()
# def handle_mouse_down(x, y):
#     mouse_down = True

# canvas.on_mouse_down(handle_mouse_down)

# @out.capture()
# def handle_mouse_up(x, y):
#     mouse_down = False

# canvas.on_mouse_up(handle_mouse_up)

@out.capture()
def handle_mouse_move(x, y):
    if mouse_down:
        idx = int((x / canvas.width) * len(ps))
        ps[idx] = canvas.height - y
        draw(canvas, ps)
        rg = np.arange(len(ps))
        f = interpolate.interp1d(rg, ps, 'quadratic')
        ys = f(np.arange(0, len(ps) - 1, 0.1))
        draw(canvas, ys, "red", False)

canvas.on_mouse_down(handle_mouse_move)


display(out)
canvas

Output()

Canvas(height=100, width=800)