In [2]:
from scipy import signal
from pydub import AudioSegment
import os
import librosa
import soundfile as sf
import tensorflow as tf
import numpy as np
import pydub
import glob
import random
#from py3fst.audio_load import load_audio_from_files, audio2spec
import IPython.display as ipd
import wave
import copy
from scipy import linalg, fftpack
from numpy.lib.stride_tricks import as_strided
from audio_utilities import reconstruct_signal_griffin_lim

ModuleNotFoundError: No module named 'audio_utilities'

In [3]:
sr = 16000

In [41]:
tfrecord_file = './data/_clean_speech_00_0000.tfrecord'

In [42]:
examples = []
limit = 100
for i, example in enumerate(tf.python_io.tf_record_iterator(tfrecord_file)):
    eg_np = tf.train.Example.FromString(example)
    audio_segment = pydub.AudioSegment(
        eg_np.features.feature["audio"].bytes_list.value[0], 
        frame_rate=16000,
        sample_width=2, 
        channels=1
    )
    y = audio_segment.get_array_of_samples()
    examples.append(y)
    if i > limit:
        break

In [43]:
def spec2wav(mag, n_fft, win_length, hop_length, num_iters=30, phase=None):
    """
    Get a waveform from the magnitude spectrogram by Griffin-Lim Algorithm.
    Parameters
    ----------
    mag : np.ndarray [shape=(1 + n_fft/2, t)]
        Magnitude spectrogram.
    n_fft : int > 0 [scalar]
        FFT window size.
    win_length  : int <= n_fft [scalar]
        The window will be of length `win_length` and then padded
        with zeros to match `n_fft`.
    hop_length : int > 0 [scalar]
        Number audio of frames between STFT columns.
    num_iters: int > 0 [scalar]
        Number of iterations of Griffin-Lim Algorithm.
    phase : np.ndarray [shape=(1 + n_fft/2, t)]
        Initial phase spectrogram.
    Returns
    -------
    wav : np.ndarray [shape=(n,)]
        The real-valued waveform.
    """
    assert (num_iters > 0)
    if phase is None:
        phase = np.pi * np.random.rand(*mag.shape)
    stft = mag * np.exp(1.j * phase)
    wav = None
    for i in range(num_iters):
        wav = librosa.istft(stft, win_length=win_length, hop_length=hop_length)
        if i != num_iters - 1:
            stft = librosa.stft(wav, n_fft=n_fft, win_length=win_length, hop_length=hop_length)
            _, phase = librosa.magphase(stft)
            phase = np.angle(phase)
            stft = mag * np.exp(1.j * phase)
    return wav

In [44]:
def stft(X, fftsize=128, step="half", mean_normalize=True, real=False,
         compute_onesided=True):
    """
    Compute STFT for 1D real valued input X
    """
    if real:
        local_fft = fftpack.rfft
        cut = -1
    else:
        local_fft = fftpack.fft
        cut = None
    if compute_onesided:
        cut = fftsize // 2 + 1
    if mean_normalize:
        X -= X.mean()
    if step == "half":
        X = halfoverlap(X, fftsize)
    else:
        X = overlap(X, fftsize, step)
    size = fftsize
    win = 0.54 - .46 * np.cos(2 * np.pi * np.arange(size) / (size - 1))
    X = X * win[None]
    X = local_fft(X)[:, :cut]
    return X

In [58]:
n_fft, win_length, hop_length = 512, 400, 160

In [59]:
idx = random.randint(0, len(examples) - 1)
p_ex =  np.array(examples[idx],dtype='float')
p_spec = np.abs(librosa.stft(p_ex, hop_length=hop_length, n_fft=n_fft, win_length=win_length))
n = len(p_ex)

In [60]:
print("Original audio")
ipd.Audio(p_ex,rate=sr)

Original audio


In [62]:
spec2aud = spec2wav(p_spec, n_fft, win_length, hop_length, num_iters=500)
mse = np.sum((p_ex - spec2aud)**2) / n
print('MSE for spec2wav = ' + str(mse))
print("Reconstructed audio from spec2wav")
ipd.Audio(spec2aud,rate=sr)

MSE for spec2wav = 669577.9586384845
Reconstructed audio from spec2wav


In [63]:
spec2aud2 = librosa.core.griffinlim(p_spec, n_iter=50, hop_length=hop_length, win_length=win_length)
mse = np.sum((p_ex - spec2aud2)**2) / n
print('MSE for librosa griffinlim = ' + str(mse))
print("Reconstructed audio with librosa")
ipd.Audio(spec2aud2,rate=sr)

MSE for librosa griffinlim = 651288.6901039367
Reconstructed audio with librosa


In [None]:
#compare estimated phase with true phase
#check augmentation methods for distortion

In [4]:
"""
The following code was ripped off from this stack exchange post:
https://codereview.stackexchange.com/questions/155510/karplus-strong-pluck-generation
"""

damping = 0.99
def generate(f, vol, nsamples):
    """Generate a Karplus-Strong pluck.

    Arguments:
    f -- the frequency, as an integer
    vol -- volume, a float in [0.0, 1.0]
    nsamples -- the number of samples to generate. To generate a pluck t
    seconds long, one needs t * sample_rate samples. nsamples is an int.

    Return value:
    A numpy array of floats with length nsamples.
    """

    N = sr // f
    buf = np.random.rand(N) * 2 - 1
    samples = np.empty(nsamples, dtype=float)

    for i in range(nsamples):
        samples[i] = buf[i % N]
        avg = damping * 0.5 * (buf[i % N] + buf[(1 + i) % N])
        buf[i % N] = avg

    return samples * vol

In [5]:
def generate_list(L, fr, vol, length):
    """Generate list of karplus_strong plucks.
    Arguments:
    L -- number of notes
    f -- list of L frequencies, as an integer
    vol -- list of L volumes, (floats in [0.0, 1.0])
    length -- fixed length of all notes, in seconds

    Return value:
    A list of np  arrays of floats, i.e a list of
    guitar string plucks
    """
    ls = []
    for i in range(L):
        note = generate(fr[i], vol[i], length*sr)
        ls.append(note)
    return ls

In [10]:
def play_freq(string, fret):
    base = {1: 329.63, 2: 246.94, 3: 196.00, 4: 146.83, 5: 110.00, 6: 82.41}
    return base[string] * 2**(fret / 12.0)

In [66]:
# Todo: make minute long track for frank
# handle double digits
# handle chords
input_str = '0-0-----0-0--3--5-5-----7-7--3--0-0-----0-0--3--5-5-----7-7--3--'
input_str = input_str * 6
dash_length = int(0.15*16000)
sound_out = np.zeros(dash_length*len(input_str))
for idx, c in enumerate(input_str):
    time = idx * dash_length
    if c in '0123456789':
        freq = play_freq(4,int(c))
        pluck = generate(int(freq), 1, 16000)
        out_len = len(sound_out[time:time+len(pluck)])
        sound_out[time:time+len(pluck)] += pluck[:out_len]

In [68]:
#franks notebook
#note = generate(int(play_freq(1,0)), 1, 16000)
ipd.Audio(sound_out, rate=sr)

In [69]:
sf.write('karp_strong_joan_jett.wav', sound_out,16000)