In [None]:
from IPython.display import display, HTML
from IPython.display import clear_output
display(HTML("<style>.container { width:85% !important; }</style>"))

In [None]:
import soundfile as sf
import IPython.display as ipd
import numpy as np
import pandas as pd
import glob
import os
import sys
import matplotlib
import mido
from mido import MidiFile
from scipy.interpolate import interp1d
import torch

import matplotlib.pyplot as plt
import librosa.display

from prepare_sax_phrase_dataset import *

In [None]:
range_notes = ['C3', 'A#5'] # alto sax range is ['Db3', 'A5'], take half-step below/above
alto_sax_range = librosa.note_to_hz(range_notes)
TEST_MODE = False #--- mode for experimenting on a small dataset
tgt_sr = 44100

if not TEST_MODE:
    data_folder = '/home/mlspeech/itamark/ssynth/git_repos/DeepLearningExamples/PyTorch/SpeechSynthesis/HiFiGAN/data_ssynth/wavs_raw' #'/home/itamar/ssynth/data/wavs'
    flist = glob.glob(f'{data_folder}/*Free*dynamic_mic.wav')
else:
    data_folder = '/home/mlspeech/itamark/ssynth/git_repos/DeepLearningExamples/PyTorch/SpeechSynthesis/HiFiGAN/data_ssynth_TMP/wavs_raw' #'/home/itamar/ssynth/data/wavs'
    flist = glob.glob(f'{data_folder}/*dynamic_mic.wav')
    
out_dir = data_folder.replace('wavs_raw', 'wavs')
if not os.path.isdir(out_dir):
    os.mkdir(out_dir)
    
print_info = True
dur = 0
if print_info:
    print(f'found {len(flist)} files in wavs_raw folder:')
    print(pd.Series([os.path.basename(fl) for fl in flist]))
    print('\n')
    for fnm in flist:
        f = sf.SoundFile(fnm)
        sec = f.frames / f.samplerate
        dur += sec
        #print(f'samples = {f.frames}')
        print(f'file {os.path.basename(fnm)}')
        print(f'\tsample rate = {f.samplerate}, sample format = {f.subtype_info}, seconds = {sec:.1f}')
print(f'total recording duration {dur / 60:.1f} minutes')
print(f'NOTE: output files will be saved to {out_dir}')

In [None]:
def split_audio_to_phrases(y, sr):
    cfg = PhraseSegmentCfg(sr)
    st, en, env_db = get_audio_pauses(y, cfg, verbose = False)
    #assert(st[0] == cfg.win_hop_samples) #--- make sure first pause starts from audio's start 
    print(f'found {st.shape[0]} pauses')

    #--- treat too long segments
    st, en = split_long_phrases(y, st, en, cfg)
    print(f'After splitting long phrases: {st.shape[0]} pauses')

    #--- treat too short segments
    st, en = merge_short_phrases(st, en, cfg)
    print(f'After merging short phrases: {st.shape[0]} pauses')
    
    #--- update duration array
    seg_dur_sec = (st[1:] - en[:-1]) / sr
    phrase_inds = np.c_[en[:-1], st[1:]]
    
    return phrase_inds, seg_dur_sec    

In [None]:
save_to_disk = True if TEST_MODE else False
process_if_exists = False #True

#--- WARNING this will overrite existing raw audio files!
write_resampled_files_to_disk = False # True 

phrase_df = []

for file_nm in flist:
    #--- check for previous results
    file_nm_base = os.path.basename(file_nm)
    exist_wavs = glob.glob(f'{out_dir}/{file_nm_base[:-4]}*.wav')
    
    if not process_if_exists and len(exist_wavs) > 0:
        print(f'found {len(exist_wavs)} phrases from file {file_nm_base}. Skipping.')
        continue
  
    print(f'>>> loading {file_nm_base}')
    f = sf.SoundFile(file_nm)
    resampling = False
    if f.samplerate != tgt_sr:
        resampling = True
        print(f'file sr is {f.samplerate}, resampling to {tgt_sr}')
    
    y, sr = librosa.load(file_nm, sr = tgt_sr) #--- NOTE librosa converts 24 bit audio (which is int32) to float in [-1, 1]
    if resampling and write_resampled_files_to_disk:
        print('Warning: overwriting existing file with resampled one')
        with sf.SoundFile(file_nm, 'w', tgt_sr, 1, f.subtype) as fout:
            fout.write(y)
    
    phrase_inds, seg_dur_sec = split_audio_to_phrases(y, sr)
    
    print(f'Total phrases: {len(phrase_inds)}')
    print(f'phrase durations sec: min {seg_dur_sec.min():.1f} max {seg_dur_sec.max():.1f} mean {seg_dur_sec.mean():.1f}')

    #--- save phrases to disk
    
    if save_to_disk:
        print(f'saving phrases to {out_dir}')
    for k, pind in enumerate(phrase_inds):
        yout = y[pind[0]:pind[1]]
        ifnm = file_nm_base.replace('.wav', f'_phrase{k:03d}.wav')
        fnm_out = f'{out_dir}/{ifnm}'
        phrase_df.append(pd.Series(
            dict(file_nm = file_nm_base, 
                 phrase_id = ifnm.replace('.wav', ''), 
                 sample_start = pind[0], 
                 sample_end = pind[1])))
        if save_to_disk:
            sf.write(fnm_out, yout, sr, subtype = f.subtype)
            
print('done')
if len(phrase_df) > 0:
    phrase_df = pd.concat(phrase_df, axis = 1).T
    phrase_df.sort_values(by = 'phrase_id', inplace = True)
all_files = glob.glob(f'{out_dir}/*.wav')
print(f'data set size: {len(all_files)} phrases')

In [None]:
phrase_df_fnm = f'{data_folder}/../phrase_df.csv'

if not os.path.isfile(phrase_df_fnm): # False:
    phrase_df.to_csv(phrase_df_fnm)
    print(f'saved phrase dataframe to {phrase_df_fnm}')
else:
    phrase_df = pd.read_csv(phrase_df_fnm)

#display(phrase_df.head())

## write metadata files

In [None]:
#--- metadata.csv used by HiFiGan training script
metadata_fnm = f'{data_folder}/../metadata.csv'
if not os.path.isfile(metadata_fnm):
    print(f'writing metadata.csv to {metadata_fnm}')
    (phrase_df['phrase_id'] + '||').to_csv(metadata_fnm, index=False, header=False)
else:
    print(f'file {metadata_fnm} exists, not writing a new one')

if False:
    #--- write it below, when we have midi data as well
    #--- filelist used by FastPitch (in HiFiGan there's a script that create file lists)
    filelist_fnm = f'{data_folder}/../filelists_fastpitch/ssynth_audio.txt'
    if not os.path.isfile(filelist_fnm):
        ('wavs/' + phrase_df['phrase_id'] + '.wav|').to_csv(filelist_fnm, index=False, header=False)
    else:
        print(f'file {filelist_fnm} exists, not writing a new one')

## validation - compare detected phrses from phrase_df to actual files on disk

In [None]:
for file_nm in flist:
    file_nm_base = os.path.basename(file_nm)
    print(f'>>>> file {file_nm_base}')
    y, sr = librosa.load(file_nm, sr = tgt_sr) 
    pdf = phrase_df.query("file_nm == @file_nm_base")
    print(f'total file duration {y.shape[0] / tgt_sr / 60:.1f} min')
    print(f'total phrase duration {(pdf.sample_end - pdf.sample_start).sum()/tgt_sr/60:.1f} min')
    print(f'phrases: {pdf.shape[0]}')

    tlen = 0
    for k in range(pdf.shape[0]):
        p = pdf.iloc[k]
        p_fnm = f'{out_dir}/{p.phrase_id}.wav'
        y_p, _ = librosa.load(p_fnm, sr = tgt_sr) 
        tlen += len(y_p)
        if p.sample_end - p.sample_start != len(y_p):
            #pass
            print(f'[{k}] mismatch between detected phrase and pharse-file on disk')

        if False and k==24:
            print(f'phrase from {os.path.basename(p_fnm)} ({len(y_p)} samples)')
            ipd.display(ipd.Audio(y_p, rate=sr))
            print(f'phrase from detection ({p.sample_end-p.sample_start} samples)')
            ipd.display(ipd.Audio(y[p.sample_start:p.sample_end], rate=sr))
            break
    print(f'done, total len {tlen/tgt_sr/60:.1f} min')

# Analysis of specific phrases
## plot N seconds of signal, and detected phrases. Play audio of a selected phrase
### file is selected in the cell above (if not selected - it will be the last file processed in the for-loop)

In [None]:
%matplotlib notebook
from matplotlib.patches import Rectangle
N = 70
T1 = 10*60 # start of plot, sec
T2 = T1 + N # end of plot, sec
y0 = y[T1*sr:T2*sr]
ts = T1 + np.arange(y0.shape[0]) / sr
fig, ax = plt.subplots(figsize = (12,4))
ax.plot(ts,y0)
plist = pdf[(pdf.sample_start >= T1*sr) & (pdf.sample_end <= T2*sr)].reset_index()
print(f'{len(plist)} phrases in segment')
for ip, p in plist.iterrows():
    ax.add_patch(Rectangle((p.sample_start/sr,-.2), (p.sample_end-p.sample_start)/sr, 0.4, edgecolor='blue', facecolor='red',alpha=.3 ))

### Select a phrse and play it

In [None]:
p = plist.iloc[3]
p_fnm = f'{out_dir}/{p.phrase_id}.wav'
y_p, _ = librosa.load(p_fnm, sr = tgt_sr)
print(f'phrase start {p.sample_start/sr:.2f} sec')
ipd.display(ipd.Audio(y_p, rate=sr))

## Create MIDI phrases and write to metadata
## NOTE: only needed if we synthesize from notes notation using Fastpitch)

In [None]:
def binary_array_to_seg_inds(arr, shift_end_ind = True):
    seg_inds = np.diff(np.r_[0, np.int_(arr), 0]).nonzero()[0]
    n_segs = int(seg_inds.shape[0] / 2)
    seg_inds = seg_inds.reshape((n_segs, 2)) # + np.c_[np.zeros(n_segs),-np.ones(n_segs)]   
    if shift_end_ind:
        seg_inds[:,1] -= 1
    return seg_inds    

def read_midi_to_df(midi_fnm, try_to_fix_note_order = True):
    mid = MidiFile(midi_fnm)
    
    #assert(len(mid.tracks) == 1)
    tr = mido.merge_tracks(mid.tracks);df =  pd.DataFrame([m.dict() for m in tr]);tempo = df.set_index('type').loc['set_tempo','tempo']
    if type(tempo) == pd.Series:
        uniq_tempo = tempo.unique()
        if len(uniq_tempo) > 1:
            raise Exception('multiple tempo changes not supported')
        else:
            tempo = uniq_tempo[0]
            
    df['ts_sec'] = mido.tick2second(df.time.cumsum(), mid.ticks_per_beat, tempo)
    #--- some mete-messages like "channel prefix" contain non-zero time value. so remove them *after* calculating 'ts_sec'
    for type_remove in ['channel_prefix', 'track_name', 'instrument_name', 'time_signature', 'key_signature', 
                        'smpte_offset', 'set_tempo', 'end_of_track', 'midi_port', 'program_change', 'control_change', 'pitchwheel', 'marker']:
        df = df[df.type != type_remove]
    
    df = df.dropna(axis = 1).reset_index(drop = True)
    
    #--- sometimes, instead of a sequence of on-off notes, we get on-on-off-off. try to fix that
    if try_to_fix_note_order:
        try:
            verify_midi(df)
        except AssertionError:
            print(f'{midi_fnm}: note order problem in midi dataframe, trying to fix...')
            df_copy = df.copy()
            #--- indices of where we expect to see "note off" and see "note on"
            off_err_ind = df[((df.index % 2) == 1) & (df.type == 'note_on')].index
            for ind in off_err_ind:
                curr_note = df.loc[ind]
                next_note = df.loc[ind + 1]
                prev_note = df.loc[ind - 1]
                if next_note.type == 'note_off' and next_note.note == prev_note.note:
                    df.loc[ind + 1, 'ts_sec'] = curr_note.ts_sec - 0.001
            df = df.sort_values(by = 'ts_sec', kind = 'stable').reset_index(drop = True)
            try:
                verify_midi(df)
                print('fixed')
            except AssertionError:
                print('fix failed, calling verify_midi() on returned dataframe will fail')
                #--- if fix failed, return the original copy
                df = df_copy
                
    return df

def verify_midi(midi_df):
    #--- validate the assumption that we have series of note-on/note-off events
    assert((midi_df['type'].iloc[::2] == 'note_on').all() and 
       (midi_df['type'].iloc[1::2] == 'note_off').all() and
       (midi_df['note'].iloc[::2].to_numpy() == midi_df['note'].iloc[1::2].to_numpy()).all())

def midi_phrase_from_dataframe(p, midi_df, sr):
    t0 = p.sample_start / sr
    t1 = p.sample_end / sr
    midi_p = midi_df[(midi_df.ts_sec >= t0) & (midi_df.ts_sec <= t1)]
    
    #--- check for missing note_off (at end) or note_on (at start)
    first_note = midi_p.iloc[0]
    if first_note['type'] == 'note_off':
        candidate = midi_df.loc[first_note.name - 1]
        if candidate['type'] == 'note_on' and candidate['note'] == first_note['note']:
            midi_p = pd.concat([candidate.to_frame().T, midi_p])
            
    last_note = midi_p.iloc[-1]
    if last_note['type'] == 'note_on':
        candidate = midi_df.loc[last_note.name + 1]
        if candidate['type'] == 'note_off' and candidate['note'] == last_note['note']:
            midi_p = pd.concat([midi_p, candidate.to_frame().T])
    
    return midi_p
    
def phrase_to_midi_string(p, midi_df, sr):    
    midi_p = midi_phrase_from_dataframe(p, midi_df, sr)            
    try:
        verify_midi(midi_p)
    except Exception as e:
        print(f'phrase {p.phrase_id} verification failed')
        return ''
    
    note_on = midi_p.loc[midi_p.type == 'note_on']
    s = f"wavs/{p.phrase_id}.wav|{' '.join(note_on.note.astype(int).astype(str).to_list())}"
    return s

In [None]:
#--- midi file reading to data-frame fails on the midi files in TEST_MODE (these are parallel recordings done by Elad, not the auto-midi files)
midis_lns = []
midi_folder = 'midi' if TEST_MODE else 'auto_midi'
for fnm in flist:
    fnm_base = os.path.basename(fnm)
    midi_fnm = fnm.replace('/wavs_raw/', f'/{midi_folder}/').replace('.wav', '.mid')
    if TEST_MODE and '_dynamic_mic' in midi_fnm:
        midi_fnm = midi_fnm.replace('_dynamic_mic', '')

    print(f'reading midi file {os.path.basename(midi_fnm)}')
    midi_df = read_midi_to_df(midi_fnm)
    verify_midi(midi_df)
    p_df = phrase_df.query("file_nm == @fnm_base").reset_index(drop = True)
    print(f'processing {p_df.shape[0]} phrases')
    for ip, p in p_df.iterrows():
        midis = phrase_to_midi_string(p, midi_df, tgt_sr)
        midis_lns.append(midis)

#--- write metadata to file
filelist_fnm = f'{data_folder}/../filelists_fastpitch/ssynth_audio.txt'
print(f'writing {len(midis_lns)} lines to file')
if not os.path.isfile(filelist_fnm):
    with open(filelist_fnm, 'w') as fout:
        fout.writelines([ln + '\n' for ln in midis_lns])
else:
    print(f'file {filelist_fnm} exists, not writing a new one')

## Example: choose a phrase, and:
* ### find its midi counterpart
* ### play the phrase
* ### plot the phrase and the midi note-on/note-off marks
* ### plot a spectrogram, and midi notes and detected pitch on top of it

In [None]:
pdf = phrase_df.query("file_nm == @fnm_base")
p = phrase_df.iloc[2] # pdf.iloc[250]
t0 = p.sample_start / sr
wav_fnm = f'{out_dir}/{p.phrase_id}.wav'
seg, sr = librosa.load(wav_fnm, sr = tgt_sr)

try:
    midi_p = midi_phrase_from_dataframe(p, midi_df, sr)   
    #print(midi_p)
    verify_midi(midi_p)
except Exception as e:
    midi_p = None
    print(f'Warning: failed to load midi pharse: {e}')

In [None]:
#fnm0 = file_nm #flist[-1]
#midi_fnm = fnm0.replace('/wavs_raw/', '/auto_midi/').replace('.wav', '.mid')
#midi_df = read_midi_to_df(midi_fnm)
#p =  pdf.iloc[99]
#t0 = p.sample_start / tgt_sr
#t1 = p.sample_end / tgt_sr

#print(t0.round(2),t1.round(2),p) #midi_p

#p = ph_df.iloc[0] #phrase_df.query('phrase_id == "05_Free_Improv_dynamic_mic_phrase181"').iloc[0]
if midi_p is not None:
    note_on = midi_p.loc[midi_p.type == 'note_on']
    note_off = midi_p.loc[midi_p.type == 'note_off']

    note_hz = librosa.midi_to_hz(note_on.note)
    note_on_ts = note_on['ts_sec'].values - t0
    note_off_ts = note_off['ts_sec'].values - t0

    phrase_to_midi_string(p, midi_df, tgt_sr)

In [None]:
#--- pitch detection
win = 1024
ac_win = 512 # autocorrelation window
hop = 256
#--- Note librosa.pyin is the method used by the hifigan example for speech synthesis
f0, vflag, vprob = librosa.pyin(seg, 
                                fmin = alto_sax_range[0], 
                                fmax = alto_sax_range[1], 
                                sr = sr, 
                                frame_length=win, 
                                win_length=ac_win, 
                                hop_length=hop, 
                                center=False,
                                max_transition_rate=100)
times = librosa.times_like(f0, sr = sr, hop_length=hop)
times += (win / 2) / sr # center at windows mid point

In [None]:
fig, ax = plt.subplots(figsize = (12,2))
print(wav_fnm)
librosa.display.waveshow(seg, sr=sr)
plot_midi_notes_segs = False
if plot_midi_notes_segs and midi_p is not None:
    for k in range(len(note_on)):
        xk = note_on_ts[k]
        yk = -0.2
        ax.add_patch(Rectangle((xk, yk), note_off_ts[k] - note_on_ts[k], 0.4, edgecolor='blue', facecolor='red',alpha=.3))
        ax.text(xk,yk,note_on.iloc[k].note)
        
sr_play = sr / 1
ipd.display(ipd.Audio(seg, rate=sr_play))
if midi_p is not None:
    n = note_on_ts.shape[0]
    ax.plot(note_on_ts, [0.01]*n, 'ro', label = 'auto-midi on')
    ax.plot(note_off_ts, [-0.01]*n, 'cx', label = 'auto-midi off')

ax.plot(times, 0.2*vprob, 'k.')
no_note = (~vflag) # | (vprob < 0.15)
ax.plot(times[no_note], 0.2*vprob[no_note], 'rx')
#ax.plot(note_on_ts, seg[(note_on_ts*sr).astype(int)], 'ro', label = 'auto-midi on')
#ax.plot(note_off_ts, seg[(note_off_ts*sr).astype(int)], 'cx', label = 'auto-midi off')
#ax.set_title(f'seg {ind} at {(phrase_inds[ind] / sr).round(1)} sec')

wav_of_all_notes = False
if wav_of_all_notes and midi_p is not None:
    for k in range(len(note_on)):
        st = int(note_on_ts[k] * sr)
        en = int(note_off_ts[k] * sr)
        print(note_on_ts[k])
        ipd.display(ipd.Audio(seg[st:en], rate=sr_play))

In [None]:
%matplotlib notebook 
D = librosa.amplitude_to_db(np.abs(librosa.stft(seg)), ref=np.max)
fig, ax = plt.subplots(figsize = (12,8))
yscale = 'log' # 'linear'
img = librosa.display.specshow(D, x_axis='time', y_axis = yscale, ax=ax, sr=sr)
ax.set(title='pYIN fundamental frequency estimation')
fig.colorbar(img, ax=ax, format="%+2.f dB")
th = 0.25
#ax.plot(times[vprob >= th], f0[vprob >= th], 'o',label='voiced', color='red')
ax.plot(times, f0, '.',label='f0', color='cyan', linewidth=3)
if midi_p is not None:
    ax.plot(note_on_ts, note_hz, 'ro', label = 'auto-midi on')
    ax.plot(note_off_ts, note_hz, 'gx', label = 'auto-midi off')

ax.legend(loc='upper right')

#ax.set_ylim([120,1700])
#ax.set_xlim([4.5,5.5])

## play some random phrases

In [None]:
## play some random phrases
k = 100
for ind in range(k, k + 201, 50):
    print(all_files[ind].split('/')[-1])
    ipd.display(ipd.Audio(all_files[ind], rate=sr))

## toy model of fastpitch + hifigan from "raw midi notes" (sounds bad)

In [None]:
#--- load and play model output of complete loop (fastpitch + hifigan)
#--- this sounds horrible, as expected. fastpitch was trained on 'raw modi notes' (no timing, pitch, or energy info)
out_files = glob.glob('/home/mlspeech/itamark/ssynth/git_repos/DeepLearningExamples/PyTorch/SpeechSynthesis/FastPitch/results/2022-12-20_fastpitch_ssynth/audio_devset10_fp32_fastpitch_hifigan_SAX_denoise-0.01/*.wav')
for fl in out_files:
    print(os.path.basename(fl))
    ipd.display(ipd.Audio(fl, rate=sr))

# Synthesize a sine wave with pitch and env from signal

In [None]:
def get_num_harmonics(min_freq_src_hz, max_freq_src_hz, sr, max_freq_tgt_hz):
    fmin = min_freq_src_hz # librosa.note_to_hz(range_notes[0]) # can't naively use fnew.min() since we interpolate to f=0 Hz
    num_harmonics = int(max_freq_tgt_hz / fmin)
    new_sr = 2 * max_freq_src_hz * num_harmonics
    #--- take the smallest multiple of sr which is high enough (6 is the highest, assuming freqs.max() <= 932 Hz)
    new_sr_factor = [k for k in range(1, 7) if k * sr > new_sr][0]
    return num_harmonics, new_sr_factor

In [None]:
from scipy.signal import decimate, butter, dlti # resample_poly
from scipy.interpolate import UnivariateSpline

#num_harmonics = 10; #None #
#verbose = False; max_freq_hz=None #8000

#if True:
def phrase_to_synth(seg, sr, midi_p, t0, num_harmonics = None, max_freq_hz = None, spline_smoothing = None, verbose = False):
    ''' Exactly one of these should be given (and the other set to None):
            - num harmonics: how many harmonics (inc the fundamental) are used in the saw-tooth additive synthesis
                             in this case the max-freq is note-dependent (f0*num_harmonics) and the caller is responsible
                             to make sure that (highest note in hz) * (num_harmonics) < nyquist
            - max_freq_hz:   synthesize up to this frequency. This is done by upsampling, synthesizing the required amound of harmonics,
                             and downsampling back to sr
    '''
    if verbose:
        print(f'pitch detection range: {alto_sax_range.round(1)} Hz, {(sr/alto_sax_range).astype(int)} samples')
        print(f'pitch detection: frame len {win}, auto-corr len {ac_win} (min freq of {sr/ac_win:.1f} Hz), hop len {hop}')
    
    assert(num_harmonics is None or max_freq_hz is None)
    f1, vflag1, vprob1 = librosa.pyin(seg, 
                                      fmin = alto_sax_range[0], 
                                      fmax = alto_sax_range[1], 
                                      sr = sr, 
                                      frame_length=win, 
                                      win_length=ac_win, 
                                      hop_length=hop, 
                                      center=True, 
                                      max_transition_rate=100)
    times1 = librosa.times_like(f1, sr = sr, hop_length = hop)
    no_note1 = (~vflag1)
    tmin = times1[0]
    tmax = times1[-1]
    
    note_on = midi_p.loc[midi_p.type == 'note_on']
    note_off = midi_p.loc[midi_p.type == 'note_off']
    #note_hz = librosa.midi_to_hz(note_on.note)
    note_on_ts = note_on['ts_sec'].values - t0
    note_off_ts = note_off['ts_sec'].values - t0
    
    #-------------------------------------------------------------------------------------------------------------------
    #--- interpolate missing pitch, where possible. otherwise, set to 0 (in order to accumulate 0 phase when integrating)
    #-------------------------------------------------------------------------------------------------------------------
    #--- step A, interpolate within (intra-) midi notes
    n_notes = note_on.shape[0]
    if verbose:
        print(f'samples with non-detected pitch: {np.isnan(f1).sum()}')
    for k in range(n_notes):
        #--- first, find missing pitch samples which are inside a detected midi note
        midi_note_span = (times1 >= note_on_ts[k]) & (times1 <= note_off_ts[k])
        
        #--- if no missing pitch samples are in the midi note span, we don't need this note, so skip
        if not (midi_note_span & no_note1).any():
            continue
        
        #--- if we don't have at least 2 pitch samples in the note span, we can't extrapolate, so skip
        if (midi_note_span & ~no_note1).sum() < 2:
            continue
            
        #--- build the interpolating function from detected pitch samples
        pitch_intrp = interp1d(times1[midi_note_span & ~no_note1], 
                               f1[midi_note_span & ~no_note1], 
                               fill_value = 'extrapolate', 
                               kind = 'nearest',
                               assume_sorted = True)
        #--- the time samples where we want to interpolate: inside midi note AND missing pitch
        t_intrp = times1[midi_note_span & no_note1]
        f1[midi_note_span & no_note1] = pitch_intrp(t_intrp)

    if verbose:
        print(f'after interpolating using midi notes: samples with non-detected pitch: {np.isnan(f1).sum()}')

    #--- step B, interpolate across (inter-) midi notes
    max_gap_to_interpolate_sec = 0.1 #--- don't interpolate gaps above this interval in seconds
    no_note1 = np.isnan(f1)
    seg_inds = binary_array_to_seg_inds(no_note1, shift_end_ind = False)
    seg_lens_sec = np.diff(seg_inds, 1)[:,0] * hop / sr
    for k, inds in enumerate(seg_inds):
        #--- don't interpolate head or tail of signal, or if gap is too long
        #--- TODO check energy envelope in gap (interpolate only above env threshold)
        gap_len = seg_lens_sec[k]
        if (inds[0] == 0) or (inds[1] == len(f1)) or gap_len > max_gap_to_interpolate_sec:
            continue
        gap_len_samples = inds[1] - inds[0]
        if verbose:
            print(f'interpolating over {gap_len_samples} samples over gap of {gap_len:.3f} sec')
        #--- linear interpolation using 1 sample before and after
        new_freqs = np.linspace(f1[inds[0] - 1], f1[inds[1]], gap_len_samples + 2)
        f1[inds[0]:inds[1]] = new_freqs[1:-1]

    no_note1 = np.isnan(f1)
    seg_inds = binary_array_to_seg_inds(no_note1, shift_end_ind = False)
    if verbose:
        print(f'after interpolating over small gaps: samples with non-detected pitch: {np.isnan(f1).sum()}')
    #--- lastly, fill with zeros the samples that are still missing
    f1[np.isnan(f1)] = 0.
    
    #--- set number of harmonics of sawtooth wave
    if num_harmonics is not None:
        additive_synth_k = num_harmonics # 10
        should_downsample = False
    else:
        num_harmonics, new_sr_factor = get_num_harmonics(f1[f1 > 20].min(), f1.max(), sr, max_freq_hz)
        #--- make sure we stay below new nyquist
        assert f1.max() * num_harmonics < 0.5 * sr * new_sr_factor, f'Nyquist says you cannot synthesize {num_harmonics} harmonics at {new_sr_factor} X (current sampling rate)'
        additive_synth_k = num_harmonics
        sr *= new_sr_factor
        should_downsample = True
    
    #--- now interpolate to sampling-rate grid
    dt = 1 / sr
    fintrp = interp1d(times1, f1)
    tnew = np.arange(tmin, tmax, dt)
    fnew = fintrp(tnew)
    
    #--- phase is the integral of instantanous freq
    phi = np.cumsum(2 * np.pi * fnew * dt)
    # to wrap: phi = (phi + np.pi) % (2 * np.pi) - np.pi 
        
    x = np.sin(phi) #(np.sin(phi) + .5*np.sin(2*phi) + .333*np.sin(3*phi) + .25*np.sin(4*phi))
    for k in range(2, additive_synth_k + 1):
        x += (-1)**(k-1) * np.sin(k*phi) / k
    
    #--- if we upsampled, go back to original rate
    if should_downsample:
        #--- for x, give a "anti-alias" filter to "decimate", but actually use it to filter above the desired max_freq_hz
        zpk = butter(12, max_freq_hz, output = 'zpk', fs = sr)
        aa_filt = dlti(*zpk) 
        x = decimate(x, new_sr_factor, ftype = aa_filt)
        fnew = decimate(fnew, new_sr_factor) #--- fnew is just used to zero the envelope, so decimate so size fits
        sr = int(sr / new_sr_factor)
    
    env = librosa.feature.rms(y = seg, frame_length = 512, hop_length = 1, center = True)
    env = 1.3 * np.sqrt(2)*env[0, :len(x)]
    env[fnew == 0] = 0. # don't apply envelope where there was no pitch found

    #--- make envelope go to zero smoothly. This also takes care of the non-continous phase at jumps of f1 to 0
    env_segments = binary_array_to_seg_inds(env == 0)
    decay_time_sec = 0.05 #--- 50 msec decay time
    decay_time_samples = int(decay_time_sec * sr)
    for env_seg in env_segments:
        if env_seg[0] == 0:
            continue
        ind_start = max(0, env_seg[0] - decay_time_samples)
        decay_len = env_seg[0] - ind_start
        decay_factor = np.linspace(1, 0, decay_len)
        env[ind_start: env_seg[0]] *= decay_factor    
    
    if spline_smoothing is not None:
        ts = t0 + np.arange(0, len(env)) / sr
        spl = UnivariateSpline(ts, env, s = spline_smoothing, k = 2)
        env = spl(ts)
         
    x *= env
    gain = np.sqrt((x**2).mean()) / np.sqrt((seg**2).mean()) 
    x /= gain
    env /= gain
    
    return x, env, f1

In [None]:
x, env, _ = phrase_to_synth(seg, sr, midi_p, t0, num_harmonics = 30, verbose=True)
xnew = np.c_[x, x].T #.3*seg[:len(x)]].T
ipd.display(ipd.Audio(xnew, rate=sr))

In [None]:
x1, env, _ = phrase_to_synth(seg, sr, midi_p, t0, max_freq_hz = 16000, spline_smoothing = 0.5, verbose=True)
xnew = np.c_[x1, x1].T #.3*seg[:len(x)]].T
ipd.display(ipd.Audio(xnew, rate=sr))

In [None]:
%matplotlib notebook 
fig, ax = plt.subplots(figsize = (12,2))
n = min(len(seg), len(x))
tsec = np.arange(n) / sr
plt.plot(tsec, seg[:n],'b.')
plt.plot(tsec, x[:n],'g.')
plt.plot(tsec,env,'r:')
#plt.xlim([0,0.05])
plt.show()

if True:
    for xx in [x,x1]:
        D = librosa.amplitude_to_db(np.abs(librosa.stft(xx, n_fft=2048, win_length=1024, hop_length=256)))
        fig, ax = plt.subplots(figsize = (8,4))
        yscale =  'linear' #'log' #
        img = librosa.display.specshow(D, x_axis='time', y_axis = yscale, ax=ax, sr=sr)
        fig.colorbar(img, ax=ax, format="%+2.f dB")

# Create synthetic (saw tooth) wavs for all phrases and save to disk (inc. pitch)

In [None]:
#--- exactly one of these should be None:
num_harmonics = None # 10
max_freq_hz = 16000
spline_smoothing = 0.5 # set to None for no smoothing of amplitude envelope

suffix = f'{num_harmonics}h' if max_freq_hz is None else f'{int(max_freq_hz / 1000)}k'
suffix += '' if spline_smoothing is None else f'_spl{spline_smoothing}'

synth_out_dir = data_folder.replace('wavs_raw', f'wavs_synth_{suffix}')
pitch_out_dir = data_folder.replace('wavs_raw', 'pitch_synth')
print(f'writing synthesized wavs to {synth_out_dir}')
print(f'writing extracted pitch to {pitch_out_dir}')

if not os.path.isdir(synth_out_dir):
    os.mkdir(synth_out_dir)
if not os.path.isdir(pitch_out_dir):
    os.mkdir(pitch_out_dir)

#--- iterate over files, and over phrases in an inner loop 
for ifnm, fnm in enumerate(flist):
    #if ifnm < 2:
    #    continue
    fnm_base = os.path.basename(fnm)
    midi_fnm = fnm.replace('/wavs_raw/', f'/{midi_folder}/').replace('.wav', '.mid')
    if TEST_MODE and '_dynamic_mic' in midi_fnm:
        midi_fnm = midi_fnm.replace('_dynamic_mic', '')
        
    print(f'[{ifnm}] reading midi file {os.path.basename(midi_fnm)}')
    midi_df = read_midi_to_df(midi_fnm)
    verify_midi(midi_df)
    p_df = phrase_df.query("file_nm == @fnm_base").reset_index(drop = True)
    print(f'processing {p_df.shape[0]} phrases')
    for iphrs, phrs in p_df.iterrows():             
        #if iphrs < 610:
        #    continue
        wav_fnm = f'{out_dir}/{phrs.phrase_id}.wav'
        seg, sr = librosa.load(wav_fnm, sr = tgt_sr)
        midi_p = midi_phrase_from_dataframe(phrs, midi_df, sr)
        t0 = phrs.sample_start / sr
        try:
            seg_synth, env, pitch = phrase_to_synth(seg, sr, midi_p, t0, 
                                                    num_harmonics = num_harmonics, 
                                                    max_freq_hz = max_freq_hz,
                                                    spline_smoothing = spline_smoothing,  
                                                    verbose = False)
        except Exception as e:
            print(f'phrase {iphrs} failed with error: {e}')
            continue
        #--- save synth signal and pitch
        fnm_out = f'{synth_out_dir}/{phrs.phrase_id}.wav'
        sf.write(fnm_out, seg_synth, sr, subtype = 'PCM_24')
        pitch_fnm_out = f'{pitch_out_dir}/{phrs.phrase_id}.pt'
        pitch = torch.tensor(pitch[np.newaxis,:].astype(np.float32))
        torch.save(pitch, pitch_fnm_out)
        #break
    #break

In [None]:
n = min(len(seg), len(seg_synth))
xnew = np.c_[.5*seg_synth[:n], .4*seg[:n]].T
ipd.display(ipd.Audio(xnew, rate=sr))

## Analyze the difference between pitch extracted from whole phrase vs. a segment
The algorithm uses probability of pitch-jumps so results may vary, depending on the segment start/end

In [None]:
seg_synth.dtype

In [None]:
ipd.display(ipd.Audio('ewimididemo.wav', rate=sr))