In [64]:
import sound_to_midi as sm
import sys
import numpy as np
import librosa
import midiutil
import os
from scipy import signal
import scipy
import random
import time
from datetime import timedelta as td
import music21
import scipy.io.wavfile
import wave
import struct
import soundfile
import torch
import torchaudio

In [3]:
def transition_matrix(
        note_min: str,
        note_max: str,
        p_stay_note: float,
        p_stay_silence: float) -> np.array:

    midi_min = librosa.note_to_midi(note_min)
    midi_max = librosa.note_to_midi(note_max)
    n_notes = midi_max - midi_min + 1
    p_l = (1 - p_stay_silence) / n_notes
    p_ll = (1 - p_stay_note) / (n_notes + 1)

    # Transition matrix:
    # State 0 = silence
    # States 1, 3, 5... = onsets
    # States 2, 4, 6... = sustains
    transmat = np.zeros((2 * n_notes + 1, 2 * n_notes + 1))

    # State 0: silence
    transmat[0, 0] = p_stay_silence
    for i in range(n_notes):
        transmat[0, (i * 2) + 1] = p_l

    # States 1, 3, 5... = onsets
    for i in range(n_notes):
        transmat[(i * 2) + 1, (i * 2) + 2] = 1

    # States 2, 4, 6... = sustains
    for i in range(n_notes):
        transmat[(i * 2) + 2, 0] = p_ll
        transmat[(i * 2) + 2, (i * 2) + 2] = p_stay_note
        for j in range(n_notes):
            transmat[(i * 2) + 2, (j * 2) + 1] = p_ll

    return transmat


def prior_probabilities(
        audio_signal: np.array,
        note_min: str,
        note_max: str,
        srate: int,
        frame_length: int = 2048,
        hop_length: int = 512,
        pitch_acc: float = 0.9,
        voiced_acc: float = 0.9,
        onset_acc: float = 0.9,
        spread: float = 0.2) -> np.array:

    fmin = librosa.note_to_hz(note_min)
    fmax = librosa.note_to_hz(note_max)
    midi_min = librosa.note_to_midi(note_min)
    midi_max = librosa.note_to_midi(note_max)
    n_notes = midi_max - midi_min + 1

    # pitch and voicing
    pitch, voiced_flag, _ = librosa.pyin(
        y=audio_signal, fmin=fmin * 0.9, fmax=fmax * 1.1,
        sr=srate, frame_length=frame_length, win_length=int(frame_length / 2),
        hop_length=hop_length)
    tuning = librosa.pitch_tuning(pitch)
    f0_ = np.round(librosa.hz_to_midi(pitch - tuning)).astype(int)
    onsets = librosa.onset.onset_detect(
        y=audio_signal, sr=srate,
        hop_length=hop_length, backtrack=True)

    priors = np.ones((n_notes * 2 + 1, len(pitch)))

    for n_frame in range(len(pitch)):
        # probability of silence or onset = 1-voiced_prob
        # Probability of a note = voiced_prob * (pitch_acc) (estimated note)
        # Probability of a note = voiced_prob * (1-pitch_acc) (estimated note)
        if not voiced_flag[n_frame]:
            priors[0, n_frame] = voiced_acc
        else:
            priors[0, n_frame] = 1 - voiced_acc

        for j in range(n_notes):
            if n_frame in onsets:
                priors[(j * 2) + 1, n_frame] = onset_acc
            else:
                priors[(j * 2) + 1, n_frame] = 1 - onset_acc

            if j + midi_min == f0_[n_frame]:
                priors[(j * 2) + 2, n_frame] = pitch_acc

            elif np.abs(j + midi_min - f0_[n_frame]) == 1:
                priors[(j * 2) + 2, n_frame] = pitch_acc * spread

            else:
                priors[(j * 2) + 2, n_frame] = 1 - pitch_acc

    return priors


def states_to_pianoroll(states: list, note_min: str, hop_time: float) -> list:
    midi_min = librosa.note_to_midi(note_min)

    states_ = np.hstack((states, np.zeros(1)))

    # possible types of states
    silence = 0
    onset = 1
    sustain = 2

    my_state = silence
    output = []

    last_onset = 0
    last_offset = 0
    last_midi = 0
    for i, _ in enumerate(states_):
        if my_state == silence:
            if int(states_[i] % 2) != 0:
                # Found an onset!
                last_onset = i * hop_time
                last_midi = ((states_[i] - 1) / 2) + midi_min
                last_note = librosa.midi_to_note(last_midi)
                my_state = onset

        elif my_state == onset:
            if int(states_[i] % 2) == 0:
                my_state = sustain

        elif my_state == sustain:
            if int(states_[i] % 2) != 0:
                # Found an onset.
                # Finish last note
                last_offset = i * hop_time
                my_note = [last_onset, last_offset, last_midi, last_note]
                output.append(my_note)

                # Start new note
                last_onset = i * hop_time
                last_midi = ((states_[i] - 1) / 2) + midi_min
                last_note = librosa.midi_to_note(last_midi)
                my_state = onset

            elif states_[i] == 0:
                # Found silence. Finish last note.
                last_offset = i * hop_time
                my_note = [last_onset, last_offset, last_midi, last_note]
                output.append(my_note)
                my_state = silence

    return output


def pianoroll_to_midi(bpm: float, pianoroll: list) -> midiutil.MIDIFile():
    quarter_note = 60 / bpm

    onsets = np.array([p[0] for p in pianoroll])
    offsets = np.array([p[1] for p in pianoroll])

    onsets = onsets / quarter_note
    offsets = offsets / quarter_note
    durations = offsets - onsets

    midi = midiutil.MIDIFile(1)
    midi.addTempo(0, 0, bpm)

    for i, _ in enumerate(onsets):
        midi.addNote(
            0, 0, int(pianoroll[i][2]), onsets[i], durations[i], 100)

    return midi

def wave_to_midi(
        audio_signal: np.array,
        srate: int = 16000,
        frame_length: int = 2048,
        hop_length: int = 512,
        note_min: str = "A2",
        note_max: str = "E8",
        p_stay_note: float = 0.9,
        p_stay_silence: float = 0.7,
        pitch_acc: float = 0.9,
        voiced_acc: float = 0.9,
        onset_acc: float = 0.9,
        spread: float = 0.2) -> midiutil.MIDIFile():
    transmat = transition_matrix(note_min, note_max, p_stay_note, p_stay_silence)
    priors = prior_probabilities(
        audio_signal,
        note_min,
        note_max,
        srate,
        frame_length,
        hop_length,
        pitch_acc,
        voiced_acc,
        onset_acc,
        spread)
    p_init = np.zeros(transmat.shape[0])
    p_init[0] = 1
    states = librosa.sequence.viterbi(priors, transmat, p_init=p_init)

    pianoroll = states_to_pianoroll(states, note_min, hop_length / srate)
    bpm = librosa.beat.tempo(y=audio_signal)[0]
    midi = pianoroll_to_midi(bpm, pianoroll)

    return midi

def f_high(y,sr):
    b,a = signal.butter(10, 2000/(sr/2), btype='highpass')
    yf = signal.lfilter(b,a,y)
    return yf

def _stft(y, n_fft, hop_length, win_length):
    return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)

def _istft(y, hop_length, win_length):
    return librosa.istft(y, hop_length, win_length)

def _amp_to_db(x):
    return librosa.core.amplitude_to_db(x, ref=1.0, amin=1e-20, top_db=80.0)

def _db_to_amp(x,):
    return librosa.core.db_to_amplitude(x, ref=1.0)

def removeNoise(
    audio_clip,
    noise_clip,
    n_grad_freq=2,
    n_grad_time=4,
    n_fft=2048,
    win_length=2048,
    hop_length=512,
    n_std_thresh=1.5,
    prop_decrease=1.0,
    verbose=False,
    visual=False,
):
    if verbose:
        start = time.time()
    noise_stft = _stft(noise_clip, n_fft, hop_length, win_length)
    noise_stft_db = _amp_to_db(np.abs(noise_stft))  # convert to dB
    mean_freq_noise = np.mean(noise_stft_db, axis=1)
    std_freq_noise = np.std(noise_stft_db, axis=1)
    noise_thresh = mean_freq_noise + std_freq_noise * n_std_thresh
    if verbose:
        print("STFT on noise:", td(seconds=time.time() - start))
        start = time.time()
    if verbose:
        start = time.time()
    sig_stft = _stft(audio_clip, n_fft, hop_length, win_length)
    sig_stft_db = _amp_to_db(np.abs(sig_stft))
    if verbose:
        print("STFT on signal:", td(seconds=time.time() - start))
        start = time.time()
    mask_gain_dB = np.min(_amp_to_db(np.abs(sig_stft)))
    smoothing_filter = np.outer(
        np.concatenate(
            [
                np.linspace(0, 1, n_grad_freq + 1, endpoint=False),
                np.linspace(1, 0, n_grad_freq + 2),
            ]
        )[1:-1],
        np.concatenate(
            [
                np.linspace(0, 1, n_grad_time + 1, endpoint=False),
                np.linspace(1, 0, n_grad_time + 2),
            ]
        )[1:-1],
    )
    smoothing_filter = smoothing_filter / np.sum(smoothing_filter)
    db_thresh = np.repeat(
        np.reshape(noise_thresh, [1, len(mean_freq_noise)]),
        np.shape(sig_stft_db)[1],
        axis=0,
    ).T
    sig_mask = sig_stft_db < db_thresh
    if verbose:
        print("Masking:", td(seconds=time.time() - start))
        start = time.time()
    sig_mask = scipy.signal.fftconvolve(sig_mask, smoothing_filter, mode="same")
    sig_mask = sig_mask * prop_decrease
    if verbose:
        print("Mask convolution:", td(seconds=time.time() - start))
        start = time.time()
    sig_stft_db_masked = (
        sig_stft_db * (1 - sig_mask)
        + np.ones(np.shape(mask_gain_dB)) * mask_gain_dB * sig_mask
    )  # mask real
    sig_imag_masked = np.imag(sig_stft) * (1 - sig_mask)
    sig_stft_amp = (_db_to_amp(sig_stft_db_masked) * np.sign(sig_stft)) + (
        1j * sig_imag_masked
    )
    if verbose:
        print("Mask application:", td(seconds=time.time() - start))
        start = time.time()
    # recover the signal
    recovered_signal = _istft(sig_stft_amp, hop_length, win_length)
    recovered_spec = _amp_to_db(
        np.abs(_stft(recovered_signal, n_fft, hop_length, win_length))
    )
    if verbose:
        print("Signal recovery:", td(seconds=time.time() - start))
    return recovered_signal

In [10]:
file_in = 'C:/Users/jakes/Music/PythonStems/Eros/vocals.wav'
path_out = 'C:/Users/jakes/Music/PythonMIDIs/Eros/'
path_out2 = 'C:/Users/jakes/Music/PythonStems/Eros/'
file_out = 'vocalMIDI.mid'
file_out2 = 'cleanvocals.wav'
y, sr = librosa.load(file_in, sr=16000)
print("Audio file loaded!")
noise1 = y[0:1*sr]
yadj = removeNoise(audio_clip=y, noise_clip=noise1,
    n_grad_freq=2,
    n_grad_time=4,
    n_fft=2048,
    win_length=2048,
    hop_length=512,
    n_std_thresh=5,
    prop_decrease=1.0,
    verbose=False,
    visual=False)
try:
    os.chdir(path_out2)
except (FileNotFoundError,FileExistsError):
    os.makedirs(path_out2)
soundfile.write(path_out2+file_out2, yadj, 16000)
print("Noise removal finished!")
midi = wave_to_midi(yadj, srate=sr)
print("Conversion finished!")
try:
    os.chdir(path_out)
except (FileNotFoundError,FileExistsError):
    os.makedirs(path_out)
with open(os.path.join(path_out, file_out), 'wb') as fp:
    midi.writeFile(fp)
print("Done generating MIDI")
score = music21.converter.parse('C:/Users/jakes/Music/PythonMIDIs/Calling/vocalMIDI.mid')
key = score.analyze('key')
print(key.tonic.name, key.mode)

Audio file loaded!


  return librosa.istft(y, hop_length, win_length)


Noise removal finished!


  elif np.abs(j + midi_min - f0_[n_frame]) == 1:


Conversion finished!
Done generating MIDI
C# minor
