In [21]:
import librosa
import numpy as np
import os
import argparse
import fnmatch
from tqdm import tqdm

In [16]:
parser = argparse.ArgumentParser(description='Split audio into multiple files and build fingerprints.')
parser.add_argument('-i', '--input', type=str, default='input')
parser.add_argument('-o', '--output', type=str, default='output')
parser.add_argument('-s', '--sr', type=int, default=44100)
parser.add_argument('-l', '--hop_length', type=int, default=2048)
parser.add_argument('-d', '--duration', type=float, default=1)
args, _ = parser.parse_known_args()

In [10]:
def list_all_files(directory, extensions=None):
    for root, dirnames, filenames in os.walk(directory):
        for filename in filenames:
            base, ext = os.path.splitext(filename)
            joined = os.path.join(root, filename)
            if extensions is None or ext.lower() in extensions:
                yield joined

In [11]:
def tweak_onsets(onset_frames, o_env):
    tweaked = []
    for onset_frame in onset_frames:
        while True:
            if onset_frame == 0:
                break
            cur = o_env[onset_frame]
            prev = o_env[onset_frame - 1]
            if prev > cur:
                break
            onset_frame = onset_frame - 1
        tweaked.append(onset_frame)
    return np.array(tweaked)

In [19]:
def get_fingerprint(y, duration=1, hop_length=2048, sr=44100):
    y = librosa.to_mono(y)
    y = librosa.util.fix_length(y, duration * sr)
    y = librosa.util.normalize(y)
    cqt = librosa.cqt(y, sr=sr, hop_length=hop_length)
    return cqt.flatten('F')

In [37]:
fingerprints = []
filenames_original = []
filenames_samples = []
for fn in list_all_files(args.input):
    y, sr = librosa.load(fn, sr=sr)
    
    o_env = librosa.onset.onset_strength(y, sr=sr)
    onset_frames = librosa.onset.onset_detect(onset_envelope=o_env, sr=sr)
    onset_frames = tweak_onsets(onset_frames, o_env)
    
    onset_samples = list(librosa.frames_to_samples(onset_frames))
    onset_samples = np.concatenate(onset_samples, len(y))
    starts = onset_samples[0:-1]
    stops = onset_samples[1:]
    samples_folder = os.path.join(args.output, 'samples')
    try:
        os.makedirs(samples_folder)
    except:
        pass
    for (start, stop) in tqdm(zip(starts, stops), desc=fn, leave=True):
        sample = y[start:stop]
        i = len(fingerprints)
        sample_fn = os.path.join(samples_folder, str(i) + '.wav')
        librosa.output.write_wav(sample_fn, sample, sr)
        fingerprint = get_fingerprint(sample, duration=args.duration, hop_length=args.hop_length, sr=sr)
        fingerprints.append(fingerprint.astype(np.float32))
        filenames_original.append(fn)
        filenames_samples.append(sample_fn)
fingerprints = np.array(fingerprints)

input/crosstown-traffic.ogg: 100%|██████████| 344/344 [00:08<00:00, 41.13it/s]
input/freedom.ogg: 100%|██████████| 493/493 [00:12<00:00, 40.07it/s]
input/spanish-castle-magic.ogg: 100%|██████████| 253/253 [00:06<00:00, 41.05it/s]


In [38]:
np.save(os.path.join(args.output, 'fingerprints.npy'), fingerprints)
np.savetxt(os.path.join(args.output, 'filenames_original.txt'), filenames_original, fmt='%s')
np.savetxt(os.path.join(args.output, 'filenames_samples.txt'), filenames_samples, fmt='%s')