# Audio similarity - prototype

## MFCC descriptor + DTW distance

In [1]:
import librosa
import librosa.display
import matplotlib.pyplot as plt
from dtw import dtw, accelerated_dtw
from numpy.linalg import norm
import numpy as np
import math
import time

### Extract MFCC feature

In [4]:
def extract_mfcc(file):
    signal, sr = librosa.load('data/' + file)
    mfcc = librosa.feature.mfcc(signal, n_mfcc=13, sr=sr)
    return mfcc

### Split one audio track into multiple samples

In [306]:
N = 400 # Sample size

def split_mfcc(mfcc):
    n_samples = math.floor(mfcc.shape[1] / SAMPLE_SIZE)
    last_sample_size = mfcc.shape[1] % SAMPLE_SIZE
#    print('Splitting into %d samples of size %d. Last sample has size %d.' % (n_samples, SAMPLE_SIZE, last_sample_size))
    
    samples = []
    for i in range(n_samples):
        samples.append(mfcc[:, i*N:(i+1)*N])
    samples.append(mfcc[:, n_samples*N:])
    
    return samples

### Build reference database of audio samples

In [299]:
def build_db(files):
    db = []
    
    for file in files:
        mfcc = extract_mfcc(file)
        samples = split_mfcc(mfcc)

        for sample in samples:
            db.append({ 'file': file, 'mfcc': sample })
    
    print("Added %d samples of %d audiotracks" % (len(db), len(files)))
    return np.array(db, dtype=object)

### Distance measure

In [300]:
def distance(a, b):
    dist, cost, acc_cost, path = dtw(a.T, b.T, dist=lambda x, y: norm(x - y, ord=1))
    return dist

### Find the most similar match

In [301]:
def find_match(mfcc, db):
    start = time.time()
    matches = []
    
    # Loop over all samples in the database and find the best match
    for sample in db:
        dist = None
        
        # Align both samples to the same size
        if mfcc.shape[1] == sample['mfcc'].shape[1]:
            dist = distance(mfcc, sample['mfcc'])
        elif mfcc.shape[1] < sample['mfcc'].shape[1]:
            dist = distance(mfcc, sample['mfcc'][:, mfcc.shape[1]])
        else:
            dist = distance(mfcc[:, sample['mfcc'].shape[1]], sample['mfcc'])
        
        matches.append({ 'file': sample['file'], 'dist': dist })

    matches = sorted(matches, key=lambda x: x['dist'])
    print("Time: %.2f s" % (time.time() - start))
    return matches

### Run

In [307]:
files = ['sample_1.wav', 'dubstep_beat_1.wav', 'dubstep_drum.mp3', 'trance.m4a']
db = build_db(files)



Added 6 samples of 4 audiotracks


In [308]:
mfcc = extract_mfcc('recorded_sample_1.m4a')[:, :N]
find_match(mfcc, db)



Time: 4.62 s


[{'file': 'sample_1.wav', 'dist': 35945.62905883789},
 {'file': 'dubstep_beat_1.wav', 'dist': 86426.36378479004},
 {'file': 'sample_1.wav', 'dist': 119476.18923950195},
 {'file': 'trance.m4a', 'dist': 148126.52734375},
 {'file': 'dubstep_drum.mp3', 'dist': 149414.47550964355},
 {'file': 'dubstep_beat_1.wav', 'dist': 209982.72225952148}]

In [9]:
mfcc = extract_mfcc('guitar.wav')

In [12]:
len(mfcc[0])

2905

In [310]:
mfcc = extract_mfcc('trance_2.m4a')[:, :N]
find_match(mfcc, db)



Time: 4.61 s


[{'file': 'sample_1.wav', 'dist': 37445.249908447266},
 {'file': 'dubstep_beat_1.wav', 'dist': 85355.85360717773},
 {'file': 'trance.m4a', 'dist': 141331.39974975586},
 {'file': 'dubstep_drum.mp3', 'dist': 149616.13061523438},
 {'file': 'sample_1.wav', 'dist': 152272.89219665527},
 {'file': 'dubstep_beat_1.wav', 'dist': 178349.8076171875}]

## Fingerprinting

### Fingeprint audiotrack using Chromaprint library / tool

In [50]:
def parse_chromaprint_output(output):
    output = output.decode('utf-8')
    key = 'FINGERPRINT='
    strip_index = output.find(key) + len(key)
    return list(map(int, output[strip_index:-1].split(',')))

In [3]:
def fingeprint_file(path):
    output = !fpcalc $path  -raw
    return parse_chromaprint_output(output)

In [51]:
import subprocess

def fingeprint_file(path):
        cmd = 'fpcalc %s -raw' % path
        process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
        output, error = process.communicate()

        if error:
            raise Exception(error)
        return parse_chromaprint_output(output)

In [53]:
fp1 = fingeprint_file('data/sample_1.wav')

In [54]:
fp2 = fingeprint_file('data/trance.m4a')

### Compare fingeprints

Source: https://gist.github.com/lalinsky/1132166

Another way with correlation: https://medium.com/@shivama205/audio-signals-comparison-23e431ed2207

In [55]:
popcnt_table_8bit = [
    0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,
    1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
    1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
    2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
    1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
    2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
    2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
    3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8,
]

def popcnt(x):
    """
    Count the number of set bits in the given 32-bit integer.
    """
    return (popcnt_table_8bit[(x >>  0) & 0xFF] +
            popcnt_table_8bit[(x >>  8) & 0xFF] +
            popcnt_table_8bit[(x >> 16) & 0xFF] +
            popcnt_table_8bit[(x >> 24) & 0xFF])

In [56]:
def compare_fingerprints(fp1, fp2):
    error = 0
    for x, y in zip(fp1, fp2):
        error += popcnt(x ^ y)
    return 1.0 - error / 32.0 / min(len(fp1), len(fp2))

In [57]:
sim = compare_fingerprints(fp1, fp2)
print("Similarity: %f"% sim)

Similarity: 0.482031
