# Locality Sensitive Hashing for audio identification

# Libraries

In [834]:
import pandas as pd
import numpy as np
import os
import librosa
from tqdm import tqdm,trange
import pickle


# This Function is used to extract MFCC of Audios

In [835]:
def extract_mfcc(path):
    audio,sr =librosa.load(path)
    mfccs=(librosa.feature.mfcc(y=audio, sr=sr))
    mean=np.mean(mfccs,axis=1)
    std=np.std(mfccs,axis=1)
    mfcc_feature=(np.concatenate([mean, std]))
    threshold = np.median(mfcc_feature)
    bin_mfcc = (mfcc_feature > threshold).astype(int)
    return bin_mfcc

# Random Permutation is genrated through this method

In [836]:
def genrate_random_perm(mfcc_length,number_permutation):
    random_permutation=[]
    number_permutation=number_permutation
    mfcc_length=40
    for i in range(0,number_permutation):   
        check=True
        while(check):
            temp=np.random.permutation(mfcc_length)
            if len(random_permutation)>0:
                for j in range(0,len(random_permutation)):
                    if(temp.all()==random_permutation[j].all()):
                        check=False
                        break
                if(check==False):
                    check=True
                break
            else:
                break
        random_permutation.append(temp)
    return random_permutation
        

# Genrate Hash takes MFCC and Permutation and return minimum Values from HashTable where 1 is peresent 

In [914]:
def genrate_hash(mfcc,random_permutation,number_permutation):
    hash_signature=[]
    for i in range(0,number_permutation):
        index=np.where(mfcc== 1)
        values = np.take(random_permutation[i],index)
        min_value = np.min(values)
        hash_signature.append(min_value)
    return hash_signature

# This buckets Function Divide the Hash Signature into Buckets and Assign the key item to value

In [915]:
def buckets(LSH_table,hash_feature,fileName):
    for i in range(0, len(hash_feature), 4):
        # Sum the next 5 values in the array list
        sum_ = sum(hash_feature[i:i+4])
        current_values = LSH_table.get(sum_)  # retrieve the current values associated with the key 
        if current_values is None:
            LSH_table[sum_] = fileName  # add a new key-value pair if the key does not exist
        else:
            current_values=current_values+","+fileName
            LSH_table[sum_]= current_values  # update the values associated with the key 1


# Querry Function Take Audio and threshold value to check along side stored values in LSH_table

In [916]:
def query(audio,threshold,random_permutation,number_permutation,LSH_table,hashing_dict):
    mfcc=extract_mfcc(audio) #querry mfcc
    queryHash_feature=genrate_hash(mfcc,random_permutation,number_permutation) #querry hash is genrated
    # print(queryHash_feature)
    keys=[] 
    for i in range(0, len(queryHash_feature), 4):#this will sum next values
        # Sum the next 5 values in the array list 
        sum_ = sum(queryHash_feature[i:i+4])
        keys.append(sum_)
    # print(keys)
    AllFiles=[]
    for i in range(0,len(keys)):  #this will extract all the key items and genrate unqiue files
        if LSH_table.get(keys[i]) is None:
            continue
        files=LSH_table.get(keys[i]).split(",")
        if len(files)>0:
            for j in range(0,len(files)):
                AllFiles.append(files[j])
    uniqueFiles = list(set(AllFiles))
    similarites={}
    # threshold=0
    for i in uniqueFiles:
        dataBaseAudio_Hash=hashing_dict.get(i)
        counter=0
        for j in range(0,len(queryHash_feature)):
            if(dataBaseAudio_Hash[j]==queryHash_feature[j]):
                counter+=1
        jaccardSimilarity=counter/20
        if jaccardSimilarity>=threshold:
            similarites[i]=jaccardSimilarity
    max_key = max(similarites.items(), key=lambda x: x[1])[0]    
    # print(similarites)
    # print("Nearest Audio could be: ",max_key," with Jaccard Similarity ",similarites.get(max_key))
    r=printAnswer(similarites)
    return r
    

In [917]:
def printAnswer(dic):
    final_output=[]
    if len(dic)!=0:
        k=[]
        val=[]
        for key, value in sorted(dic.items(), key=lambda x: x[1],reverse=True):
            k.append(key)
            val.append(value)
        s="Nearest Audio could be "+str(k[0])+" with Jaccard Similarity "+ str(val[0]*100)+"%"
        final_output.append(s)
        # print("Nearest Audio could be: ",k[0]," with Jaccard Similarity ",val[0])
        s='------------------------------------------------------------------'
        final_output.append(s)
        # print('------------------------------------------------------------------')
        if(len(k)>=2):
            s="You Will Also Like these Audios: ",str(k[1])," and "+str(k[2])
            final_output.append(s)
            # print("You Will Also Like these Audios: ",k[1]," and ",k[2])
        elif(len(k)>1):
            s="You Will Also Like these Audio: ",str(k[1])
            final_output.append(s)
            # print("You Will Also Like these Audio: ",k[1])
        else:
            s='You Have Unique Taste Try this: 002096.mp3'
            final_output.append(s)
    else:
        s='There is no Audio Matched Try changing the Threshold Value'
        final_output.append(s)
    return final_output
    
        
        
        
    

# Int Main

In [899]:
# Set Parameters
mfcc_length=40 #1Demensional
number_permutation=20  #how many permutation columns are required
LSH_table={}
hashing_dict={}
random_permutation=[] #this will store the random permutation

#Genrate random_permutation list first and pickle it
# random_permutation=genrate_random_perm(mfcc_length,number_permutation)
# # Open a file for writing in binary mode
# with open('random_perm.pickle', 'wb') as file:
#     pickle.dump(random_permutation, file)


In [900]:
import pickle

# Open the file for reading in binary mode
with open('random_perm.pickle', 'rb') as file:
    random_permutation = pickle.load(file)



# This function added all Audios in LSH

Before Running Below Tab make sure you have audioMFCC.csv file in the folder

In [902]:
import pandas as pd
columns=np.arange(0,41)
df=pd.read_csv("audioMFCC.csv",names=columns)
for i in range(0,len(df.index)):
    audios=df.iloc[i][1:]
    threshold = np.median(audios)
    bin_mfcc = (audios > threshold).astype(int)
    binary_audios=bin_mfcc
    hash_feature=genrate_hash(binary_audios,random_permutation,number_permutation)
    hashing_dict[df.iloc[i,0]]=hash_feature
    buckets(LSH_table,hash_feature,df.iloc[i,0],n_rows,n_bands)    

In [903]:
# Open a file for writing in binary mode
with open('LSH_table.pickle', 'wb') as file:
    pickle.dump(LSH_table, file)
with open('hashing_dict.pickle', 'wb') as file:
    pickle.dump(hashing_dict, file)

In [904]:
# Open the file for reading in binary mode
with open('LSH_table.pickle', 'rb') as file:
    LSH_table = pickle.load(file)
with open('hashing_dict.pickle', 'rb') as file:
    hashing_dict = pickle.load(file)
    

# This is Query Method where you only have to provide Test Audio Path

In [905]:
threshold=0.9  #Jaccard Similarity Threshold to check similarity between test audio and others
q=query(audio_files[2],threshold,random_permutation,number_permutation,LSH_table,hashing_dict)