In [1]:
import librosa
import numpy as np

# Valor arbitrario. Tiene que ser chico, se puede probar con otros a ver como da
DELTA_SPECTRUM_DELTA = 0.05

# Valor arbitrario del paper, se pueden probar otros
SPECTRAL_ROLLOFF_THRESHOLD = 0.85

def sign(val):
  if val >= 0:
    return 1
  else:
    return -1

# TODO: Implementacion ad_hoc, ver como se calcula la resolution real
def DFT_resolution(signal, sampling_rate):
  return int(
    np.ceil(
      sampling_rate / len(signal)
    )
  )

def sample_spectrum(frecuency, i, t, delta):
  return np.log(
    frecuency[i, t]
  ) + delta

# drecado, tambien estaba en librosa
def zero_crossing_rate(signal):
  rate = 0

  for i in range(1, len(signal)):
    rate = rate + np.absolute(
      sign( signal[i] ) 
      - 
      sign( signal[i - 1] ) 
    )

  return rate / len(signal - 1)

# TODO: Terminar, como se calcula K? (resolution of the DFT)
def delta_spectrum(signal, sampling_rate, t_frame):
  res = 0
  DFT_res   = DFT_resolution(signal, sampling_rate)
  frecuency = np.abs(librosa.stft(signal))

  for i in range(0, DFT_res):
    res = res + np.square(
      sample_spectrum(frecuency, i, t_frame, DELTA_SPECTRUM_DELTA)
      -
      sample_spectrum(frecuency, i, t_frame-1, DELTA_SPECTRUM_DELTA)
    )
  
  return res / (DFT_res - 1)

# TODO: terminar, como se calcula K? (resolution of the DFT)
def spectral_rolloff(signal, sampling_rate, t_frame):
  res   = 0
  upper_bound = 0
  DFT_res = DFT_resolution(signal, sampling_rate)
  frecuency = np.abs(librosa.stft(signal))

  for i in range(0, DFT_res):
    upper_bound = upper_bound + frecuency[i, t_frame]

  upper_bound = upper_bound * SPECTRAL_ROLLOFF_THRESHOLD

  for i in range(0, DFT_res):
    proximo_val = res + frecuency[i, t_frame]
    
    if proximo_val >= upper_bound:
      return res
    else:
      res = proximo_val

def generate_delta_spectrum_feature_vector(signal, sampling_rate):
  frecuency  = librosa.stft(signal)
  _, columns = frecuency.shape 
  feature_vector = np.array([])

  for i in range(columns):
    feature_vector = np.append(
      feature_vector,
      delta_spectrum(
        signal, 
        sampling_rate,
        i
      )
    )

  return feature_vector

# @deprecated(version='1.2.1', reason="Habia una hecha en librosa")
def generate_spectral_rolloff_feature_vector(signal, sampling_rate):
  frecuency  = librosa.stft(signal)
  _, columns = frecuency.shape 
  feature_vector = np.array([], dtype=float)

  for i in range(columns):
    feature_vector = np.append(
      feature_vector,
      spectral_rolloff(
        signal, 
        sampling_rate, 
        i
      )
    )

  return feature_vector


# Nota: transponemos los features para tenerlos como columnas (ver paper)
def generate_audio_features(signal, sampling_rate):
  return np.array([
    # Primeros features
    librosa.feature.zero_crossing_rate(signal)[0],
    # np.array(generate_delta_spectrum_feature_vector(signal, sampling_rate)),
    librosa.feature.spectral_rolloff(signal, sampling_rate)[0],

    # Segundos features
    librosa.feature.spectral_centroid(y=signal, sr=sampling_rate)[0],
    librosa.feature.spectral_flatness(y=signal)[0]
#     librosa.feature.chroma_cqt(y=y, sr=sr)[0]
  ])

In [15]:
import sklearn.decomposition
from  sklearn.decomposition import NMF
import librosa

from os import listdir, mkdir
from os.path import isfile, join, splitext, getsize

from numpy import linalg

def concatenate(a, b):
    if a.size == 0: return b
    return np.concatenate((a, b))

def containsAll(substrings, string):
    res = True
    for s in substrings:
        res = res and (s in string)
    return res


def containsAny(substrings, string):
    res = False
    for s in substrings:
        res = res or (s in string)
    return res

def getConcat(arr):
    res = ""
    for a in arr:
        res = res + "_" + a
    return res

def generate_stacked_new_row(a, b):
    if a.size == 0: return b # Caso especial si a no tiene elems todavia
    return np.vstack( (a, b) )


def filter_valid_names(names, substrings, not_substrings):
	res = []

	for name in names:
		if containsAll(substrings, name) and not containsAny(not_substrings, name):
			res.append(name)

	return res

def getFiles(directory, substrings, not_substrings):
    # Pruebo levantar por size
    all_paths = [
        file_path 
            for file_path 
            in listdir(directory) 
                if isfile(join(directory, file_path))
    ]

    all_paths = filter_valid_names(all_paths, substrings, not_substrings)

    return all_paths

def get_audio_features(file_path):
    y, sr = librosa.load(file_path)
    return generate_audio_features(y, sr)

def generate_instrument_dataset(directory, substrings, not_substrings, instrument):
    instrument_dataset = np.array([])
    files = getFiles(directory, substrings, not_substrings)

    # Generamos el instrument_dataset
    for file in files:
        file_path = join(directory, file)

        instrument_dataset = generate_stacked_new_row(
            instrument_dataset, 
            get_audio_features(file_path)
        )
            
    return instrument_dataset


def generate_dataset(instruments):
    dataset = np.array([])
    labels  = [] # El label de la i-esima columna

    for directory, substrings, not_substrings, instrument in instruments:
        instrument_dataset = generate_instrument_dataset(
                directory, 
                substrings, 
                not_substrings, 
                instrument
            )
        
        # armo el dataset nuevo
        dataset = concatenate(
            dataset, 
            generate_instrument_dataset(
                directory, 
                substrings, 
                not_substrings, 
                instrument
            )
        )
        
        # armo los labels, las cols representan los vectores de features
        for i in range(instrument_dataset.shape[0]):
            labels.append(instrument)

    # Para el dataset uso los vectores de features como columnas
    dataset = dataset.T
    
	# Opcional
    T = sklearn.decomposition.MiniBatchDictionaryLearning(n_components=5)
# 	 scomps, sacts = librosa.decompose.decompose(dataset, transformer=T, sort=True)    

    # Generamos la descomposicion de Non-negative matrix
    # dataset = W * H
    W, H = librosa.decompose.decompose(dataset, transformer=T)
#     model = NMF(n_components=len(instruments), init='random', random_state=0)
#     W = model.fit_transform(dataset.T)
#     H = model.components_
    
    
    # Calculamos la inversa de Moore-Penrose
    # Asi queda dataset * W^(-1) = H
    W_inv = linalg.pinv(W)
    
    print("shape de dataset")
    print(dataset.shape)
    print("shape de W")
    print(W.shape)
    print(W_inv.shape)
    print(H.shape)

    return np.array(W_inv), np.array(H), labels

In [13]:
from sklearn.metrics   import pairwise 
from sklearn.neighbors import KDTree

import operator
import numpy as np
from numpy import linalg

VIOLIN   = ("../audios/violin/retocadas_MIS", [], ["trill"], "violin")
FLAUTA   = ("../audios/flauta/retocadas_MIS", [], ["trill"], "flauta")
GUITARRA = ("../audios/guitar/retocadas", [], ["trill"], "guitarra")

def cosine_similarity(a, b):
    numerador   = a.T.dot(b)
    denominador = linalg.norm(a) * linalg.norm(b)
    
    return numerador / denominador

def calcular_distancias(H_t, predicted):
    distancias = []
    
    for i in range(H_t.shape[0]):
        distancias.append(
            cosine_similarity(
                predicted, 
                H_t[i]
            )
        )
        
    return distancias


def get_key_from_max_value(dic):
    return max(dic.items(), key=operator.itemgetter(1))[0]

def count_frecuencies(labels_ordenados):
    frecuencias = dict()
    
    for i in range(len(labels_ordenados)):
        val = labels_ordenados[i]
        if val in frecuencias.keys():
            frecuencias[val] = frecuencias[val] + 1
        else:
            frecuencias[val] = 1
            
    return frecuencias

# Usamos Cosine Similarity Measure para knn, probar si anda bien
# predicted tiene shape: #Instr * 1
# H tiene shape: #Instr * #Audios
# h_i columna tiene shape 1 * # instrumentso <- estas comparamos 
def k_near_neighbors(predicted, acts, labels, k):
    max_label     = 0
    max_label_val = -1
    
    H_t = acts.T
    
    # Calculamos las distancias a todo el resto de los audios
    distancias = calcular_distancias(H_t, predicted)
    
    print(np.array(distancias).shape)
    
    # Ordenamos los labels en base a cuales estan mas cerca
    labels_ordenados_por_distancia = [
        label 
        for _,label 
        in sorted(
            zip(distancias,labels),
            reverse=True
        )
    ]
    
    print(labels_ordenados_por_distancia)
    
    # Calculamos las frecuencias de los primeros k labels
    frecuencias = count_frecuencies(
        labels_ordenados_por_distancia[:k]
    )
    
    # Predecimos el label que mas veces aparecio
    predicted_label = get_key_from_max_value(frecuencias)
    return predicted_label
    
def predict(dataset, k, test_file_path):
    comps_inv, acts, labels = dataset
    test = get_audio_features(test_file_path)

    # Generamos la prediccion de test 
    W = comps_inv
    test_vect = test

    # Para sacar el nuevo vector de activaciones calculamos
    # Componentes^(-1) * test_vect_features = activacion_test
    predicted = W.dot(test_vect.T)

    # Calculamos el instrumento mas cercano al de test
    pred = k_near_neighbors(predicted, acts, labels, k)
    return pred


In [5]:
# Generamos el modelo para predecir
instruments = [VIOLIN, FLAUTA]
W_inv, H, labels = generate_dataset(instruments=instruments)

shape de dataset
(72, 624)
shape de W
(72, 10)
(10, 72)
(10, 624)


In [14]:
# Probamos predecir
predict((W_inv, H, labels), 10, "../audios/violin/retocadas_MIS/test/Violin.arco.ff.sulG.B3.stereo.wav")
# print(len(labels))

(624, 4)


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()