In [350]:
import librosa
import numpy as np

# Valor arbitrario. Tiene que ser chico, se puede probar con otros a ver como da
DELTA_SPECTRUM_DELTA = 0.05

# Valor arbitrario del paper, se pueden probar otros
SPECTRAL_ROLLOFF_THRESHOLD = 0.85

def sign(val):
  if val >= 0:
    return 1
  else:
    return -1

# TODO: Implementacion ad_hoc, ver como se calcula la resolution real
def DFT_resolution(signal, sampling_rate):
  return int(
    np.ceil(
      sampling_rate / len(signal)
    )
  )

def sample_spectrum(frecuency, i, t, delta):
  return np.log(
    frecuency[i, t]
  ) + delta

# drecado, tambien estaba en librosa
def zero_crossing_rate(signal):
  rate = 0

  for i in range(1, len(signal)):
    rate = rate + np.absolute(
      sign( signal[i] ) 
      - 
      sign( signal[i - 1] ) 
    )

  return rate / len(signal - 1)

# TODO: Terminar, como se calcula K? (resolution of the DFT)
def delta_spectrum(signal, sampling_rate, t_frame):
  res = 0
  DFT_res   = DFT_resolution(signal, sampling_rate)
  frecuency = np.abs(librosa.stft(signal))

  for i in range(0, DFT_res):
    res = res + np.square(
      sample_spectrum(frecuency, i, t_frame, DELTA_SPECTRUM_DELTA)
      -
      sample_spectrum(frecuency, i, t_frame-1, DELTA_SPECTRUM_DELTA)
    )
  
  return res / (DFT_res - 1)

# TODO: terminar, como se calcula K? (resolution of the DFT)
def spectral_rolloff(signal, sampling_rate, t_frame):
  res   = 0
  upper_bound = 0
  DFT_res = DFT_resolution(signal, sampling_rate)
  frecuency = np.abs(librosa.stft(signal))

  for i in range(0, DFT_res):
    upper_bound = upper_bound + frecuency[i, t_frame]

  upper_bound = upper_bound * SPECTRAL_ROLLOFF_THRESHOLD

  for i in range(0, DFT_res):
    proximo_val = res + frecuency[i, t_frame]
    
    if proximo_val >= upper_bound:
      return res
    else:
      res = proximo_val

def generate_delta_spectrum_feature_vector(signal, sampling_rate):
  frecuency  = librosa.stft(signal)
  _, columns = frecuency.shape 
  feature_vector = np.array([])

  for i in range(columns):
    feature_vector = np.append(
      feature_vector,
      delta_spectrum(
        signal, 
        sampling_rate,
        i
      )
    )

  return feature_vector

# @deprecated(version='1.2.1', reason="Habia una hecha en librosa")
def generate_spectral_rolloff_feature_vector(signal, sampling_rate):
  frecuency  = librosa.stft(signal)
  _, columns = frecuency.shape 
  feature_vector = np.array([], dtype=float)

  for i in range(columns):
    feature_vector = np.append(
      feature_vector,
      spectral_rolloff(
        signal, 
        sampling_rate, 
        i
      )
    )

  return feature_vector


def generate_audio_features(signal, sampling_rate):
  # spectral_rolloff = librosa.feature.spectral_rolloff(signal, sampling_rate)
  # spectral_rolloff = np.reshape(
  #   spectral_rolloff,
  #   spectral_rolloff.size, 
  #   order='F'
  # )

  return np.array([
    # Primeros features
    np.array(librosa.feature.zero_crossing_rate(signal)[0]),
    # np.array(generate_delta_spectrum_feature_vector(signal, sampling_rate)),
    np.array(librosa.feature.spectral_rolloff(signal, sampling_rate)[0]),

    # Segundos features
    np.array(librosa.feature.spectral_centroid(y=signal, sr=sampling_rate)[0]),
    np.array(librosa.feature.spectral_flatness(y=signal)[0]),
  ]).T

In [428]:
import sklearn.decomposition
import librosa

from os import listdir, mkdir
from os.path import isfile, join, splitext, getsize

from numpy import linalg

def count(files):
    count = dict()

    for file in files:
        file_size = file[1]

        if file_size in count.keys():
            count[file_size] = count[file_size] + 1
        else:
            count[file_size] = 1

    return count

def first_elements(lst):
    check = lst[0][1]
    elems = []

    for elem in lst:
        if elem[1] == check:
            elems.append(elem[0])
        else:
            break

    return elems

def filter_valid_names(names, substrings, not_substrings):
	res = []

	for name in names:
		if containsAll(substrings, name) and not containsAny(not_substrings, name):
			res.append(name)

	return res

def getFiles(directory, substrings, not_substrings):
    # Pruebo levantar por size
    all_paths = [
        file_path 
            for file_path 
            in listdir(directory) 
                if isfile(join(directory, file_path))
    ]

    all_paths = filter_valid_names(all_paths, substrings, not_substrings)

    # filepaths = []

    # for i in range(len(all_paths)):
    #     filepaths.append(
    #     	(
    #     		all_paths[i], 
    #     		getsize(
    #     			join(
    #     				directory, 
    #     				all_paths[i]
    #     				)
    #     			)
    #     		)
    #     	)

    # counts = count(filepaths)
    # filepaths = sorted(filepaths, key=lambda x: counts[x[1]], reverse=True)

    # Me quedo con los que mas aparecen
    # return first_elements(filepaths)
    return all_paths

def containsAll(substrings, string):
    res = True
    for s in substrings:
        res = res and (s in string)
    return res


def containsAny(substrings, string):
    res = False
    for s in substrings:
        res = res or (s in string)
    return res

def getConcat(arr):
    res = ""
    for a in arr:
        res = res + "_" + a
    return res

def get_audio_features(file_path):
    y, sr = librosa.load(file_path)
    return generate_audio_features(y, sr)


def generate_stacked_new_row(a, b):
    if a.size == 0: return b # Caso especial si a no tiene elems todavia
    return np.vstack( (a, b) )

def generate_instrument_dataset(directory, substrings, not_substrings, instrument):
    instrument_dataset = np.array([])
    files = getFiles(directory, substrings, not_substrings)

    # Generamos el instrument_dataset
    for file in files:
        file_path = join(directory, file)

        instrument_dataset = generate_stacked_new_row(
            instrument_dataset, 
            get_audio_features(file_path)
        )
            
    return instrument_dataset

def concatenate(a, b):
    if a.size == 0: return b
    return np.concatenate((a, b))

def generate_dataset(instruments):
    dataset = np.array([])

    for directory, substrings, not_substrings, instrument in instruments:
        dataset = concatenate(
            dataset, 
            generate_instrument_dataset(
                directory, 
                substrings, 
                not_substrings, 
                instrument
            )
        )

	# Opcional
	# T = sklearn.decomposition.MiniBatchDictionaryLearning(n_components=1)
	# scomps, sacts = librosa.decompose.decompose(dataset, transformer=T, sort=True)    

    # Generamos la descomposicion de Non-negative matrix
    # Nota: transponemos el dataset para tener los features como filas (ver paper)
    # dataset = Componentes * Activaciones
    comps, acts = librosa.decompose.decompose(dataset.T, n_components=len(instruments))
    
    # Calculamos la inversa de Moore-Penrose para sacar la Activacion de los samples de prueba
    # Asi queda dataset * Componentes^(-1) = Activaviones
    comps_inv = linalg.pinv(comps)
    
    print("shape de dataset")
    print(dataset.shape)
    print("shape de W")
    print(comps.shape)
    print(comps_inv.shape)
    print(acts.shape)
    

    return np.array(comps_inv), np.array(acts)

    # [[7.82712577e-03 1.62039602e+02 8.89513943e+01 5.19766760e-03]] - Violin

In [429]:
def cosine_similarity(a, b):
    print("cosine")
    print(a.shape)
    print(b.shape)
    
    arriba = a.T.dot(b)
    abajo  = linalg.norm(a) * linalg.norm(b)

    return arriba / abajo  

In [432]:
from sklearn.metrics    import pairwise 
from sklearn.neighbors import KDTree

import numpy as np
from numpy import linalg

VIOLIN   = ("../audios/violin/retocadas", [], ["trill"], "violin")
FLAUTA   = ("../audios/flauta/retocadas", [], ["trill"], "flauta")
GUITARRA = ("../audios/guitar/retocadas", [], ["trill"], "guitarra")

# Usamos Cosine Similarity Measure para knn
def k_near_neighbors(predicted, acts):
    max_label     = 0
    max_label_val = -1

    for i in range(acts.shape[0]):
        current = cosine_similarity(predicted, acts[i])

        if current > max_label_val:
           max_label     = i
           max_label_val = current

    return max_label
    # Pred -> 33, 6237
    # acts -> 3, 6237

def predict(dataset, test_file_path):
    comps_inv, acts = dataset
    test = get_audio_features(
        "../../audios/violin/violin_As3_05_mezzo-piano_arco-normal.mp3"
    )

    # Generamos la prediccion de test 
    W = comps_inv
    test_vect = test

    # Para sacar el nuevo vector de activaciones calculamos
    # Componentes^(-1) * test_vect_features = activacion_test
    predicted = W.dot(test_vect)

    # Calculamos el instrumento mas cercano al de test
    pred = k_near_neighbors(predicted, acts)
    print(pred)
    return pred

instruments = [VIOLIN, FLAUTA, GUITARRA]
comps_inv, acts = generate_dataset(instruments=instruments)
# generate_dataset("audios/violin", ["arco", "normal", "pianissimo"], ["trill"], "violin")
# generate_dataset("audios/clash cymbals", [], ["trill"], "clas_symbals")3
# generate_dataset("audios/flute", ["fortissimo", "normal"], ["trill"], "flute")

shape de dataset
(6237, 4)
shape de W
(4, 3)
(3, 4)
(3, 6237)


In [433]:
# print(acts[0].shape)

predict((comps_inv, acts), "../audios/flauta/retocadas/test/flute_As5_1_mezzo-forte_normal.wav")
# predict((comps_inv, acts), "../audios/guitar/retocadas/test/guitar_A2_very-long_forte_normal.wav")
# predict((comps_inv, acts), "../audios/violin/retocadas/test/violin_B3_phrase_forte_arco-spiccato.wav")



ValueError: shapes (3,4) and (42,4) not aligned: 4 (dim 1) != 42 (dim 0)