In [1]:
import os
import librosa
import librosa.display
import IPython.display as ipd

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, Dense, MaxPooling2D, Activation, Flatten, Dropout, BatchNormalization
from tensorflow.keras import models, layers
from tensorflow.keras import backend as K

from tensorflow.keras.optimizers import Adam

from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical

from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay

In [5]:
fluteC4_file = "./Instruments/flute-C4.wav"

In [6]:
pianoG6_file = "./Instruments/piano-G6.wav"

In [7]:
trumpetG4_file = "./Instruments/trumpet-G4.wav"

In [8]:
violinC4_file = "./Instruments/violin-C4.wav"

In [9]:
ipd.Audio(fluteC4_file)

ValueError: rate must be specified when data is a numpy array or list of audio samples.

In [None]:
ipd.Audio(pianoG6_file)

In [None]:
ipd.Audio(trumpetG4_file )

In [None]:
ipd.Audio(violinC4_file)

In [10]:
# load audio files with librosa
fluteC4, sr1 = librosa.load(fluteC4_file)
pianoG6, sr2 = librosa.load(pianoG6_file)
trumpetG4, sr3 = librosa.load(trumpetG4_file)
violinC4, sr4 = librosa.load(violinC4_file)

In [20]:
sr1

22050

#### Extracting Short-Time Fourier Transform

In [14]:
FRAME_SIZE = 2048
HOP_SIZE = 512

In [15]:
type(fluteC4)

numpy.ndarray

In [16]:
S_fluteC4 = librosa.stft(fluteC4, n_fft=FRAME_SIZE, hop_length=HOP_SIZE)
S_fluteC4.shape

(1025, 142)

In [17]:
# frequency bin x # frames
# frequency bins = framesize/2 + 1; 2048/2+1=1025
# frames = (samples - framesize)/hopsize + 1; 74240

In [21]:
S_trumpetG4 = librosa.stft(trumpetG4, n_fft=FRAME_SIZE, hop_length=HOP_SIZE)
S_trumpetG4.shape

(1025, 313)

In [23]:
S_pianoG6 = librosa.stft(pianoG6, n_fft=FRAME_SIZE, hop_length=HOP_SIZE)
S_pianoG6.shape

(1025, 36)

In [22]:
S_violinC4 = librosa.stft(violinC4, n_fft=FRAME_SIZE, hop_length=HOP_SIZE)
S_violinC4.shape

(1025, 151)

In [None]:
type(S_fluteC4[0][0])

#### Calculating the Spectrogram

In [None]:
Y_fluteC4 = np.abs(S_fluteC4) ** 2 

In [None]:
type(Y_fluteC4[0][0])

Spectrogram

In [None]:
def plot_spectrogram(Y, sr, hop_length, y_axis='linear'):
    plt.figure(figsize=(15,8))
    librosa.display.specshow(Y,
                             sr=sr,
                             hop_length=hop_length,
                             x_axis='time',
                             y_axis=y_axis)
    plt.colorbar(format="%+2.f")

In [None]:
plot_spectrogram(Y_fluteC4, sr1, HOP_SIZE)

#### Log-Amplitude Spectrogram

In [None]:
Y_log_fluteC4 = librosa.power_to_db(Y_fluteC4)
plot_spectrogram(Y_log_fluteC4, sr1, HOP_SIZE)

In [None]:
plot_spectrogram(Y_log_fluteC4, sr1, HOP_SIZE, y_axis='log')

#### Melt filter banks

In [None]:
sr1

In [None]:
filter_banks = librosa.filters.mel(n_fft=FRAME_SIZE, sr=22050,n_mels=10)

In [None]:
filter_banks.shape

In [None]:
# n_mels, 2048/2+1 = 1025, FRAME_SIZE=2048, F_S/2=Nyqist f 

In [None]:
plt.figure(figsize=(15,8))
librosa.display.specshow(filter_banks,
                         sr=sr,
                         x_axis='linear')
plt.colorbar(format="%+2.f")

#### Extracting Mel Spectrogram

In [None]:
mel_spectrogram = librosa.feature.melspectrogram(fluteC4, sr1, n_fft=2048, hop_length=512, n_mels=90)

In [None]:
mel_spectrogram.shape

In [None]:
# n_mels x, 142 appears before, seems to be right

In [None]:
log_mel_spectrogram = librosa.power_to_db(mel_spectrogram)

In [None]:
log_mel_spectrogram.shape

In [None]:
plt.figure(figsize=(15,8))
librosa.display.specshow(log_mel_spectrogram,
                         x_axis='time',
                         y_axis='mel')
plt.colorbar(format="%+2.f")

##### The Medley Instruments Data Set

In [24]:
df = pd.read_csv('~/documents/data/audio/medley/csv/Medley-solos-DB_metadata.csv')

In [25]:
df_clarinet = df.loc[df['instrument_id']==0]

In [26]:
df

Unnamed: 0,subset,instrument,instrument_id,song_id,uuid4
0,test,clarinet,0,0,0e4371ac-1c6a-51ab-fdb7-f8abd5fbf1a3
1,test,clarinet,0,0,33383119-fd64-59c1-f596-d1a23e8a0eff
2,test,clarinet,0,0,b2b7a288-e169-5642-fced-b509c06b11fc
3,test,clarinet,0,0,151b6ee4-313a-58d9-fbcb-bab73e0d426b
4,test,clarinet,0,0,b43999d1-9b5e-557f-f9bc-1b3759659858
...,...,...,...,...,...
21566,validation,violin,7,226,fe4e8e98-6e0f-5a31-f446-99c10e0ac485
21567,validation,violin,7,226,aa606c78-9ee5-507f-f7e9-67c3530faf0f
21568,validation,violin,7,226,05e15c0a-d530-5f3e-fa82-58c55fa44993
21569,validation,violin,7,226,2dd485de-471d-5d8b-fe92-ef957dac021c


In [27]:
df.nunique()

subset               3
instrument           8
instrument_id        8
song_id            227
uuid4            21571
dtype: int64

In [28]:
df['instrument'].unique()

array(['clarinet', 'distorted electric guitar', 'female singer', 'flute',
       'piano', 'tenor saxophone', 'trumpet', 'violin'], dtype=object)

In [29]:
filepath = "../../../../../documents/data/audio/medley/"

In [30]:
filename1 = "Medley-solos-DB_test-0"

In [74]:
for i in range(10):
    print(i)
    filename2 = df['uuid4'][i]
    ipd.Audio(filepath+filename1+'_'+filename2+'.wav')
    array1, sr1 = librosa.load(filepath+filename1+'_'+filename2+'.wav')
    array1_S = librosa.stft(array1, n_fft=FRAME_SIZE, hop_length=HOP_SIZE)
    print(array1_S.shape)

0
(1025, 129)
1
(1025, 129)
2
(1025, 129)
3
(1025, 129)
4
(1025, 129)
5
(1025, 129)
6
(1025, 129)
7
(1025, 129)
8
(1025, 129)
9
(1025, 129)


In [80]:
filename2 = df['uuid4'][13]
ipd.Audio(filepath+filename1+'_'+filename2+'.wav')

In [81]:
filename2 = df['uuid4'][14]
ipd.Audio(filepath+filename1+'_'+filename2+'.wav')

In [37]:
array1, sr1 = librosa.load(filepath+filename1+'_'+filename2+'.wav')

In [38]:
array1_S = librosa.stft(array1, n_fft=FRAME_SIZE, hop_length=HOP_SIZE)
array1_S.shape

(1025, 129)

In [17]:
# frequency bin x # frames
# frequency bins = framesize/2 + 1; 2048/2+1=1025
# frames = (samples - framesize)/hopsize + 1; (66150 - 2048)/512 + 1 = x

In [39]:
FRAME_SIZE

2048

In [40]:
HOP_SIZE

512

In [None]:
# samples: 3s * 22050 = 66150

In [42]:
sr1

22050

In [3]:
# preprocess the data
# X_train
# X_validation
# X_test

In [None]:
# X_train shpae (number wav files, f domain, t domain, 1)

In [None]:
# model

In [None]:
# compile model

In [None]:
# fit the model

In [None]:
# examine the model

In [None]:
# confusion matrix