In [1]:
import time
import os
import librosa

import IPython.display as ipd
import numpy as np
import pandas as pd

from sklearn.utils import shuffle
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder, LabelBinarizer, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split

In [2]:
import keras
from keras.layers import Conv1D, MaxPool1D, GlobalAvgPool1D
from keras.models import Sequential, Model
from keras.layers import Input, Dense, TimeDistributed, LSTM, Dropout, Activation
from keras.layers import Convolution2D, MaxPooling2D, Flatten, Reshape
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import ELU
from keras.callbacks import ModelCheckpoint
from keras.regularizers import l2
from keras import backend
from keras.utils import np_utils
from keras.models import load_model

Using TensorFlow backend.


In [3]:
import fma_utils
import time
from tqdm import tqdm
import random
import pickle

# 데이터 확인

### Load  FMA meta data (FMA util 사용)

In [4]:
tracks = fma_utils.load('data/fma_metadata/tracks.csv')  # track 및 label 데이터 포함 
features = fma_utils.load('data/fma_metadata/features.csv')  # track의 feature의 statistics 데이터 포함
np.testing.assert_array_equal(features.index, tracks.index)

print(tracks.shape, features.shape)

  'category', categories=SUBSETS, ordered=True)


(106574, 52) (106574, 518)


### Check Track data details

In [5]:
tracks.head()

Unnamed: 0_level_0,album,album,album,album,album,album,album,album,album,album,...,track,track,track,track,track,track,track,track,track,track
Unnamed: 0_level_1,comments,date_created,date_released,engineer,favorites,id,information,listens,producer,tags,...,information,interest,language_code,license,listens,lyricist,number,publisher,tags,title
track_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2,0,2008-11-26 01:44:45,2009-01-05,,4,1,<p></p>,6073,,[],...,,4656,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1293,,3,,[],Food
3,0,2008-11-26 01:44:45,2009-01-05,,4,1,<p></p>,6073,,[],...,,1470,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,514,,4,,[],Electric Ave
5,0,2008-11-26 01:44:45,2009-01-05,,4,1,<p></p>,6073,,[],...,,1933,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1151,,6,,[],This World
10,0,2008-11-26 01:45:08,2008-02-06,,4,6,,47632,,[],...,,54881,en,Attribution-NonCommercial-NoDerivatives (aka M...,50135,,1,,[],Freeway
20,0,2008-11-26 01:45:05,2009-01-06,,2,4,"<p> ""spiritual songs"" from Nicky Cook</p>",2710,,[],...,,978,en,Attribution-NonCommercial-NoDerivatives (aka M...,361,,3,,[],Spiritual Level


In [6]:
tracks.columns # 이중 genre_top을 사용할 예정.

MultiIndex(levels=[['album', 'artist', 'set', 'track'], ['active_year_begin', 'active_year_end', 'associated_labels', 'bio', 'bit_rate', 'comments', 'composer', 'date_created', 'date_recorded', 'date_released', 'duration', 'engineer', 'favorites', 'genre_top', 'genres', 'genres_all', 'id', 'information', 'interest', 'language_code', 'latitude', 'license', 'listens', 'location', 'longitude', 'lyricist', 'members', 'name', 'number', 'producer', 'publisher', 'related_projects', 'split', 'subset', 'tags', 'title', 'tracks', 'type', 'website', 'wikipedia_page']],
           labels=[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], [5, 7, 9, 11, 12, 16, 17, 22, 29, 34, 35, 36, 37, 0, 1, 2, 3, 5, 7, 12, 16, 20, 23, 24, 26, 27, 31, 34, 38, 39, 32, 33, 4, 5, 6, 7, 8, 10, 12, 13, 14, 15, 17, 18, 19, 21, 22, 25, 28, 30, 34, 35]])

### Check precomputed feature data details

In [7]:
features.head()

feature,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,...,tonnetz,tonnetz,tonnetz,zcr,zcr,zcr,zcr,zcr,zcr,zcr
statistics,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,...,std,std,std,kurtosis,max,mean,median,min,skew,std
number,01,02,03,04,05,06,07,08,09,10,...,04,05,06,01,01,01,01,01,01,01
track_id,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2,7.180653,5.230309,0.249321,1.34762,1.482478,0.531371,1.481593,2.691455,0.866868,1.341231,...,0.054125,0.012226,0.012111,5.75889,0.459473,0.085629,0.071289,0.0,2.089872,0.061448
3,1.888963,0.760539,0.345297,2.295201,1.654031,0.067592,1.366848,1.054094,0.108103,0.619185,...,0.063831,0.014212,0.01774,2.824694,0.466309,0.084578,0.063965,0.0,1.716724,0.06933
5,0.527563,-0.077654,-0.27961,0.685883,1.93757,0.880839,-0.923192,-0.927232,0.666617,1.038546,...,0.04073,0.012691,0.014759,6.808415,0.375,0.053114,0.041504,0.0,2.193303,0.044861
10,3.702245,-0.291193,2.196742,-0.234449,1.367364,0.998411,1.770694,1.604566,0.521217,1.982386,...,0.074358,0.017952,0.013921,21.434212,0.452148,0.077515,0.071777,0.0,3.542325,0.0408
20,-0.193837,-0.198527,0.201546,0.258556,0.775204,0.084794,-0.289294,-0.81641,0.043851,-0.804761,...,0.095003,0.022492,0.021355,16.669037,0.469727,0.047225,0.040039,0.000977,3.189831,0.030993


In [8]:
features.columns

MultiIndex(levels=[['chroma_cens', 'chroma_cqt', 'chroma_stft', 'mfcc', 'rmse', 'spectral_bandwidth', 'spectral_centroid', 'spectral_contrast', 'spectral_rolloff', 'tonnetz', 'zcr'], ['kurtosis', 'max', 'mean', 'median', 'min', 'skew', 'std'], ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20']],
           labels=[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2

### Distribute version (FMA small subset) 확인

In [9]:
subset = tracks.index[tracks['set', 'subset'] <= 'small']

In [10]:
tracks = tracks.loc[subset]
features_all = features.loc[subset]
print(tracks.shape, features_all.shape)

(8000, 52) (8000, 518)


### Train / valid / test set 의 index를 로드 

In [11]:
org_train_indices = tracks.index[tracks['set', 'split'] == 'training']
org_val_indices = tracks.index[tracks['set', 'split'] == 'validation']
org_test_indices = tracks.index[tracks['set', 'split'] == 'test']
print('{} training examples, {} validation examples, {} testing examples'.format(*map(len, 
                                                                                      [org_train_indices, 
                                                                                       org_val_indices, 
                                                                                       org_test_indices])))

6400 training examples, 800 validation examples, 800 testing examples


In [12]:
tracks['track', 'genre_top'].head()

track_id
2      Hip-Hop
5      Hip-Hop
10         Pop
140       Folk
141       Folk
Name: (track, genre_top), dtype: category
Categories (16, object): [Blues, Classical, Country, Easy Listening, ..., Pop, Rock, Soul-RnB, Spoken]

### 트랙들의 'genre_top' 레이블을 준비

In [13]:
labels = tracks['track', 'genre_top']

# 실험 준비 

- 실습의 편의를 위해 800곡만을 실험에 사용. -> 800곡을 train / test 셋으로 나누어서 사용.

- FMA 기존 데이터셋에서 제공하는 feature가 아니라 mel-spectrogram 데이터를 사용해서 CNN을 학습시킬 예정.

### Tiny subset 
Distribute된 데이터셋에서 original test set 800 곡만 사용하여 train / test set을 분리하겠습니다.

In [15]:
X.shape, y.shape

((800,), (800, 8))

### stratify 옵션을 사용해서 train, test 셋 분리

In [17]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((640,), (640, 8), (160,), (160, 8))

### * Original small set으로 사용시 *

In [56]:
# X_train = np.array(org_train_indices)
# X_test = np.array(org_test_indices)
# y_train = enc_onehot.fit_transform(labels[org_train_indices])
# y_test = enc_onehot.fit_transform(labels[org_test_indices])

In [57]:
# X에는 아이디가 들어가 있습니다.
X_train[:10]

array([  2,   5,  10, 140, 141, 190, 193, 194, 197, 200])

In [58]:
# 나중에 확인을 위해 one-hot 벡터의 실제 클래스들의 이름을 한번 확인.
genre_labels_in_order = list(enc_onehot.classes_)
print(genre_labels_in_order)

['Electronic', 'Experimental', 'Folk', 'Hip-Hop', 'Instrumental', 'International', 'Pop', 'Rock']


In [59]:
# one-hot 벡터 확인
y_train[:3]

array([[0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0]])

## Prepare melspectrogram feature

In [1]:
# 오디오 파일 하나에서 확인


In [61]:
tmp.shape

(479818,)

In [62]:
# Hyperparameter 선택
SR = 
N_FFT = 
HOP_LENGTH = 
N_MELS = 

In [63]:
# Melspectrotram 구하는 함수를 정의


In [None]:
# Training 셋의 모든 오디오에 대해서 melspectrogram 인풋을 준비


In [2]:
# Test 셋의 모든 오디오에 대해서 melspectrogram 인풋을 준비


In [30]:
X_train_mel.shape

(640, 1875, 96)

In [31]:
# Data standardization 을 위한 mean, std 값 precompute.
MEL_MEAN = 
MEL_STD = 

In [32]:
BATCH_SIZE = 

In [35]:
X_train_mel.shape, y_train.shape, X_test_mel.shape, y_test[:5].shape

((640, 1875, 96), (640, 8))

## 1D CNN Genre Classofocation Model

#### 1) 먼저 Keras 라이브러리의 Data feeding 구조에 맞추기 위한 data generator를 정의.

In [36]:
class FMADataGenerator(keras.utils.Sequence):
    def __init__(self, X_list, y_list, X_mean, X_std, batch_size=4, shuffle=True):
        'Initialization'
        self.batch_size = batch_size
        self.X_list = X_list
        self.y_list = y_list
        self.shuffle = shuffle
        self.on_epoch_end()
        self.X_mean = X_mean
        self.X_std = X_std

    def __len__(self):
        # 총 배치의 수 리턴
        return 

    def __getitem__(self, idx):
        # 배치 하나 리턴
        
        
        # 각 오디오에서 3 seconds (188 frames)의 melspectrogram 만 random 하게 crop합니다.
        X = []
        for _i in curr_indices:
            curr_mel = 
            X.append(curr_mel)
        y = 

        return np.array(X), np.array(y)

    def on_epoch_end(self):
        # 매 이폭이 끝날 때 마다 실행
        self.indices = 
        if self.shuffle == True:
            


In [37]:
train_generator = 

#### 2) 모델 정의

In [38]:
keras.backend.clear_session()




In [121]:
del model

In [39]:
model = Model(inputs = model_input, outputs = output)

In [40]:
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 188, 96)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 188, 64)           24640     
_________________________________________________________________
activation_1 (Activation)    (None, 188, 64)           0         
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 47, 64)            0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 47, 128)           32896     
_________________________________________________________________
activation_2 (Activation)    (None, 47, 128)           0         
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 11, 128)           0   

In [41]:
optimizer = keras.optimizers.SGD(lr=0.001, momentum=0.9, nesterov=True)          # optimization 관련 알고리즘 및 파라메터.
model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy']) # loss 함수, 평가 메트릭 정의.

#### 3) 모델 학습

In [None]:
model.fit_generator(train_generator, 
                    steps_per_epoch= int(len(X_train_mel)/BATCH_SIZE), # 한 이폭에 몇번의 배치를 학습할지.
                    max_queue_size=10,               # data generation 시 queue에 미리 cache 해놓을 데이터 샘플 수.
                    workers=4,                       # data generation 시 사용할 thread 수.
                    use_multiprocessing=True,        # process-based threading 을 사용할지 여부.
                    epochs=1000,                     # 학습 이터레이션 수.
                    verbose=1)

#### 4) Evaluation

In [259]:
X_test_mel.shape

(5, 1875, 96)

In [270]:
correct = 0
for idx in range(X_test_mel.shape[0]):
    _mel = X_test_mel[idx]
    _mel -= MEL_MEAN
    _mel /= MEL_STD
    
    curr_len = _mel.shape[0]
    num_segs = curr_len // 188
    _segs = []
    for seg_idx in range(num_segs):
        _segs.append(_mel[seg_idx * 188 : (seg_idx+1) * 188])
    
    _segs = np.array(_segs)
    preds = model.predict(_segs)
    preds = np.mean(preds, axis=0)
    predicted_label = np.argmax(preds)
    label = np.argmax(y_test[idx])
    # print('predicted_label', predicted_label, '/ label', label)
    if predicted_label == label:
        correct += 1

print('acc:', correct /X_test_mel.shape[0])
    

predicted_label 4 / label 3
predicted_label 4 / label 0
predicted_label 4 / label 7
predicted_label 4 / label 1
predicted_label 4 / label 5
acc: 0.0


In [None]:
# 모델 저장
_save_path = './curr_model.h5'
model.save(_save_path)

In [None]:
# 모델 로드
model = load_model('my_model.h5')