In [1]:
import os
import numpy as np
import random
import librosa
import librosa.display
import pandas as pd
import matplotlib.pyplot as plt

import multiprocessing
from functools import partial

from keras.utils import np_utils

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

import scipy.stats as stats

%matplotlib inline

Using TensorFlow backend.


In [2]:
DATA_DIR = '/Users/james/Documents/Data/genre classification/' 
AUDIO_DIR = DATA_DIR + 'Audio' 
NPZ_DIR = DATA_DIR + 'npz'

In [3]:
SAMPLE_RATE = 22050
N_MFCC = 20
POOL_SIZE = 8

In [4]:
def get_downloaded_track_ids(directory):
    """
    Collect the track_ids from the tracks that have downloaded in the specific directory
    :return: ist of track ids
    """    
    track_ids = []
    for _, dirnames, files in os.walk(directory):
        if dirnames == []:
            track_ids.extend(str(file[:-4]) for file in files) # 4 spots before the '.mp3'
    return track_ids

In [5]:
def get_audio_path(directory, track_id):
    """
    Return the path to the mp3 given the directory where the audio is stored
    and the track ID.
    """
    return os.path.join(directory, track_id + '.mp3')

In [6]:
# get the successfully downloaded track id's  
track_ids = get_downloaded_track_ids(AUDIO_DIR)

In [7]:
def create_mfcc(track_id):
    filename = get_audio_path(AUDIO_DIR, track_id)
    y, sr = librosa.load(filename)
    spect = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=N_MFCC)
    return spect.T  

#### Create the data

In [8]:
# load the meta data from data transformer step
meta_data_df = pd.read_pickle(DATA_DIR + '/meta_data.pkl')

genre_target_list = meta_data_df['genre_target'].unique()
genre_target_dict = {genre_target_list[i] : i  for i in range(0,len(genre_target_list))}
print(genre_target_dict)

{'club': 0, 'bass-other': 1}


In [9]:
def create_features_label_array(df):
    genres = []
    X_spect = np.empty((0, 640, N_MFCC))
    count = 0
    #Code skips records in case of errors
    for index, row in df.iterrows():
        try:
            count += 1
            track_id = row['guid']
            genre = str(row['genre_target'])
            spect = create_mfcc(track_id)

            # Normalize for small shape differences
            spect = spect[:640, :]
            X_spect = np.append(X_spect, [spect], axis=0)
            genres.append(genre_target_dict[genre])
            if count % 100 == 0:
                print("Currently processing: ", count)
        except:
            print("Couldn't process: ", count)
            continue
    y = np.array(genres)
    return X_spect , y

In [10]:
def parallel_feature_label_creation(df, pool_size):
    """
    process the 'create_features_..' function in pools across multiple cores 
    """
    df_split = np.array_split(df, pool_size)
    pool = multiprocessing.Pool(processes=pool_size)
    results = pool.map(create_features_label_array, df_split)
    pool.close()
    pool.join()
    
    X_spect = [result[0] for result in results]
    y = [result[1] for result in results]
    
    X_spect = np.vstack(X_spect)
    y = np.concatenate(y)
    
    return X_spect, y

In [11]:
# create
df_train = meta_data_df[meta_data_df['split'] == 'train']
df_valid = meta_data_df[meta_data_df['split'] == 'valid']
df_test  = meta_data_df[meta_data_df['split'] == 'test']

print(df_train.shape, df_valid.shape, df_test.shape)


(2024, 4) (579, 4) (289, 4)


In [14]:
X_train, y_train = parallel_feature_label_creation(df_train, POOL_SIZE)
print(X_train.shape, y_train.shape)
np.savez(NPZ_DIR + '/MFC_train_array', X_train, y_train)

Currently processing:  100
Currently processing:  100
Currently processing:  100
Currently processing:  100
Currently processing:  100
Currently processing:  100
Currently processing:  100
Currently processing:  100
Currently processing:  200
Currently processing:  200
Currently processing:  200
Currently processing:  200
Currently processing:  200
Currently processing:  200
Currently processing:  200
Currently processing:  200
(2024, 640, 20) (2024,)


In [15]:
X_valid, y_valid = create_features_label_array(df_valid)
print(X_valid.shape, y_valid.shape)
np.savez(NPZ_DIR + '/MFC_valid_array', X_valid, y_valid)

TypeError: create_features_label_array() takes 1 positional argument but 2 were given

In [13]:
X_test, y_test = create_features_label_array(df_test)
print(X_test.shape, y_test.shape)
np.savez(NPZ_DIR + '/MFC_test_array', X_test, y_test)

Currently processing:  100
Currently processing:  200
(289, 640, 20) (289,)


### MFCC Summary Stats (each mfcc band averaged over time)

In [None]:
# mean, standard dev, skew, kurtosis, median, min, max
def mfcc_summary(X_array):
    stats.kurtosis(X_array, axis=1)
    
    return X_array