In [55]:
import os
import numpy as np
import random
import librosa
import librosa.display
import pandas as pd
import matplotlib.pyplot as plt

import multiprocessing
from functools import partial

from keras.utils import np_utils

import sklearn as skl
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import f1_score
from time import time

import scipy.stats as stats

%matplotlib inline

In [4]:
DATA_DIR = '/Users/james/Documents/Data/genre classification/' 
AUDIO_DIR = DATA_DIR + 'Audio' 
NPZ_DIR = DATA_DIR + 'npz'

In [5]:
SAMPLE_RATE = 22050
N_MFCC = 20
POOL_SIZE = 8

In [4]:
def get_downloaded_track_ids(directory):
    """
    Collect the track_ids from the tracks that have downloaded in the specific directory
    :return: ist of track ids
    """    
    track_ids = []
    for _, dirnames, files in os.walk(directory):
        if dirnames == []:
            track_ids.extend(str(file[:-4]) for file in files) # 4 spots before the '.mp3'
    return track_ids

In [5]:
def get_audio_path(directory, track_id):
    """
    Return the path to the mp3 given the directory where the audio is stored
    and the track ID.
    """
    return os.path.join(directory, track_id + '.mp3')

In [6]:
# get the successfully downloaded track id's  
track_ids = get_downloaded_track_ids(AUDIO_DIR)

In [7]:
def create_mfcc(track_id):
    filename = get_audio_path(AUDIO_DIR, track_id)
    y, sr = librosa.load(filename)
    spect = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=N_MFCC)
    return spect.T  

#### Create the data

In [8]:
# load the meta data from data transformer step
meta_data_df = pd.read_pickle(DATA_DIR + '/meta_data.pkl')

genre_target_list = meta_data_df['genre_target'].unique()
genre_target_dict = {genre_target_list[i] : i  for i in range(0,len(genre_target_list))}
print(genre_target_dict)

{'club': 0, 'bass-other': 1}


In [9]:
def create_features_label_array(df):
    genres = []
    X_spect = np.empty((0, 640, N_MFCC))
    count = 0
    #Code skips records in case of errors
    for index, row in df.iterrows():
        try:
            count += 1
            track_id = row['guid']
            genre = str(row['genre_target'])
            spect = create_mfcc(track_id)

            # Normalize for small shape differences
            spect = spect[:640, :]
            X_spect = np.append(X_spect, [spect], axis=0)
            genres.append(genre_target_dict[genre])
            if count % 100 == 0:
                print("Currently processing: ", count)
        except:
            print("Couldn't process: ", count)
            continue
    y = np.array(genres)
    return X_spect , y

In [10]:
def parallel_feature_label_creation(df, pool_size):
    """
    process the 'create_features_..' function in pools across multiple cores 
    """
    df_split = np.array_split(df, pool_size)
    pool = multiprocessing.Pool(processes=pool_size)
    results = pool.map(create_features_label_array, df_split)
    pool.close()
    pool.join()
    
    X_spect = [result[0] for result in results]
    y = [result[1] for result in results]
    
    X_spect = np.vstack(X_spect)
    y = np.concatenate(y)
    
    return X_spect, y

In [11]:
# create
df_train = meta_data_df[meta_data_df['split'] == 'train']
df_valid = meta_data_df[meta_data_df['split'] == 'valid']
df_test  = meta_data_df[meta_data_df['split'] == 'test']

print(df_train.shape, df_valid.shape, df_test.shape)


(2024, 4) (579, 4) (289, 4)


In [14]:
X_train, y_train = parallel_feature_label_creation(df_train, POOL_SIZE)
print(X_train.shape, y_train.shape)
np.savez(NPZ_DIR + '/MFC_train_array', X_train, y_train)

Currently processing:  100
Currently processing:  100
Currently processing:  100
Currently processing:  100
Currently processing:  100
Currently processing:  100
Currently processing:  100
Currently processing:  100
Currently processing:  200
Currently processing:  200
Currently processing:  200
Currently processing:  200
Currently processing:  200
Currently processing:  200
Currently processing:  200
Currently processing:  200
(2024, 640, 20) (2024,)


In [16]:
X_valid, y_valid = create_features_label_array(df_valid)
print(X_valid.shape, y_valid.shape)
np.savez(NPZ_DIR + '/MFC_valid_array', X_valid, y_valid)

Currently processing:  100
Currently processing:  200
Currently processing:  300
Currently processing:  400
Currently processing:  500
(579, 640, 20) (579,)


In [13]:
X_test, y_test = create_features_label_array(df_test)
print(X_test.shape, y_test.shape)
np.savez(NPZ_DIR + '/MFC_test_array', X_test, y_test)

Currently processing:  100
Currently processing:  200
(289, 640, 20) (289,)


#### Load

In [60]:
npzfile = np.load(NPZ_DIR + '/MFC_train_array.npz')
print(npzfile.files)
X_train = npzfile['arr_0']
y_train = npzfile['arr_1']
print(X_train.shape, y_train.shape)

['arr_0', 'arr_1']
(2024, 640, 20) (2024,)


In [61]:
npzfile = np.load(NPZ_DIR + '/MFC_valid_array.npz')
print(npzfile.files)
X_valid = npzfile['arr_0']
y_valid = npzfile['arr_1']
print(X_valid.shape, y_valid.shape)

['arr_0', 'arr_1']
(579, 640, 20) (579,)


### MFCC Summary Stats (each mfcc band averaged over time)

In [62]:
# mean, standard dev, skew, kurtosis, median, min, max
def mfcc_summary(X_array, time_axis=1):
    X_copy = X_array.copy()
    x1 = np.mean(X_copy, axis=time_axis)
    x2 = np.std(X_copy, axis=time_axis)
    x3 = stats.skew(X_copy, axis=time_axis)
    x4 = stats.kurtosis(X_copy, axis=time_axis)
    x5 = np.median(X_copy, axis=time_axis)
    x6 = np.min(X_copy, axis=time_axis)
    x7 = np.max(X_copy, axis=time_axis)
    
    X_summary = np.concatenate([x1,x2,x3,x4,x5,x6,x7], axis=1)
    
    return X_summary

In [63]:
X_train_stats_array = mfcc_summary(X_train, 1)
X_train_stats_array.shape

(2024, 140)

In [64]:
X_valid_stats_array = mfcc_summary(X_valid, 1)
X_valid_stats_array.shape

(579, 140)

### Standardize Features and Encode Labels

In [68]:
#Shuffle training features
X_train_stats_array, y_train = skl.utils.shuffle(X_train_stats_array, y_train, random_state=17)

# Standardize features by removing the mean and scaling to unit variance using x_train as model fit and applyimng to hold-out and validation sets
scaler = skl.preprocessing.StandardScaler(copy=False)
X_train = scaler.fit_transform(X_train_stats_array)
X_valid = scaler.transform(X_valid_stats_array)

In [69]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_valid = le.fit_transform(y_valid)
le.classes_

array([0, 1])

In [72]:
X_valid.shape

(579, 140)

## Models

In [73]:
def train_classifier(clf, X_train, y_train):
    ''' Fits a classifier to the training data. '''
    
    # Start the clock, train the classifier, then stop the clock
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    
    # Print the results
    print("Trained model in {:.4f} seconds".format(end - start))

    
def predict_labels(clf, features, target):
    ''' Makes predictions using a fit classifier based on F1 score. '''
    
    # Start the clock, make predictions, then stop the clock
    start = time()
    y_pred = clf.predict(features)
    end = time()
    
    # Print and return results
    print("Made predictions in {:.4f} seconds.".format(end - start))
    return f1_score(target, y_pred, average='micro', pos_label = 1)


def train_predict(clf, X_train, y_train, X_test, y_test):
    ''' Train and predict using a classifer based on F1 score. '''
    
    # Indicate the classifier and the training set size
    print("Training a {} using a training set size of {}. . .".format(clf.__class__.__name__, len(X_train)))
    
    # Train the classifier
    train_classifier(clf, X_train, y_train)
    
    # Print the results of prediction for both training and testing
    print("F1 score for training set: {:.4f}.".format(predict_labels(clf, X_train, y_train)))
    print("F1 score for val set: {:.4f}.".format(predict_labels(clf, X_valid, y_valid)))
    #print("F1 score for test set: {:.4f}.".format(predict_labels(clf, X_test, y_test)))

In [74]:
# TODO: Initialize the three models
clf_A = DecisionTreeClassifier(random_state=10, max_depth =4)
clf_B = SVC()
clf_C = LogisticRegression()
clf_D = RandomForestClassifier(random_state=10, max_depth=30, n_estimators=300, min_samples_leaf=6, min_impurity_decrease=0.0002,
                     class_weight='balanced')

for clf in [clf_A, clf_B, clf_C, clf_D]:
    print("\n{}: \n".format(clf.__class__.__name__))
    train_predict(clf, X_train, y_train, X_valid, y_valid)


DecisionTreeClassifier: 

Training a DecisionTreeClassifier using a training set size of 2024. . .
Trained model in 0.0980 seconds
Made predictions in 0.0005 seconds.
F1 score for training set: 0.6838.
Made predictions in 0.0003 seconds.
F1 score for val set: 0.5889.

SVC: 

Training a SVC using a training set size of 2024. . .
Trained model in 0.5364 seconds
Made predictions in 0.4361 seconds.
F1 score for training set: 0.8394.
Made predictions in 0.1304 seconds.
F1 score for val set: 0.6615.

LogisticRegression: 

Training a LogisticRegression using a training set size of 2024. . .




Trained model in 0.0857 seconds
Made predictions in 0.0019 seconds.
F1 score for training set: 0.7016.
Made predictions in 0.0002 seconds.
F1 score for val set: 0.6131.

RandomForestClassifier: 

Training a RandomForestClassifier using a training set size of 2024. . .
Trained model in 2.2633 seconds
Made predictions in 0.0566 seconds.
F1 score for training set: 0.9886.
Made predictions in 0.0280 seconds.
F1 score for val set: 0.6356.
