In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import sklearn as skl
import librosa
import librosa.display
import ast
import tensorflow as tf
from tensorflow.keras import datasets, layers, models
from sklearn import svm

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
    #for filename in filenames:
        #print(os.path.join(dirname, filename))
        
#code taken from MDEFF-FMA GitHub page for fetching metadata and audio files for FMA dataset.
# (link: https://github.com/mdeff/fma/blob/master/utils.py)        
def load(filepath):

    filename = os.path.basename(filepath)

    if 'features' in filename:
        return pd.read_csv(filepath, index_col=0, header=[0, 1, 2])

    if 'echonest' in filename:
        return pd.read_csv(filepath, index_col=0, header=[0, 1, 2])

    if 'genres' in filename:
        return pd.read_csv(filepath, index_col=0)

    if 'tracks' in filename:
        tracks = pd.read_csv(filepath, index_col=0, header=[0, 1])

        COLUMNS = [('track', 'tags'), ('album', 'tags'), ('artist', 'tags'),
                   ('track', 'genres'), ('track', 'genres_all')]
        for column in COLUMNS:
            tracks[column] = tracks[column].map(ast.literal_eval)

        COLUMNS = [('track', 'date_created'), ('track', 'date_recorded'),
                   ('album', 'date_created'), ('album', 'date_released'),
                   ('artist', 'date_created'), ('artist', 'active_year_begin'),
                   ('artist', 'active_year_end')]
        for column in COLUMNS:
            tracks[column] = pd.to_datetime(tracks[column])

        SUBSETS = ('small', 'medium', 'large')
        try:
            tracks['set', 'subset'] = tracks['set', 'subset'].astype(
                    'category', categories=SUBSETS, ordered=True)
        except (ValueError, TypeError):
            # the categories and ordered arguments were removed in pandas 0.25
            tracks['set', 'subset'] = tracks['set', 'subset'].astype(
                     pd.CategoricalDtype(categories=SUBSETS, ordered=True))

        COLUMNS = [('track', 'genre_top'), ('track', 'license'),
                   ('album', 'type'), ('album', 'information'),
                   ('artist', 'bio')]
        for column in COLUMNS:
            tracks[column] = tracks[column].astype('category')

        return tracks


#code taken from MDEFF-FMA GitHub page for fetching metadata and audio files for FMA dataset.
# (link: https://github.com/mdeff/fma/blob/master/utils.py)       
def get_audio_path(audio_dir, track_id):
    """
    Return the path to the mp3 given the directory where the audio is stored
    and the track ID.

    Examples
    --------
    >>> import utils
    >>> AUDIO_DIR = os.environ.get('AUDIO_DIR')
    >>> utils.get_audio_path(AUDIO_DIR, 2)
    '../data/fma_small/000/000002.mp3'

    """
    tid_str = '{:06d}'.format(track_id)
    return os.path.join(audio_dir, tid_str[:3], tid_str + '.mp3')


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#load and re-format pandas dataframe of metadata
tracks = load("/kaggle/input/fma-metadata/fma_metadata/tracks.csv")


In [None]:
print(tracks['track', 'genre_top'])

In [None]:
#drop any tracks with inconsistencies
tracks = tracks.drop(index=[98565,98567,98569,99134,108925,133297])

In [None]:
#dictionary: mapping each genre to a unique int
label_dict = {"Hip-Hop": 0, "Rock":1, "Electronic":2, "Folk": 3, "Instrumental":4, "Pop":5, "International":6, "Experimental":7}

small = tracks[tracks['set','subset'] == 'small']

# We set new_val to be the validation data set
small_val = small[small['set','split'] == "validation"]
new_val = small_val.sample(n=500, random_state=1)

#Set new_train to be the training data set
small_train = small[small['set','split'] == 'training']
new_train = small_train.sample(n=4000, random_state=1)

#Set new_test to be the testing data set
small_test = small[small['set','split'] == 'test']
new_test = small_test.sample(n=500, random_state=1)

In [None]:
#examining list of possible genres in dataset
labels = set(small_train['track', 'genre_top'])
label_list = list(labels)

In [None]:
#Note that we are only using 5000 of the 8000 tracks in the dataset, due to Kaggle memory limitations.

In [None]:
# function that encodes top genres into number-labels
def label_encode(label):
    if label in label_list:
        return label_list.index(label)
    else:
        return -1

In [None]:
#encoding genres into number-labels for training, validation, and testing subsets
new_train['track', 'genre_top'] = new_train['track', 'genre_top'].apply(label_encode)
new_test['track', 'genre_top'] = new_test['track', 'genre_top'].apply(label_encode)
new_val['track', 'genre_top'] = new_val['track', 'genre_top'].apply(label_encode)


#print(new_train['track','genre_top'])

In [None]:
#extract labels for dataset
train_labels = np.array(new_train['track', 'genre_top'])
test_labels = np.array(new_test['track', 'genre_top'])
val_labels = np.array(new_val['track', 'genre_top'])

small_train['track', 'genre_top]

In [None]:
print(train_labels)

In [None]:
train_track_ids = np.array(new_train.index)
val_track_ids = np.array(new_val.index)
test_track_ids = np.array(new_test.index)

In [None]:
#omitted (removed) STFT feature extraction

In [None]:
chrom_train = np.empty([1,12, 1293])
chrom_val = np.empty([1,12, 1293])
chrom_test = np.empty([1,12, 1293])

In [None]:
#Chromagram feature extraction
def chroma_extraction(X,track_ids):
    for i in track_ids:
        filepath = get_audio_path('/kaggle/input/fma-audio/fma_small', i)
        x, sr = librosa.load(filepath)
        chrom = librosa.feature.chroma_stft(y=x, sr=sr)
        a, b = chrom.shape
        #padding the feature arrays to make sure all arrays are the same size
        if (b < 1293):
            if (1293 - b) % 2 == 0:
                chrom = np.pad(chrom, ((0,0),(int((1293-b)/2),int((1293-b)/2))), 'edge')
            else:
                chrom = np.pad(chrom, ((0,0),(int((1293-b)/2),int((1293-b)/2)+1)), 'edge')
        X = np.append(X, [chrom], axis = 0)
    return X


#chromagram feature extraction
chrom_train = chroma_extraction(chrom_train,train_track_ids)
chrom_val = chroma_extraction(chrom_val,val_track_ids)
chrom_test = chroma_extraction(chrom_test,test_track_ids)

In [None]:
# the first feature array in each data subset is an extra "arbitrary" feature array 
# We remove this extra feature array here
chrom_train = np.delete(chrom_train, 0, 0)
chrom_val = np.delete(chrom_val, 0, 0)
chrom_test = np.delete(chrom_test, 0, 0)


In [None]:
#chrom_train

chrom_train

In [None]:
# Additionally, we have to remove all possible nan values in the dataset
#chrom_train = np.nan_to_num(chrom_train)
#chrom_val = np.nan_to_num(chrom_val)
#chrom_test = np.nan_to_num(chrom_test)

In [None]:
#set-up a CNN model
#data_augment = tf.keras.Sequential([
#  tf.keras.layers.RandomFlip('horizontal', input_shape=(12,1293, 1)),
#  tf.keras.layers.RandomRotation(0.3),
#])

chrom_model = models.Sequential()
chrom_model.add(layers.Conv2D(30, kernel_size=(7, 7), activation='relu', input_shape=(12, 1293, 1), padding='same'))
#chrom_model.add(data_augment)
chrom_model.add(layers.Conv2D(60, kernel_size=(7, 7), activation='relu', padding='same'))
#chrom_model.add(layers.BatchNormalization())
chrom_model.add(layers.MaxPooling2D())
chrom_model.add(layers.Dropout(0.4))

chrom_model.add(layers.Conv2D(60, kernel_size=(7, 7), activation='relu', padding='same'))
chrom_model.add(layers.Conv2D(60, kernel_size=(7, 7), activation='relu', padding='same'))
chrom_model.add(layers.MaxPooling2D())

chrom_model.add(layers.Conv2D(60, kernel_size=(7, 7), activation='relu', padding='same'))
chrom_model.add(layers.Conv2D(60, kernel_size=(7, 7), activation='relu', padding='same'))
#chrom_model.add(layers.MaxPooling2D())


#chrom_model.summary()
chrom_model.add(layers.Flatten())
chrom_model.add(layers.Dense(60, activation='relu'))
#chrom_model.add(layers.Dropout(0.4))
chrom_model.add(layers.Dense(8))


In [None]:
opt = tf.keras.optimizers.Adam(learning_rate=0.00005)
chrom_model.compile(optimizer=opt,
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

chrom_hist = chrom_model.fit(chrom_train, train_labels, batch_size = 50, epochs=60, 
                    validation_data=(chrom_val, val_labels))

In [None]:
#plot training vs validation loss and accuracy
acc = chrom_hist.history['accuracy']
val_acc = chrom_hist.history['val_accuracy']

loss = chrom_hist.history['loss']
val_loss = chrom_hist.history['val_loss']

epochs = 60

plt.subplot(1, 2, 1)
plt.plot(range(epochs), acc)
plt.plot(range(epochs), val_acc)
plt.legend(["Training Accuracy", "Validation Accuracy"])
plt.title('Training vs Validation Accuracy')
plt.xlabel("Epochs")
plt.ylabel("Accuracy")

plt.subplot(1, 2, 2)
plt.plot(range(epochs), loss)
plt.plot(range(epochs), val_loss)
plt.legend(["Training Loss", "Validation Loss"])
plt.title('Training vs Validation Loss')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.show()

In [None]:
#Trial 2: MFCC spectrogram feature extraction
mfcc_train = np.empty([1,20, 1293])
mfcc_val = np.empty([1,20, 1293])
mfcc_test = np.empty([1,20, 1293])

In [None]:
def mfcc_extraction(X,track_ids):
    for i in track_ids:
        filepath = get_audio_path('/kaggle/input/fma-audio/fma_small', i)
        x, sr = librosa.load(filepath)
        mfcc = librosa.feature.mfcc(y=x, sr=sr)
        a, b = mfcc.shape
        #padding the feature arrays to make sure all arrays are the same size
        if (b < 1293):
            if (1293 - b) % 2 == 0:
                mfcc = np.pad(mfcc, ((0,0),(int((1293-b)/2),int((1293-b)/2))), 'edge')
            else:
                mfcc = np.pad(mfcc, ((0,0),(int((1293-b)/2),int((1293-b)/2)+1)), 'edge')
        X = np.append(X, [mfcc], axis = 0)
    return X


#MFCC feature extraction
mfcc_train = mfcc_extraction(mfcc_train,train_track_ids)
mfcc_val = mfcc_extraction(mfcc_val,val_track_ids)
mfcc_test = mfcc_extraction(mfcc_test,test_track_ids)

In [None]:
# the first feature array in each data subset is an extra "arbitrary" feature array 
# We remove this extra feature array here
mfcc_train = np.delete(mfcc_train, 0, 0)
mfcc_val = np.delete(mfcc_val, 0, 0)
mfcc_test = np.delete(mfcc_test, 0, 0)

In [None]:
def normalize(x):
    maxim = np.max(x)
    return x/maxim
mfcc_train = normalize(mfcc_train)
mfcc_val = normalize(mfcc_val)
mfcc_test = normalize(mfcc_test)

In [None]:
#CNN model for MFCCs
#data_augment = tf.keras.Sequential([
 # tf.keras.layers.RandomFlip('horizontal', input_shape=(20,1293, 1)),
  #tf.keras.layers.RandomRotation(0.3),
#])
mfcc_model = models.Sequential()
mfcc_model.add(layers.Conv2D(30, kernel_size=(7, 7), activation='relu', input_shape=(20, 1293, 1), padding='same'))
#mfcc_model.add(data_augment)
mfcc_model.add(layers.Conv2D(60, kernel_size=(7, 7), activation='relu', padding='same'))
#mfcc_model.add(layers.Conv2D(60, kernel_size=(7, 7), activation='relu', padding='same'))
#chrom_model.add(layers.BatchNormalization())
mfcc_model.add(layers.MaxPooling2D())
mfcc_model.add(layers.Dropout(0.4))

mfcc_model.add(layers.Conv2D(60, kernel_size=(7, 7), activation='relu', padding='same'))
mfcc_model.add(layers.Conv2D(60, kernel_size=(7, 7), activation='relu', padding='same'))
mfcc_model.add(layers.MaxPooling2D())

mfcc_model.add(layers.Conv2D(60, kernel_size=(7, 7), activation='relu', padding='same'))
mfcc_model.add(layers.Conv2D(60, kernel_size=(7, 7), activation='relu', padding='same'))

#mfcc_model.summary()
mfcc_model.add(layers.Flatten())
mfcc_model.add(layers.Dense(60, activation='relu'))
mfcc_model.add(layers.Dense(8))


In [None]:
opt = tf.keras.optimizers.Adam(learning_rate=0.00005)
mfcc_model.compile(optimizer=opt,
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

mfcc_hist = mfcc_model.fit(mfcc_train, train_labels, batch_size=30, epochs=50, 
                    validation_data=(mfcc_val, val_labels))

In [None]:
acc = mfcc_hist.history['accuracy']
val_acc = mfcc_hist.history['val_accuracy']

loss = mfcc_hist.history['loss']
val_loss = mfcc_hist.history['val_loss']

epochs = 50

plt.subplot(1, 2, 1)
plt.plot(range(epochs), acc)
plt.plot(range(epochs), val_acc)
plt.legend(["Training Accuracy", "Validation Accuracy"])
plt.title('Training vs Validation Accuracy')
plt.xlabel("Epochs")
plt.ylabel("Accuracy")

plt.subplot(1, 2, 2)
plt.plot(range(epochs), loss)
plt.plot(range(epochs), val_loss)
plt.legend(["Training Loss", "Validation Loss"])
plt.title('Training vs Validation Loss')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.show()

In [None]:
#Baseline Linear SVM implementation

#First, we extract spectral flatness features from music files
spec_flat_train = np.empty([1,1293])
spec_flat_val = np.empty([1,1293])
spec_flat_test = np.empty([1,1293])

def flat_extraction(X,track_ids):
    for i in track_ids:
        filepath = get_audio_path('/kaggle/input/fma-audio/fma_small', i)
        x, sr = librosa.load(filepath)
        flat = librosa.feature.spectral_flatness(y=x)
        a, b = flat.shape
        #padding the feature arrays to make sure all arrays are the same size
        if (b < 1293):
            if (1293 - b) % 2 == 0:
                flat = np.pad(flat, ((0,0),(int((1293-b)/2),int((1293-b)/2))), 'edge')
            else:
                flat = np.pad(flat, ((0,0),(int((1293-b)/2),int((1293-b)/2)+1)), 'edge')
        X = np.append(X, flat, axis = 0)
    return X

# Spectral Flatness feature extraction
spec_flat_train = flat_extraction(spec_flat_train,train_track_ids)
spec_flat_val = flat_extraction(spec_flat_val,val_track_ids)
spec_flat_test = flat_extraction(spec_flat_test,test_track_ids)

In [None]:
# the first feature array in each data subset is an extra "arbitrary" feature array 
# We remove this extra feature array here
spec_flat_train = np.delete(spec_flat_train, 0, 0)
spec_flat_val = np.delete(spec_flat_val, 0, 0)
spec_flat_test = np.delete(spec_flat_test, 0, 0)

In [None]:
#Build and train Linear SVM
classifier = svm.SVC(decision_function_shape='ovo')
classifier.fit(spec_flat_train, train_labels)

#Output test accuracy and validation accuracy
print("Baseline SVM Test Accuracy: " + str(classifier.score(spec_flat_test, test_labels)))
print("Baseline SVM Validation Accuracy: "+ str(classifier.score(spec_flat_val, val_labels)))

In [None]:
# testing models
#t_score, acc = mfcc_model.evaluate(mfcc_test, test_labels)
#print('MFCC Test accuracy:', acc)

#t_score, acc = chrom_model.evaluate(chrom_test, test_labels)
#print('Chroma Test accuracy:', acc)