# CSE 802 Project: Genre recognition using FMA
Kevin McMahon

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
tracks = pd.read_csv('fma_metadata/tracks.csv')
genres = pd.read_csv('fma_metadata/genres.csv')
echonest = pd.read_csv('fma_metadata/echonest.csv')

In [None]:
echonest.shape, genres.shape, tracks.shape

# Genres
Create DataFrame consisting of #tracks and genre_id for the 8 most popular top level genres

In [None]:
genres.head()

In [None]:
genres = genres.loc[genres['parent'] == 0]
genres = genres.rename(columns={genres.columns[0]: 'Genre id'})
genres.index = genres['Genre id']
base_genres = genres.drop(columns=['Genre id', 'parent', 'top_level'])
base_genres = base_genres.sort_values('#tracks', ascending = False)[:8]


# def f(dat, c='red'):
#     return [f'background-color: {c}' for i in dat]

# df.style.apply(f, axis=0, subset=['X'])

colors_list = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'white']

base_genres['Color'] = colors_list

def color_cols(s):
    return [f'background-color: {i}' if colors_list == list(s.values) else '' for i in colors_list]

print(f'{8} Most popular top-level genres')
base_genres.style.apply(color_cols)

# Tracks
1. Remove first 2 rows of header info
2. Change 'track.8' genre tags from string to list
3. Index the resulting dataframe by track number

In [None]:
tracks = tracks[2:]
tracks['track.8'] = tracks['track.8'].transform(lambda x: x.strip('[]').replace(' ','').split(','))

3. Make dataframe index the track number

In [None]:
tracks = tracks.rename(columns={tracks.columns[0]: 'Track number'})
tracks.index = tracks['Track number']
tracks = tracks.drop(columns=['Track number'])
tracks.head()

# Echonest
1. Eliminate first 3 rows of headers
2. Remove features from echonest that are NaN or string (artist, release date, etc)
3. Index the resulting dataframe by track number

In [None]:
echonest = echonest[3:]
echonest = echonest.drop(echonest.columns[9:21],axis=1);
echonest = echonest.rename(columns={echonest.columns[0]: 'Track number'})
echonest.index = echonest['Track number']
echonest = echonest.drop(columns=['Track number'])
echonest.head()

In [None]:
echonest.shape

# Get samples from 8 base genres
- Additionally, create a dict that maps a track_id to it's top genre

In [None]:
samples_to_obtain = 10000

In [None]:
feature_space = pd.DataFrame(columns=echonest.columns)
genre_of_track = dict()

genre_map = tracks['track.8']
top_level_genres = list(base_genres.index.values)
num_samples = 1

# Get samples whose genre is one of the 8 base genres
for track_id, features in echonest.iterrows():
    try:
        track_genres = genre_map.loc[track_id]
        for gen in track_genres:
            if int(gen) in top_level_genres:
                feature_space.loc[track_id] = features
                genre_of_track[track_id] = int(gen)
                num_samples += 1
                break
    except:
        pass
    
    if num_samples > samples_to_obtain:
        break

# Dictionary, maps track_id to genre_id
genre_of_track

# DataFrame of features for each track
feature_space.shape

# Feature Selection: Variance Threshold
- Remove features below a specified variance
- Observe relationship between features retained and threshold

In [None]:
from sklearn.feature_selection import VarianceThreshold

In [None]:
threshold = []
num_features = []
var_range = np.linspace(0, 200, 300)
for var in var_range:
    vt = VarianceThreshold(var)
    selected = vt.fit_transform(feature_space)
    num_features.append(len(selected[0]))
    threshold.append(var)
plt.plot(threshold, num_features)
plt.title(f'Number of features vs. Variance Threshold, $n = {len(feature_space)}$')
plt.xlabel('Variance Threshold')
plt.ylabel('# features selected');

# Feature Extraction: PCA
- Use PCA to project features to 3D scatter plot

In [None]:
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import normalize

In [None]:
pca = PCA(n_components=3)
pca_data = pca.fit_transform(normalize(feature_space))

fig = plt.figure()
ax = plt.axes(projection='3d')

genre_color_map = {
    38: 'b',
    15: 'g',
    12: 'r',
    1235: 'c',
    10: 'm',
    17: 'y',
    21: 'k',
    2: 'w'
}

X=[]
Y=[]
Z=[]
colors=[]
for index, sample in enumerate(pca_data):
    # Add info to appropriate lists
    X.append(sample[0])
    Y.append(sample[1])
    Z.append(sample[2])
    
    gen = genre_of_track[feature_space.index.values[index]]
    colors.append(genre_color_map[gen])

ax.scatter(X,Y,Z, c=colors, alpha=0.5)
    
# Optionally project plot onto each axis, only useful for n < 10
# ax.scatter(X, Z, marker='>', c=colors, zdir='y')
# ax.scatter(Y, Z, marker='^', c=colors, zdir='x')
# ax.scatter(X, Y, marker='<', c=colors, zdir='z')
    
ax.set_title(f'PCA for feature space, $n = {len(feature_space)}$')
ax.set_xlabel('$x$')
ax.set_ylabel('$y$')
ax.set_zlabel('$z$');

# Feature Extraction: LDA

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [None]:
# Class/genre labels for each sample
track_genres = []
for track_id in list(feature_space.index.values):
    track_genres.append(genre_of_track[track_id])

lda = LDA(n_components=3)
lda_data = lda.fit_transform(feature_space, track_genres)

fig = plt.figure()
ax = plt.axes(projection='3d')

X=[]
Y=[]
Z=[]
colors=[]
for index, sample in enumerate(lda_data):
    # Add info to appropriate lists
    X.append(sample[0])
    Y.append(sample[1])
    Z.append(sample[2])
    gen = genre_of_track[feature_space.index.values[index]]
    colors.append(genre_color_map[gen])

ax.scatter(X,Y,Z, c=colors, alpha=0.5)
ax.set_title(f'LDA for feature space, $n = {len(feature_space)}$')
ax.set_xlabel('$x$')
ax.set_ylabel('$y$')
ax.set_zlabel('$z$');

# Preprocessing: LDA with SelectKBest
- Select the k 'best' features from the feature space
- Scale features space so features can never be negative
- Perform LDA and plot results, compare to LDA without selection

In [None]:
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import MinMaxScaler

Specify value of $d$ (dimensions) and number of features to select before performing LDA

In [None]:
# Must be greater than dimensions
features_to_select = 150

# Number of dimensions to reduce LDA 
# Max value is 237
dimensions = 5

In [None]:
# Class/genre labels for each sample
track_genres = []
for track_id in list(feature_space.index.values):
    track_genres.append(genre_of_track[track_id])
    
# Scale so inputs to .fit() functions are not negative
min_max_scaler = MinMaxScaler(feature_range=(0,5))
scaled_feature_space = min_max_scaler.fit_transform(feature_space)

# K Best feature selection
kbest_features = SelectKBest(chi2, k=features_to_select).fit_transform(scaled_feature_space, track_genres)

# Now use LDA to show effect of selection
# Include kbest_features to apply selection to LDA
lda_plot = LDA(n_components=3)
lda_data_plot = lda_plot.fit_transform(kbest_features, track_genres)

fig = plt.figure()
ax = plt.axes(projection='3d')

X=[]
Y=[]
Z=[]
colors=[]
for index, sample in enumerate(lda_data_plot):
    # Add info to appropriate lists
    X.append(sample[0])
    Y.append(sample[1])
    Z.append(sample[2])
    gen = genre_of_track[feature_space.index.values[index]]
    colors.append(genre_color_map[gen])

ax.scatter(X,Y,Z, c=colors, alpha=0.5)
ax.set_title(f'LDA on {dimensions} Best Features from KBest, $n = {len(feature_space)}$')
ax.set_xlabel('$x$')
ax.set_ylabel('$y$')
ax.set_zlabel('$z$')

# Now do LDA for the correct number of dimensions
lda = LDA(n_components=dimensions)
lda_data = lda.fit_transform(kbest_features, track_genres)

# Use for this and subsequent classifiers to group samples with their class
samples_with_labels = [(lda_data[i], track_genres[i]) for i in range(len(track_genres))]

lda_data.shape

# Bayesian Classification: MLE

In [None]:
from scipy.stats import multivariate_normal as mvn
from sklearn.model_selection import train_test_split, KFold
from sklearn.covariance import EmpiricalCovariance

In [None]:
def compute_mean(data, num_features):
    total = np.zeros(num_features, dtype=int)
    for sample in data:
        total = np.add(total, sample)
    return total/(len(data))

In [None]:
def compute_cov(data, num_features, mean):
    total = np.zeros((num_features,num_features), dtype=int)
    for sample in data:
        term1=np.subtract([sample], [mean])
        term2=np.subtract([sample], [mean]).transpose()
        mult=np.multiply(term1, term2)
        total=np.add(total, mult)
    return total/(len(data)-1)

In [None]:
kf = KFold(n_splits=10, shuffle=True)
folds = kf.split(samples_with_labels)
error_rates = []
for train, test in folds:
    train_features = [samples_with_labels[i][0] for i in train]
    train_classes = [samples_with_labels[i][1] for i in train]
    test_features = [samples_with_labels[i][0] for i in test]
    test_classes = [samples_with_labels[i][1] for i in test]

    # 
    # Train
    # 

    sigma = []
    mu = []
    train_dict = dict()

    for index in range(len(train_classes)):
        features = train_features[index]
        sample_class = train_classes[index]

        if sample_class not in train_dict:
            train_dict[sample_class] = []
        train_dict[sample_class].append(features)

    for genre_id, samples in train_dict.items():
        mu_i = compute_mean(samples, dimensions)
        sigma_i = compute_cov(samples, dimensions, mu_i)
        sigma.append(sigma_i)
        mu.append(mu_i)

    # 
    # Test
    # 

    # Initialize confusion matrix
    gen_list = []
    for g in list(base_genres.title):
        gen_list.append(g[:3])

    confusion = pd.DataFrame(0, index=gen_list, columns=gen_list)
    confusion = confusion.rename_axis('Actual')
    confusion = confusion.rename_axis('Predicted',axis='columns')

    # Iterate over the testing set
    for testing_index in range(len(test_classes)):
        features = test_features[testing_index]
        true_class = test_classes[testing_index]

        # Calculate class conditional density of each class for each sample
        class_cond = []
        for class_index, mu_i in enumerate(mu):
            sigma_i = sigma[class_index]
            rv = mvn(mean=mu_i, cov=sigma_i, allow_singular=True)
            class_cond.append(rv.pdf(features))

        # Add sample to confusion matrix
        j = np.argmax(class_cond)
        pred_class = list(train_dict.keys())[j]
        true_genre = base_genres.loc[true_class].title[:3]
        pred_genre = base_genres.loc[pred_class].title[:3]
        confusion.loc[true_genre].loc[pred_genre] += 1

    # Display results
    correct = sum(confusion.loc[i].loc[i] for i in gen_list)
    num_samples = len(test_features)
    err = (num_samples-correct)/num_samples*100
    error_rates.append(err)
    
print(f'Average error rate: {np.mean(error_rates)}%')
print(f'Variance: {np.var(error_rate)}')
print(f'Standard deviation: {np.std(error_rate)}')

# K-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
k_range = range(2,200,2)
err = []
min_confusion_matrix = []
min_error = 100.0
for k in k_range:
    kf = KFold(n_splits=10, shuffle=True)
    folds = kf.split(samples_with_labels)

    knn = KNeighborsClassifier(n_neighbors=k)
    error_rate = []
    for train, test in folds:
        train_features = [samples_with_labels[i][0] for i in train]
        train_classes = [samples_with_labels[i][1] for i in train]
        test_features = [samples_with_labels[i][0] for i in test]
        test_classes = [samples_with_labels[i][1] for i in test]

        knn.fit(train_features, train_classes)

        predicted = knn.predict(test_features)

        cm = confusion_matrix(test_classes, predicted)
        error = (1-accuracy_score(test_classes, predicted))*100
        error_rate.append(error)
        
        if error < min_error:
            min_error = error
            min_confusion_matrix = [cm, k]
        
    error = np.mean(error_rate)
    err.append(error)
    # print(f'Error rate for {splits} folds, k = {k}: {error}%')
plt.plot(k_range, err)
plt.title(f'k-NN error rate vs. k, $n = {len(track_genres)}$')
plt.xlabel('k')
plt.ylabel('Error rate, %');

print(f'Minimum classification error occured when k = {min_confusion_matrix[1]}\n')
print(f'Error rate: {np.mean(error_rate)}%\n')
print(f'Confusion Matrix:\n{min_confusion_matrix[0]}\n')
print(f'Variance: {np.var(error_rate)}')
print(f'Standard deviation: {np.std(error_rate)}')

# Multi Layer Perceptron

In [None]:
from sklearn.model_selection import KFold, train_test_split
from sklearn.neural_network import MLPClassifier
import scipy.stats

In [None]:
kf = KFold(n_splits=10, shuffle=True)
folds = kf.split(samples_with_labels)

error_rate = []

for index, splits in enumerate(folds):
    train, test = splits
    train_features = [samples_with_labels[i][0] for i in train]
    train_classes = [samples_with_labels[i][1] for i in train]
    test_features = [samples_with_labels[i][0] for i in test]
    test_classes = [samples_with_labels[i][1] for i in test]
    
    # set 15% of training data for validation
    mlp = MLPClassifier(max_iter = 500, early_stopping=True, validation_fraction=0.15)
    mlp.fit(train_features, train_classes)

    predicted = mlp.predict(test_features)

    cm = confusion_matrix(test_classes, predicted)
    error = (1-accuracy_score(test_classes, predicted))*100
    error_rate.append(error)
    
print(f'Average error rate: {np.mean(error_rate)}%')
print(f'Variance: {np.var(error_rate)}')
print(f'Standard deviation: {np.std(error_rate)}')

# Support Vector Machine

In [None]:
from sklearn.svm import SVC

In [None]:
kf = KFold(n_splits=10, shuffle=True)
folds = kf.split(samples_with_labels)

error_rate = []

for index, splits in enumerate(folds):
    train, test = splits
    train_features = [samples_with_labels[i][0] for i in train]
    train_classes = [samples_with_labels[i][1] for i in train]
    test_features = [samples_with_labels[i][0] for i in test]
    test_classes = [samples_with_labels[i][1] for i in test]
    
    clf = SVC()
    mlp.fit(train_features, train_classes)

    predicted = mlp.predict(test_features)

    cm = confusion_matrix(test_classes, predicted)
    error = (1-accuracy_score(test_classes, predicted))*100
    error_rate.append(error)
    
print(f'Average error rate: {np.mean(error_rate)}%')
print(f'Variance: {np.var(error_rate)}')
print(f'Standard deviation: {np.std(error_rate)}')