In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy as sp
from sklearn import linear_model
from sklearn import model_selection
from sklearn import preprocessing
from sklearn import decomposition

# Data selection
The dataset is filtered by the classes *metal*$ \cup$*punk* and *dance and electronica*, assigning 1 and 0 as their respective labels.

In [2]:
data = pd.read_csv('msd_genre_dataset.txt',header=9)
dataf = data[(data['%genre']=='metal') | (data['%genre']=='punk') | (data['%genre']=='dance and electronica')]
dataf = dataf.replace(to_replace='punk', value=True)
dataf = dataf.replace(to_replace='metal', value=True)
dataf = dataf.replace(to_replace='dance and electronica', value=False)
data.head()

Unnamed: 0,%genre,track_id,artist_name,title,loudness,tempo,time_signature,key,mode,duration,...,var_timbre3,var_timbre4,var_timbre5,var_timbre6,var_timbre7,var_timbre8,var_timbre9,var_timbre10,var_timbre11,var_timbre12
0,classic pop and rock,TRFCOOU128F427AEC0,Blue Oyster Cult,Mes Dames Sarat,-8.697,155.007,1,9,1,246.33424,...,1255.514569,580.030472,598.485223,575.337671,322.068603,321.726029,232.700609,186.805303,181.938688,151.508011
1,classic pop and rock,TRNJTPB128F427AE9F,Blue Oyster Cult,Screams,-10.659,148.462,1,4,0,189.80526,...,2007.65307,1043.474073,585.694981,564.013736,510.177022,400.200186,365.119588,238.099708,197.933757,251.577525
2,classic pop and rock,TRLFJHA128F427AEEA,Blue Oyster Cult,Dance The Night Away,-13.494,112.909,1,10,0,158.1971,...,1204.856777,2736.520024,730.233239,665.203452,535.775111,439.335059,486.82297,265.33386,447.097987,251.880724
3,classic pop and rock,TRCQZAG128F427DB97,Blue Oyster Cult,Debbie Denise,-12.786,117.429,4,7,1,250.22649,...,809.755802,563.90807,492.803819,378.382799,372.875044,231.941957,246.313305,168.400152,85.282462,339.897173
4,classic pop and rock,TRNXMNM128F427DB8C,Blue Oyster Cult,(Don't Fear) The Reaper,-14.093,141.536,4,9,0,307.06893,...,1093.684935,343.556047,889.163314,218.111796,304.862864,178.352161,440.478867,142.669283,81.061326,208.355152


# Features
It is easy to see that there are 30 quantitative features to in the dataset, so these are the ones used for the analysis.

In [3]:
X = dataf[dataf.keys()[4:]]
Y = dataf['%genre']

# First approach
The first approach is to use the whole raw dataset to try and make predictions about the genres with a linear classifier. For this purpose, 20% of the data are reserved for validation, keeping in mind that approximately 10000 observations will leave enough data for the training and tessting sets.

In the linear classifier approach the average stochastic gradient solver will be used, in accordance with the methodology of stochastic gradient descent learned in class.

In [4]:
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.2)
linc = linear_model.LogisticRegression(solver='sag', max_iter=1000)
linc.fit(X_train, Y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='sag', tol=0.0001,
          verbose=0, warm_start=False)

# Goodness of classification
The goodness of the classification is evaluated by measuring the mean accuracy of the classifier in the test samples, and comparing it to the training samples to annalize the error and the overfitting of the model.

In [5]:
print('The mean accuracy on the training samples is: %f' %(linc.score(X_train, Y_train)))
print('The mean accuracy on the testing samples is: %f' %(linc.score(X_test, Y_test)))

The mean accuracy on the training samples is: 0.842369
The mean accuracy on the testing samples is: 0.841797


The accuracy on the classifier is good, as it achieves a reasonably good fit considering the used data, and the fact that there is no big difference between the accuracies on the testing and training datasets. However a better classifier in respect to accuracy may be found if previous treatment of data is performed.

# Standardisation
One of the simplest operations that can be performed with each of the features is to normalise it with respect ot their means and variances, which will allow for equal assignment of the "weights", as they will have the same scale.

After doing this, the whole fitting process is repeated.

In [6]:
scaler = preprocessing.StandardScaler().fit(X_train)
X_trains = scaler.transform(X_train)
X_tests = scaler.transform(X_test)

In [7]:
lincs = linear_model.LogisticRegression(solver='sag', max_iter=1000)
lincs.fit(X_trains, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='sag', tol=0.0001,
          verbose=0, warm_start=False)

# Goodness
The goodness of this classifier is:

In [8]:
print('The mean accuracy on the training samples is: %f' %(lincs.score(X_trains, Y_train)))
print('The mean accuracy on the testing samples is: %f' %(lincs.score(X_tests, Y_test)))

The mean accuracy on the training samples is: 0.860440
The mean accuracy on the testing samples is: 0.856445


The accuracy of this classifier is classifier is slightly better, however there is reason to think there may be better classifiers as not all features may be as important to the analysis.

In order to solve this problem a principal component analysis is performed, and the fitting proceduere is repeated with this transformation.

# Principal Component Analysis
In this part two fits will be performed, one with all components of the analysis are kept, and one where only the first components that explain at least 95% of the variance will be kept.

In [9]:
PCA_scaler = decomposition.PCA().fit(X_train)
X_traint = PCA_scaler.fit_transform(X_train)
X_testt = PCA_scaler.fit_transform(X_test)

In [10]:
lincp = linear_model.LogisticRegression(solver='sag', max_iter=1000)
lincp.fit(X_traint, Y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='sag', tol=0.0001,
          verbose=0, warm_start=False)

# Goodness

In [11]:
print('The mean accuracy on the training samples is: %f' %(lincp.score(X_traint, Y_train)))
print('The mean accuracy on the testing samples is: %f' %(lincp.score(X_testt, Y_test)))

The mean accuracy on the training samples is: 0.838828
The mean accuracy on the testing samples is: 0.778320


In this case there is not a significant improvement on the goodness of the classifier and there is evidence of overfitting, as there is a bigger difference between the accuracy on the training samples and the accuracy on the testing samples. Given the amount of samples used for testing, this is a reasonable assumption.

# Dimensionality reduction
As discussed previously, now only the most important components will be taken into account. These components are the first 4 as shown.

In [12]:
PCA_scaler.explained_variance_ratio_

array([  7.10040861e-01,   1.53387214e-01,   8.58806375e-02,
         1.87632621e-02,   1.23080747e-02,   6.74138947e-03,
         4.63492592e-03,   2.75615131e-03,   1.66082743e-03,
         1.26293832e-03,   1.21468149e-03,   6.42956978e-04,
         2.99442612e-04,   1.27186165e-04,   1.18095839e-04,
         4.88292686e-05,   3.37638836e-05,   2.65223552e-05,
         1.37429358e-05,   1.16913945e-05,   7.80895987e-06,
         6.81759286e-06,   5.41041403e-06,   3.04847733e-06,
         1.65771562e-06,   1.27503827e-06,   5.74825622e-07,
         1.49417766e-07,   4.22143311e-08,   2.04898759e-08])

In [13]:
lincpd = linear_model.LogisticRegression(solver='sag')
lincpd.fit(X_traint[:,:4], Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='sag', tol=0.0001,
          verbose=0, warm_start=False)

# Goodness

In [14]:
print('The mean accuracy on the training samples is: %f' %(lincpd.score(X_traint[:,:4], Y_train)))
print('The mean accuracy on the testing samples is: %f' %(lincpd.score(X_testt[:,:4], Y_test)))

The mean accuracy on the training samples is: 0.776679
The mean accuracy on the testing samples is: 0.763184


By using only the most important components, the overfitting problem is solved, however the accuracy is signifficantly worse.

# The last attempt
It can be seen that only the cases where standardisation or dimensionality reduction were used presented convergence, this is due to the big difference in variances among the features, when standardisation was not performed, which caused undesirable oscillations in the gradient, which delayed the convergence.

There are two possible solutions for this, one is to perform standardisation on the PCA transformation (which has the advantage of giving uncorrelated features) and the other one is to use a different sover that allows for faster convergence. Both approaches will be shown next.

# Standardisation of PCA

In [15]:
PCA_rescaler = preprocessing.StandardScaler().fit(X_traint)
X_traintre = PCA_rescaler.transform(X_traint)
X_testtre = PCA_rescaler.transform(X_testt)

In [16]:
lincpr = linear_model.LogisticRegression(solver='sag', max_iter=1000)
lincpr.fit(X_traintre, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='sag', tol=0.0001,
          verbose=0, warm_start=False)

# Goodness

In [17]:
print('The mean accuracy on the training samples is: %f' %(lincpr.score(X_traintre, Y_train)))
print('The mean accuracy on the testing samples is: %f' %(lincpr.score(X_testtre, Y_test)))

The mean accuracy on the training samples is: 0.860440
The mean accuracy on the testing samples is: 0.750488


There is a slight improvement with respect to the PCA decomposition without additional treatment, however there is evidence of overfitting which gisves a significantly worse performance of this approach with respect to only doing standardisation. This may be improved with dimensionality reduction.

In [18]:
lincprd = linear_model.LogisticRegression(solver='sag', max_iter=1000)
lincprd.fit(X_traintre[:,:4], Y_train)
print('The mean accuracy on the training samples is: %f' %(lincprd.score(X_traintre[:,:4], Y_train)))
print('The mean accuracy on the testing samples is: %f' %(lincprd.score(X_testtre[:,:4], Y_test)))

The mean accuracy on the training samples is: 0.782051
The mean accuracy on the testing samples is: 0.771484


Again, the overfitting problem is corrected, but the performance is still signifficantly worse. In this case it is not be worth all the trouble.

# The recommended solver
In this case the expected results will be similar to the standardisation cases, as the same minimum will be reached even though the adjusted parameters will be in a different scale.

The default solver for the function LogisticRegression is a coordinate descent algorithm, not seen in class.

In [19]:
lincN = linear_model.LogisticRegression(max_iter=1000)
lincN.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

# Goodness

In [20]:
print('The mean accuracy on the training samples is: %f' %(lincN.score(X_train, Y_train)))
print('The mean accuracy on the testing samples is: %f' %(lincN.score(X_test, Y_test)))

The mean accuracy on the training samples is: 0.859951
The mean accuracy on the testing samples is: 0.856445


As predicted, the accuracy is very similar to the one found through standardisation.

# Conclusions
The best accuracy is given by standardising the features and using the whole dataset, as it yields the best accuracy of the different approaches without the overfitting.

Given the amount of samples reserved for testing, there is high probability that the estimated accuracy of the algorithm is close to the real accuracy.

Standardisation of the samples helps with the convergence of the algorithms by eliminating undesirable oscillations due to high eccentricities given by the feature scales.

Use of transformations like PCA should be handled with care as the use of a number of component equal to the number of features leads to overfitting.

In this case, dimensionality reduction is not really necessary as there is a small number of features with respect to the number of samples and the computational power available is more than enough to handle the problem.