In [24]:
from sklearn.svm import SVC
import numpy as np
from scipy.io import loadmat
import matplotlib.pyplot as plt
import matplotlib.axes as ax
import pandas as pd

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score

In [3]:
def classification_accuracy(predicted, actual):
    if len(predicted) != len(actual):
        raise ValueError('length of predicted not equal to length of actual.')
    correct = 0
    for i in range(len(predicted)):
        if predicted[i] == actual[i]:
            correct+=1
    return str((correct/len(predicted)) * 100) + "%"

In [4]:
def outputLabels(y_data):
    return [ y for y in y_data]
#y = outputLabels(y_data)

In [5]:
def make_meshgrid(x, y, h=.02):
    x_min, x_max = x.min() - 1, x.max() + 1
    y_min, y_max = y.min() - 1, y.max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    return xx, yy

In [6]:
def plot_contours(ax, clf, xx, yy, **params):
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    out = ax.contourf(xx, yy, Z, **params)
    return out

In [7]:
def separate_axes(X0, X1, pred):
    if len(X0) != len(X1) or len(X1) != len(pred):
        raise ValueError('unequal feature-label lengths')
    class0_x = []
    class0_y = []
    class1_x = []
    class1_y = []
    
    for i in range(len(pred)):
        if pred[i] == 0:
            class0_x.append(X0[i])
            class0_y.append(X1[i])
        else:
            class1_x.append(X0[i])
            class1_y.append(X1[i])
    return class0_x, class0_y, class1_x, class1_y

In [8]:
def generate_svm_plot(X, y, pred, classifier, name):
    fig, ax = plt.subplots()

    # title for the plots
    title = ('Decision surface of ' + name + ' SVC ')
    # Set-up grid for plotting.
    X0, X1 = X[:, 0], X[:, 1]
    xx, yy = make_meshgrid(X0, X1)

    plot_contours(ax, classifier, xx, yy, cmap=plt.cm.coolwarm, alpha=0.8) 
    class0_x, class0_y, class1_x, class1_y = separate_axes(X0, X1, y)
    ax.scatter(class0_x, class0_y, c='b', marker="o", label='class 0', s=20)
    ax.scatter(class1_x, class1_y, c='r', marker="s", label='class 1', s=20)
    ax.set_ylabel('feature 2')
    ax.set_xlabel('feature 1')
    ax.set_xticks(())
    ax.set_yticks(())
    ax.set_title(title)
    ax.legend()
    plt.show()

In [9]:
valid_kernel_strs = set(['linear', 'rbf', 'poly'])
def svm(x_train, y_arr, x_test, kernel_str, deg=0):
    if kernel_str not in valid_kernel_strs:
        raise ValueError('Invalid kernel string provided')
    if kernel_str == 'poly' and deg == 0:
        raise ValueError('Need to provide the degree for a polynomial kernel')
        
    #y_arr = outputLabels(y_train)
    classifier = SVC(kernel=kernel_str, degree=deg)
    classifier.fit(x_train,  y_arr)
    weights = np.matmul(classifier.dual_coef_,classifier.support_vectors_)
    bias = classifier.intercept_
    train_pred = classifier.predict(x_train)
    test_pred = classifier.predict(x_test)
    return (weights, bias, train_pred, test_pred, classifier)

In [10]:
#read in the data from a csv file.
data = pd.read_csv('SpotifyFeatures.csv')
data

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Movie,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,0.61100,0.389,99373,0.9100,0.000000,C#,0.3460,-1.828,Major,0.0525,166.969,4/4,0.8140
1,Movie,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,0.24600,0.590,137373,0.7370,0.000000,F#,0.1510,-5.559,Minor,0.0868,174.003,4/4,0.8160
2,Movie,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,0.95200,0.663,170267,0.1310,0.000000,C,0.1030,-13.879,Minor,0.0362,99.488,5/4,0.3680
3,Movie,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0,0.70300,0.240,152427,0.3260,0.000000,C#,0.0985,-12.178,Major,0.0395,171.758,4/4,0.2270
4,Movie,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,4,0.95000,0.331,82625,0.2250,0.123000,F,0.2020,-21.150,Major,0.0456,140.576,4/4,0.3900
5,Movie,Henri Salvador,Le petit souper aux chandelles,0Mf1jKa8eNAf1a4PwTbizj,0,0.74900,0.578,160627,0.0948,0.000000,C#,0.1070,-14.970,Major,0.1430,87.479,4/4,0.3580
6,Movie,Martin & les fées,"Premières recherches (par Paul Ventimila, Lori...",0NUiKYRd6jt1LKMYGkUdnZ,2,0.34400,0.703,212293,0.2700,0.000000,C#,0.1050,-12.675,Major,0.9530,82.873,4/4,0.5330
7,Movie,Laura Mayne,Let Me Let Go,0PbIF9YVD505GutwotpB5C,15,0.93900,0.416,240067,0.2690,0.000000,F#,0.1130,-8.949,Major,0.0286,96.827,4/4,0.2740
8,Movie,Chorus,Helka,0ST6uPfvaPpJLtQwhE6KfC,0,0.00104,0.734,226200,0.4810,0.000860,C,0.0765,-7.725,Major,0.0460,125.080,4/4,0.7650
9,Movie,Le Club des Juniors,Les bisous des bisounours,0VSqZ3KStsjcfERGdcWpFO,10,0.31900,0.598,152694,0.7050,0.001250,G,0.3490,-7.790,Major,0.0281,137.496,4/4,0.7180


In [11]:
#remove unnecessary columns
data = data.drop(['artist_name'],axis=1)
data = data.drop(['track_id'],axis=1)
data = data.drop(['track_name'],axis=1)
data = data.drop(['key'],axis=1)
data = data.drop(['mode'],axis=1)
data = data.drop(['time_signature'],axis=1)
data = data.drop(['popularity'],axis=1)
data

Unnamed: 0,genre,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence
0,Movie,0.61100,0.389,99373,0.9100,0.000000,0.3460,-1.828,0.0525,166.969,0.8140
1,Movie,0.24600,0.590,137373,0.7370,0.000000,0.1510,-5.559,0.0868,174.003,0.8160
2,Movie,0.95200,0.663,170267,0.1310,0.000000,0.1030,-13.879,0.0362,99.488,0.3680
3,Movie,0.70300,0.240,152427,0.3260,0.000000,0.0985,-12.178,0.0395,171.758,0.2270
4,Movie,0.95000,0.331,82625,0.2250,0.123000,0.2020,-21.150,0.0456,140.576,0.3900
5,Movie,0.74900,0.578,160627,0.0948,0.000000,0.1070,-14.970,0.1430,87.479,0.3580
6,Movie,0.34400,0.703,212293,0.2700,0.000000,0.1050,-12.675,0.9530,82.873,0.5330
7,Movie,0.93900,0.416,240067,0.2690,0.000000,0.1130,-8.949,0.0286,96.827,0.2740
8,Movie,0.00104,0.734,226200,0.4810,0.000860,0.0765,-7.725,0.0460,125.080,0.7650
9,Movie,0.31900,0.598,152694,0.7050,0.001250,0.3490,-7.790,0.0281,137.496,0.7180


In [12]:
def pluckRows(dataframe, numrows):
    #genreNames = dataframe['genre'].unique()
    genreNames = ['Hip-Hop', 'Soul', 'Classical', 'Blues', 'Rock']
    genreNames[:5]
    #print(genreNames)
    columns = list(dataframe.columns.values)
    res = pd.DataFrame(columns=columns)
    for genre in genreNames:
        temp = dataframe.loc[dataframe['genre'] == genre].head(numrows)
        res = res.append(temp, ignore_index=True)
    return res
data = pluckRows(data, 1000)

In [13]:
# movies = data.loc[data['genre'] == 'Movie'].head(20)
# soul = data.loc[data['genre'] == 'Soul'].head(20)
# movies = movies.append(soul)
# movies.reset_index(drop=True)
# #genreNames = dataframe['genre'].unique()
# cols = list(data.columns.values)
# df = pd.DataFrame(columns=cols)
# df
genreNames = ['R&B', 'Alternative', 'Country', 'Electronic', 'Blues', 'Rap', 'Indie', 'Classical', 'Pop' 'Jazz', 'Rock', 'Hip-Hop', 'Folk', 'Soul']

In [14]:
#extract labels from data
labels = data['genre']

In [15]:
#encode the labels
encoder = LabelEncoder()
y = encoder.fit_transform(labels)
y = labels

In [16]:
y = outputLabels(y)

In [17]:
scaler = StandardScaler()
X = scaler.fit_transform(np.array(data.iloc[:, 1:], dtype = float))
X

array([[-0.69332528,  1.48486765, -0.21459139, ...,  1.85637726,
         0.22993335,  0.02002281],
       [-0.39491592,  1.75698584, -0.52001505, ..., -0.01088529,
        -1.02406058,  0.34402427],
       [-0.99037823,  1.48486765,  0.47831915, ..., -0.05618276,
         0.95589683, -0.7359806 ],
       ...,
       [-0.57396153, -0.13214237,  0.60704739, ..., -0.59773923,
        -1.26184433, -0.61198004],
       [ 1.16766399,  0.286501  , -1.08140834, ..., -0.60679872,
        -0.11545067,  0.23602378],
       [-1.09387202, -0.18447279, -0.30577889, ...,  0.65851072,
         1.4849508 , -0.89998134]])

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [19]:
# SVM-exponential-kernel model.
weights, bias, train_pred, test_pred, classifier = svm(X_train, y_train, X_test,'rbf', 0)



In [20]:
# classification accuracy--training data: 
classification_accuracy(train_pred, outputLabels(y_train))

'69.72500000000001%'

In [21]:
# classification accuracy--training data: 
classification_accuracy(test_pred, outputLabels(y_test))

'63.800000000000004%'

In [22]:
print(classification_report(y_test, test_pred))

              precision    recall  f1-score   support

       Blues       0.51      0.47      0.49       201
   Classical       0.95      0.98      0.97       185
     Hip-Hop       0.80      0.74      0.77       202
        Rock       0.54      0.50      0.52       214
        Soul       0.44      0.53      0.48       198

   micro avg       0.64      0.64      0.64      1000
   macro avg       0.65      0.64      0.65      1000
weighted avg       0.64      0.64      0.64      1000



In [25]:
accuracy_score(y_test, test_pred, normalize=True, sample_weight=None)

0.638

In [27]:
train_error = classifier.score(X_train, y_train)
test_error = classifier.score(X_test, y_test)

print('\tTrain accuracy: %s' % str(train_error))
print('\tTest accuracy: %s' % str(test_error))

	Train accuracy: 0.69725
	Test accuracy: 0.638
