In [1]:
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn import svm
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RepeatedStratifiedKFold

#  Saving Features as pandas dataframe

In [2]:
df = pd.read_csv('extracted_feature_values_midi.csv')

In [3]:
cluster_mood_filename = df.cluster_mood_filename
temp = []
cluster = []
mood = []
filename = []

for i in cluster_mood_filename:
    temp.append(i)
temp = [i.split('\\') for i in temp]

for path in temp:
    cluster.append(path[-3])
    mood.append(path[-2])
    filename.append(path[-1].split('.')[0])

df['cluster'] = cluster
df['mood'] = mood
df['filename'] = filename

cols = [col for col in df.columns]
xcols = cols[1:-3]
ycol = cols[-3]
print(ycol)

cluster


#  Applying label encoder and splitting the dataset into train and test datasets

In [4]:
le = preprocessing.LabelEncoder()
X = df[xcols]
X = X.fillna(0)
Y = df[ycol]
Y = pd.DataFrame(Y)
Y = Y.apply(lambda col: le.fit_transform(col.astype(str)), axis=0, result_type='expand')
files = df['filename']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
# print(type(df))
# print(type(X_train))
# print(type(Y_train))
# print(type(xcols))
# print(type(ycol))
# print(X.isnull().sum())
# print(Y.isnull().sum())
# print(X.shape)
# X_nonan = X.fillna(0)
# print(X_nonan.isnull().values.any())
# print(X_nonan.shape)
print(X_train.shape)
print(Y_train.shape)
X_train.head()


(156, 1495)
(156, 1)


Unnamed: 0,Basic_Pitch_Histogram_0,Basic_Pitch_Histogram_1,Basic_Pitch_Histogram_2,Basic_Pitch_Histogram_3,Basic_Pitch_Histogram_4,Basic_Pitch_Histogram_5,Basic_Pitch_Histogram_6,Basic_Pitch_Histogram_7,Basic_Pitch_Histogram_8,Basic_Pitch_Histogram_9,...,Parallel_Motion,Similar_Motion,Contrary_Motion,Oblique_Motion,Parallel_Fifths,Parallel_Octaves,Dynamic_Range,Variation_of_Dynamics,Variation_of_Dynamics_In_Each_Voice,Average_Note_to_Note_Change_in_Dynamics
127,0.0,0,0,0,0,0,0,0,0,0,...,0.03535,0.3178,0.03098,0.6159,0.00656,0.01166,62,20.9,1.954,1.086
120,0.0,0,0,0,0,0,0,0,0,0,...,0.03118,0.3344,0.06274,0.5717,0.000779,0.01208,120,21.41,10.51,12.07
98,0.0,0,0,0,0,0,0,0,0,0,...,0.03947,0.2562,0.04064,0.6637,0.003491,0.01067,117,12.69,10.87,5.132
173,0.0,0,0,0,0,0,0,0,0,0,...,0.2202,0.1361,0.01376,0.63,0.04893,0.09939,57,17.01,8.72,6.425
152,0.0,0,0,0,0,0,0,0,0,0,...,0.08721,0.4893,0.09908,0.3244,0.003054,0.0436,115,20.59,7.99,10.84


#  Normalizing the dataset for improved svm performance

In [5]:
#Normalizing using standard scalor 
normalized_X = X.values
std_scalor = preprocessing.StandardScaler()
X_scaled = std_scalor.fit_transform(normalized_X)
normalized_X = pd.DataFrame(X_scaled, columns=xcols)
X_train3, X_test3, Y_train3, Y_test3 = train_test_split(normalized_X, Y, test_size=0.2)
normalized_X.head()

Unnamed: 0,Basic_Pitch_Histogram_0,Basic_Pitch_Histogram_1,Basic_Pitch_Histogram_2,Basic_Pitch_Histogram_3,Basic_Pitch_Histogram_4,Basic_Pitch_Histogram_5,Basic_Pitch_Histogram_6,Basic_Pitch_Histogram_7,Basic_Pitch_Histogram_8,Basic_Pitch_Histogram_9,...,Parallel_Motion,Similar_Motion,Contrary_Motion,Oblique_Motion,Parallel_Fifths,Parallel_Octaves,Dynamic_Range,Variation_of_Dynamics,Variation_of_Dynamics_In_Each_Voice,Average_Note_to_Note_Change_in_Dynamics
0,-0.071611,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.029958,-0.288967,0.402916,-0.8707,0.234466,1.133501,0.43265,-0.153625,-0.31814,-0.272365
1,-0.071611,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.322842,-0.966963,-1.480627,1.640473,0.192266,-0.43225,-0.930685,1.625977,-1.484887,-1.25805
2,-0.071611,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.763202,-0.735132,7.522922,-1.031701,-0.492572,-0.698894,3.295654,1.931535,3.223318,0.950938
3,-0.071611,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.420798,-0.860234,0.228022,1.095548,-0.463812,-0.598059,-0.555768,0.57572,-0.35529,-0.26476
4,-0.071611,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.18734,-0.771875,-0.317818,0.968059,-0.413584,0.121219,0.3304,-2.150035,-1.363933,-1.262548


# SVM classification on original dataset without feature selection

In [None]:
%%time
#clf = LinearSVC(max_iter = 10000000, verbose = 1)
clf = svm.SVC(C=1, kernel='linear', verbose = True)
clf.fit(X_train, Y_train.values.ravel())
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=20)
#cv = StratifiedKFold(5)
#cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=20, random_state=1)
train_accuracy_svm = cross_val_score(clf, X_train, Y_train.values.ravel(), cv=cv, n_jobs=15).mean()
test_accuracy_svm = clf.score(X_test, Y_test.values.ravel())

print("Training accuracy " + str(train_accuracy_svm))
print("Test accuracy " + str(test_accuracy_svm))

# SVM classification on normalized dataset without feature selection

In [6]:
%%time
clf = svm.SVC(C=1, kernel='linear', verbose = True)
clf.fit(X_train3, Y_train3.values.ravel())
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=20)
#cv = StratifiedKFold(5)
#cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=20, random_state=1)
train_accuracy_svm_norm = cross_val_score(clf, X_train3, Y_train3.values.ravel(), cv=cv, n_jobs=15).mean()
test_accuracy_svm_norm = clf.score(X_test3, Y_test3.values.ravel())

print("Training accuracy " + str(train_accuracy_svm_norm))
print("Test accuracy " + str(test_accuracy_svm_norm))

[LibSVM]Training accuracy 0.3516666666666666
Test accuracy 0.35
Wall time: 4.7 s


#  Feature Selection on Midi features

Methods to employ -<br> 1) Tree based feature selection(Random Forests) <br>
                    2) Recursive feature elimination with cross validation using SVM estimators <br>
                    3) Anova F-measures <br>
                    4) Chi-squared <br>
                    5) Principal component analysis<br>
We will apply each of them individually, evaluate performance, and choose a combination
Principal component analysis should be performed as the dimensions of the feature vectore(1495) is more than the number of samples(196) <br>

**study showed svm classification based on midi features had a test accuracy of 35%**

https://www.kaggle.com/arrohit/feature-selection-and-hypertuning-svm  **tree based feature selection**

In [7]:
%%time
rf_clf = RandomForestClassifier()
rf_clf = rf_clf.fit(X_train3, Y_train3.values.ravel())
model = SelectFromModel(rf_clf, prefit=True)

rf_X_train = X_train3.loc[:, model.get_support()]
rf_X_test = X_test3.loc[:, model.get_support()]
print(type(rf_X_train))
print(rf_X_train.shape)
rf_X_train.head()

<class 'pandas.core.frame.DataFrame'>
(156, 529)
Wall time: 193 ms


Unnamed: 0,Basic_Pitch_Histogram_27,Basic_Pitch_Histogram_30,Basic_Pitch_Histogram_31,Basic_Pitch_Histogram_32,Basic_Pitch_Histogram_33,Basic_Pitch_Histogram_34,Basic_Pitch_Histogram_35,Basic_Pitch_Histogram_36,Basic_Pitch_Histogram_38,Basic_Pitch_Histogram_40,...,Voice_Overlap,Variability_of_Voice_Separation,Parallel_Motion,Similar_Motion,Contrary_Motion,Parallel_Fifths,Parallel_Octaves,Dynamic_Range,Variation_of_Dynamics,Variation_of_Dynamics_In_Each_Voice
64,-0.17125,0.564609,-0.440096,0.187697,0.871298,-0.411287,-0.247255,-0.664195,-0.503504,-0.104193,...,0.194041,0.86588,-0.280059,0.862315,-0.021151,-0.326064,-0.371885,-0.214934,-0.067639,-0.179504
38,-0.17125,-0.406188,-0.440096,-0.309868,-0.616224,-0.411287,-0.512604,-0.690407,1.018151,1.220252,...,-0.164122,-0.492608,0.307088,-0.461309,0.479221,-0.129466,0.243557,0.500817,0.058269,-0.0583
156,-0.17125,-0.406188,-0.440096,3.155013,-0.616224,2.128462,-0.512604,1.681784,-0.63899,-0.591828,...,0.565545,-1.094971,3.77465,-1.451622,-1.480627,3.363256,2.031839,0.773484,0.115081,-1.071302
84,-0.17125,-0.231277,0.147956,-0.309868,-0.594883,0.541184,0.670071,1.228198,0.080004,-0.13716,...,0.656881,0.152808,-0.290017,0.930552,0.016438,-0.221802,-0.445824,-0.283101,-0.159767,0.198011
46,-0.17125,-0.406188,-0.077672,-0.309868,0.047181,-0.411287,-0.052905,0.174378,1.276888,1.711734,...,0.548098,0.797071,-0.516689,1.416959,0.226446,-0.463078,-0.365991,0.500817,-1.451092,-0.527755


In [8]:
%%time
clf = svm.SVC(C=1, kernel='linear', verbose = True)
clf.fit(rf_X_train, Y_train3.values.ravel())
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=20)
#cv = StratifiedKFold(5)
#cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=20, random_state=1)
train_accuracy_svm_rf = cross_val_score(clf, rf_X_train, Y_train3.values.ravel(), cv=cv, n_jobs=15).mean()
test_accuracy_svm_rf = clf.score(rf_X_test, Y_test3.values.ravel())

print("Training accuracy " + str(train_accuracy_svm_rf))
print("Test accuracy " + str(test_accuracy_svm_rf))

[LibSVM]Training accuracy 0.41010416666666666
Test accuracy 0.3
Wall time: 758 ms


https://machinelearningmastery.com/rfe-feature-selection-in-python/ **RFECV**

In [9]:
%%time
clf = svm.SVC(C=1, kernel='linear', verbose = True)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=20)
#cv = StratifiedKFold(5)
rfecv = RFECV(estimator = clf, step = 1, cv = cv, scoring='accuracy', verbose=1, n_jobs=15 )
rfetrain=rfecv.fit(X_train3, Y_train3.values.ravel())
print('Optimal number of features :', rfecv.n_features_)

Fitting estimator with 1495 features.
[LibSVM]Fitting estimator with 1494 features.
[LibSVM]Fitting estimator with 1493 features.
[LibSVM]Fitting estimator with 1492 features.
[LibSVM]Fitting estimator with 1491 features.
[LibSVM]Fitting estimator with 1490 features.
[LibSVM]Fitting estimator with 1489 features.
[LibSVM]Fitting estimator with 1488 features.
[LibSVM]Fitting estimator with 1487 features.
[LibSVM]Fitting estimator with 1486 features.
[LibSVM]Fitting estimator with 1485 features.
[LibSVM]Fitting estimator with 1484 features.
[LibSVM]Fitting estimator with 1483 features.
[LibSVM]Fitting estimator with 1482 features.
[LibSVM]Fitting estimator with 1481 features.
[LibSVM]Fitting estimator with 1480 features.
[LibSVM]Fitting estimator with 1479 features.
[LibSVM]Fitting estimator with 1478 features.
[LibSVM]Fitting estimator with 1477 features.
[LibSVM]Fitting estimator with 1476 features.
[LibSVM]Fitting estimator with 1475 features.
[LibSVM]Fitting estimator with 1474 featur

In [10]:
%%time
rfe = RFE(estimator=clf, n_features_to_select=rfecv.n_features_, step=1, verbose=1)
rfe = rfe.fit(X_train3, Y_train3.values.ravel())

rfe_X_train = X_train3.loc[:, rfe.get_support()]
rfe_X_test = X_test3.loc[:, rfe.get_support()]

Fitting estimator with 1495 features.
[LibSVM]Fitting estimator with 1494 features.
[LibSVM]Fitting estimator with 1493 features.
[LibSVM]Fitting estimator with 1492 features.
[LibSVM]Fitting estimator with 1491 features.
[LibSVM]Fitting estimator with 1490 features.
[LibSVM]Fitting estimator with 1489 features.
[LibSVM]Fitting estimator with 1488 features.
[LibSVM]Fitting estimator with 1487 features.
[LibSVM]Fitting estimator with 1486 features.
[LibSVM]Fitting estimator with 1485 features.
[LibSVM]Fitting estimator with 1484 features.
[LibSVM]Fitting estimator with 1483 features.
[LibSVM]Fitting estimator with 1482 features.
[LibSVM]Fitting estimator with 1481 features.
[LibSVM]Fitting estimator with 1480 features.
[LibSVM]Fitting estimator with 1479 features.
[LibSVM]Fitting estimator with 1478 features.
[LibSVM]Fitting estimator with 1477 features.
[LibSVM]Fitting estimator with 1476 features.
[LibSVM]Fitting estimator with 1475 features.
[LibSVM]Fitting estimator with 1474 featur

In [11]:
%%time
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=20)
#cv = StratifiedKFold(5)
train_accuracy_svm_rfe = cross_val_score(estimator=clf,X=rfe_X_train,y=Y_train3.values.ravel(), cv=cv, n_jobs=15, verbose=1).mean()
print("Training accuracy " + str(train_accuracy_svm_rfe))
clf = clf.fit(rfe_X_train, Y_train3.values.ravel())
test_accuracy_svm_rfe = clf.score(rfe_X_test, Y_test3.values.ravel())
print("Test accuracy " + str(test_accuracy_svm_rfe))

[Parallel(n_jobs=15)]: Using backend LokyBackend with 15 concurrent workers.
[Parallel(n_jobs=15)]: Done  20 tasks      | elapsed:    0.1s


Training accuracy 0.8264583333333334
[LibSVM]Test accuracy 0.375
Wall time: 832 ms


[Parallel(n_jobs=15)]: Done 200 out of 200 | elapsed:    0.7s finished


https://machinelearningmastery.com/feature-selection-with-real-and-categorical-data/ **ANOVA F measure**