In [15]:
import numpy as np
import pandas as pd

# for PCA
from sklearn.decomposition import PCA

# for SVM
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.multioutput import ClassifierChain, MultiOutputClassifier
from sklearn.model_selection import GridSearchCV, ShuffleSplit
from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.metrics import average_precision_score#, average_recall_score, average_f1_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report, multilabel_confusion_matrix
from sklearn.preprocessing import MultiLabelBinarizer

# save models
import joblib
from joblib import dump
import shutil

In [2]:
# define path where the training, validation and test data is
root_path = ""

In [3]:
# load relevant training data
X_train = np.load("X_train2.npy")
y_train = np.load("y_train2.npy")

# load relevant training data
X_val = np.load("X_val2.npy")
y_val = np.load("y_val2.npy")

# load relevant test data
X_test = np.load("X_test2.npy")
y_test = np.load("y_test2.npy")

In [4]:
# check shape of X
print("Shape of X_train: ", X_train.shape, "\n")
print("Shape of X_val: ", X_val.shape, "\n")
print("Shape of X_test: ", X_test.shape, "\n")

# check shape of Y
print("Shape of y_train: ", y_train.shape, "\n")
print("Shape of y_val: ", y_val.shape, "\n")
print("Shape of y_test: ", y_test.shape, "\n")

Shape of X_train:  (4239, 1292, 20, 1) 

Shape of X_val:  (1413, 1292, 20, 1) 

Shape of X_test:  (1413, 1292, 20, 1) 

Shape of y_train:  (4239, 10) 

Shape of y_val:  (1413, 10) 

Shape of y_test:  (1413, 10) 



### Reshape features according to the model

In [5]:
# append validation data to training data because grid-search with cv is used
X_train = np.append(X_train, X_val, axis=0)
y_train = np.append(y_train, y_val, axis=0)

In [6]:
# check that append worked
print(X_train.shape)
print(y_train.shape)

(5652, 1292, 20, 1)
(5652, 10)


In [7]:
# remove channel dimension of MFCCs by reshaping features
X_train = X_train.reshape((X_train.shape[0], -1))
X_test = X_test.reshape((X_test.shape[0], -1))

In [8]:
# check that removal of channel dimension worked
print(X_train.shape)
print(X_test.shape)

(5652, 25840)
(1413, 25840)


### PCA to reduce number of features

In [9]:
# perform PCA on X to reduce amount of features
# reduce to dimensionality so that 70% of variance is kept
pca = PCA(n_components=0.7, random_state=42)
X_pca_train = pca.fit_transform(X_train)

In [10]:
# check how many principal components are needed
pca.n_components_

305

In [11]:
# apply the trained PCA model on the test set as well
X_pca_test = pca.transform(X_test)

In [12]:
# change to original variable convention
X_train = X_pca_train
X_test = X_pca_test

In [13]:
# check that PCA worked
print(X_train.shape)
print(X_test.shape)

(5652, 305)
(1413, 305)


Performance of linear SVM

In [16]:
# define the linear SVM model
svm = SVC(kernel='linear', class_weight='balanced', random_state=42)
multi_svm = ClassifierChain(svm)

# define the hyperparameter grid
param_grid = {
    'base_estimator__C':[2**-5, 2**-3, 2**-1, 2],
}

# define the scoring metric
scorer = make_scorer(roc_auc_score, multi_class='ovr')

# define the grid search
grid = GridSearchCV(
    multi_svm,
    param_grid,
    # cv=5,
    cv=ShuffleSplit(test_size=0.25, n_splits=1),
    scoring=scorer,
    refit=True,
    return_train_score=True,
    verbose=3)

In [17]:
# train the SVMs
grid.fit(X_train, y_train)

Fitting 1 folds for each of 4 candidates, totalling 4 fits
[CV 1/1] END base_estimator__C=0.03125;, score=(train=0.650, test=0.561) total time=  20.5s
[CV 1/1] END base_estimator__C=0.125;, score=(train=0.637, test=0.554) total time=  23.4s
[CV 1/1] END base_estimator__C=0.5;, score=(train=0.640, test=0.550) total time=  42.6s
[CV 1/1] END base_estimator__C=2;, score=(train=0.646, test=0.550) total time= 2.1min


In [19]:
# check performance of the models
pd.concat([pd.DataFrame(grid.cv_results_["params"]),pd.DataFrame(grid.cv_results_["mean_train_score"], columns=["Training Mean AUC"]),pd.DataFrame(grid.cv_results_["mean_test_score"], columns=["Validation Mean AUC"])], axis=1)

Unnamed: 0,base_estimator__C,Training Mean AUC,Validation Mean AUC
0,0.03125,0.649891,0.560581
1,0.125,0.636601,0.55402
2,0.5,0.640247,0.549721
3,2.0,0.645924,0.549617


In [20]:
# print the best hyperparameters
print('Best Hyperparameters: ', grid.best_params_)

Best Hyperparameters:  {'base_estimator__C': 0.03125}


In [21]:
# save the best model
# dump(grid, 'svm_linear.joblib') # without zip
# shutil.make_archive('svm_linear', 'zip', '.', 'svm_linear.joblib') # with zip

In [22]:
# load the model if already trained
# grid = joblib.load('svm_linear.joblib')

In [23]:
# make predictions on the test set
y_pred = grid.predict(X_test)

In [24]:
# check predictions 
y_pred[:5]

array([[0., 0., 0., 1., 1., 0., 1., 0., 0., 1.],
       [0., 0., 0., 0., 1., 0., 1., 0., 0., 1.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [1., 0., 0., 1., 0., 0., 0., 1., 0., 0.]])

In [25]:
# get results from the test set
auc_score = roc_auc_score(y_test, y_pred)

In [26]:
# print the results
print("AUC: :", auc_score)

AUC: : 0.5565569688041891


In [28]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.13      0.59      0.21        96
           1       0.33      0.68      0.45       259
           2       0.29      0.25      0.27       334
           3       0.03      0.17      0.05        40
           4       0.18      0.35      0.23       179
           5       0.13      0.05      0.07       193
           6       0.11      0.31      0.16       132
           7       0.30      0.37      0.33       268
           8       0.29      0.05      0.08       150
           9       0.10      0.31      0.16       132

   micro avg       0.19      0.33      0.24      1783
   macro avg       0.19      0.31      0.20      1783
weighted avg       0.23      0.33      0.24      1783
 samples avg       0.22      0.33      0.24      1783



### Performance of RBF kernel

In [31]:
# define the SVM model with RBF kernel
svm = SVC(kernel='rbf', class_weight='balanced', random_state=42)
multi_svm = ClassifierChain(svm)

# define the hyperparameter grid
param_grid = {
    # 'base_estimator__C':[2**-5, 2**-3, 2**-1, 2],
    'base_estimator__C':[2**-5, 2**-3],
    'base_estimator__gamma':[2**-15,2**-9,2**-3,2**-1]
}

# define the scoring metric
scorer = make_scorer(roc_auc_score, multi_class='ovr')

# define the grid search
grid = GridSearchCV(
    multi_svm,
    param_grid,
    # cv=5,
    cv=ShuffleSplit(test_size=0.25, n_splits=1),
    scoring=scorer,
    refit=True,
    return_train_score=True,
    verbose=3)

In [None]:
# train the SVMs
grid.fit(X_train, y_train)

Fitting 1 folds for each of 16 candidates, totalling 16 fits
[CV 1/1] END base_estimator__C=0.03125, base_estimator__gamma=3.0517578125e-05;, score=(train=0.500, test=0.500) total time=  41.5s
[CV 1/1] END base_estimator__C=0.03125, base_estimator__gamma=0.001953125;, score=(train=0.618, test=0.625) total time=  37.7s
[CV 1/1] END base_estimator__C=0.03125, base_estimator__gamma=0.125;, score=(train=0.500, test=0.500) total time=  35.6s
[CV 1/1] END base_estimator__C=0.03125, base_estimator__gamma=0.5;, score=(train=0.500, test=0.500) total time=  34.8s
[CV 1/1] END base_estimator__C=0.125, base_estimator__gamma=3.0517578125e-05;, score=(train=0.527, test=0.527) total time=  36.6s


In [None]:
# check the models
pd.concat([pd.DataFrame(grid.cv_results_["params"]),pd.DataFrame(grid.cv_results_["mean_train_score"], columns=["Training Mean AUC"]),pd.DataFrame(grid.cv_results_["mean_test_score"], columns=["Validation Mean AUC"])], axis=1)

In [None]:
# print the best hyperparameters
print('Best Hyperparameters: ', grid.best_params_)

In [None]:
# save the best model
# dump(grid, 'svm_linear.joblib') # without zip
# shutil.make_archive('svm_linear', 'zip', '.', 'svm_linear.joblib') # with zip

In [16]:
# load the model if already trained
# grid = joblib.load('svm_linear.joblib')

In [76]:
# make predictions on the test set
y_pred = grid.predict(X_test)

In [82]:
# check predictions 
y_pred[:5]

array([4, 9, 7, ..., 4, 1, 5])

In [79]:
# get results from the test set
auc_score = roc_auc_score(y_test, y_pred)

AxisError: axis 1 is out of bounds for array of dimension 1

In [74]:
# print the results
print("AUC: :", auc_score)

AUC: : 0.6182190633495556


In [36]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.11      0.53      0.19       361
           1       0.34      0.77      0.47      1088
           2       0.30      0.75      0.43      1314
           3       0.06      0.30      0.09       176
           4       0.19      0.43      0.26       692
           5       0.18      0.43      0.26       716
           6       0.11      0.82      0.20       497
           7       0.34      0.68      0.45      1081
           8       0.18      0.58      0.28       599
           9       0.13      0.68      0.22       561

   micro avg       0.20      0.64      0.31      7085
   macro avg       0.19      0.60      0.29      7085
weighted avg       0.24      0.64      0.34      7085
 samples avg       0.21      0.64      0.30      7085



Appendix

In [None]:
# compute confusion matrix for each label
# cm = multilabel_confusion_matrix(y_test, y_pred)
# print("These are the confusion matrices for all labels: \n")

# for i in range(cm.shape[0]):
#     print(f'Confusion matrix for label {i+1}: \n', cm[i])