In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier


path_data = r'/content/gdrive/MyDrive/Uni/ML-Project/Data/mfcc/mfccaudio_mfcc_features_40_all_20Sec_cleaned.pkl'
path_label = r'/content/gdrive/MyDrive/Uni/ML-Project/Data/mfcc/mfccaudio_mfcc_features_40_all_20Sec_labels_cleaned.pkl'
data = pd.read_pickle(path_data)

# Data extraction
def data_extract(d):
    X=d.copy()
    for i in range(len(X)):
      f1 = np.mean(X[i],axis=0)
      f2 = np.var(X[i],axis=0)
      X[i] = np.append(f1,f2)
    return np.array(X)
    
X = data_extract(data)
y = pd.read_pickle(path_label)

LDA / PCA

In [None]:
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_selection import SequentialFeatureSelector

knn = KNeighborsClassifier(n_neighbors=1)
fs = SequentialFeatureSelector(knn).fit(X, y)
X_fs = fs.transform(X)

bs = SequentialFeatureSelector(knn,direction='backward').fit(X, y)
X_bs = fs.transform(X)

pca = PCA(n_components = 0.99).fit(X,y)
X_pca = pca.transform(X)
lda = LinearDiscriminantAnalysis().fit(X,y)
X_lda = lda.transform(X)

print('X shape:    ',X.shape)
print('X_fs shape:',X_fs.shape)
print('X_bs shape:',X_bs.shape)
print('X_PCA shape:',X_pca.shape)
print('X_LDA shape:',X_lda.shape)



X shape:     (9538, 80)
X_PCA shape: (9538, 3)
X_LDA shape: (9538, 6)


Hyperparameter tunning

In [None]:
from sklearn.model_selection import GridSearchCV
#SVM
def svm_hypertunning (X , y, C_range = list(np.linspace(0.1,1.7,5))):
    params = {'C':C_range}
    search = GridSearchCV(svm.SVC(), params,cv=4,n_jobs=-1).fit(X,y)
    best_C = search.best_params_['C']
    return best_C 

# KNN
def knn_hypertunning(X , y,k_range = [1,50,100,150,200,250,300]):
    params = {'n_neighbors':k_range}
    search = GridSearchCV(KNeighborsClassifier(),params,cv=4,n_jobs=-1).fit(X,y)
    best_k = search.best_params_['n_neighbors']
    return best_k

# MLP
def mlp_hypertunning(X , y, hidden_layer_range = [(7,7),(7,8),(8,7),(8,8)]):
    
    params = {'hidden_layer_sizes':hidden_layer_range}
    search = GridSearchCV(MLPClassifier(max_iter=500),params,cv=4,n_jobs=-1).fit(X,y)
    best_hl = search.best_params_['hidden_layer_sizes']
    return best_hl

  #XGBoost
def xgb_hypertunning (X , y, alpha_range = list(np.linspace(0.1,1.7,5))):
    params = {'learning_rate':alpha_range}
    search = GridSearchCV(GradientBoostingClassifier(), params,cv=4,n_jobs=-1).fit(X,y)
    best_alpha = search.best_params_['learning_rate']
    return best_alpha 

Evaluation 

In [None]:
from sklearn.model_selection import cross_validate
def print_scores(clf, X, y):
    cv_results = cross_validate(clf, X, y, cv=4, 
                                scoring='accuracy',
                                return_train_score='true',
                                n_jobs=-1)
    
    mean_train = np.array(cv_results['train_score']).mean().round(3)
    test = np.array(cv_results['test_score'])
    mean_test = test.mean().round(3)
    sd_test = np.sqrt(test.var()).round(4)
    print(f'accuracy score on test data: ',mean_test,"\u00B1",sd_test)
    print(f'accuracy score on train data:',mean_train)
    return 
    
xs = (X, X_pca, X_lda, X_fs, X_bs)
titles = ('Primary Features','PCA','LDA','FS','BS')
for  x , title in zip(xs,titles):
  k = knn_hypertunning(x,y)
  knn = KNeighborsClassifier(n_neighbors=k)
  print(f'\n{title}:')
  print('k=',k)
  print_scores(knn, x, y)


Primary Features:
k= 1
accuracy score on test data:  0.195 ± 0.029
accuracy score on train data: 0.997

PCA:
k= 1
accuracy score on test data:  0.18 ± 0.0129
accuracy score on train data: 0.997

LDA:
k= 200
accuracy score on test data:  0.404 ± 0.0402
accuracy score on train data: 0.468


In [None]:
C_range = list(np.linspace(0.1,1.7,5))
for c in C_range:
  svc = svm.SVC(kernel="rbf",C = c)
  print(f'\n{svc}:')
  print_scores(svc, X_lda,y)

k_range = [1,50,100,150,200,250,300]
for k in k_range:
  knn = KNeighborsClassifier(n_neighbors=k)
  print(f'\n{knn}:')
  print_scores(knn, X_lda,y)

hidden_layer_range = [(7,7),(7,8),(8,7),(8,8)]  
for hl in hidden_layer_range:       
  mlp = MLPClassifier(hidden_layer_sizes=hl, max_iter=500,learning_rate_init=0.01)
  print(f'\n{mlp}:')
  print_scores(mlp, X_lda,y)

  alpha_range = (0.01,0.05,0.1,0.5,1)
for a in alpha_range:       
  xgb = GradientBoostingClassifier(learning_rate=a)
  print(f'\n{xgb}:')
  print_scores(xgb, X_lda,y)


SVC(C=0.1):
accuracy score on test data:  0.404 ± 0.0486
accuracy score on train data: 0.474

SVC(C=0.5):
accuracy score on test data:  0.392 ± 0.0382
accuracy score on train data: 0.502

SVC(C=0.8999999999999999):
accuracy score on test data:  0.386 ± 0.0369
accuracy score on train data: 0.515

SVC(C=1.3):
accuracy score on test data:  0.383 ± 0.035
accuracy score on train data: 0.523

SVC(C=1.7):
accuracy score on test data:  0.382 ± 0.0354
accuracy score on train data: 0.532

KNeighborsClassifier(n_neighbors=1):
accuracy score on test data:  0.293 ± 0.0314
accuracy score on train data: 0.997

KNeighborsClassifier(n_neighbors=50):
accuracy score on test data:  0.392 ± 0.0388
accuracy score on train data: 0.516

KNeighborsClassifier(n_neighbors=100):
accuracy score on test data:  0.402 ± 0.0405
accuracy score on train data: 0.49

KNeighborsClassifier(n_neighbors=150):
accuracy score on test data:  0.403 ± 0.04
accuracy score on train data: 0.475

KNeighborsClassifier(n_neighbors=200)

**Ensemble Learning**

In [None]:
c = svm_hypertunning(X_lda,y)
svc = svm.SVC(kernel="rbf",C = c)

k = knn_hypertunning(X_lda,y)
knn = KNeighborsClassifier(n_neighbors=k)
                           
hl = mlp_hypertunning(X_lda,y)                       
mlp = MLPClassifier(hidden_layer_sizes=hl, max_iter=500,learning_rate_init=0.01)

alpha = xgb_hypertunning(X_lda,y) 
xgb = GradientBoostingClassifier(learning_rate=alpha)

In [None]:
print(svc)
print(knn)
print(mlp)
print(xgb)

SVC(C=0.1)
KNeighborsClassifier(n_neighbors=200)
MLPClassifier(hidden_layer_sizes=(8, 8), learning_rate_init=0.01, max_iter=500)
GradientBoostingClassifier(learning_rate=0.05)


Majority Voting

In [None]:

from sklearn.ensemble import VotingClassifier, BaggingClassifier
mvt = VotingClassifier(estimators=[('svc', svc),('knn', knn),('mlp', mlp),('xgb', xgb)])
print('\nMajority Voting:')
print_scores(mvt, X_lda, y)


Majority Voting:
accuracy score on test data:  0.409 ± 0.046
accuracy score on train data: 0.483


Bagging

In [None]:
from sklearn.ensemble import BaggingClassifier
n = 10
models = (svc,knn,mlp,xgb)
for clf in models:
    bagg_clf = BaggingClassifier(base_estimator=clf, n_estimators=n, random_state=5)
    print(f'\n{clf} Bagging model:')
    print_scores(bagg_clf, X_lda, y)


In [None]:
    bagg_clf = BaggingClassifier(base_estimator=xgb, n_estimators=n, random_state=5)
    print(f'\n{xgb} Bagging model:')
    print_scores(bagg_clf, X_lda, y)


GradientBoostingClassifier(learning_rate=0.05) Bagging model:
accuracy score on test data:  0.402 ± 0.0477
accuracy score on train data: 0.521
