# Preliminary operations

In [None]:
# import main libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Condenses Nearest Neighbour 
from imblearn.under_sampling import CondensedNearestNeighbour

# collections
from collections import Counter
from collections import defaultdict

# Dummy clf
from sklearn.dummy import DummyClassifier
# metrics
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
)

# repeated stratified kfold
from sklearn.model_selection import RepeatedStratifiedKFold

# random search cv
from sklearn.model_selection import RandomizedSearchCV

# KNN
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# mont Google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# import training data
df_data_train = pd.read_excel("drive/MyDrive/Progetto Data Mining 2/CODICE PROGETTO/Outliers/DATASET NO OUTLIERS/df_prep_TRAIN_no_outliers.xlsx", index_col="Unnamed: 0")
df_info_train = pd.read_excel("drive/MyDrive/Progetto Data Mining 2/CODICE PROGETTO/Outliers/DATASET NO OUTLIERS/df_prep_info_TRAIN_no_outliers.xlsx", index_col="Unnamed: 0")

X_train = df_data_train.values
y_train = df_info_train['vocal_channel']

# import test data
df_data_test = pd.read_excel("drive/MyDrive/Progetto Data Mining 2/CODICE PROGETTO/Outliers/DATASET NO OUTLIERS/df_prep_TEST_no_outliers.xlsx", index_col="Unnamed: 0")
df_info_test = pd.read_excel("drive/MyDrive/Progetto Data Mining 2/CODICE PROGETTO/Outliers/DATASET NO OUTLIERS/df_prep_info_TEST_no_outliers.xlsx", index_col="Unnamed: 0")

X_test = df_data_test.values
y_test = df_info_test['vocal_channel']

In [None]:
df_data_train

Unnamed: 0,sum,std,q25,kur,skew,lag1_sum,lag1_kur,lag1_skew,zc_sum,mfcc_sum,...,mfcc_q05_w4,mfcc_q95_w4,mfcc_q99_w4,mfcc_kur_w4,sc_std_w4,sc_kur_w4,sc_skew_w4,stft_sum_w4,stft_mean_w4,stft_kur_w4
0,0.716365,-1.159462,1.215433,0.859870,1.608196,0.032995,0.662262,0.805546,-1.500797,0.399234,...,0.914663,-0.670034,-1.541948,-0.202342,-0.425145,3.147197,-2.200824,0.228251,1.542380,-0.159600
1,0.669143,-1.108453,1.215433,1.070644,1.658463,0.032995,0.275624,1.241144,-1.321006,0.408776,...,0.465963,-0.925068,-0.828006,0.275867,-0.427448,-0.229617,-0.575310,-0.231871,1.094347,0.111549
2,0.723227,-0.975264,2.047744,1.146381,1.781550,-1.036084,0.763118,-0.128356,-1.750464,0.490061,...,0.637260,-0.595362,-1.139805,-0.014546,-0.246588,2.742928,-1.781613,-0.390360,1.161032,-0.289829
3,0.708504,-1.006749,1.215433,1.749201,1.981405,-0.815534,0.707610,1.132806,-1.416084,0.750825,...,1.016104,-0.098577,-0.717890,-0.392469,-0.555570,-0.462721,-0.367839,-0.824689,1.017073,0.412375
4,0.705644,-1.371989,1.215433,0.581488,1.436854,-0.815534,0.370417,-0.060381,-1.569077,-0.145204,...,0.736785,-0.583115,-0.797341,0.154515,-0.249430,0.605878,-0.549321,0.241436,1.091298,-0.159600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1773,-0.533551,-0.926692,2.047744,-0.775907,-0.974597,0.032995,0.234861,-0.828073,-1.373922,-0.346083,...,0.664345,-1.699436,-1.259249,0.894243,1.572822,1.191457,-0.871113,-0.858915,0.261546,-0.157863
1774,-0.512702,-0.494895,1.215433,-0.050756,-0.482154,0.032995,1.144182,-1.138132,-0.648233,0.011200,...,0.339675,-2.119495,-0.552764,1.389867,1.466784,1.206284,-0.742265,-1.242542,-0.273897,-0.282172
1775,-0.602680,-0.374843,2.047744,-0.945497,-0.285060,0.032995,0.858489,-0.565557,-1.728149,-0.008234,...,-0.089512,-1.531537,-0.442736,1.290250,0.875711,1.581088,-0.871506,-1.276408,-0.053277,0.231495
1776,-0.421299,-0.160226,1.215433,-1.368889,-0.548836,0.032995,-0.013708,-2.119959,0.007696,-0.385782,...,-0.377597,-1.926863,-1.238323,1.709477,1.727957,-0.044882,-0.627522,-0.587311,-0.180552,0.713603


In [None]:
# get number of elements for each class
np.unique(df_info_train['vocal_channel'], return_counts=True)

(array(['song', 'speech'], dtype=object), array([ 732, 1046]))

In [None]:
# get rows to remove for an unbalanced dataset 
rows2remove = np.random.choice(df_info_train[df_info_train['vocal_channel'] == "song"].index, 732 - 55, replace=False)

In [None]:
# get new training dataset
df2_data_train = df_data_train.drop(index=rows2remove, axis=0)
df2_info_train = df_info_train.drop(index=rows2remove, axis=0)

In [None]:
# check number of elements for each class
np.unique(df2_info_train['vocal_channel'], return_counts=True)

(array(['song', 'speech'], dtype=object), array([  55, 1046]))

In [None]:
# get training values
X_train = df2_data_train.values
y_train = df2_info_train['vocal_channel']

#Condensed Nearest Neighbor

In [None]:
# get Condensed Nearest Neighbor
cnn = CondensedNearestNeighbour(
    random_state=1
)

# get rebalanced dataset
X_res, y_res = cnn.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_res))

Resampled dataset shape Counter({'speech': 78, 'song': 55})


#Hyperparameters tuning

In [None]:
# get dummy classifier
dummy_clf = DummyClassifier(
    strategy="stratified", 
    random_state=0
)

# fit dummy classifier
dummy_clf.fit(
    X_res, 
    y_res
)

# get baseline for testset (54.48%)
print(dummy_clf.score(X=None, y=y_test))

0.5398671096345515


In [None]:
# get parameters grid for tuning
param_grid = {
    "n_neighbors": range(1, X_res.shape[0]//2),
    "metric": ["cityblock", "euclidean"],
    "weights": ["uniform", "distance"]
}

# get KNN
KNC = KNeighborsClassifier(
        n_jobs = -1
)

# get Repeated Stratified K Fold
RSKF = RepeatedStratifiedKFold(
        n_splits = 20, 
        n_repeats = 3, 
        random_state = 0
)

# get Randomized Search CV
grid = RandomizedSearchCV(
    KNC,
    n_iter=1500,
    param_distributions = param_grid,
    cv=RSKF,
    n_jobs = -1,
    refit = True,
    verbose=1,
    random_state=0
)

# fit GridSearchCV
grid.fit(X_res, y_res)

Fitting 60 folds for each of 260 candidates, totalling 15600 fits




In [None]:
# get best parameters for KNN
print(grid.best_params_)
# get best obtained score
print(grid.best_score_)

{'weights': 'distance', 'n_neighbors': 35, 'metric': 'euclidean'}
0.9166666666666666


In [None]:
# export tuning results
grid_res = pd.DataFrame(grid.cv_results_)
grid_res.to_excel("knn_grid_res.xlsx") 

#Tuned model

In [None]:
# get 
KNC = KNeighborsClassifier(
    n_neighbors=35, 
    metric="euclidean", 
    weights="distance"
)

In [None]:
X_res.shape

(133, 96)

In [None]:
X_test.shape

(602, 96)

In [None]:
KNC = KNC.fit(X_res, y_res)
y_pred = KNC.predict(X_test)
y_test_pred_proba = KNC.predict_proba(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        song       0.97      0.95      0.96       264
      speech       0.96      0.98      0.97       338

    accuracy                           0.97       602
   macro avg       0.97      0.97      0.97       602
weighted avg       0.97      0.97      0.97       602

