In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import os

os.chdir('/content/drive/MyDrive/Data Science Padova/Semester 3/Human Data Analytics/Project')

In [5]:
import tqdm
from parameters import *
import librosa
from utils import *

In [6]:
import pandas as pd
import numpy as np
from scipy.stats import skew
from sklearn.utils import shuffle

# Scikit learn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from sklearn.utils import class_weight

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# sklearn models

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# xgb

from xgboost import XGBClassifier

# **Feature Extraction**

In [7]:
audio_list = []
target_list = []

METADATA_FILE_PATH = '/content/drive/MyDrive/Data Science Padova/Semester 3/Human Data Analytics/Project/Data/ESC-50-master/ESC-50-master/meta/esc50.csv'
AUDIO_FOLDER = '/content/drive/MyDrive/Data Science Padova/Semester 3/Human Data Analytics/Project/Data/ESC-50-master/ESC-50-master/audio/'

df = import_dataframe(METADATA_FILE_PATH, which_dataset='esc50')

data = []

for filename,target in tqdm(list(zip(df['filename'], df['target'])), desc = 'Importing dataset'):
        y, fs = librosa.load(AUDIO_FOLDER + filename, sr=SR)
        data.append((y,target))

audio_data = []
labels=[]

for i,j in data:
    audio_data.append(i)
    labels.append(j)

audio_data=np.array(audio_data) # array with size (n_data, n_samples). Each row contains an audio
labels=np.array(labels) # array with size (n_data), contains the indexes of the labels

import keras
ylabels=keras.utils.to_categorical(labels, num_classes=OUTPUT_CLASSES, dtype='float32')

from skimage import util

sub_sequence= SR*1.25    #  1.25 seconds of signal !
st=400 #  samples for sliding the window ith overlap
audio_data_red = []

for i in tqdm(range(0,len(audio_data)), desc = 'data reduction'):
    frames = util.view_as_windows(audio_data[i], window_shape=(sub_sequence,), step=st)
    frame_intensity = []
    for frame in frames:
        frame_intensity.append(frame @ frame)
    optim_frame_index = np.array(frame_intensity).argmax()
    audio_data_red.append(frames[optim_frame_index]/np.max(frames[optim_frame_index]))

Importing dataset: 100%|██████████| 2000/2000 [13:09<00:00,  2.53it/s]
data reduction: 100%|██████████| 2000/2000 [00:03<00:00, 605.79it/s]


In [8]:
def extract_featuresAvg(y, sr):

    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
    spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
    spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    zcr = librosa.feature.zero_crossing_rate(y)

    n_fft = int(sr * 0.02)   # window length: 0.02 s
    hop_length = n_fft // 2  # usually one specifies the hop length as a fraction of the window length
    mfccs = librosa.feature.mfcc(y, sr=sr, n_mfcc=13, hop_length=hop_length, n_fft=n_fft)

    vector = np.empty
    vector = np.append(vector, np.mean(chroma_stft))
    vector = np.append(vector, np.mean(spec_cent))
    vector = np.append(vector,np.mean(spec_bw))
    vector = np.append(vector,np.mean(rolloff))
    vector = np.append(vector,np.mean(zcr))
    for e in mfccs:
            vector = np.append(vector,np.mean(e))
    vector = np.delete(vector,0,0)


    return vector

#Extract MFCC from spectogram
def extract_featuresMCC(y, sr):
    S = librosa.feature.melspectrogram(y, sr=sr, n_fft=2048,
                                       hop_length=512,
                                       n_mels=128)
    mfccs = librosa.feature.mfcc(S=librosa.power_to_db(S), n_mfcc=40)

    return mfccs


def get_mfcc(data, SAMPLE_RATE):
    try:
      # data = librosa.to_mono(data)

      ft1 = librosa.feature.mfcc(y=data, sr = SAMPLE_RATE, n_mfcc=30)
      ft2 = librosa.feature.zero_crossing_rate(y=data)[0]
      ft3 = librosa.feature.spectral_rolloff(y=data)[0]
      ft4 = librosa.feature.spectral_centroid(y=data)[0]
      ft5 = librosa.feature.spectral_contrast(y=data)[0]
      ft6 = librosa.feature.spectral_bandwidth(y=data)[0]
      ft1_trunc = np.hstack((np.mean(ft1, axis=1), np.std(ft1, axis=1), skew(ft1, axis = 1), np.max(ft1, axis = 1), np.median(ft1, axis = 1), np.min(ft1, axis = 1)))
      ft2_trunc = np.hstack((np.mean(ft2), np.std(ft2), skew(ft2), np.max(ft2), np.median(ft2), np.min(ft2)))
      ft3_trunc = np.hstack((np.mean(ft3), np.std(ft3), skew(ft3), np.max(ft3), np.median(ft3), np.min(ft3)))
      ft4_trunc = np.hstack((np.mean(ft4), np.std(ft4), skew(ft4), np.max(ft4), np.median(ft4), np.min(ft4)))
      ft5_trunc = np.hstack((np.mean(ft5), np.std(ft5), skew(ft5), np.max(ft5), np.median(ft5), np.min(ft5)))
      ft6_trunc = np.hstack((np.mean(ft6), np.std(ft6), skew(ft6), np.max(ft6), np.median(ft6), np.max(ft6)))
      return np.hstack((ft1_trunc, ft2_trunc, ft3_trunc, ft4_trunc, ft5_trunc, ft6_trunc))
    except Exception as e:
        print('naughty example')
        return pd.Series([0]*210)

In [9]:
labels[0]

0

In [10]:
X = []
dataset_dict = {}

print("Getting features ..")
for i,y in enumerate(audio_data_red):
    vector = get_mfcc(y, SR)
    X.append(vector)
    dataset_dict[i] = {'features' : vector,
                        'label' : labels[i]}


Getting features ..


In [11]:
dataset = pd.DataFrame.from_dict(dataset_dict, orient='index')
dataset = shuffle(dataset, random_state=42)
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 1860 to 1126
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   features  2000 non-null   object
 1   label     2000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 46.9+ KB


In [12]:
dataset.head()

Unnamed: 0,features,label
1860,"[-314.47210693359375, 104.05805969238281, 9.81...",22
353,"[-86.4356460571289, -23.950395584106445, -36.5...",17
1333,"[-18.113712310791016, 128.5638427734375, -9.61...",45
905,"[63.74094009399414, 21.263023376464844, -44.89...",22
1289,"[-311.519775390625, 58.64865493774414, 4.65748...",21


In [13]:
from sklearn.model_selection import train_test_split

X = np.array(dataset['features'].tolist())
y = dataset['label']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# **SVM**

In [14]:
## Support Vector Classifier

svm_model = SVC()

svm_pipe = Pipeline([
       ('scale', StandardScaler()),
      ('reduce_dims', PCA()),
        ('model', svm_model)])

In [15]:
svm_pipe.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'scale', 'reduce_dims', 'model', 'scale__copy', 'scale__with_mean', 'scale__with_std', 'reduce_dims__copy', 'reduce_dims__iterated_power', 'reduce_dims__n_components', 'reduce_dims__n_oversamples', 'reduce_dims__power_iteration_normalizer', 'reduce_dims__random_state', 'reduce_dims__svd_solver', 'reduce_dims__tol', 'reduce_dims__whiten', 'model__C', 'model__break_ties', 'model__cache_size', 'model__class_weight', 'model__coef0', 'model__decision_function_shape', 'model__degree', 'model__gamma', 'model__kernel', 'model__max_iter', 'model__probability', 'model__random_state', 'model__shrinking', 'model__tol', 'model__verbose'])

In [16]:
param_grid = dict(reduce_dims__n_components=[100,150,210],
                  model__kernel = ['linear', 'rbf', 'poly'],
                  model__gamma = [0.01, 0.1, 0.5, 1],
                  model__degree = [1,2,3,4])

In [17]:
svm_grid = GridSearchCV(svm_pipe, n_jobs=1, param_grid=param_grid, cv=3, verbose=2,
                    return_train_score=True,
                    scoring= 'accuracy')

In [18]:
svm_grid.fit(X_train, y_train)

Fitting 3 folds for each of 144 candidates, totalling 432 fits
[CV] END model__degree=1, model__gamma=0.01, model__kernel=linear, reduce_dims__n_components=100; total time=   0.5s
[CV] END model__degree=1, model__gamma=0.01, model__kernel=linear, reduce_dims__n_components=100; total time=   0.4s
[CV] END model__degree=1, model__gamma=0.01, model__kernel=linear, reduce_dims__n_components=100; total time=   0.5s
[CV] END model__degree=1, model__gamma=0.01, model__kernel=linear, reduce_dims__n_components=150; total time=   0.5s
[CV] END model__degree=1, model__gamma=0.01, model__kernel=linear, reduce_dims__n_components=150; total time=   0.5s
[CV] END model__degree=1, model__gamma=0.01, model__kernel=linear, reduce_dims__n_components=150; total time=   0.5s
[CV] END model__degree=1, model__gamma=0.01, model__kernel=linear, reduce_dims__n_components=210; total time=   0.4s
[CV] END model__degree=1, model__gamma=0.01, model__kernel=linear, reduce_dims__n_components=210; total time=   0.4s
[

In [23]:
print(f'Best parameters: {svm_grid.best_params_}')

Best parameters: {'model__degree': 1, 'model__gamma': 0.01, 'model__kernel': 'poly', 'reduce_dims__n_components': 150}


In [24]:
svm_results = svm_grid.best_estimator_.predict(X_test)

In [25]:
print(classification_report(y_test, svm_results))

              precision    recall  f1-score   support

           0       0.50      0.57      0.53         7
           1       0.90      0.82      0.86        11
           2       0.58      0.88      0.70         8
           3       0.46      0.60      0.52        10
           4       0.90      0.90      0.90        10
           5       0.77      0.83      0.80        12
           6       0.40      0.25      0.31         8
           7       0.40      0.40      0.40         5
           8       0.80      0.67      0.73         6
           9       0.73      0.89      0.80         9
          10       0.54      0.50      0.52        14
          11       0.44      0.88      0.58         8
          12       0.42      0.83      0.56         6
          13       0.67      0.18      0.29        11
          14       0.78      0.78      0.78         9
          15       0.71      0.50      0.59        10
          16       0.57      0.44      0.50         9
          17       0.70    

In [22]:
svm_grid.best_score_

0.5499879372173151

# **Bayes Classifier**

In [26]:
bayes_model = GaussianNB()
bayes_pipe = Pipeline([
       ('scale', StandardScaler()),
      ('reduce_dims', PCA()),
        ('model', bayes_model)])

In [27]:
param_grid = dict(reduce_dims__n_components=[100,150,210])

In [28]:
bayes_grid = GridSearchCV(bayes_pipe, n_jobs=1, param_grid=param_grid, cv=3, verbose=2,
                    return_train_score=True,
                    scoring= 'accuracy')

In [29]:
bayes_grid.fit(X_train, y_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] END ......................reduce_dims__n_components=100; total time=   0.2s
[CV] END ......................reduce_dims__n_components=100; total time=   0.2s
[CV] END ......................reduce_dims__n_components=100; total time=   0.1s
[CV] END ......................reduce_dims__n_components=150; total time=   0.2s
[CV] END ......................reduce_dims__n_components=150; total time=   0.2s
[CV] END ......................reduce_dims__n_components=150; total time=   0.3s
[CV] END ......................reduce_dims__n_components=210; total time=   0.2s
[CV] END ......................reduce_dims__n_components=210; total time=   0.1s
[CV] END ......................reduce_dims__n_components=210; total time=   0.1s


In [30]:
print(f'Best parameters: {bayes_grid.best_params_}')

Best parameters: {'reduce_dims__n_components': 100}


In [31]:
bayes_results = bayes_grid.best_estimator_.predict(X_test)

In [32]:
print(classification_report(y_test, bayes_results))

              precision    recall  f1-score   support

           0       0.62      0.71      0.67         7
           1       0.70      0.64      0.67        11
           2       0.46      0.75      0.57         8
           3       0.42      0.50      0.45        10
           4       0.62      0.50      0.56        10
           5       0.57      0.67      0.62        12
           6       0.50      0.12      0.20         8
           7       0.29      0.40      0.33         5
           8       0.50      0.33      0.40         6
           9       1.00      0.78      0.88         9
          10       1.00      0.57      0.73        14
          11       0.33      0.75      0.46         8
          12       0.67      0.33      0.44         6
          13       0.60      0.27      0.37        11
          14       0.43      0.33      0.38         9
          15       1.00      0.30      0.46        10
          16       0.57      0.44      0.50         9
          17       0.64    

# **Random Forests**

In [40]:
rf_model = RandomForestClassifier()

rf_pipe = Pipeline([
       ('scale', StandardScaler()),
      ('reduce_dims', PCA()),
        ('model', rf_model)])

In [41]:
param_grid = dict(reduce_dims__n_components=[90,100],
                  model__max_depth= [50, 70],
                  model__max_features= ['auto', 'sqrt'],
                  model__min_samples_leaf= [4, 5],
                  model__min_samples_split= [5, 10],
                  model__n_estimators= [100,200])

In [42]:
rf_grid = GridSearchCV(rf_pipe, n_jobs=1, param_grid=param_grid, cv=3, verbose=2,
                    return_train_score=True,
                    scoring= 'accuracy')

In [43]:
rf_grid.fit(X_train, y_train)

Fitting 3 folds for each of 64 candidates, totalling 192 fits


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=5, model__n_estimators=100, reduce_dims__n_components=90; total time=   1.7s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=5, model__n_estimators=100, reduce_dims__n_components=90; total time=   1.7s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=5, model__n_estimators=100, reduce_dims__n_components=90; total time=   1.7s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=5, model__n_estimators=100, reduce_dims__n_components=100; total time=   2.6s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=5, model__n_estimators=100, reduce_dims__n_components=100; total time=   2.2s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=5, model__n_estimators=100, reduce_dims__n_components=100; total time=   1.8s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=5, model__n_estimators=200, reduce_dims__n_components=90; total time=   3.3s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=5, model__n_estimators=200, reduce_dims__n_components=90; total time=   3.3s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=5, model__n_estimators=200, reduce_dims__n_components=90; total time=   4.2s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=5, model__n_estimators=200, reduce_dims__n_components=100; total time=   3.5s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=5, model__n_estimators=200, reduce_dims__n_components=100; total time=   3.5s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=5, model__n_estimators=200, reduce_dims__n_components=100; total time=   3.9s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=10, model__n_estimators=100, reduce_dims__n_components=90; total time=   2.3s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=10, model__n_estimators=100, reduce_dims__n_components=90; total time=   1.7s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=10, model__n_estimators=100, reduce_dims__n_components=90; total time=   1.6s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=10, model__n_estimators=100, reduce_dims__n_components=100; total time=   1.8s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=10, model__n_estimators=100, reduce_dims__n_components=100; total time=   1.8s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=10, model__n_estimators=100, reduce_dims__n_components=100; total time=   1.7s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=10, model__n_estimators=200, reduce_dims__n_components=90; total time=   4.2s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=10, model__n_estimators=200, reduce_dims__n_components=90; total time=   3.2s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=10, model__n_estimators=200, reduce_dims__n_components=90; total time=   3.1s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=10, model__n_estimators=200, reduce_dims__n_components=100; total time=   3.5s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=10, model__n_estimators=200, reduce_dims__n_components=100; total time=   4.6s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=10, model__n_estimators=200, reduce_dims__n_components=100; total time=   3.4s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=5, model__n_estimators=100, reduce_dims__n_components=90; total time=   1.6s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=5, model__n_estimators=100, reduce_dims__n_components=90; total time=   1.6s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=5, model__n_estimators=100, reduce_dims__n_components=90; total time=   1.6s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=5, model__n_estimators=100, reduce_dims__n_components=100; total time=   2.2s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=5, model__n_estimators=100, reduce_dims__n_components=100; total time=   2.4s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=5, model__n_estimators=100, reduce_dims__n_components=100; total time=   1.7s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=5, model__n_estimators=200, reduce_dims__n_components=90; total time=   3.0s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=5, model__n_estimators=200, reduce_dims__n_components=90; total time=   3.0s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=5, model__n_estimators=200, reduce_dims__n_components=90; total time=   4.1s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=5, model__n_estimators=200, reduce_dims__n_components=100; total time=   3.8s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=5, model__n_estimators=200, reduce_dims__n_components=100; total time=   3.3s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=5, model__n_estimators=200, reduce_dims__n_components=100; total time=   3.2s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=10, model__n_estimators=100, reduce_dims__n_components=90; total time=   2.0s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=10, model__n_estimators=100, reduce_dims__n_components=90; total time=   2.4s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=10, model__n_estimators=100, reduce_dims__n_components=90; total time=   1.6s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=10, model__n_estimators=100, reduce_dims__n_components=100; total time=   1.8s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=10, model__n_estimators=100, reduce_dims__n_components=100; total time=   1.8s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=10, model__n_estimators=100, reduce_dims__n_components=100; total time=   1.7s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=10, model__n_estimators=200, reduce_dims__n_components=90; total time=   3.2s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=10, model__n_estimators=200, reduce_dims__n_components=90; total time=   4.1s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=10, model__n_estimators=200, reduce_dims__n_components=90; total time=   3.0s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=10, model__n_estimators=200, reduce_dims__n_components=100; total time=   3.4s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=10, model__n_estimators=200, reduce_dims__n_components=100; total time=   3.6s


  warn(


[CV] END model__max_depth=50, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=10, model__n_estimators=200, reduce_dims__n_components=100; total time=   4.1s
[CV] END model__max_depth=50, model__max_features=sqrt, model__min_samples_leaf=4, model__min_samples_split=5, model__n_estimators=100, reduce_dims__n_components=90; total time=   1.7s
[CV] END model__max_depth=50, model__max_features=sqrt, model__min_samples_leaf=4, model__min_samples_split=5, model__n_estimators=100, reduce_dims__n_components=90; total time=   1.7s
[CV] END model__max_depth=50, model__max_features=sqrt, model__min_samples_leaf=4, model__min_samples_split=5, model__n_estimators=100, reduce_dims__n_components=90; total time=   1.7s
[CV] END model__max_depth=50, model__max_features=sqrt, model__min_samples_leaf=4, model__min_samples_split=5, model__n_estimators=100, reduce_dims__n_components=100; total time=   2.1s
[CV] END model__max_depth=50, model__max_features=sqrt, model__min_sampl

  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=5, model__n_estimators=100, reduce_dims__n_components=90; total time=   2.3s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=5, model__n_estimators=100, reduce_dims__n_components=90; total time=   1.9s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=5, model__n_estimators=100, reduce_dims__n_components=90; total time=   1.6s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=5, model__n_estimators=100, reduce_dims__n_components=100; total time=   1.8s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=5, model__n_estimators=100, reduce_dims__n_components=100; total time=   1.8s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=5, model__n_estimators=100, reduce_dims__n_components=100; total time=   1.8s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=5, model__n_estimators=200, reduce_dims__n_components=90; total time=   3.8s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=5, model__n_estimators=200, reduce_dims__n_components=90; total time=   3.5s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=5, model__n_estimators=200, reduce_dims__n_components=90; total time=   3.1s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=5, model__n_estimators=200, reduce_dims__n_components=100; total time=   3.5s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=5, model__n_estimators=200, reduce_dims__n_components=100; total time=   4.6s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=5, model__n_estimators=200, reduce_dims__n_components=100; total time=   3.5s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=10, model__n_estimators=100, reduce_dims__n_components=90; total time=   1.7s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=10, model__n_estimators=100, reduce_dims__n_components=90; total time=   1.7s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=10, model__n_estimators=100, reduce_dims__n_components=90; total time=   1.7s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=10, model__n_estimators=100, reduce_dims__n_components=100; total time=   2.7s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=10, model__n_estimators=100, reduce_dims__n_components=100; total time=   3.1s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=10, model__n_estimators=100, reduce_dims__n_components=100; total time=   2.1s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=10, model__n_estimators=200, reduce_dims__n_components=90; total time=   3.1s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=10, model__n_estimators=200, reduce_dims__n_components=90; total time=   3.2s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=10, model__n_estimators=200, reduce_dims__n_components=90; total time=   3.5s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=10, model__n_estimators=200, reduce_dims__n_components=100; total time=   4.2s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=10, model__n_estimators=200, reduce_dims__n_components=100; total time=   3.4s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=10, model__n_estimators=200, reduce_dims__n_components=100; total time=   3.4s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=5, model__n_estimators=100, reduce_dims__n_components=90; total time=   1.8s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=5, model__n_estimators=100, reduce_dims__n_components=90; total time=   2.3s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=5, model__n_estimators=100, reduce_dims__n_components=90; total time=   1.7s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=5, model__n_estimators=100, reduce_dims__n_components=100; total time=   1.8s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=5, model__n_estimators=100, reduce_dims__n_components=100; total time=   1.8s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=5, model__n_estimators=100, reduce_dims__n_components=100; total time=   1.7s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=5, model__n_estimators=200, reduce_dims__n_components=90; total time=   3.0s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=5, model__n_estimators=200, reduce_dims__n_components=90; total time=   4.0s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=5, model__n_estimators=200, reduce_dims__n_components=90; total time=   3.0s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=5, model__n_estimators=200, reduce_dims__n_components=100; total time=   3.3s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=5, model__n_estimators=200, reduce_dims__n_components=100; total time=   3.4s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=5, model__n_estimators=200, reduce_dims__n_components=100; total time=   4.1s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=10, model__n_estimators=100, reduce_dims__n_components=90; total time=   1.6s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=10, model__n_estimators=100, reduce_dims__n_components=90; total time=   1.6s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=10, model__n_estimators=100, reduce_dims__n_components=90; total time=   1.5s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=10, model__n_estimators=100, reduce_dims__n_components=100; total time=   1.8s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=10, model__n_estimators=100, reduce_dims__n_components=100; total time=   1.7s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=10, model__n_estimators=100, reduce_dims__n_components=100; total time=   2.3s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=10, model__n_estimators=200, reduce_dims__n_components=90; total time=   3.6s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=10, model__n_estimators=200, reduce_dims__n_components=90; total time=   3.0s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=10, model__n_estimators=200, reduce_dims__n_components=90; total time=   2.9s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=10, model__n_estimators=200, reduce_dims__n_components=100; total time=   3.9s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=10, model__n_estimators=200, reduce_dims__n_components=100; total time=   3.9s


  warn(


[CV] END model__max_depth=70, model__max_features=auto, model__min_samples_leaf=5, model__min_samples_split=10, model__n_estimators=200, reduce_dims__n_components=100; total time=   3.3s
[CV] END model__max_depth=70, model__max_features=sqrt, model__min_samples_leaf=4, model__min_samples_split=5, model__n_estimators=100, reduce_dims__n_components=90; total time=   1.7s
[CV] END model__max_depth=70, model__max_features=sqrt, model__min_samples_leaf=4, model__min_samples_split=5, model__n_estimators=100, reduce_dims__n_components=90; total time=   1.6s
[CV] END model__max_depth=70, model__max_features=sqrt, model__min_samples_leaf=4, model__min_samples_split=5, model__n_estimators=100, reduce_dims__n_components=90; total time=   1.8s
[CV] END model__max_depth=70, model__max_features=sqrt, model__min_samples_leaf=4, model__min_samples_split=5, model__n_estimators=100, reduce_dims__n_components=100; total time=   2.5s
[CV] END model__max_depth=70, model__max_features=sqrt, model__min_sampl

In [44]:
print(f'Best parameters: {rf_grid.best_params_}')

Best parameters: {'model__max_depth': 70, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 5, 'model__min_samples_split': 5, 'model__n_estimators': 200, 'reduce_dims__n_components': 100}


In [45]:
rf_results = rf_grid.best_estimator_.predict(X_test)

In [46]:
print(classification_report(y_test, rf_results))

              precision    recall  f1-score   support

           0       0.71      0.71      0.71         7
           1       0.78      0.64      0.70        11
           2       0.57      0.50      0.53         8
           3       0.46      0.60      0.52        10
           4       0.75      0.60      0.67        10
           5       0.78      0.58      0.67        12
           6       1.00      0.62      0.77         8
           7       0.50      0.20      0.29         5
           8       0.50      0.67      0.57         6
           9       0.53      0.89      0.67         9
          10       0.75      0.43      0.55        14
          11       0.35      0.88      0.50         8
          12       0.45      0.83      0.59         6
          13       1.00      0.27      0.43        11
          14       0.62      0.56      0.59         9
          15       0.50      0.10      0.17        10
          16       0.71      0.56      0.63         9
          17       0.67    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# **K-NN**

In [33]:
knn_model = KNeighborsClassifier()

knn_pipe = Pipeline([
       ('scale', StandardScaler()),
      ('reduce_dims', PCA()),
        ('model', knn_model)])

In [34]:
param_grid = dict(reduce_dims__n_components=[100,150,210],
                  model__weights = ['uniform', 'distance'],
                  model__metric = ['euclidean', 'manhattan'],
                  model__n_neighbors = [3, 5, 10, 15])

In [35]:
knn_grid = GridSearchCV(knn_pipe, n_jobs=1, param_grid=param_grid, cv=3, verbose=2,
                    return_train_score=True,
                    scoring= 'accuracy')

In [36]:
knn_grid.fit(X_train, y_train)

Fitting 3 folds for each of 48 candidates, totalling 144 fits
[CV] END model__metric=euclidean, model__n_neighbors=3, model__weights=uniform, reduce_dims__n_components=100; total time=   0.3s
[CV] END model__metric=euclidean, model__n_neighbors=3, model__weights=uniform, reduce_dims__n_components=100; total time=   0.3s
[CV] END model__metric=euclidean, model__n_neighbors=3, model__weights=uniform, reduce_dims__n_components=100; total time=   0.3s
[CV] END model__metric=euclidean, model__n_neighbors=3, model__weights=uniform, reduce_dims__n_components=150; total time=   0.2s
[CV] END model__metric=euclidean, model__n_neighbors=3, model__weights=uniform, reduce_dims__n_components=150; total time=   0.3s
[CV] END model__metric=euclidean, model__n_neighbors=3, model__weights=uniform, reduce_dims__n_components=150; total time=   0.3s
[CV] END model__metric=euclidean, model__n_neighbors=3, model__weights=uniform, reduce_dims__n_components=210; total time=   0.2s
[CV] END model__metric=eucli

In [37]:
print(f'Best parameters: {knn_grid.best_params_}')

Best parameters: {'model__metric': 'euclidean', 'model__n_neighbors': 3, 'model__weights': 'distance', 'reduce_dims__n_components': 150}


In [38]:
knn_results = knn_grid.best_estimator_.predict(X_test)

In [39]:
print(classification_report(y_test, knn_results))

              precision    recall  f1-score   support

           0       0.67      0.29      0.40         7
           1       0.75      0.55      0.63        11
           2       0.43      0.75      0.55         8
           3       0.83      0.50      0.62        10
           4       1.00      0.60      0.75        10
           5       0.67      0.67      0.67        12
           6       0.67      0.25      0.36         8
           7       0.50      0.40      0.44         5
           8       0.50      0.17      0.25         6
           9       0.75      0.67      0.71         9
          10       0.38      0.43      0.40        14
          11       0.21      0.88      0.33         8
          12       0.44      0.67      0.53         6
          13       0.75      0.27      0.40        11
          14       0.71      0.56      0.63         9
          15       0.50      0.30      0.37        10
          16       0.44      0.78      0.56         9
          17       0.67    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# **XGBoost**

In [47]:
xgb_model = XGBClassifier()

xgb_pipe = Pipeline([
       ('scale', StandardScaler()),
      ('reduce_dims', PCA()),
        ('model', xgb_model)])

In [48]:
param_grid = dict(reduce_dims__n_components=[90, 100],
                  model__learning_rate = [0.05, 0.1],
                  model__max_depth= [10, 20],
                  model__n_estimators = [100, 200],
                  model__subsample = [0.7, 0.8])

In [49]:
xgb_grid = GridSearchCV(xgb_pipe, n_jobs=1, param_grid=param_grid, cv=3, verbose=2,
                    return_train_score=True,
                    scoring= 'accuracy')

In [None]:
xgb_grid.fit(X_train, y_train)

Fitting 3 folds for each of 32 candidates, totalling 96 fits
[CV] END model__learning_rate=0.05, model__max_depth=10, model__n_estimators=100, model__subsample=0.7, reduce_dims__n_components=90; total time=  28.8s
[CV] END model__learning_rate=0.05, model__max_depth=10, model__n_estimators=100, model__subsample=0.7, reduce_dims__n_components=90; total time=  31.3s
[CV] END model__learning_rate=0.05, model__max_depth=10, model__n_estimators=100, model__subsample=0.7, reduce_dims__n_components=90; total time=  28.5s
[CV] END model__learning_rate=0.05, model__max_depth=10, model__n_estimators=100, model__subsample=0.7, reduce_dims__n_components=100; total time=  31.9s
[CV] END model__learning_rate=0.05, model__max_depth=10, model__n_estimators=100, model__subsample=0.7, reduce_dims__n_components=100; total time=  33.7s
[CV] END model__learning_rate=0.05, model__max_depth=10, model__n_estimators=100, model__subsample=0.7, reduce_dims__n_components=100; total time=  31.5s
[CV] END model__le

In [None]:
print(f'Best parameters: {xgb_grid.best_params_}')

Best parameters: {'model__learning_rate': 0.1, 'model__max_depth': 20, 'model__n_estimators': 200, 'model__subsample': 0.7, 'reduce_dims__n_components': 100}


In [None]:
xgb_results = xgb_grid.best_estimator_.predict(X_test)

In [None]:
print(classification_report(y_test, xgb_results))

              precision    recall  f1-score   support

           0       0.62      0.71      0.67         7
           1       0.80      0.73      0.76        11
           2       0.30      0.38      0.33         8
           3       0.55      0.60      0.57        10
           4       1.00      0.60      0.75        10
           5       0.50      0.58      0.54        12
           6       0.50      0.38      0.43         8
           7       0.17      0.20      0.18         5
           8       0.43      0.50      0.46         6
           9       0.50      0.89      0.64         9
          10       0.50      0.21      0.30        14
          11       0.36      0.62      0.45         8
          12       0.25      0.17      0.20         6
          13       0.40      0.18      0.25        11
          14       0.25      0.22      0.24         9
          15       0.62      0.50      0.56        10
          16       0.50      0.44      0.47         9
          17       0.75    