In [103]:
import librosa
import numpy as np
import pandas as pd
import os, re, sys, pprint, string, math, time, copy
from os.path import isfile, join
from os import listdir

In [85]:
def stft_spectrogram(waveform):
    stft_spectrogram=np.abs(librosa.stft(waveform))
    return stft_spectrogram

def feature_chromagram(waveform):
    stft_spectrogram_c=stft_spectrogram(waveform)
    chromagram=np.mean(librosa.feature.chroma_stft(S=stft_spectrogram_c).T,axis=0)
    return chromagram

def feature_melspectrogram(waveform):
    melspectrogram=np.mean(librosa.feature.melspectrogram(y=waveform, n_mels=128, fmax=8000).T,axis=0)
    return melspectrogram

def feature_mfcc(waveform):
    mfc_coefficients=np.mean(librosa.feature.mfcc(y=waveform,n_mfcc=40).T, axis=0)
    return mfc_coefficients
def feature_rms_time_series(waveform):
    rmse = np.mean(librosa.feature.rms(waveform))
    return rmse
def feature_rms_spectrogram(waveform):
    stft_spectrogram_r=stft_spectrogram(waveform)
    rmse = np.mean(librosa.feature.rms(S=stft_spectrogram_r))
    return rmse
def feature_chroma_cens(waveform):
    chroma_cens  = np.mean(librosa.feature.chroma_cens(waveform))
    return chroma_cens
def feature_cens(waveform):
    cens  = np.mean(librosa.feature.spectral_centroid(waveform))
    return cens
def feature_spec_bw(waveform):
    spec_bw =  np.mean(librosa.feature.spectral_bandwidth(waveform))
    return spec_bw
def feature_rolloff(waveform):
    rolloff = np.mean(librosa.feature.spectral_rolloff(waveform))
    return rolloff

def feature_zcr(waveform):
    zcr = np.mean( librosa.feature.zero_crossing_rate(waveform))
    return zcr
def feature_harmonic(waveform):
    harmonic = np.mean( librosa.effects.harmonic(waveform))
    return harmonic

In [1]:
def get_features(path):
# load an individual soundfile           
        waveform , sr = librosa.load(path)
        print(path[23:]+" is loaded" )   
    # compute features of soundfile
        chromagram = feature_chromagram(waveform)
        melspectrogram = feature_melspectrogram(waveform)
        mfc_coefficients = feature_mfcc(waveform)
        rms_time_series=feature_rms_time_series(waveform)
        rms_spectrogram=feature_rms_spectrogram(waveform)
        chroma_cens=feature_chroma_cens(waveform)
        cens=feature_cens(waveform)
        spec_bw=feature_spec_bw(waveform)
        rolloff=feature_rolloff(waveform)
        zcr=feature_zcr(waveform)
        harmonic=feature_harmonic(waveform)
        feature_matrix=np.array([])
        # use np.hstack to stack our feature arrays horizontally to create a feature matrix
        feature_matrix = np.hstack((chromagram, melspectrogram, mfc_coefficients,
                                    rms_time_series,rms_spectrogram,chroma_cens,cens,spec_bw,rolloff,zcr,harmonic))
            
        
        return feature_matrix

In [2]:
def load_data(path):
        X,y=[],[]
        for root, dirs, files in os.walk(path):
            for filename in files:
                file_path = os.path.join(root, filename)
                label=root[-1:]
                features = get_features(file_path)
                X.append(features)
                y.append(label)
                # '\r' + end='' results in printing over same line
                print(filename + 'is processed')
        # Return arrays to plug into sklearn's cross-validation algorithms
        return np.array(X), np.array(y)

In [100]:
features, lables = load_data('chunks/')

AA713_Q12.wav is loaded
AA713_Q12.wavis processed
AA713_Q24.wav is loaded
AA713_Q24.wavis processed
AM058_Q16._0.wav is loaded
AM058_Q16._0.wavis processed
AM058_Q16._1.wav is loaded
AM058_Q16._1.wavis processed
AM058_Q16._10.wav is loaded
AM058_Q16._10.wavis processed
AM058_Q16._11.wav is loaded
AM058_Q16._11.wavis processed
AM058_Q16._12.wav is loaded
AM058_Q16._12.wavis processed
AM058_Q16._13.wav is loaded
AM058_Q16._13.wavis processed
AM058_Q16._14.wav is loaded
AM058_Q16._14.wavis processed
AM058_Q16._15.wav is loaded
AM058_Q16._15.wavis processed
AM058_Q16._16.wav is loaded
AM058_Q16._16.wavis processed
AM058_Q16._2.wav is loaded
AM058_Q16._2.wavis processed
AM058_Q16._3.wav is loaded
AM058_Q16._3.wavis processed
AM058_Q16._4.wav is loaded
AM058_Q16._4.wavis processed
AM058_Q16._5.wav is loaded
AM058_Q16._5.wavis processed
AM058_Q16._6.wav is loaded
AM058_Q16._6.wavis processed
AM058_Q16._7.wav is loaded
AM058_Q16._7.wavis processed
AM058_Q16._8.wav is loaded
AM058_Q16._8.wavis 

MS090_Q16._7.wavis processed
MS090_Q16._8.wav is loaded
MS090_Q16._8.wavis processed
MS090_Q17._0.wav is loaded
MS090_Q17._0.wavis processed
MS090_Q17._1.wav is loaded
MS090_Q17._1.wavis processed
MS090_Q17._2.wav is loaded
MS090_Q17._2.wavis processed
MS090_Q17._3.wav is loaded
MS090_Q17._3.wavis processed
MS090_Q17._4.wav is loaded
MS090_Q17._4.wavis processed
MS090_Q18._0.wav is loaded
MS090_Q18._0.wavis processed
MS090_Q18._1.wav is loaded
MS090_Q18._1.wavis processed
MS090_Q18._2.wav is loaded
MS090_Q18._2.wavis processed
MS090_Q18._3.wav is loaded
MS090_Q18._3.wavis processed
MS090_Q18._4.wav is loaded
MS090_Q18._4.wavis processed
MS090_Q18._5.wav is loaded
MS090_Q18._5.wavis processed
MS090_Q18._6.wav is loaded
MS090_Q18._6.wavis processed
MS090_Q18._7.wav is loaded
MS090_Q18._7.wavis processed
MS090_Q18._8.wav is loaded
MS090_Q18._8.wavis processed
MS090_Q18._9.wav is loaded
MS090_Q18._9.wavis processed
MS090_Q21._0.wav is loaded
MS090_Q21._0.wavis processed
MS090_Q21._1.wav is

CJ032_Q18._6.wavis processed
CJ032_Q2._0.wav is loaded
CJ032_Q2._0.wavis processed
GL603_Q16._0.wav is loaded
GL603_Q16._0.wavis processed
GL603_Q16._1.wav is loaded
GL603_Q16._1.wavis processed
GL603_Q16._10.wav is loaded
GL603_Q16._10.wavis processed
GL603_Q16._2.wav is loaded
GL603_Q16._2.wavis processed
GL603_Q16._3.wav is loaded
GL603_Q16._3.wavis processed
GL603_Q16._4.wav is loaded
GL603_Q16._4.wavis processed
GL603_Q16._5.wav is loaded
GL603_Q16._5.wavis processed
GL603_Q16._6.wav is loaded
GL603_Q16._6.wavis processed
GL603_Q16._7.wav is loaded
GL603_Q16._7.wavis processed
GL603_Q16._8.wav is loaded
GL603_Q16._8.wavis processed
GL603_Q16._9.wav is loaded
GL603_Q16._9.wavis processed
GL603_Q17._0.wav is loaded
GL603_Q17._0.wavis processed
GL603_Q17._1.wav is loaded
GL603_Q17._1.wavis processed
GL603_Q17._2.wav is loaded
GL603_Q17._2.wavis processed
GL603_Q17._3.wav is loaded
GL603_Q17._3.wavis processed
GL603_Q18._0.wav is loaded
GL603_Q18._0.wavis processed
GL603_Q18._1.wav is

AA713_Q8._5.wavis processed
AA713_Q8._6.wav is loaded
AA713_Q8._6.wavis processed
AI128_Q14._0.wav is loaded
AI128_Q14._0.wavis processed
AI128_Q14._1.wav is loaded
AI128_Q14._1.wavis processed
AI128_Q14._2.wav is loaded
AI128_Q14._2.wavis processed
AI128_Q14._3.wav is loaded
AI128_Q14._3.wavis processed
AI128_Q14._4.wav is loaded
AI128_Q14._4.wavis processed
AI128_Q14._5.wav is loaded
AI128_Q14._5.wavis processed
AI128_Q14._6.wav is loaded
AI128_Q14._6.wavis processed
AI128_Q14._7.wav is loaded
AI128_Q14._7.wavis processed
AI128_Q18._0.wav is loaded
AI128_Q18._0.wavis processed
AI128_Q18._1.wav is loaded
AI128_Q18._1.wavis processed
AI128_Q18._10.wav is loaded
AI128_Q18._10.wavis processed
AI128_Q18._11.wav is loaded
AI128_Q18._11.wavis processed
AI128_Q18._12.wav is loaded
AI128_Q18._12.wavis processed
AI128_Q18._13.wav is loaded
AI128_Q18._13.wavis processed
AI128_Q18._14.wav is loaded
AI128_Q18._14.wavis processed
AI128_Q18._15.wav is loaded
AI128_Q18._15.wavis processed
AI128_Q18.

JS827_Q16._6.wavis processed
JS827_Q16._7.wav is loaded
JS827_Q16._7.wavis processed
JS827_Q16._8.wav is loaded
JS827_Q16._8.wavis processed
JS827_Q16._9.wav is loaded
JS827_Q16._9.wavis processed
JS827_Q6._0.wav is loaded
JS827_Q6._0.wavis processed
JS827_Q6._1.wav is loaded
JS827_Q6._1.wavis processed
JS827_Q7._0.wav is loaded
JS827_Q7._0.wavis processed
JS827_Q7._1.wav is loaded
JS827_Q7._1.wavis processed
JS827_Q7._10.wav is loaded
JS827_Q7._10.wavis processed
JS827_Q7._11.wav is loaded
JS827_Q7._11.wavis processed
JS827_Q7._12.wav is loaded
JS827_Q7._12.wavis processed
JS827_Q7._13.wav is loaded
JS827_Q7._13.wavis processed
JS827_Q7._14.wav is loaded
JS827_Q7._14.wavis processed
JS827_Q7._15.wav is loaded
JS827_Q7._15.wavis processed
JS827_Q7._16.wav is loaded
JS827_Q7._16.wavis processed
JS827_Q7._17.wav is loaded
JS827_Q7._17.wavis processed
JS827_Q7._18.wav is loaded
JS827_Q7._18.wavis processed
JS827_Q7._19.wav is loaded
JS827_Q7._19.wavis processed
JS827_Q7._2.wav is loaded
J

KN241_Q11._1.wavis processed
KN241_Q13._0.wav is loaded
KN241_Q13._0.wavis processed
KN241_Q13._1.wav is loaded
KN241_Q13._1.wavis processed
KN241_Q13._2.wav is loaded
KN241_Q13._2.wavis processed
KN241_Q13._3.wav is loaded
KN241_Q13._3.wavis processed
KN241_Q15.wav is loaded
KN241_Q15.wavis processed
KN241_Q17.wav is loaded
KN241_Q17.wavis processed
KN241_Q21._0.wav is loaded
KN241_Q21._0.wavis processed
KN241_Q21._1.wav is loaded
KN241_Q21._1.wavis processed
KN241_Q21._2.wav is loaded
KN241_Q21._2.wavis processed
KN241_Q22._0.wav is loaded
KN241_Q22._0.wavis processed
KN241_Q22._1.wav is loaded
KN241_Q22._1.wavis processed
KN241_Q22._2.wav is loaded
KN241_Q22._2.wavis processed
KN241_Q23.wav is loaded
KN241_Q23.wavis processed
KN241_Q5._0.wav is loaded
KN241_Q5._0.wavis processed
KN241_Q5._1.wav is loaded
KN241_Q5._1.wavis processed
KN241_Q6._0.wav is loaded
KN241_Q6._0.wavis processed
KN241_Q6._1.wav is loaded
KN241_Q6._1.wavis processed
KN241_Q7._0.wav is loaded
KN241_Q7._0.wavis p

In [182]:
print(f'\nAudio samples represented: {features.shape[0]}')
print(f'Numerical features extracted per sample: {features.shape[1]}')
features_df = pd.DataFrame(features) # make it pretty for display
features_df



Audio samples represented: 782
Numerical features extracted per sample: 188


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,178,179,180,181,182,183,184,185,186,187
0,0.126453,0.118079,0.113394,0.115093,0.124821,0.122381,0.116222,0.122192,0.123309,0.126666,...,-0.290414,0.280361,0.002241,0.001344,0.190926,426.136171,385.827784,841.206935,0.023586,-8.226017e-08
1,0.154620,0.158742,0.163335,0.137386,0.112005,0.089871,0.103420,0.143814,0.142978,0.168386,...,-1.348519,-0.379349,0.008281,0.004937,0.198939,615.777755,577.368030,1162.005936,0.033746,-2.866544e-07
2,0.305739,0.319390,0.306852,0.317104,0.329285,0.269795,0.243396,0.264277,0.258224,0.315227,...,-1.796512,-1.331023,0.010460,0.006169,0.193145,1104.354646,1171.883084,2209.185427,0.052985,-5.341833e-07
3,0.573555,0.610279,0.557567,0.651021,0.659120,0.474614,0.410231,0.501500,0.613317,0.717284,...,-3.154679,-1.549339,0.014623,0.008693,0.240065,2198.293349,2353.166539,4453.451437,0.106871,-9.203883e-07
4,0.557184,0.574471,0.567658,0.661689,0.662666,0.504073,0.450017,0.533253,0.599260,0.684993,...,-4.470907,-3.034260,0.018639,0.011101,0.231484,1966.494926,2237.760256,3977.011492,0.087127,-2.299355e-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
777,0.700555,0.722834,0.707970,0.650610,0.559781,0.516229,0.524056,0.600751,0.640134,0.707656,...,-4.783976,-0.876855,0.016461,0.009703,0.260588,2310.266293,2361.472218,4646.216386,0.119739,-6.802977e-07
778,0.363815,0.381756,0.384329,0.402426,0.357173,0.299630,0.263445,0.279243,0.302423,0.347530,...,-3.424054,-0.352039,0.013362,0.008016,0.216051,1239.142099,1299.258253,2417.458561,0.061106,1.795992e-07
779,0.718319,0.667220,0.687375,0.648520,0.574079,0.497618,0.516569,0.563480,0.605375,0.677905,...,-5.108752,-0.811181,0.022097,0.013043,0.248820,2222.412464,2315.320949,4396.338537,0.111737,2.804956e-07
780,0.661095,0.691035,0.714920,0.727481,0.631913,0.534066,0.484650,0.549375,0.621661,0.662648,...,-5.035075,-1.087654,0.018225,0.010766,0.257744,2315.396766,2309.419515,4559.584460,0.122551,-6.101220e-07


In [179]:
labels_name=[]
for x in range(12) :
    column_name = 'chromagram_'+str(x)
    labels_name.append(column_name)
for x in range(128) :
    column_name = 'mel_spectrogram_'+str(x)
    labels_name.append(column_name)
for x in range(40) :
    column_name = 'mfcc_'+str(x)
    labels_name.append(column_name)
labels_name.append('rms_time_series')
labels_name.append('rms_spectrogram')
labels_name.append('chroma_cens')
labels_name.append('cens')
labels_name.append('spec_bw')
labels_name.append('rolloff')
labels_name.append('zcr')
labels_name.append('harmonic')
f = features_df
dataset1 = pd.DataFrame(f)
dataset1.columns=labels_name
dataset1.to_csv('data_withoutindex.csv', index=False)
dataset1.to_csv('data.csv', index=False)
dataset1['label']=lables
dataset1.to_csv('data_withlabels.csv', index=False)


In [281]:
features_df.to_csv('features.csv', index=False)

In [254]:
# We would usually use df.describe(), but it provides a bit of a mess of information we don't need at the moment.
def print_features(df,labels_name):
    # Check chromagram feature values
    features_df_chromagram = df.loc[:,:11]
    chroma_min = features_df_chromagram.min().min()
    chroma_max = features_df_chromagram.max().max()
    # stack all features into a single series so we don't get a mean of means or stdev of stdevs
    chroma_mean = features_df_chromagram.stack().mean()
    chroma_stdev = features_df_chromagram.stack().std()
    print(f'12 Chromagram features:       \
    min = {chroma_min:.3f}, \
    max = {chroma_max:.3f}, \
    mean = {chroma_mean:.3f}, \
    deviation = {chroma_stdev:.3f}') 

    # Check mel spectrogram feature values
    features_df_melspectrogram = df.loc[:,12:139]
    mel_min = features_df_melspectrogram.min().min()
    mel_max = features_df_melspectrogram.max().max()
    # stack all features into a single series so we don't get a mean of means or stdev of stdevs
    mel_mean = features_df_melspectrogram.stack().mean()
    mel_stdev = features_df_melspectrogram.stack().std()
    print(f'\n128 Mel Spectrogram features: \
    min = {mel_min:.3f}, \
    max = {mel_max:.3f}, \
    mean = {mel_mean:.3f}, \
    deviation = {mel_stdev:.3f}')

    # Check MFCC feature values
    features_df_mfcc = df.loc[:,140:179]
    mfcc_min = features_df_mfcc.min().min()
    mfcc_max = features_df_mfcc.max().max()
    # stack all features into a single series so we don't get a mean of means or stdev of stdevs
    mfcc_mean = features_df_mfcc.stack().mean()
    mfcc_stdev = features_df_mfcc.stack().std()
    print(f'\n40 MFCC features:             \
    min = {mfcc_min:.3f},\
    max = {mfcc_max:.3f},\
    mean = {mfcc_mean:.3f},\
    deviation = {mfcc_stdev:.3f}')
    
    
        #  check rms_time_series,rms_spectrogram,chroma_cens,cens,spec_bw,rolloff,zcr,harmonic
    for x in range (180,188) :
            
            name=labels_name[x]
            feature = df.loc[:,x]
            feature_min = feature.min()
            feature__max = feature.max()
            # stack all features into a single series so we don't get a mean of means or stdev of stdevs
            feature_mean = feature.mean()
            feature_stdev = feature.std()
            print(f'\n{name}:  \
            min ={feature_min:.3f},\
            max ={feature__max:.3f},\
            mean = {feature_mean:.3f},\
            deviation ={feature_stdev:.3f}')
    
    
print_features(features_df,labels_name)

12 Chromagram features:           min = 0.061,     max = 0.956,     mean = 0.559,     deviation = 0.156

128 Mel Spectrogram features:     min = 0.000,     max = 31.088,     mean = 0.099,     deviation = 0.491

40 MFCC features:                 min = -786.967,    max = 119.352,    mean = -9.737,    deviation = 63.753

rms_time_series:              min =0.001,            max =0.056,            mean = 0.013,            deviation =0.008

rms_spectrogram:              min =0.001,            max =0.033,            mean = 0.007,            deviation =0.005

chroma_cens:              min =0.155,            max =0.284,            mean = 0.246,            deviation =0.023

cens:              min =238.352,            max =3360.299,            mean = 2317.469,            deviation =614.456

spec_bw:              min =238.810,            max =2826.659,            mean = 2204.609,            deviation =559.709

rolloff:              min =485.180,            max =6891.338,            mean = 4697.551

In [263]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

scaler = StandardScaler()
# keep our unscaled features just in case we need to process them alternatively
features_scaled = features 
features_scaled = scaler.fit_transform(features_scaled)

scaler = MinMaxScaler()
# keep our unscaled features just in case we need to process them alternatively
features_minmax = features
features_minmax = scaler.fit_transform(features_minmax)

In [264]:
print('\033[1m'+'Standard Scaling:\n'+'\033[0m')
features_scaled_df = pd.DataFrame(features_scaled)
print_features(features_scaled_df,labels_name)

print('\n\n\033[1m'+'MinMax Scaling:\n'+'\033[0m')
features_minmax_df = pd.DataFrame(features_minmax)
print_features(features_minmax_df,labels_name)

[1mStandard Scaling:
[0m
12 Chromagram features:           min = -3.402,     max = 2.117,     mean = -0.000,     deviation = 1.000

128 Mel Spectrogram features:     min = -0.885,     max = 25.657,     mean = 0.000,     deviation = 1.000

40 MFCC features:                 min = -4.755,    max = 5.483,    mean = 0.000,    deviation = 1.000

rms_time_series:              min =-1.415,            max =5.306,            mean = -0.000,            deviation =1.001

rms_spectrogram:              min =-1.415,            max =5.335,            mean = 0.000,            deviation =1.001

chroma_cens:              min =-4.043,            max =1.672,            mean = -0.000,            deviation =1.001

cens:              min =-3.386,            max =1.698,            mean = 0.000,            deviation =1.001

spec_bw:              min =-3.514,            max =1.112,            mean = -0.000,            deviation =1.001

rolloff:              min =-3.278,            max =1.707,            mean = 

In [266]:
from sklearn.model_selection import train_test_split
############# Unscaled test/train set #############
X_train, X_test, y_train, y_test = train_test_split(
    features, 
    lables, 
    test_size=0.2, 
    random_state=69
)

############ Standard Scaled test/train set ###########
# The labels/classes (y_train, y_test) never change, keep old values 
X_train_scaled, X_test_scaled, _, _ = train_test_split(
    features_scaled, 
    lables, 
    test_size=0.2, 
    random_state=69
)

############# MinMax Scaled test/train set ###############
# The labels/classes (y_train, y_test) never change, keep old values 
X_train_minmax, X_test_minmax, _, _ = train_test_split(
    features_scaled, 
    lables, 
    test_size=0.2, 
    random_state=69
)

In [275]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

classification_models = [
    KNeighborsClassifier(),#(3),
    SVC(kernel='linear'),#, C=0.025),
    SVC(kernel='rbf'),
    DecisionTreeClassifier(),#max_depth=5),
    RandomForestClassifier(),#max_depth=5, n_estimators=10, max_features=1),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

scores = []
for model in classification_models:
    model.fit(X_train_scaled, y_train)
    score = model.score(X_test_scaled, y_test)
    model_name = type(model).__name__
    if model_name=='SVC' and model.kernel=='rbf': model_name+=' RBF kernel'
    scores.append((model_name,(f'{100*score:.2f}%')))
# Make it pretty
scores_df = pd.DataFrame(scores,columns=['Classifier','Accuracy Score'])
scores_df.sort_values(by='Accuracy Score',axis=0,ascending=False)



Unnamed: 0,Classifier,Accuracy Score
0,KNeighborsClassifier,64.97%
2,SVC RBF kernel,64.33%
4,RandomForestClassifier,63.06%
1,SVC,60.51%
5,AdaBoostClassifier,56.05%
3,DecisionTreeClassifier,47.13%
7,QuadraticDiscriminantAnalysis,45.86%
6,GaussianNB,31.85%


In [268]:
from sklearn.svm import SVC

model = SVC(
    C=10,
    gamma='auto',
    kernel='rbf',
    random_state=69
)

model.fit(X_train, y_train)

print(f'SVC Model\'s accuracy on training set is {100*model.score(X_train, y_train):.2f}%')
print(f'SVC Model\'s accuracy on test set is {100*model.score(X_test, y_test):.2f}%')

SVC Model's accuracy on training set is 100.00%
SVC Model's accuracy on test set is 46.50%


In [269]:
from sklearn.neighbors import KNeighborsClassifier

####### Default kNN  ########
model = KNeighborsClassifier(
)

model.fit(X_train, y_train)

print(f'Default kNN Model\'s accuracy on training set is {100*model.score(X_train, y_train):.2f}%')
print(f'Default kNN Model\'s accuracy on test set is {100*model.score(X_test, y_test):.2f}%\n')

##### (hastily) tuned kNN ######
model = KNeighborsClassifier(
    n_neighbors = 5,
    weights = 'distance',
    algorithm = 'brute',
    leaf_size = '30',
    n_jobs=4
)

model.fit(X_train, y_train)

print(f'kNN Model\'s accuracy on training set is {100*model.score(X_train, y_train):.2f}%')
print(f'kNN Model\'s accuracy on test set is {100*model.score(X_test, y_test):.2f}%')

Default kNN Model's accuracy on training set is 63.68%
Default kNN Model's accuracy on test set is 42.04%

kNN Model's accuracy on training set is 100.00%
kNN Model's accuracy on test set is 47.77%


In [271]:
from sklearn.ensemble import RandomForestClassifier

####### Default Random Forest ########
model = RandomForestClassifier(
    random_state=69
)

model.fit(X_train, y_train)

print(f'Default Random Forest Model\'s accuracy on training set is {100*model.score(X_train, y_train):.2f}%')
print(f'Default Random Forest Model\'s accuracy on test set is {100*model.score(X_test, y_test):.2f}%\n')


########## Tuned Random Forest #######
model = RandomForestClassifier(
    n_estimators = 500, 
    criterion ='entropy',
    warm_start = True,
    max_features = 'sqrt',
    oob_score = 'True', # more on this below
    random_state=69  
) 

model.fit(X_train, y_train)

print(f'Random Forest Model\'s accuracy on training set is {100*model.score(X_train, y_train):.2f}%')
print(f'Random Forest Model\'s accuracy on test set is {100*model.score(X_test, y_test):.2f}%')

Default Random Forest Model's accuracy on training set is 100.00%
Default Random Forest Model's accuracy on test set is 63.69%

Random Forest Model's accuracy on training set is 100.00%
Random Forest Model's accuracy on test set is 63.06%


In [272]:
from sklearn.neural_network import MLPClassifier

# Default 'off-the-shelf' MLP from sklearn
model = MLPClassifier(
    random_state = 69
)

#################### unscaled features #####################
X_train, X_test, y_train, y_test = train_test_split(
    features, 
    lables, 
    test_size=0.2, 
    random_state=69
)

model.fit(X_train, y_train)

print(f'Possible emotions predicted by model:{model.classes_}')
print(f'Unscaled MLP Model\'s accuracy on training set is {100*model.score(X_train, y_train):.2f}%')
print(f'Unscaled MLP Model\'s accuracy on test set is {100*model.score(X_test, y_test):.2f}%')

Possible emotions predicted by model:['1' '2' '7']
Unscaled MLP Model's accuracy on training set is 53.92%
Unscaled MLP Model's accuracy on test set is 54.14%


In [273]:
#################### minmax scaled #######################
# Rebuild the training set from the minmax scaled features
# The labels/classes (y_train, y_test) never change, keep old values 
X_train_minmax, X_test_minmax, _, _ = train_test_split(
    features_minmax, 
    lables, 
    test_size=0.2, 
    random_state=69
)

# Fit the model again on the minmax scaled features
model.fit(X_train_minmax, y_train)

print(f'MinMax scaled MLP Model\'s accuracy on training set is {100*model.score(X_train_minmax, y_train):.2f}%')
print(f'MinMax sacled MLP Model\'s accuracy on test set is {100*model.score(X_test_minmax, y_test):.2f}%\n')

#################### standard scaled #######################
# Rebuild the training set from the standard scaled features
# The labels/classes (y_train, y_test) never change, keep old values 
X_train_scaled, X_test_scaled, _, _ = train_test_split(
    features_scaled, 
    lables, 
    test_size=0.2, 
    random_state=69
)
# Fit the model again on the scaled features
model.fit(X_train_scaled, y_train)

print(f'Standard scaled MLP Model\'s accuracy on training set is {100*model.score(X_train_scaled, y_train):.2f}%')
print(f'Standard scaled MLP Model\'s accuracy on test set is {100*model.score(X_test_scaled, y_test):.2f}%')



MinMax scaled MLP Model's accuracy on training set is 84.48%
MinMax sacled MLP Model's accuracy on test set is 67.52%

Standard scaled MLP Model's accuracy on training set is 99.84%
Standard scaled MLP Model's accuracy on test set is 70.06%


