## Task 2: Multi-class classification of ECG signals

In [2]:
import numpy as np
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow import keras
from sklearn.model_selection import train_test_split
import neurokit2 as nk
import warnings
import biosppy.signals.ecg as ecg
import biosppy
warnings.filterwarnings('ignore')
from datetime import date
from sklearn.preprocessing import StandardScaler
import numpy.ma as ma
from numpy import ptp, zeros, mean

import statsmodels.api as sm
from statsmodels.graphics import tsaplots
import matplotlib.pyplot as plt
from scipy.stats import norm, kurtosis
from scipy.signal import argrelextrema

import neurokit2 as nk
import statsmodels.api as sm
from statsmodels.graphics import tsaplots
import matplotlib.pyplot as plt
from scipy.stats import norm, kurtosis
import biosignalsnotebooks as bsnb
from math import log10
import statsmodels.api as sm
from statsmodels.graphics import tsaplots
import matplotlib.pyplot as plt
from scipy.stats import norm, kurtosis, pearsonr
from scipy.signal import argrelextrema, find_peaks,correlate
import heartpy as hp

import os.path
from tqdm.notebook import tqdm
from sklearn.feature_selection import SelectKBest, f_classif

from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from IPython.display import Audio
sound_file = 'notification.mp3'
import pygame

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

pygame 2.0.2 (SDL 2.0.16, Python 3.9.7)
Hello from the pygame community. https://www.pygame.org/contribute.html


### Functions preprocessing and features extraction

In [3]:

def loadData(f):
    A = pd.read_csv(f).drop(columns = ['id'])
    return A

def processSignal(data):
    #input: orignal matrix data
    
    
    data_filtered = pd.DataFrame()

    for (i, sample) in enumerate(tqdm(data.iterrows(), total=len(data))):
        
        ECG = sample[1].dropna().to_numpy(dtype='float32')
        cleansed = nk.ecg_clean(ECG, sampling_rate=300, method='neurokit')       
        # Plot the processed dataframe, normalizing all variables for viewing purpose
        data_filtered = data_filtered.append(pd.Series(cleansed, name = i))
        
    return data_filtered


def featuresExtraction(data):
    
    # input: complete matrix of signals with NaN values
    # output: matrix of computed features for each ECG signal
    column_names = []
    features_total = pd.DataFrame([], columns=column_names, dtype=np.float32)
    for (i, sample) in enumerate(tqdm(data[:].iterrows(), total = len(data))):
    
            
        ECG = sample[1].dropna().to_numpy(dtype='float32')
        sample_rate = 300


        try:
            wd, m = hp.process(ECG, 300)
            

            features = pd.DataFrame()
            features = features.append(pd.Series([m[key] for key in m.keys()]), ignore_index=True)
            

        except Exception as e:

            features = pd.DataFrame()
            features = features.append(pd.Series(np.zeros((13))),ignore_index=True)
        

        '''
        hrv_time
        '''
        
        try:
            peaks1, info = nk.ecg_peaks(ECG, sampling_rate=sample_rate) # 0 or 1 for each position (if 1 then it is RR)
            peaks2 = ecg.engzee_segmenter(ECG, 300)['rpeaks']

            if (peaks1.ECG_R_Peaks.sum()<5): # main method fails
                # 2nd method: detection of the TA from AML
                if (peaks2.shape[0]<5): #2nd method fails
                    # if 2nd method fails: use third one
                    peaks3 = ecg.hamilton_segmenter(signal=ECG, sampling_rate=sample_rate)                
                    peaks3 = np.array(peaks3)[0]
                    peaks = peaks3
                else:
                    peaks = peaks2
            else:
                peaks = peaks1


            hrv_time = nk.hrv_time(peaks, sampling_rate=sample_rate, show=False)
            hrv_freq = nk.hrv_frequency(peaks, sampling_rate=sample_rate, show=False, normalize=True)
            hrv_non = nk.hrv_nonlinear(peaks, sampling_rate=100, show=False)


            #hrv_non = nk.hrv_nonlinear(peaks, sampling_rate=300, show=False)

        except: #AttributeError:
            peaks = ecg.engzee_segmenter(ECG, 300)['rpeaks']
            hrv_time = nk.hrv_time(peaks, sampling_rate=sample_rate, show=False)
            hrv_freq = nk.hrv_frequency(peaks, sampling_rate=sample_rate, show=False, normalize=True)
            hrv_non = nk.hrv_nonlinear(peaks, sampling_rate=100, show=False)

        #hrv_time = hrv_time.drop(['HRV_SDANN1', 'HRV_SDNNI1', 'HRV_SDANN2', 'HRV_SDNNI2', 'HRV_SDANN5', 'HRV_SDNNI5'], axis=1)

        features = pd.concat([features,hrv_time], axis = 1)      

        features = pd.concat([features,hrv_freq], axis = 1)      
        features = pd.concat([features,hrv_non], axis = 1)
        
        
        indicesZeroCrossing = nk.signal_zerocrossings(ECG, direction='both')
        zeroCross = len(indicesZeroCrossing)/len(ECG)
        features['zeroCrossing'] = zeroCross
        kurtosisValue = kurtosis(ECG)
        features['kurtosis'] = kurtosisValue
        


        
        psdQRS = nk.signal_psd(ECG, sampling_rate=300, show=False, min_frequency=5, max_frequency=25)
        powerQRS = psdQRS.sum().loc['Power']
        features['powerQRS'] = powerQRS


        psd30 = nk.signal_psd(ECG, sampling_rate=300, show=False, min_frequency=30, max_frequency=np.inf)
        power30 = psd30.sum().loc['Power']
        psdTOTAL = nk.signal_psd(ECG, sampling_rate=300, show=False, min_frequency=0, max_frequency=np.inf)
        powerTOTAL = psdTOTAL.sum().loc['Power']
        feature2 = power30/powerTOTAL
        features['power2'] = feature2
        
        
        # Average beat: take features from it!
        if (peaks2.shape[0]<5):
            peaks3 = ecg.hamilton_segmenter(signal=ECG, sampling_rate=sample_rate)                
            peaks3 = np.array(peaks3)[0]
            beats = ecg.extract_heartbeats(ECG, peaks3, 300)['templates']
            #snr = computeSNR(ECG, peaks3)
            #features['snr'] = snr
        else:
            beats = ecg.extract_heartbeats(ECG, peaks2, 300)['templates']
            #snr = computeSNR(ECG, peaks2)
            #features['snr'] = snr

        mu = np.mean(beats, axis=0) 
        var = np.std(beats, axis=0)
        md = np.median(beats, axis=0)
        maxCorrMuMd = max(correlate(mu, md, mode='full', method='auto')) #feature!!!
        features['maxCorrMuMd'] = maxCorrMuMd
        varAverageBeat = abs(max(var-mu)) #feature!!!
        features['varAverageBeat'] = varAverageBeat
        pearson = pearsonr(mu,md)[0] #feature!!!
        features['pearson'] = pearson

        
        # ACF
        acf = sm.tsa.acf(np.array(ECG), nlags=ECG.shape[0]-1,fft=False)
        acfPeaks = find_peaks(acf, height = 0.1, distance=1)[0]
        if (acfPeaks.size==0):
            features['stdDiffLocs'] = np.nan
        else:
            acfLocs = np.append(np.array([1]),acfPeaks)
            diffLocs = np.diff(acfLocs)
            stdDiffLocs = np.std(diffLocs) #feature!!!
            features['stdDiffLocs'] = stdDiffLocs
       
        try:
            quality = nk.ecg_quality(ECG, rpeaks=None, sampling_rate=300, method='zhao2018', approach=None)
            quality_out = ["Unacceptable", "Barely Acceptable", "Excellent"]
            features['quality'] = quality_out.index(quality)
        except Exception as e:
            features['quality'] = 0

        
        features_total = pd.concat([features_total,features], axis = 0)
        try:
            ECG = nk.ecg_clean(ECG, sampling_rate=sampling_rate, method="neurokit")
            _, rpeaks = nk.ecg_peaks(ECG, sampling_rate=300)    
            signals, waves = nk.ecg_delineate(ECG, rpeaks, sampling_rate=300)
            print(waves)

            #Feature: Relative amount of peaks
            features['Relative_Amount_r'] = pd.Series((len(rpeaks['ECG_R_Peaks']) / len(ECG)))
            features['Relative_Amount_p'] = pd.Series(np.argwhere(np.isnan(waves["ECG_P_Peaks"])).shape[0] / len(ECG))
            features['Relative_Amount_t'] = pd.Series(np.argwhere(np.isnan(waves["ECG_T_Peaks"])).shape[0] / len(ECG))
            features['Relative_Amount_q'] = pd.Series(np.argwhere(np.isnan(waves["ECG_Q_Peaks"])).shape[0] / len(ECG))
            features['Relative_Amount_s'] = pd.Series(np.argwhere(np.isnan(waves["ECG_S_Peaks"])).shape[0] / len(ECG))

            features['Amplitude_r'] = pd.Series(np.mean(ECG[rpeaks['ECG_R_Peaks']]))
            features['Amplitude_p'] = pd.Series(np.mean(ECG[(np.array(waves['ECG_P_Peaks'])[~np.isnan(np.array(waves['ECG_P_Peaks']))]).astype(int)]))
            features['Amplitude_t'] = pd.Series(np.mean(ECG[(np.array(waves['ECG_T_Peaks'])[~np.isnan(np.array(waves['ECG_T_Peaks']))]).astype(int)]))
            features['Amplitude_q'] = pd.Series(np.mean(ECG[(np.array(waves['ECG_Q_Peaks'])[~np.isnan(np.array(waves['ECG_Q_Peaks']))]).astype(int)]))
            features['Amplitude_s'] = pd.Series(np.mean(ECG[(np.array(waves['ECG_S_Peaks'])[~np.isnan(np.array(waves['ECG_S_Peaks']))]).astype(int)]))
            features['Amplitude_r_std'] = pd.Series(np.std(ECG[rpeaks['ECG_R_Peaks']]))
            features['Amplitude_p_std'] = pd.Series(np.std(ECG[(np.array(waves['ECG_P_Peaks'])[~np.isnan(np.array(waves['ECG_P_Peaks']))]).astype(int)]))
            features['Amplitude_t_std'] = pd.Series(np.std(ECG[(np.array(waves['ECG_T_Peaks'])[~np.isnan(np.array(waves['ECG_T_Peaks']))]).astype(int)]))
            features['Amplitude_q_std'] = pd.Series(np.std(ECG[(np.array(waves['ECG_Q_Peaks'])[~np.isnan(np.array(waves['ECG_Q_Peaks']))]).astype(int)]))
            features['Amplitude_s_std'] = pd.Series(np.std(ECG[(np.array(waves['ECG_S_Peaks'])[~np.isnan(np.array(waves['ECG_S_Peaks']))]).astype(int)]))
            
            features_total = features_total.append(features)

        except:
            features['Relative_Amount_r'] = pd.Series(np.nan)
            features['Relative_Amount_p'] = pd.Series(np.nan)
            features['Relative_Amount_t'] = pd.Series(np.nan)
            features['Relative_Amount_q'] = pd.Series(np.nan)
            features['Relative_Amount_s'] = pd.Series(np.nan)

            features['Amplitude_r'] = pd.Series(np.nan)
            features['Amplitude_p'] = pd.Series(np.nan)
            features['Amplitude_t'] = pd.Series(np.nan)
            features['Amplitude_q'] = pd.Series(np.nan)
            features['Amplitude_s'] = pd.Series(np.nan)

            features['Amplitude_r_std'] = pd.Series(np.nan)
            features['Amplitude_p_std'] = pd.Series(np.nan)
            features['Amplitude_t_std'] = pd.Series(np.nan)
            features['Amplitude_q_std'] = pd.Series(np.nan)
            features['Amplitude_s_std'] = pd.Series(np.nan)
            
            features_total = features_total.append(features)




    return features_total

def featuresExtraction_add(data):
    
    # input: complete matrix of signals with NaN values
    # output: matrix of computed features for each ECG signal
    column_names = []
    features = pd.DataFrame()

    features_total = pd.DataFrame([], columns=column_names, dtype=np.float32)
    for (i, sample) in enumerate(tqdm(data[:].iterrows(), total = len(data))):
    
            
        ECG = sample[1].dropna().to_numpy(dtype='float32')
        sampling_rate = 300

        try:

            time_r_peaks, amplitude_r_peaks = bsnb.detect_r_peaks(ECG, 300, time_units=True, plot_result= False)

            # Finding the maximum and minimum values of the ECG signal
            max_ecg = max(ECG)
            min_ecg = min(ECG)

            # Calculating the amplitude of the signal
            vpp_signal_ecg = max_ecg - min_ecg

            # Notice that this procedure is condensed in a single function in the numpy Python package:
            vpp_signal_ecg = ptp(ECG)
            vpp_noise_ecg = []

            # For this task, we will follow the same procedure as shown before, but store the values in a list, so that we can then calculate the mean value.
            for t in time_r_peaks:
                start = int((t + 0.5) * 300) # 0.5 - time between a peak and a flat 
                end = int((t + 0.65)* 300) # 0.65 time between a peak and the end of the flat
                interval = ECG[start:end]
                vpp = ptp(interval)
                vpp_noise_ecg.append(vpp)
                
            vpp_noise_ecg = mean(vpp_noise_ecg)

            snr_ecg = vpp_signal_ecg/vpp_noise_ecg

            # The multiplication by 20 is because the signals are in the unit of (micro)Siemes
            snr_ecg_db = 20 * log10(snr_ecg)

            features['SNR'] = pd.Series(snr_ecg)

            

        except:
            features['SNR'] = pd.Series(np.NaN)
        
        
        features_total = features_total.append(features)


        try:
            ECG = nk.ecg_clean(ECG, sampling_rate=sampling_rate, method="neurokit")
            _, rpeaks = nk.ecg_peaks(ECG, sampling_rate=300)    
            signals, waves = nk.ecg_delineate(ECG, rpeaks, sampling_rate=300)

            #Feature: Relative amount of peaks
            features['r_diff_mean'] = pd.Series(np.mean(np.diff(rpeaks)))
            features['r_diff_std'] = pd.Series(np.std(np.diff(rpeaks)))
            features['r_ampl_std'] = pd.Series(np.std(ECG[rpeaks['ECG_R_Peaks']]))
            

            features['p_diff_mean'] = pd.Series(np.mean(np.diff(waves["ECG_P_Peaks"][np.argwhere(np.isnan(waves["ECG_P_Peaks"]))])))
            features['p_diff_mean'] = pd.Series(np.std(np.diff(waves["ECG_P_Peaks"][np.argwhere(np.isnan(waves["ECG_P_Peaks"]))])))
            features['p_ampl_std'] = pd.Series(np.std(ECG[(np.array(waves['ECG_P_Peaks'])[~np.isnan(np.array(waves['ECG_P_Peaks']))]).astype(int)]))

            features['t_diff_mean'] = pd.Series(np.mean(np.diff(waves["ECG_T_Peaks"][np.argwhere(np.isnan(waves["ECG_T_Peaks"]))])))
            features['t_diff_mean'] = pd.Series(np.std(np.diff(waves["ECG_T_Peaks"][np.argwhere(np.isnan(waves["ECG_T_Peaks"]))])))
            features['t_ampl_std'] = pd.Series(np.std(ECG[(np.array(waves['ECG_T_Peaks'])[~np.isnan(np.array(waves['ECG_T_Peaks']))]).astype(int)]))

            features['q_diff_mean'] = pd.Series(np.mean(np.diff(waves["ECG_Q_Peaks"][np.argwhere(np.isnan(waves["ECG_Q_Peaks"]))])))
            features['q_diff_mean'] = pd.Series(np.std(np.diff(waves["ECG_Q_Peaks"][np.argwhere(np.isnan(waves["ECG_Q_Peaks"]))])))
            features['q_ampl_std'] = pd.Series(np.std(ECG[(np.array(waves['ECG_Q_Peaks'])[~np.isnan(np.array(waves['ECG_Q_Peaks']))]).astype(int)]))


            features['s_diff_mean'] = pd.Series(np.mean(np.diff(waves["ECG_S_Peaks"][np.argwhere(np.isnan(waves["ECG_S_Peaks"]))])))
            features['s_diff_mean'] = pd.Series(np.std(np.diff(waves["ECG_S_Peaks"][np.argwhere(np.isnan(waves["ECG_S_Peaks"]))])))
            features['s_ampl_std'] = pd.Series(np.std(ECG[(np.array(waves['ECG_S_Peaks'])[~np.isnan(np.array(waves['ECG_S_Peaks']))]).astype(int)]))
            
            features_total = features_total.append(features)

        except:
            features['r_diff_mean'] = pd.Series(np.nan)
            features['r_diff_std'] = pd.Series(np.nan)
            features['r_ampl_std'] = pd.Series(np.nan)

            features['p_diff_mean'] = pd.Series(np.nan)
            features['p_diff_mean'] = pd.Series(np.nan)
            features['p_ampl_std'] = pd.Series(np.nan)

            features['t_diff_mean'] = pd.Series(np.nan)
            features['t_diff_mean'] = pd.Series(np.nan)
            features['t_ampl_std'] = pd.Series(np.nan)

            features['q_diff_mean'] = pd.Series(np.nan)
            features['q_diff_mean'] = pd.Series(np.nan)
            features['q_ampl_std'] = pd.Series(np.nan)

            features['s_diff_mean'] = pd.Series(np.nan)
            features['s_diff_mean'] = pd.Series(np.nan)
            features['s_ampl_std'] = pd.Series(np.nan)
            
            features_total = features_total.append(features)
            print(features)
            break
        




    return features_total    





def fillingNaN(features, option):
    # fillingNaN:
    # 1.- iterative imputer
    # 2.- nan replaced by median + 3xstd

    if (option==1):
        imp = IterativeImputer(max_iter=10, random_state=1, n_nearest_features = 28, verbose = 0)
        featuresFinal = imp.fit_transform(np.array(features))
        
    else:
        #np.where(np.isnan(features), ma.array(features, mask=np.isnan(features)).np.nanmean(axis=0), features)    
        col_med = np.nanmedian(features, axis=0)
        col_std = np.nanstd(features, axis=0)
        replace = col_med + col_std
        featuresFinal= np.where(np.isnan(features),replace,features)
        #inds = np.where(np.isnan(features))
        #features[inds] = np.take(col_med+3*col_std, inds[1])

    return featuresFinal

def scaler(X_train):
    scaler = StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    return X_train


def feature_selection(x,y,k):
    selector = SelectKBest(f_classif, k=k)
    selector.fit(x, y)
    cols = selector.get_support(indices=True)
    return pd.DataFrame(x).iloc[:,cols], cols
 

### Import data and cleaning

In [4]:
#import data
'''
if not('x_train' in locals()):
    x_train = loadData('X_train.csv')
if not('x_test' in locals()):
    x_test = loadData('X_test.csv')
'''    
if not('y_train' in locals()):
    y_train = loadData('y_train.csv')

In [5]:
'''

#clean data
if not('x_train_p' in locals()):
    if os.path.isfile('X_train_clean.csv'):
        x_train_p = pd.read_csv('X_train_clean.csv')
    else: 
        x_train_p = processSignal(x_train)
        x_train_p.to_csv('X_train_clean.csv')

if not('x_test_p' in locals()):
    if os.path.isfile('X_test_clean.csv'):
        x_test_p = pd.read_csv('X_test_clean.csv')
    else: 
        x_test_p = processSignal(x_test)
        x_test_p.to_csv('X_test_clean.csv')
'''

"\n\n#clean data\nif not('x_train_p' in locals()):\n    if os.path.isfile('X_train_clean.csv'):\n        x_train_p = pd.read_csv('X_train_clean.csv')\n    else: \n        x_train_p = processSignal(x_train)\n        x_train_p.to_csv('X_train_clean.csv')\n\nif not('x_test_p' in locals()):\n    if os.path.isfile('X_test_clean.csv'):\n        x_test_p = pd.read_csv('X_test_clean.csv')\n    else: \n        x_test_p = processSignal(x_test)\n        x_test_p.to_csv('X_test_clean.csv')\n"

### Extraction of features

In [7]:
#extract features
#if not('x_train_f' in locals()):
if os.path.isfile('X_train_features_add.csv'):
    x_train_f_add = pd.read_csv('X_train_features_add.csv')
else: 
    x_train_f_add = featuresExtraction_add(x_train_clean)
    x_train_f_add.to_csv('X_train_features_add.csv')

#if not('x_test_f' in locals()):
if os.path.isfile('X_test_features_add.csv'):
    x_test_f_add = pd.read_csv('X_test_features_add.csv')
else: 
    x_test_f_add = featuresExtraction_add(x_test_clean)
    x_test_f_add.to_csv('X_test_features_add.csv')


  0%|          | 0/5117 [00:00<?, ?it/s]

         SNR  r_diff_mean  r_diff_std  r_ampl_std  p_diff_mean  p_ampl_std  \
0  15.149641          NaN         NaN         NaN          NaN         NaN   

   t_diff_mean  t_ampl_std  q_diff_mean  q_ampl_std  s_diff_mean  s_ampl_std  
0          NaN         NaN          NaN         NaN          NaN         NaN  
         SNR  r_diff_mean  r_diff_std  r_ampl_std  p_diff_mean  p_ampl_std  \
0  10.859864          NaN         NaN         NaN          NaN         NaN   

   t_diff_mean  t_ampl_std  q_diff_mean  q_ampl_std  s_diff_mean  s_ampl_std  
0          NaN         NaN          NaN         NaN          NaN         NaN  
   SNR  r_diff_mean  r_diff_std  r_ampl_std  p_diff_mean  p_ampl_std  \
0  NaN          NaN         NaN         NaN          NaN         NaN   

   t_diff_mean  t_ampl_std  q_diff_mean  q_ampl_std  s_diff_mean  s_ampl_std  
0          NaN         NaN          NaN         NaN          NaN         NaN  
        SNR  r_diff_mean  r_diff_std  r_ampl_std  p_diff_mean  p_am

  0%|          | 0/3411 [00:00<?, ?it/s]

         SNR  r_diff_mean  r_diff_std  r_ampl_std  p_diff_mean  p_ampl_std  \
0  11.895144          NaN         NaN         NaN          NaN         NaN   

   t_diff_mean  t_ampl_std  q_diff_mean  q_ampl_std  s_diff_mean  s_ampl_std  
0          NaN         NaN          NaN         NaN          NaN         NaN  
         SNR  r_diff_mean  r_diff_std  r_ampl_std  p_diff_mean  p_ampl_std  \
0  43.075626          NaN         NaN         NaN          NaN         NaN   

   t_diff_mean  t_ampl_std  q_diff_mean  q_ampl_std  s_diff_mean  s_ampl_std  
0          NaN         NaN          NaN         NaN          NaN         NaN  
        SNR  r_diff_mean  r_diff_std  r_ampl_std  p_diff_mean  p_ampl_std  \
0  6.205052          NaN         NaN         NaN          NaN         NaN   

   t_diff_mean  t_ampl_std  q_diff_mean  q_ampl_std  s_diff_mean  s_ampl_std  
0          NaN         NaN          NaN         NaN          NaN         NaN  
         SNR  r_diff_mean  r_diff_std  r_ampl_std  p_diff

### Dealing with NaN columns

In [None]:
# First: drop columns with NAN
x_train_final = x_train_f.dropna(axis = 1,how = 'all')
x_test_final = x_test_f.dropna(axis = 1,how = 'all')
#x_train_final = pd.read_csv('X_train_features_unclean.csv').dropna(axis = 1,how = 'all')
#x_test_final = pd.read_csv('X_test_features_unclean.csv').dropna(axis = 1,how = 'all')
x_train_2 = pd.read_csv('features_matlab/feats_X_train_Prep.csv', header = None).dropna(axis = 1,how = 'all')
x_test_2 = pd.read_csv('features_matlab/feats_X_test_Prep.csv', header = None).dropna(axis = 1,how = 'all')

x_train_add1 = pd.read_csv('features_add/X_train_features_add1.csv').dropna(axis = 1,how = 'all')
x_test_add1 = pd.read_csv('features_add/X_test_features_add1.csv').dropna(axis = 1,how = 'all')

x_train_add2 = pd.read_csv('features_add/X_train_features_add.csv').dropna(axis = 1,how = 'all')
x_test_add2 = pd.read_csv('features_add/X_test_features_add.csv').dropna(axis = 1,how = 'all')

x_train_add3 = pd.read_csv('features_add/X_test_features_add3.csv').dropna(axis = 1,how = 'all')
x_test_add3 = pd.read_csv('features_add/X_test_features_add3.csv').dropna(axis = 1,how = 'all')


x_train_final = pd.concat([pd.DataFrame(x_train_final).reset_index(), \
                           pd.DataFrame(x_train_2), \
                           #pd.Series(x_train_f_add['SNR']).reset_index(), \
                           pd.DataFrame(x_train_add1),
                           pd.DataFrame(x_train_add2),
                           pd.DataFrame(x_train_add3)], axis = 1)

x_test_final = pd.concat([pd.DataFrame(x_test_final).reset_index(), \
                          pd.DataFrame(x_test_2), \
                          #pd.Series(x_test_f_add['SNR']).reset_index(), \
                          pd.DataFrame(x_test_add1),
                          pd.DataFrame(x_test_add2),
                          pd.DataFrame(x_test_add3)], axis = 1)



#print([type for type in x_test_final.dtypes])
#print(x_train_final[x_train_final.columns[5]])

# Second: filling missing values (I chose 2 methods: iterative imputer or with median + 3·std)
x_train_final = x_train_final.drop(columns = ['3'])
x_test_final = x_test_final.drop(columns = ['3'])
#x_train_final = x_train_final.drop(columns = ['quality'])


x_train_final = x_train_final.astype(float)
x_test_final = x_test_final.astype(float)

x_train_final = x_train_final.replace([np.inf, -np.inf], np.nan)
x_test_final = x_test_final.replace([np.inf, -np.inf], np.nan)


x_train_final = fillingNaN(x_train_final, option=1)
x_test_final = fillingNaN(x_test_final, option=1)
print(pd.DataFrame(x_train_final))
#feature selection
x_train_final, cols = feature_selection(x_train_final, y_train,80)
x_test_final = pd.DataFrame(x_test_final).iloc[:,cols]

# Third: scaler
x_train_final = scaler(x_train_final)
x_test_final = scaler(x_test_final)

#rest against 3
y_train_1_3 = np.where(y_train['y'] < 3, 0,1)

#01 against 2
y_train_1_2 = y_train[y_train['y'] != 3]

x_train_1_2 = pd.DataFrame(x_train_final).loc[y_train_1_2.index]


#0 against 1
#y_train_0_1 = pd.DataFrame(y_train_1_2[y_train_1_2 < 1])
#x_train_0_1 = pd.DataFrame(x_train_1_2).loc[y_train_0_1.index]
print(x_train_final.shape)

         0    1           2           3           4           5         6    \
0        0.0  0.0  151.359103  396.408269  138.807281  263.414987  1.000000   
1        1.0  0.0  159.364359  376.495726  176.564151  298.659040  0.964286   
2        2.0  0.0   66.615620  900.689655   34.942529   26.208868  0.500000   
3        3.0  0.0  152.678571  392.982456   69.035924   93.747912  0.588235   
4        4.0  0.0  166.281755  360.833333  145.960791  237.174389  1.000000   
...      ...  ...         ...         ...         ...         ...       ...   
5112  5112.0  0.0  205.011390  292.666667  220.078101  195.941460  0.875000   
5113  5113.0  0.0   69.397590  864.583333   13.863171   10.142356  0.032258   
5114  5114.0  0.0   76.187875  787.526882  106.787458   68.422286  0.379310   
5115  5115.0  0.0   70.726916  848.333333   27.600825   34.888455  0.451613   
5116  5116.0  0.0  181.910056  329.833333  196.202772  241.779667  0.833333   

           7           8           9    ...        

### Classifiers 

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import ComplementNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from imblearn.ensemble import RUSBoostClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
#classification
#clf_1_3 = GradientBoostingClassifier(random_state=0, n_estimators=300, max_depth = 4)
#clf_1_2 = GradientBoostingClassifier(random_state=0, n_estimators=300, max_depth = 4)

clf = GradientBoostingClassifier(random_state=0, n_estimators=100, max_depth = 4)

#clf_1_3 = GradientBoostingClassifier(random_state=0, n_estimators=100, max_depth = 3, learning_rate =0.01)
#clf_0_1 = GradientBoostingClassifier(random_state=0, n_estimators=100, max_depth = 3, learning_rate =0.01)

#clf = GaussianNB()
#clf = MLPClassifier(hidden_layer_sizes=(128,256,500,560,256,128), early_stopping=True, max_iter=1000)
#clf = RUSBoostClassifier(n_estimators=500)
#scores1 = cross_val_score(clf_1_3, x_train_final, y_train_1_3, cv=5, scoring='f1_micro')
#scores2 = cross_val_score(clf_1_2, x_train_1_2, y_train_1_2, cv=5, scoring='f1_micro')
scores = cross_val_score(clf, x_train_final, y_train, cv=5, scoring='f1_micro')

#scores1 = cross_val_score(clf_1_3, x_train_final, y_train_1_3, cv=5, scoring='f1_micro')
#scores2 = cross_val_score(clf_1_2, x_train_1_2, y_train_1_2, cv=5, scoring='f1_micro')
#scores3 = cross_val_score(clf_0_1, x_train_0_1, y_train_0_1, cv=5, scoring='f1_micro')


print(scores)
#print(scores1,scores2, scores3)
print('mean score', np.mean(scores))

pygame.mixer.init()
pygame.mixer.music.load(sound_file)
pygame.mixer.music.play()


[0.78613281 0.78417969 0.79276637 0.80938416 0.79765396]
mean score 0.7940233993157382


In [None]:
clf_1_3.fit(x_train_final, y_train_1_3)
clf_1_2.fit(x_train_1_2, y_train_1_2)
#clf_0_1.fit(x_train_0_1, y_train_0_1)

#clf.fit(x_train_final, y_train)



GradientBoostingClassifier(max_depth=4, n_estimators=300, random_state=0)

### Prediction in test

In [None]:
pred_1_3 = clf_1_3.predict(x_test_final)
x_test_1_2 = pd.DataFrame(x_test_final[np.where(pred_1_3 == 0)])
print(x_test_1_2.shape)

pred_1_2 = clf_1_2.predict(x_test_1_2)



mask_pred = pred_1_3 == 1


#pred_1_2 = clf_1_2.predict(x_test_final)
#pred_0_1 = clf_0_1.predict(x_test_final)

#mask_pred = pred_1_3 == 1
#pred_0_1_2 = clf_0_1_2.predict(x_test_final[mask_pred])
#pred = clf.predict(x_test_final)
pred_test = pd.DataFrame()
pred_test['id'] = np.arange(len(pred_1_3))
pred_test['y'] = pred_1_3+2
print(len(pred_test['y'][y_train['y'] != 3]), len(pred_1_2))
pred_test['y'][pred_test['y'] == 2] = pred_1_2
#pred_test['y'] = pred
pred_test.to_csv('results.csv' ,index=False)
pred_test

(3341, 50)
3304 3341


Unnamed: 0,id,y
0,0,0
1,1,2
2,2,2
3,3,0
4,4,0
...,...,...
3406,3406,0
3407,3407,0
3408,3408,0
3409,3409,0


### Checking the amount of each class

In [None]:
print(set(pred_test['y']))
pred_test.y.value_counts()

{0, 1, 2, 3}


0    2216
2     836
1     289
3      70
Name: y, dtype: int64