In [2]:
import shutil
import warnings
from glob import glob
import os
import numpy as np
import librosa
from librosa import feature
import re
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report,accuracy_score, confusion_matrix, plot_confusion_matrix, recall_score, f1_score
import wave
import contextlib
import csv
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from imblearn.over_sampling import SMOTE, RandomOverSampler
import pickle
warnings.filterwarnings('ignore')
from sklearn.metrics import make_scorer
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier


### STEP 1: Data Collections

In [12]:
dataset = 'D:\\COVID_19_dataset\\final\\'
files = []
# r=root, d=directories, f = files
for r, d, f in os.walk(dataset):    
    for file in f:
        if '.wav' in file:
            files.append(os.path.join(r,file))

In [14]:
empty_list = []
for fname in files:
    with contextlib.closing(wave.open(fname,'r')) as f:
        frames = f.getnframes()
        rate = f.getframerate()
        duration = frames / float(rate)
#         print(fname + " " +str(duration))
        if(duration == 0):
            empty_list.append(fname)
empty_list

Error: unknown format: 3

In [13]:
files

['F:\\dataset\\Respiratory_Sound_Database\\Respiratory_Sound_Database\\clean_files\\101_1b1_Al_sc_Meditron.wav',
 'F:\\dataset\\Respiratory_Sound_Database\\Respiratory_Sound_Database\\clean_files\\101_1b1_Pr_sc_Meditron.wav',
 'F:\\dataset\\Respiratory_Sound_Database\\Respiratory_Sound_Database\\clean_files\\102_1b1_Ar_sc_Meditron.wav',
 'F:\\dataset\\Respiratory_Sound_Database\\Respiratory_Sound_Database\\clean_files\\103_2b2_Ar_mc_LittC2SE.wav',
 'F:\\dataset\\Respiratory_Sound_Database\\Respiratory_Sound_Database\\clean_files\\104_1b1_Al_sc_Litt3200.wav',
 'F:\\dataset\\Respiratory_Sound_Database\\Respiratory_Sound_Database\\clean_files\\104_1b1_Ar_sc_Litt3200.wav',
 'F:\\dataset\\Respiratory_Sound_Database\\Respiratory_Sound_Database\\clean_files\\104_1b1_Ll_sc_Litt3200.wav',
 'F:\\dataset\\Respiratory_Sound_Database\\Respiratory_Sound_Database\\clean_files\\104_1b1_Lr_sc_Litt3200.wav',
 'F:\\dataset\\Respiratory_Sound_Database\\Respiratory_Sound_Database\\clean_files\\104_1b1_Pl_s

### STEP 2: Feature Extraction from signal

In [5]:
fn_list_i = [
 feature.chroma_stft,
 feature.spectral_centroid,
 feature.spectral_bandwidth,
 feature.spectral_rolloff]
 
fn_list_ii = [
 feature.zero_crossing_rate]


def get_feature_vector(file,y,sr): 
    id1 = []
    id1.append(file.split("\\")[3])
    feat_vect_i = [np.mean(funct(y,sr,pad_mode='constant')) for funct in fn_list_i]
    feat_vect_ii = [np.mean(funct(y,frame_length=4096)) for funct in fn_list_ii] 
    feature_vector = id1 + feat_vect_i + feat_vect_ii 
    return feature_vector


In [6]:
def extract_audio_feaures(breathing_deep, breathing_shallow, cough_heavy, cough_shallow, counting_fast, counting_normal, vowel_a, vowel_e, vowel_o, files):
    for f in files:
        file_name = f.split("\\")[-1]
        y,sr = librosa.load(f,sr=48000,res_type='kaiser_fast')
        feature_vector = get_feature_vector(f, y, sr)

        if file_name == 'breathing-deep.wav':
            breathing_deep.append(feature_vector)

        elif file_name == 'breathing-shallow.wav':
            breathing_shallow.append(feature_vector)

        elif file_name == 'cough-heavy.wav':
            cough_heavy.append(feature_vector)

        elif file_name == 'cough-shallow.wav':
            cough_shallow.append(feature_vector)

        elif file_name == 'counting-fast.wav':
            counting_fast.append(feature_vector)

        elif file_name == 'counting-normal.wav':
            counting_normal.append(feature_vector)

        elif file_name == 'vowel-a.wav':
            vowel_a.append(feature_vector)

        elif file_name == 'vowel-e.wav':
            vowel_e.append(feature_vector)

        elif file_name == 'vowel-o.wav':
            vowel_o.append(feature_vector)


In [7]:
def extract_mfcc_features(breathing_deep, breathing_shallow, cough_heavy, cough_shallow, counting_fast, 
                          counting_normal, vowel_a, vowel_e, vowel_o, files):
    for f in files:
        file_name = f.split("\\")[-1]
        id1 = []
        id1.append(f.split("\\")[3])
        y,sr = librosa.load(f,sr=None)    
        mfcc_feature = librosa.feature.mfcc(y,sr,n_mfcc=13,hop_length=512,n_fft=2048)
        mfcc = mfcc_feature.T.tolist()
        mfcc_1 = np.mean(mfcc, axis=0).tolist()

        if file_name == 'breathing-deep.wav':
            breathing_deep.append(id1 + mfcc_1)

        elif file_name == 'breathing-shallow.wav':
            breathing_shallow.append(id1 + mfcc_1)

        elif file_name == 'cough-heavy.wav':
            cough_heavy.append(id1 + mfcc_1)

        elif file_name == 'cough-shallow.wav':
            cough_shallow.append(id1 + mfcc_1)

        elif file_name == 'counting-fast.wav':
            counting_fast.append(id1 + mfcc_1)

        elif file_name == 'counting-normal.wav':
            counting_normal.append(id1 + mfcc_1)

        elif file_name == 'vowel-a.wav':
            vowel_a.append(id1 + mfcc_1)

        elif file_name == 'vowel-e.wav':
            vowel_e.append(id1 + mfcc_1)

        elif file_name == 'vowel-o.wav':
            vowel_o.append(id1 + mfcc_1)


In [8]:
def write_audio_features(path, list_name,header_name,mfcc=False):
    
    if mfcc == False:
        header = ['id', header_name + '_chroma_stft', header_name + '_spectral_centroid', header_name + '_spectral_bandwidth', header_name +'_spectral_rolloff',header_name + '_zero_crossing_rate']
        with open (path,'w') as f:
            csv_writer = csv.writer(f, delimiter = ',')
            csv_writer.writerow(header)
            csv_writer.writerows(list_name)
    else:
        header = ['id']
        for i in range(1,14):
            header.append(f' {header_name}_mfcc_{i}')
        with open (path,'w') as f:
            csv_writer = csv.writer(f, delimiter = ',')
            csv_writer.writerow(header)
            csv_writer.writerows(list_name)    

In [9]:
#extract audio features
# breathing_deep = []
# breathing_shallow = []
# cough_heavy = []
# cough_shallow = []
# counting_fast = []
# counting_normal = []
# vowel_a =[]
# vowel_e = []
# vowel_o = []

# extract_audio_feaures(breathing_deep, breathing_shallow, cough_heavy, cough_shallow, counting_fast, 
#                       counting_normal, vowel_a, vowel_e, vowel_o, files)

In [10]:
#extract mfcc features
# mfcc_breathing_deep = []
# mfcc_breathing_shallow = []
# mfcc_cough_heavy = []
# mfcc_cough_shallow = []
# mfcc_counting_fast = []
# mfcc_counting_normal = []
# mfcc_vowel_a =[]
# mfcc_vowel_e = []
# mfcc_vowel_o = []

# extract_mfcc_features(mfcc_breathing_deep, mfcc_breathing_shallow, mfcc_cough_heavy, mfcc_cough_shallow, mfcc_counting_fast, 
#                      mfcc_counting_normal, mfcc_vowel_a, mfcc_vowel_e, mfcc_vowel_o, files)

In [11]:
# #write audio features
# path1 = "D:\\COVID_19_dataset\\csv_files\\breathing_deep.csv"
# path2 = "D:\\COVID_19_dataset\\csv_files\\breathing_shallow.csv"
# path3 = "D:\\COVID_19_dataset\\csv_files\\cough_heavy.csv"
# path4 = "D:\\COVID_19_dataset\\csv_files\\cough_shallow.csv"
# path5 = "D:\\COVID_19_dataset\\csv_files\\counting_fast.csv"
# path6 = "D:\\COVID_19_dataset\\csv_files\\counting_normal.csv"
# path7 = "D:\\COVID_19_dataset\\csv_files\\vowel_a.csv"
# path8 = "D:\\COVID_19_dataset\\csv_files\\vowel_e.csv"
# path9 = "D:\\COVID_19_dataset\\csv_files\\vowel_o.csv"

# write_audio_features(path1,breathing_deep,'breathing_deep')
# write_audio_features(path2,breathing_shallow,'breathing_shallow')
# write_audio_features(path3,cough_heavy,'cough_heavy')
# write_audio_features(path4,cough_shallow,'cough_shallow')
# write_audio_features(path5,counting_fast,'counting_fast')
# write_audio_features(path6,counting_normal,'counting_normal')
# write_audio_features(path7,vowel_a,'vowel_a')
# write_audio_features(path8,vowel_e,'vowel_e')
# write_audio_features(path9,vowel_o,'vowel_o')

In [12]:
# #write mfcc features
# path1 = "D:\\COVID_19_dataset\\csv_files\\mfcc_breathing_deep.csv"
# path2 = "D:\\COVID_19_dataset\\csv_files\\mfcc_breathing_shallow.csv"
# path3 = "D:\\COVID_19_dataset\\csv_files\\mfcc_cough_heavy.csv"
# path4 = "D:\\COVID_19_dataset\\csv_files\\mfcc_cough_shallow.csv"
# path5 = "D:\\COVID_19_dataset\\csv_files\\mfcc_counting_fast.csv"
# path6 = "D:\\COVID_19_dataset\\csv_files\\mfcc_counting_normal.csv"
# path7 = "D:\\COVID_19_dataset\\csv_files\\mfcc_vowel_a.csv"
# path8 = "D:\\COVID_19_dataset\\csv_files\\mfcc_vowel_e.csv"
# path9 = "D:\\COVID_19_dataset\\csv_files\\mfcc_vowel_o.csv"

# write_audio_features(path1,mfcc_breathing_deep,'mfcc_breathing_deep',mfcc=True)
# write_audio_features(path2,mfcc_breathing_shallow,'mfcc_breathing_shallow',mfcc=True)
# write_audio_features(path3,mfcc_cough_heavy,'mfcc_cough_heavy',mfcc=True)
# write_audio_features(path4,mfcc_cough_shallow,'mfcc_cough_shallow',mfcc=True)
# write_audio_features(path5,mfcc_counting_fast,'mfcc_counting_fast',mfcc=True)
# write_audio_features(path6,mfcc_counting_normal,'mfcc_counting_normal',mfcc=True)
# write_audio_features(path7,mfcc_vowel_a,'mfcc_vowel_a',mfcc=True)
# write_audio_features(path8,mfcc_vowel_e,'mfcc_vowel_e',mfcc=True)
# write_audio_features(path9,mfcc_vowel_o,'mfcc_vowel_o',mfcc=True)

In [13]:
#merge audio features files:
brething_deep_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\breathing_deep.csv")
brething_shallow_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\breathing_shallow.csv")
cough_heavy_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\cough_heavy.csv")
cough_shallow_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\cough_shallow.csv")
counting_fast_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\counting_fast.csv")
counting_normal_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\counting_normal.csv")
vowel_a_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\vowel_a.csv")
vowel_e_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\vowel_e.csv")
vowel_o_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\vowel_o.csv")

main_df = brething_deep_df.merge(brething_shallow_df,on='id')
main_df = main_df.merge(cough_heavy_df, on='id')
main_df = main_df.merge(cough_shallow_df, on='id')
main_df = main_df.merge(counting_fast_df, on = 'id')
main_df = main_df.merge(counting_normal_df, on = 'id')
main_df = main_df.merge(vowel_a_df, on = 'id')
main_df = main_df.merge(vowel_e_df, on = 'id')
main_df = main_df.merge(vowel_o_df, on = 'id')
print(main_df.shape)
main_df.head()

(369, 46)


Unnamed: 0,id,breathing_deep_chroma_stft,breathing_deep_spectral_centroid,breathing_deep_spectral_bandwidth,breathing_deep_spectral_rolloff,breathing_deep_zero_crossing_rate,breathing_shallow_chroma_stft,breathing_shallow_spectral_centroid,breathing_shallow_spectral_bandwidth,breathing_shallow_spectral_rolloff,...,vowel_e_chroma_stft,vowel_e_spectral_centroid,vowel_e_spectral_bandwidth,vowel_e_spectral_rolloff,vowel_e_zero_crossing_rate,vowel_o_chroma_stft,vowel_o_spectral_centroid,vowel_o_spectral_bandwidth,vowel_o_spectral_rolloff,vowel_o_zero_crossing_rate
0,05acPS4aRGfvuOfku11Za8zve8i2,0.370991,4872.314658,2813.369571,7299.173664,0.146697,0.484864,4641.002418,2990.027194,7196.914469,...,0.261112,3455.609673,4942.510562,7909.334126,0.015361,0.290769,2475.8836,4120.041054,4941.354626,0.012596
1,05bieNLXPuaIWEVaX81EkbbjVrh1,0.417714,3328.707906,2531.343171,5218.079853,0.110853,0.437503,3563.407487,3611.686411,5846.661814,...,0.323032,2986.198844,3507.974527,4808.049502,0.058355,0.262647,2259.500763,2875.645959,4155.639368,0.043458
2,0Ha52POVIxTKEPqI1eGpIoMHUd52,0.584902,1765.715346,2416.375776,3588.786005,0.027665,0.614353,1779.703303,2896.422839,3287.599382,...,0.291197,1891.393146,2196.733192,3819.276588,0.033747,0.21291,916.035094,1554.157594,1231.829882,0.020061
3,0HIgO2EhOOW1msCbEw1kC8Qsx6D3,0.525224,6244.869374,4921.769406,12287.285573,0.368375,0.430968,4835.217501,3419.488527,8368.049173,...,0.32155,3157.840176,3200.177043,6368.613233,0.079969,0.323083,2392.917866,3053.350074,4158.491592,0.062361
4,0KSi2atlmsXNcGMfpDNzIjJSvC23,0.370318,3789.491902,3184.693337,5963.528678,0.12414,0.423118,4855.678501,4743.753196,8779.11555,...,0.491816,4014.935925,3762.780926,8157.996894,0.037217,0.41755,2450.144085,2835.315044,4861.228814,0.040874


In [14]:
#merge mfcc feature files
mfcc_brething_deep_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\mfcc_breathing_deep.csv")
mfcc_brething_shallow_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\mfcc_breathing_shallow.csv")
mfcc_cough_heavy_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\mfcc_cough_heavy.csv")
mfcc_cough_shallow_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\mfcc_cough_shallow.csv")
mfcc_counting_fast_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\mfcc_counting_fast.csv")
mfcc_counting_normal_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\mfcc_counting_normal.csv")
mfcc_vowel_a_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\mfcc_vowel_a.csv")
mfcc_vowel_e_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\mfcc_vowel_e.csv")
mfcc_vowel_o_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\mfcc_vowel_o.csv")

mfcc_main_df = mfcc_brething_deep_df.merge(mfcc_brething_shallow_df,on='id')
mfcc_main_df = mfcc_main_df.merge(mfcc_cough_heavy_df, on='id')
mfcc_main_df = mfcc_main_df.merge(mfcc_cough_shallow_df, on='id')
mfcc_main_df = mfcc_main_df.merge(mfcc_counting_fast_df, on = 'id')
mfcc_main_df = mfcc_main_df.merge(mfcc_cough_shallow_df, on = 'id')
mfcc_main_df = mfcc_main_df.merge(mfcc_vowel_a_df, on = 'id')
mfcc_main_df = mfcc_main_df.merge(mfcc_vowel_e_df, on = 'id')
mfcc_main_df = mfcc_main_df.merge(mfcc_vowel_o_df, on = 'id')
mfcc_main_df.head()

Unnamed: 0,id,mfcc_breathing_deep_mfcc_1,mfcc_breathing_deep_mfcc_2,mfcc_breathing_deep_mfcc_3,mfcc_breathing_deep_mfcc_4,mfcc_breathing_deep_mfcc_5,mfcc_breathing_deep_mfcc_6,mfcc_breathing_deep_mfcc_7,mfcc_breathing_deep_mfcc_8,mfcc_breathing_deep_mfcc_9,...,mfcc_vowel_o_mfcc_4,mfcc_vowel_o_mfcc_5,mfcc_vowel_o_mfcc_6,mfcc_vowel_o_mfcc_7,mfcc_vowel_o_mfcc_8,mfcc_vowel_o_mfcc_9,mfcc_vowel_o_mfcc_10,mfcc_vowel_o_mfcc_11,mfcc_vowel_o_mfcc_12,mfcc_vowel_o_mfcc_13
0,05acPS4aRGfvuOfku11Za8zve8i2,-813.089971,38.802417,-77.222864,42.305273,-2.216783,-28.184085,33.021602,-9.333336,-17.026904,...,33.464627,6.39059,-6.487154,-8.96341,-12.9505,-7.994345,-4.596141,-9.091037,-9.831156,-15.061459
1,05bieNLXPuaIWEVaX81EkbbjVrh1,-499.679798,130.247662,-123.528521,16.411893,16.942915,-36.6554,9.356996,-21.894701,-22.049351,...,-2.579661,-4.169795,-10.893087,-3.907399,2.359373,-4.12743,-6.094339,-2.874159,-6.33047,-8.195437
2,0Ha52POVIxTKEPqI1eGpIoMHUd52,-487.244078,115.256379,-1.885961,10.821468,10.630045,-8.334683,5.847647,-0.294407,-8.702686,...,15.242778,1.611547,-9.06024,-10.821952,-12.927832,-16.825695,-13.249317,-9.701611,-10.82719,-7.606832
3,0HIgO2EhOOW1msCbEw1kC8Qsx6D3,-785.560261,42.048461,-27.18348,10.584324,-5.060375,-2.260975,8.698895,-1.705259,-15.020222,...,27.357194,3.286996,-10.075376,-7.557977,-7.845949,-9.116844,-8.071812,-0.675893,-1.728074,-16.463442
4,0KSi2atlmsXNcGMfpDNzIjJSvC23,-691.727598,131.211121,-85.588253,25.894158,33.833338,-22.728714,22.169687,9.447581,-35.685676,...,25.405182,6.799536,-13.153801,-1.434831,-5.809834,-16.252848,-4.144781,-8.174328,-26.829593,-19.518725


In [15]:
final_Df = main_df.merge(mfcc_main_df,on='id')
final_Df.head()

Unnamed: 0,id,breathing_deep_chroma_stft,breathing_deep_spectral_centroid,breathing_deep_spectral_bandwidth,breathing_deep_spectral_rolloff,breathing_deep_zero_crossing_rate,breathing_shallow_chroma_stft,breathing_shallow_spectral_centroid,breathing_shallow_spectral_bandwidth,breathing_shallow_spectral_rolloff,...,mfcc_vowel_o_mfcc_4,mfcc_vowel_o_mfcc_5,mfcc_vowel_o_mfcc_6,mfcc_vowel_o_mfcc_7,mfcc_vowel_o_mfcc_8,mfcc_vowel_o_mfcc_9,mfcc_vowel_o_mfcc_10,mfcc_vowel_o_mfcc_11,mfcc_vowel_o_mfcc_12,mfcc_vowel_o_mfcc_13
0,05acPS4aRGfvuOfku11Za8zve8i2,0.370991,4872.314658,2813.369571,7299.173664,0.146697,0.484864,4641.002418,2990.027194,7196.914469,...,33.464627,6.39059,-6.487154,-8.96341,-12.9505,-7.994345,-4.596141,-9.091037,-9.831156,-15.061459
1,05bieNLXPuaIWEVaX81EkbbjVrh1,0.417714,3328.707906,2531.343171,5218.079853,0.110853,0.437503,3563.407487,3611.686411,5846.661814,...,-2.579661,-4.169795,-10.893087,-3.907399,2.359373,-4.12743,-6.094339,-2.874159,-6.33047,-8.195437
2,0Ha52POVIxTKEPqI1eGpIoMHUd52,0.584902,1765.715346,2416.375776,3588.786005,0.027665,0.614353,1779.703303,2896.422839,3287.599382,...,15.242778,1.611547,-9.06024,-10.821952,-12.927832,-16.825695,-13.249317,-9.701611,-10.82719,-7.606832
3,0HIgO2EhOOW1msCbEw1kC8Qsx6D3,0.525224,6244.869374,4921.769406,12287.285573,0.368375,0.430968,4835.217501,3419.488527,8368.049173,...,27.357194,3.286996,-10.075376,-7.557977,-7.845949,-9.116844,-8.071812,-0.675893,-1.728074,-16.463442
4,0KSi2atlmsXNcGMfpDNzIjJSvC23,0.370318,3789.491902,3184.693337,5963.528678,0.12414,0.423118,4855.678501,4743.753196,8779.11555,...,25.405182,6.799536,-13.153801,-1.434831,-5.809834,-16.252848,-4.144781,-8.174328,-26.829593,-19.518725


In [16]:
col = ['id','a','covid_status','ep','g','l_c','l_l','l_s','rU','asthma','cough','smoker','test','ht','cold','diabetes','um','ihd','bd','st','fever','ftg','mp','loss_of_smell','test_status','diarrhoea','cld','pneumonia']
disease = pd.read_csv("https://raw.githubusercontent.com/iiscleap/Coswara-Data/master/combined_data.csv",names=col,header=None)
disease.head()

Unnamed: 0,id,a,covid_status,ep,g,l_c,l_l,l_s,rU,asthma,...,bd,st,fever,ftg,mp,loss_of_smell,test_status,diarrhoea,cld,pneumonia
0,id,a,covid_status,ep,g,l_c,l_l,l_s,rU,asthma,...,bd,st,fever,ftg,mp,loss_of_smell,test_status,diarrhoea,cld,pneumonia
1,DRBAZX64nuVtqBQf13gH7r36Mh52,26,healthy,y,female,United States,Madison,Wisconsin,n,,...,,,,,,,,,,
2,Jw7YMfwGqMX22UbHh1TTgYMTYWs1,16,healthy,y,female,India,24 pargana,West Bengal,n,,...,,,,,,,,,,
3,xa2v8z3Yzgb9dFrq2gEZz6oS7fh1,26,resp_illness_not_identified,y,male,India,Kolkata,West Bengal,n,True,...,,,,,,,,,,
4,xwHQrG0KwjTLJvBYVtVXfHp4JAd2,32,resp_illness_not_identified,y,male,Sri Lanka,Sri Jatawardanapura,Western Province,n,,...,,,,,,,,,,


In [17]:
final_Df = final_Df.merge(disease,on='id')
final_Df

Unnamed: 0,id,breathing_deep_chroma_stft,breathing_deep_spectral_centroid,breathing_deep_spectral_bandwidth,breathing_deep_spectral_rolloff,breathing_deep_zero_crossing_rate,breathing_shallow_chroma_stft,breathing_shallow_spectral_centroid,breathing_shallow_spectral_bandwidth,breathing_shallow_spectral_rolloff,...,bd,st,fever,ftg,mp,loss_of_smell,test_status,diarrhoea,cld,pneumonia
0,05acPS4aRGfvuOfku11Za8zve8i2,0.370991,4872.314658,2813.369571,7299.173664,0.146697,0.484864,4641.002418,2990.027194,7196.914469,...,,True,,,,,,,,
1,05bieNLXPuaIWEVaX81EkbbjVrh1,0.417714,3328.707906,2531.343171,5218.079853,0.110853,0.437503,3563.407487,3611.686411,5846.661814,...,,,,,,,,,,
2,0Ha52POVIxTKEPqI1eGpIoMHUd52,0.584902,1765.715346,2416.375776,3588.786005,0.027665,0.614353,1779.703303,2896.422839,3287.599382,...,,,,,,,,,,
3,0HIgO2EhOOW1msCbEw1kC8Qsx6D3,0.525224,6244.869374,4921.769406,12287.285573,0.368375,0.430968,4835.217501,3419.488527,8368.049173,...,,,,,,,,,,
4,0KSi2atlmsXNcGMfpDNzIjJSvC23,0.370318,3789.491902,3184.693337,5963.528678,0.124140,0.423118,4855.678501,4743.753196,8779.115550,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364,zPFIJ3JDo7P5qIIdKvELq76kYi02,0.462540,5567.018200,4717.394046,10732.694439,0.091804,0.451270,5523.425044,4777.168402,10867.729351,...,,,,,,,,,,
365,zpjoDhhHvJcfhSHTK9i895kmr2f1,0.223768,2199.879048,1136.912167,3403.075578,0.081062,0.320763,2813.365614,1523.504896,4470.891768,...,,,,,,,,,,
366,zt2ssuTmZESDhYPW40dyo5RW2sT2,0.502874,4396.550991,3921.767396,7950.513296,0.093771,0.554283,4500.249782,5426.979733,9923.877762,...,,,,,,,,,,
367,zUAc2drprdaJAE0h1XpQyk01iE22,0.502238,2629.222459,2791.876372,4927.756998,0.070085,0.523837,2267.298134,2910.548676,4358.297414,...,,,,True,,,,,,


In [18]:
final_Df['covid_status'].value_counts()

healthy                        250
positive_mild                   43
resp_illness_not_identified     26
no_resp_illness_exposed         24
recovered_full                  11
positive_moderate                8
positive_asymp                   7
Name: covid_status, dtype: int64

In [19]:
X = final_Df.iloc[:, 1:163]
y = final_Df.iloc[:,164:165]
X.shape,y.shape

((369, 162), (369, 1))

In [20]:
# ld = LabelEncoder()
# y_encode = ld.fit_transform(y)
# ld.inverse_transform([0,1,2,3,4,5,6])

In [21]:
label_dict ={0: 'healthy',
             1: 'no_resp_illness_exposed',
             2: 'positive_asymp',
             3: 'positive_mild',
             4: 'positive_moderate',
             5: 'recovered_full',
             6: 'resp_illness_not_identified'}

## Random Forest

In [21]:
class_weights = {0: 0.210,
                 1: 2.196,
                 2: 7.530,
                 3: 1.225,
                 4: 6.589,
                 5: 4.792,
                 6: 2.027}

In [22]:
#with no oversample and categorical class label
X = final_Df.iloc[:, 1:163]
y = final_Df.iloc[:,164:165]
y_encode = LabelEncoder().fit_transform(y)
x_train, x_test, y_train, y_test = train_test_split(X, y_encode,test_size=0.20)
cv = StratifiedKFold(n_splits = 5, shuffle=True, random_state = 42)
rdf = RandomForestClassifier(random_state = 42, class_weight= class_weights) 
scoring = {'Recall': make_scorer(recall_score),
           'f1_score': make_scorer(f1_score)
          }

params = {'max_depth': [6, 8, 10, 20], 
              'min_samples_split': [5, 10, 15],
              'min_samples_leaf' : [4, 8, 12],
              'n_estimators' : [300, 400, 500]
             }

grid_clf = GridSearchCV(estimator = rdf, param_grid = params, cv = cv, n_jobs=-1)
grid_clf.fit(x_train, y_train)
train_pred = grid_clf.predict(x_test)
print('1. The F-1 score of the model {}\n'.format(f1_score(y_test, train_pred, average='macro')))
print('2. The recall score of the model {}\n'.format(recall_score(y_test, train_pred, average='macro')))
print('3. Classification report \n {} \n'.format(classification_report(y_test, train_pred)))
print('4. Confusion matrix \n {} \n'.format(confusion_matrix(y_test, train_pred)))
print('5. Accuracy \n {} \n'.format(accuracy_score(y_test, train_pred)))

filename = 'COVID-19 Results\\randomforest_model_without_oversample.pkl'
with open(filename, 'wb') as file:  
    pickle.dump(grid_clf, file)

1. The F-1 score of the model 0.20512820512820512

2. The recall score of the model 0.19505494505494506

3. Classification report 
               precision    recall  f1-score   support

           0       0.69      0.87      0.77        52
           1       0.00      0.00      0.00         5
           2       1.00      0.50      0.67         2
           3       0.00      0.00      0.00         5
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         2
           6       0.00      0.00      0.00         7

    accuracy                           0.62        74
   macro avg       0.24      0.20      0.21        74
weighted avg       0.51      0.62      0.56        74
 

4. Confusion matrix 
 [[45  0  0  7  0  0  0]
 [ 5  0  0  0  0  0  0]
 [ 0  0  1  1  0  0  0]
 [ 5  0  0  0  0  0  0]
 [ 1  0  0  0  0  0  0]
 [ 2  0  0  0  0  0  0]
 [ 7  0  0  0  0  0  0]] 

5. Accuracy 
 0.6216216216216216 



In [24]:
#with oversampling and convert class label into numeric value
X = final_Df.iloc[:, 1:163]
y = final_Df.iloc[:,164:165]

oversample = SMOTE()
X_ROS, y_ROS = oversample.fit_resample(X,y)
y_encode = LabelEncoder().fit_transform(y_ROS)
x_train,x_test,y_train,y_test = train_test_split(X_ROS, y_encode,test_size=0.20,random_state=42)
cv = StratifiedKFold(n_splits = 5, shuffle=True, random_state = 42)

rf_classifier = RandomForestClassifier()
n_estimators = [100, 300, 500, 800, 1200]
max_depth = [5, 8, 15, 25, 30]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10] 
hyperF = dict(n_estimators = n_estimators, max_depth = max_depth,  
              min_samples_split = min_samples_split, 
             min_samples_leaf = min_samples_leaf)

rfgrid_with_over = GridSearchCV(rf_classifier, hyperF, cv = cv, n_jobs = -1)
rfgrid_with_over.fit(x_train, y_train)
rfgrid_with_over_prob = rfgrid_with_over.predict(x_test)
print('1. The F-1 score of the model {}\n'.format(f1_score(y_test, rfgrid_with_over_prob, average='macro')))
print('2. The recall score of the model {}\n'.format(recall_score(y_test, rfgrid_with_over_prob, average='macro')))
print('3. Classification report \n {} \n'.format(classification_report(y_test, rfgrid_with_over_prob)))
print('4. Confusion matrix \n {} \n'.format(confusion_matrix(y_test, rfgrid_with_over_prob)))
print('5. Accuracy \n {} \n'.format(accuracy_score(y_test, rfgrid_with_over_prob)))

filename = 'COVID-19 Results\\randomforest_model_with_oversample.pkl'
with open(filename, 'wb') as file:  
    pickle.dump(rfgrid_with_over, file)

1. The F-1 score of the model 0.9760544667086722

2. The recall score of the model 0.9762755762755762

3. Classification report 
               precision    recall  f1-score   support

           0       0.94      0.89      0.92        55
           1       0.96      0.96      0.96        52
           2       1.00      1.00      1.00        41
           3       0.93      0.98      0.95        54
           4       1.00      1.00      1.00        53
           5       1.00      1.00      1.00        41
           6       1.00      1.00      1.00        54

    accuracy                           0.97       350
   macro avg       0.98      0.98      0.98       350
weighted avg       0.97      0.97      0.97       350
 

4. Confusion matrix 
 [[49  2  0  4  0  0  0]
 [ 2 50  0  0  0  0  0]
 [ 0  0 41  0  0  0  0]
 [ 1  0  0 53  0  0  0]
 [ 0  0  0  0 53  0  0]
 [ 0  0  0  0  0 41  0]
 [ 0  0  0  0  0  0 54]] 

5. Accuracy 
 0.9742857142857143 



In [25]:
#convert label into numeric value  
X = final_Df.iloc[:, 1:163]
y = final_Df.iloc[:,164:165]
oversample = SMOTE()
X_ROS, y_ROS = oversample.fit_resample(X,y)

y_encode = LabelEncoder().fit_transform(y_ROS)
x_train,x_test,y_train,y_test = train_test_split(X_ROS, y_encode,test_size=0.20,random_state=42)
cv = StratifiedKFold(n_splits = 5, shuffle=True, random_state = 42)
rf_classifier = RandomForestClassifier()
n_estimators = [100, 300, 500, 800, 1200]
max_depth = [5, 8, 15, 25, 30]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10] 
hyperF = dict(n_estimators = n_estimators, max_depth = max_depth,  
              min_samples_split = min_samples_split, 
             min_samples_leaf = min_samples_leaf)

rfgrid_without_over_cv10 = GridSearchCV(rf_classifier, hyperF, cv = cv, n_jobs = -1)
rfgrid_without_over_cv10.fit(x_train, y_train)
rfgrid_without_over_prob_cv = rfgrid_without_over_cv10.predict(x_test)
print('1. The F-1 score of the model {}\n'.format(f1_score(y_test, rfgrid_without_over_prob_cv, average='macro')))
print('2. The recall score of the model {}\n'.format(recall_score(y_test, rfgrid_without_over_prob_cv, average='macro')))
print('3. Classification report \n {} \n'.format(classification_report(y_test, rfgrid_without_over_prob_cv)))
print('4. Confusion matrix \n {} \n'.format(confusion_matrix(y_test, rfgrid_without_over_prob_cv)))
print('5. Accuracy \n {} \n'.format(accuracy_score(y_test, rfgrid_without_over_prob_cv)))


filename = 'COVID-19 Results\\randomforest_model_cv10.pkl'
with open(filename, 'wb') as file:  
    pickle.dump(rfgrid_without_over_cv10, file)


1. The F-1 score of the model 0.9865945358602234

2. The recall score of the model 0.987012987012987

3. Classification report 
               precision    recall  f1-score   support

           0       1.00      0.91      0.95        55
           1       0.95      1.00      0.97        52
           2       1.00      1.00      1.00        41
           3       0.96      1.00      0.98        54
           4       1.00      1.00      1.00        53
           5       1.00      1.00      1.00        41
           6       1.00      1.00      1.00        54

    accuracy                           0.99       350
   macro avg       0.99      0.99      0.99       350
weighted avg       0.99      0.99      0.99       350
 

4. Confusion matrix 
 [[50  3  0  2  0  0  0]
 [ 0 52  0  0  0  0  0]
 [ 0  0 41  0  0  0  0]
 [ 0  0  0 54  0  0  0]
 [ 0  0  0  0 53  0  0]
 [ 0  0  0  0  0 41  0]
 [ 0  0  0  0  0  0 54]] 

5. Accuracy 
 0.9857142857142858 



## 2: K Nearest Neighbors Classifier

In [26]:
X = final_Df.iloc[:, 1:163]
y = final_Df.iloc[:,164:165]
oversample = SMOTE()
X_ROS, y_ROS = oversample.fit_resample(X,y)
y_encode = LabelEncoder().fit_transform(y_ROS)
cv = StratifiedKFold(n_splits = 10, shuffle=True, random_state = 42)

x_train, x_test, y_train, y_test = train_test_split(X_ROS, y_encode, test_size=0.20, random_state=42)

knn= KNeighborsClassifier()
leaf_size = list(range(1,70))
n_neighbors = list(range(1,50))
p=[1,2]
hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p)
knngrid = GridSearchCV(knn, hyperparameters, cv=cv)
knngrid.fit(x_train, y_train)
knn_pred_probs = knngrid.predict(x_test)
print('1. The F-1 score of the model {}\n'.format(f1_score(y_test, knn_pred_probs, average='macro')))
print('2. The recall score of the model {}\n'.format(recall_score(y_test, knn_pred_probs, average='macro')))
print('3. Classification report \n {} \n'.format(classification_report(y_test, knn_pred_probs)))
print('4. Confusion matrix \n {} \n'.format(confusion_matrix(y_test, knn_pred_probs)))
print('5. Accuracy \n {} \n'.format(accuracy_score(y_test, knn_pred_probs)))


filename = 'COVID-19 Results\\knn_model.pkl'
with open(filename, 'wb') as file:  
    pickle.dump(knngrid, file)


1. The F-1 score of the model 0.8815296243052504

2. The recall score of the model 0.9036981536981538

3. Classification report 
               precision    recall  f1-score   support

           0       1.00      0.36      0.53        55
           1       0.86      0.98      0.92        52
           2       0.95      1.00      0.98        41
           3       0.90      0.98      0.94        54
           4       0.90      1.00      0.95        53
           5       0.89      1.00      0.94        41
           6       0.84      1.00      0.92        54

    accuracy                           0.89       350
   macro avg       0.91      0.90      0.88       350
weighted avg       0.91      0.89      0.87       350
 

4. Confusion matrix 
 [[20  8  2  6  6  5  8]
 [ 0 51  0  0  0  0  1]
 [ 0  0 41  0  0  0  0]
 [ 0  0  0 53  0  0  1]
 [ 0  0  0  0 53  0  0]
 [ 0  0  0  0  0 41  0]
 [ 0  0  0  0  0  0 54]] 

5. Accuracy 
 0.8942857142857142 



## XGBoost Classifier

In [20]:
X = final_Df.iloc[:, 1:163]
y = final_Df.iloc[:,164:165]
oversample = SMOTE()
X_ROS, y_ROS = oversample.fit_resample(X,y)
cv = StratifiedKFold(n_splits = 10, shuffle=True, random_state = 42)
y_encode = LabelEncoder().fit_transform(y_ROS)
x_train, x_test, y_train, y_test = train_test_split(X_ROS, y_encode, test_size=0.20)
model = XGBClassifier()

para_grid = {
                  'n_estimators': [2,5,10,15],
                   'max_depth':[1,3,10,20],
                   'learning_rate ':[0.001, 0.010, 0.100, 0.500],
                   'gamma' : [0, 0.5, 2, 10],
                   'subsample ':[1.0,0.9,0.5,0.1],
                   'min_child_weight ':[0.01, 0.05, 0.25, 0.75]

            }


# param_grid = dict(scale_pos_weight=weights)
XGboost = GridSearchCV(estimator=model, param_grid=para_grid, n_jobs=-1, cv=cv)
XGboost.fit(x_train, y_train)
# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
train_pred = XGboost.predict(x_test)
print('1. The F-1 score of the model {}\n'.format(f1_score(y_test, train_pred, average='macro')))
print('2. The recall score of the model {}\n'.format(recall_score(y_test, train_pred, average='macro')))
print('3. Classification report \n {} \n'.format(classification_report(y_test, train_pred)))
print('4. Confusion matrix \n {} \n'.format(confusion_matrix(y_test, train_pred)))
print('5. Accuracy \n {} \n'.format(accuracy_score(y_test, train_pred)))

filename = 'COVID-19 Results\\xgboost_model.pkl'
with open(filename, 'wb') as file:  
    pickle.dump(XGboost, file)


Parameters: { learning_rate , min_child_weight , subsample  } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


1. The F-1 score of the model 0.9272927166720374

2. The recall score of the model 0.9324829738853424

3. Classification report 
               precision    recall  f1-score   support

           0       0.90      0.69      0.78        55
           1       0.84      0.98      0.91        49
           2       0.98      1.00      0.99        51
           3       0.94      0.91      0.92        53
           4       0.96      1.00      0.98        51
           5       0.98      1.00      0.99        50
           6       0.89      0.95      0.92        41

    accuracy                           0.93       350
   macro avg       0.93      0.93      0.93       350
weighte

In [30]:
xgboost = open('COVID-19 Results\\knn_model.pkl',"rb")
example_dict = pickle.load(xgboost)

## Random Forest Testing 

In [32]:
audio_path = 'D:\\COVID_19_dataset\\test\\priyanka'
test_files = []
for r, d, f in os.walk(audio_path):    
    for file in f:
        if '.wav' in file:
            test_files.append(os.path.join(r,file))

test_breathing_deep = []
test_breathing_shallow = []
test_cough_heavy = []
test_cough_shallow = []
test_counting_fast = []
test_counting_normal = []
test_vowel_a =[]
test_vowel_e = []
test_vowel_o = []

extract_audio_feaures(test_breathing_deep,test_breathing_shallow, test_cough_heavy, test_cough_shallow,
                     test_counting_fast, test_counting_normal, test_vowel_a, test_vowel_e,test_vowel_o, test_files)


test_mfcc_breathing_deep = []
test_mfcc_breathing_shallow = []
test_mfcc_cough_heavy = []
test_mfcc_cough_shallow = []
test_mfcc_counting_fast = []
test_mfcc_counting_normal = []
test_mfcc_vowel_a =[]
test_mfcc_vowel_e = []
test_mfcc_vowel_o = []

extract_mfcc_features(test_mfcc_breathing_deep,test_mfcc_breathing_shallow, test_mfcc_cough_heavy,test_mfcc_cough_shallow, test_mfcc_counting_fast,
                     test_mfcc_counting_normal, test_mfcc_vowel_a, test_mfcc_vowel_e, test_mfcc_vowel_o, test_files )


path1 = "D:\\COVID_19_dataset\\csv_files\\test_breathing_deep.csv"
path2 = "D:\\COVID_19_dataset\\csv_files\\test_breathing_shallow.csv"
path3 = "D:\\COVID_19_dataset\\csv_files\\test_cough_heavy.csv"
path4 = "D:\\COVID_19_dataset\\csv_files\\test_cough_shallow.csv"
path5 = "D:\\COVID_19_dataset\\csv_files\\test_counting_fast.csv"
path6 = "D:\\COVID_19_dataset\\csv_files\\test_counting_normal.csv"
path7 = "D:\\COVID_19_dataset\\csv_files\\test_vowel_a.csv"
path8 = "D:\\COVID_19_dataset\\csv_files\\test_vowel_e.csv"
path9 = "D:\\COVID_19_dataset\\csv_files\\test_vowel_o.csv"

write_audio_features(path1,test_breathing_deep,'breathing_deep')
write_audio_features(path2,test_breathing_shallow,'breathing_shallow')
write_audio_features(path3,test_cough_heavy,'cough_heavy')
write_audio_features(path4,test_cough_shallow,'cough_shallow')
write_audio_features(path5,test_counting_fast,'counting_fast')
write_audio_features(path6,test_counting_normal,'counting_normal')
write_audio_features(path7,test_vowel_a,'vowel_a')
write_audio_features(path8,test_vowel_e,'vowel_e')
write_audio_features(path9,test_vowel_o,'vowel_o')

#write mfcc features
path1 = "D:\\COVID_19_dataset\\csv_files\\test_mfcc_breathing_deep.csv"
path2 = "D:\\COVID_19_dataset\\csv_files\\test_mfcc_breathing_shallow.csv"
path3 = "D:\\COVID_19_dataset\\csv_files\\test_mfcc_cough_heavy.csv"
path4 = "D:\\COVID_19_dataset\\csv_files\\test_mfcc_cough_shallow.csv"
path5 = "D:\\COVID_19_dataset\\csv_files\\test_mfcc_counting_fast.csv"
path6 = "D:\\COVID_19_dataset\\csv_files\\test_mfcc_counting_normal.csv"
path7 = "D:\\COVID_19_dataset\\csv_files\\test_mfcc_vowel_a.csv"
path8 = "D:\\COVID_19_dataset\\csv_files\\test_mfcc_vowel_e.csv"
path9 = "D:\\COVID_19_dataset\\csv_files\\test_mfcc_vowel_o.csv"

write_audio_features(path1,test_mfcc_breathing_deep,'mfcc_breathing_deep',mfcc=True)
write_audio_features(path2,test_mfcc_breathing_shallow,'mfcc_breathing_shallow',mfcc=True)
write_audio_features(path3,test_mfcc_cough_heavy,'mfcc_cough_heavy',mfcc=True)
write_audio_features(path4,test_mfcc_cough_shallow,'mfcc_cough_shallow',mfcc=True)
write_audio_features(path5,test_mfcc_counting_fast,'mfcc_counting_fast',mfcc=True)
write_audio_features(path6,test_mfcc_counting_normal,'mfcc_counting_normal',mfcc=True)
write_audio_features(path7,test_mfcc_vowel_a,'mfcc_vowel_a',mfcc=True)
write_audio_features(path8,test_mfcc_vowel_e,'mfcc_vowel_e',mfcc=True)
write_audio_features(path9,test_mfcc_vowel_o,'mfcc_vowel_o',mfcc=True)

brething_deep_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\test_breathing_deep.csv")
brething_shallow_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\test_breathing_shallow.csv")
cough_heavy_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\test_cough_heavy.csv")
cough_shallow_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\test_cough_shallow.csv")
counting_fast_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\test_counting_fast.csv")
counting_normal_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\test_counting_normal.csv")
vowel_a_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\test_vowel_a.csv")
vowel_e_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\test_vowel_e.csv")
vowel_o_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\test_vowel_o.csv")

test_main_df = brething_deep_df.merge(brething_shallow_df,on='id')
test_main_df = test_main_df.merge(cough_heavy_df, on='id')
test_main_df = test_main_df.merge(cough_shallow_df, on='id')
test_main_df = test_main_df.merge(counting_fast_df, on = 'id')
test_main_df = test_main_df.merge(counting_normal_df, on = 'id')
test_main_df = test_main_df.merge(vowel_a_df, on = 'id')
test_main_df = test_main_df.merge(vowel_e_df, on = 'id')
test_main_df = test_main_df.merge(vowel_o_df, on = 'id')
test_main_df.head()

#merge mfcc feature files
mfcc_brething_deep_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\test_mfcc_breathing_deep.csv")
mfcc_brething_shallow_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\test_mfcc_breathing_shallow.csv")
mfcc_cough_heavy_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\test_mfcc_cough_heavy.csv")
mfcc_cough_shallow_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\test_mfcc_cough_shallow.csv")
mfcc_counting_fast_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\test_mfcc_counting_fast.csv")
mfcc_counting_normal_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\test_mfcc_counting_normal.csv")
mfcc_vowel_a_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\test_mfcc_vowel_a.csv")
mfcc_vowel_e_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\test_mfcc_vowel_e.csv")
mfcc_vowel_o_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\test_mfcc_vowel_o.csv")

mfcc_main_df = mfcc_brething_deep_df.merge(mfcc_brething_shallow_df,on='id')
mfcc_main_df = mfcc_main_df.merge(mfcc_cough_heavy_df, on='id')
mfcc_main_df = mfcc_main_df.merge(mfcc_cough_shallow_df, on='id')
mfcc_main_df = mfcc_main_df.merge(mfcc_counting_fast_df, on = 'id')
mfcc_main_df = mfcc_main_df.merge(mfcc_cough_shallow_df, on = 'id')
mfcc_main_df = mfcc_main_df.merge(mfcc_vowel_a_df, on = 'id')
mfcc_main_df = mfcc_main_df.merge(mfcc_vowel_e_df, on = 'id')
mfcc_main_df = mfcc_main_df.merge(mfcc_vowel_o_df, on = 'id')

final_Df = test_main_df.merge(mfcc_main_df,on='id')
index = final_Df['id']
# print(index)
label_dict ={0: 'healthy',
             1: 'no_resp_illness_exposed',
             2: 'positive_asymp',
             3: 'positive_mild',
             4: 'positive_moderate',
             5: 'recovered_full',
             6: 'resp_illness_not_identified'}
test_prob = example_dict.predict(final_Df.iloc[:,1:])
print(f'The {index} is:')
print(f'The person is diagonised as {[label_dict[0]]}.')

The 0    priyanka
Name: id, dtype: object is:
The person is diagonised as ['healthy'].


## Kneighbors Classifier testing

In [None]:
label_dict ={0: 'healthy',
             1: 'no_resp_illness_exposed',
             2: 'positive_asymp',
             3: 'positive_mild',
             4: 'positive_moderate',
             5: 'recovered_full',
             6: 'resp_illness_not_identified'}

In [None]:
audio_path = 'D:\\COVID_19_dataset\\final\\3VWhvtBFHKMgEKxlXzMYeg5NeBN2'
test_files = []
for r, d, f in os.walk(audio_path):    
    for file in f:
        if '.wav' in file:
            test_files.append(os.path.join(r,file))

test_breathing_deep = []
test_breathing_shallow = []
test_cough_heavy = []
test_cough_shallow = []
test_counting_fast = []
test_counting_normal = []
test_vowel_a =[]
test_vowel_e = []
test_vowel_o = []

extract_audio_feaures(test_breathing_deep,test_breathing_shallow, test_cough_heavy, test_cough_shallow,
                     test_counting_fast, test_counting_normal, test_vowel_a, test_vowel_e,test_vowel_o, test_files)


test_mfcc_breathing_deep = []
test_mfcc_breathing_shallow = []
test_mfcc_cough_heavy = []
test_mfcc_cough_shallow = []
test_mfcc_counting_fast = []
test_mfcc_counting_normal = []
test_mfcc_vowel_a =[]
test_mfcc_vowel_e = []
test_mfcc_vowel_o = []

extract_mfcc_features(test_mfcc_breathing_deep,test_mfcc_breathing_shallow, test_mfcc_cough_heavy,test_mfcc_cough_shallow, test_mfcc_counting_fast,
                     test_mfcc_counting_normal, test_mfcc_vowel_a, test_mfcc_vowel_e, test_mfcc_vowel_o, test_files )


path1 = "D:\\COVID_19_dataset\\csv_files\\test_breathing_deep.csv"
path2 = "D:\\COVID_19_dataset\\csv_files\\test_breathing_shallow.csv"
path3 = "D:\\COVID_19_dataset\\csv_files\\test_cough_heavy.csv"
path4 = "D:\\COVID_19_dataset\\csv_files\\test_cough_shallow.csv"
path5 = "D:\\COVID_19_dataset\\csv_files\\test_counting_fast.csv"
path6 = "D:\\COVID_19_dataset\\csv_files\\test_counting_normal.csv"
path7 = "D:\\COVID_19_dataset\\csv_files\\test_vowel_a.csv"
path8 = "D:\\COVID_19_dataset\\csv_files\\test_vowel_e.csv"
path9 = "D:\\COVID_19_dataset\\csv_files\\test_vowel_o.csv"

write_audio_features(path1,test_breathing_deep,'breathing_deep')
write_audio_features(path2,test_breathing_shallow,'breathing_shallow')
write_audio_features(path3,test_cough_heavy,'cough_heavy')
write_audio_features(path4,test_cough_shallow,'cough_shallow')
write_audio_features(path5,test_counting_fast,'counting_fast')
write_audio_features(path6,test_counting_normal,'counting_normal')
write_audio_features(path7,test_vowel_a,'vowel_a')
write_audio_features(path8,test_vowel_e,'vowel_e')
write_audio_features(path9,test_vowel_o,'vowel_o')

#write mfcc features
path1 = "D:\\COVID_19_dataset\\csv_files\\test_mfcc_breathing_deep.csv"
path2 = "D:\\COVID_19_dataset\\csv_files\\test_mfcc_breathing_shallow.csv"
path3 = "D:\\COVID_19_dataset\\csv_files\\test_mfcc_cough_heavy.csv"
path4 = "D:\\COVID_19_dataset\\csv_files\\test_mfcc_cough_shallow.csv"
path5 = "D:\\COVID_19_dataset\\csv_files\\test_mfcc_counting_fast.csv"
path6 = "D:\\COVID_19_dataset\\csv_files\\test_mfcc_counting_normal.csv"
path7 = "D:\\COVID_19_dataset\\csv_files\\test_mfcc_vowel_a.csv"
path8 = "D:\\COVID_19_dataset\\csv_files\\test_mfcc_vowel_e.csv"
path9 = "D:\\COVID_19_dataset\\csv_files\\test_mfcc_vowel_o.csv"

write_audio_features(path1,test_mfcc_breathing_deep,'mfcc_breathing_deep',mfcc=True)
write_audio_features(path2,test_mfcc_breathing_shallow,'mfcc_breathing_shallow',mfcc=True)
write_audio_features(path3,test_mfcc_cough_heavy,'mfcc_cough_heavy',mfcc=True)
write_audio_features(path4,test_mfcc_cough_shallow,'mfcc_cough_shallow',mfcc=True)
write_audio_features(path5,test_mfcc_counting_fast,'mfcc_counting_fast',mfcc=True)
write_audio_features(path6,test_mfcc_counting_normal,'mfcc_counting_normal',mfcc=True)
write_audio_features(path7,test_mfcc_vowel_a,'mfcc_vowel_a',mfcc=True)
write_audio_features(path8,test_mfcc_vowel_e,'mfcc_vowel_e',mfcc=True)
write_audio_features(path9,test_mfcc_vowel_o,'mfcc_vowel_o',mfcc=True)

brething_deep_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\test_breathing_deep.csv")
brething_shallow_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\test_breathing_shallow.csv")
cough_heavy_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\test_cough_heavy.csv")
cough_shallow_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\test_cough_shallow.csv")
counting_fast_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\test_counting_fast.csv")
counting_normal_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\test_counting_normal.csv")
vowel_a_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\test_vowel_a.csv")
vowel_e_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\test_vowel_e.csv")
vowel_o_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\test_vowel_o.csv")

test_main_df = brething_deep_df.merge(brething_shallow_df,on='id')
test_main_df = test_main_df.merge(cough_heavy_df, on='id')
test_main_df = test_main_df.merge(cough_shallow_df, on='id')
test_main_df = test_main_df.merge(counting_fast_df, on = 'id')
test_main_df = test_main_df.merge(counting_normal_df, on = 'id')
test_main_df = test_main_df.merge(vowel_a_df, on = 'id')
test_main_df = test_main_df.merge(vowel_e_df, on = 'id')
test_main_df = test_main_df.merge(vowel_o_df, on = 'id')
test_main_df.head()

#merge mfcc feature files
mfcc_brething_deep_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\test_mfcc_breathing_deep.csv")
mfcc_brething_shallow_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\test_mfcc_breathing_shallow.csv")
mfcc_cough_heavy_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\test_mfcc_cough_heavy.csv")
mfcc_cough_shallow_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\test_mfcc_cough_shallow.csv")
mfcc_counting_fast_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\test_mfcc_counting_fast.csv")
mfcc_counting_normal_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\test_mfcc_counting_normal.csv")
mfcc_vowel_a_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\test_mfcc_vowel_a.csv")
mfcc_vowel_e_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\test_mfcc_vowel_e.csv")
mfcc_vowel_o_df = pd.read_csv("D:\\COVID_19_dataset\\csv_files\\test_mfcc_vowel_o.csv")

mfcc_main_df = mfcc_brething_deep_df.merge(mfcc_brething_shallow_df,on='id')
mfcc_main_df = mfcc_main_df.merge(mfcc_cough_heavy_df, on='id')
mfcc_main_df = mfcc_main_df.merge(mfcc_cough_shallow_df, on='id')
mfcc_main_df = mfcc_main_df.merge(mfcc_counting_fast_df, on = 'id')
mfcc_main_df = mfcc_main_df.merge(mfcc_cough_shallow_df, on = 'id')
mfcc_main_df = mfcc_main_df.merge(mfcc_vowel_a_df, on = 'id')
mfcc_main_df = mfcc_main_df.merge(mfcc_vowel_e_df, on = 'id')
mfcc_main_df = mfcc_main_df.merge(mfcc_vowel_o_df, on = 'id')

final_Df = test_main_df.merge(mfcc_main_df,on='id')
index = final_Df['id']
# print(index)
test_prob = pickel_knn_model.predict(final_Df.iloc[:,1:])
# print(f 'The {index} is:')
# print(test_prob)
print(f'The person is diagonised as {label_dict[test_prob[0]]}.')