In [21]:
import numpy as np
import pandas as pd
import librosa
from librosa import display
import os
import matplotlib.pyplot as plt

In [86]:
from sklearn.preprocessing import LabelEncoder, scale, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
import tensorflow as tf
from keras.utils import to_categorical
from keras.models import Sequential, Model, load_model
from keras.layers import BatchNormalization
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, Conv2D, MaxPooling2D, GlobalAveragePooling2D, UpSampling2D, Input 
from tensorflow.python.keras import utils
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from keras import optimizers
from keras.regularizers import l1
from datetime import datetime
from sklearn import metrics

In [23]:
df=pd.read_csv('../input/covid19-cough-audio-classification/metadata_compiled.csv')
df.head()

Unnamed: 0,uuid,datetime,cough_detected,SNR,latitude,longitude,age,gender,respiratory_condition,fever_muscle_pain,...,quality_4,cough_type_4,dyspnea_4,wheezing_4,stridor_4,choking_4,congestion_4,nothing_4,diagnosis_4,severity_4
0,00014dcc-0f06-4c27-8c7b-737b18a2cf4c,2020-11-25T18:58:50.488301+00:00,0.0155,7.326171,48.9,2.4,,,,,...,,,,,,,,,,
1,00039425-7f3a-42aa-ac13-834aaa2b6b92,2020-04-13T21:30:59.801831+00:00,0.9609,16.151433,31.3,34.8,15.0,male,False,False,...,,,,,,,,,,
2,0007c6f1-5441-40e6-9aaf-a761d8f2da3b,2020-10-18T15:38:38.205870+00:00,0.1643,16.217201,,,46.0,female,False,False,...,,,,,,,,,,
3,0009eb28-d8be-4dc1-92bb-907e53bc5c7a,2020-04-12T04:02:18.159383+00:00,0.9301,20.146058,40.0,-75.1,34.0,male,True,False,...,,,,,,,,,,
4,0012c608-33d0-4ef7-bde3-75a0b1a0024e,2020-04-15T01:03:59.029326+00:00,0.0482,0.0,-16.5,-71.5,,,,,...,,,,,,,,,,


In [24]:
print(df['status'].isnull().value_counts())
print(df['age'].isnull().value_counts())
print(df['respiratory_condition'].isnull().value_counts())
print(df['fever_muscle_pain'].isnull().value_counts())
print(df['gender'].isnull().value_counts())
print(df['status'].unique())

status
False    16224
True     11326
Name: count, dtype: int64
age
False    15218
True     12332
Name: count, dtype: int64
respiratory_condition
False    16224
True     11326
Name: count, dtype: int64
fever_muscle_pain
False    16224
True     11326
Name: count, dtype: int64
gender
False    16224
True     11326
Name: count, dtype: int64
[nan 'healthy' 'COVID-19' 'symptomatic']


In [25]:
# Drop NAs for status and age
print('Entries before: ', df.shape[0])
df = df[~df[['status', 'age', 'respiratory_condition', 'fever_muscle_pain', 'gender']].isnull().any(axis=1)]
print('Entries after: ', df.shape[0])

Entries before:  27550
Entries after:  15218


In [118]:
# Encode for binary traits
df.loc[:, 'fever_muscle_pain'] = df['fever_muscle_pain'].apply(lambda x: -100 if x == False else 100)
df.loc[:, 'gender'] = df['gender'].apply(lambda x: -100 if x == 'female' else 100)
df.loc[:, 'respiratory_condition'] = df['respiratory_condition'].apply(lambda x: -100 if x == False else 100)

In [103]:
healthy_df = df[df.status == 'healthy']
symp_df = df[df.status == 'sympotmatic']
covid_df = df[df.status == 'COVID-19']

In [66]:
def mfcc_extract(file):
    # Extract MFCC from audio file
    audio, sr = librosa.load(file, res_type='soxr_hq')
    mfcc = librosa.feature.mfcc(y=audio,sr=sr,n_mfcc=100)
    scaled_mfcc =np.mean(mfcc.T,axis=0)
    return scaled_mfcc
    

In [67]:

import warnings
warnings.filterwarnings('ignore', '.*PySoundFile failed. Trying audioread instead*.', )
warnings.filterwarnings("ignore", category=FutureWarning)

def extract_cough_features():
    mfccs = []
    for i, row in df.iterrows():
        file = '../input/covid19-cough-audio-classification/'+row['uuid']+ '.webm'
        # Look for .ogg file if .webm isn't present
        if not os.path.exists(file):
            file = '../input/covid19-cough-audio-classification/'+row['uuid']+ '.ogg'
            if not os.path.exists(file):
                print('Skipping ' + row['uuid'] + ' at row ' + str(i))
                continue
        if i % 500 == 0:
            print(str(i) + ' files read')
        data = mfcc_extract(file)
        mfccs.append([row['uuid'], data])
    print('Done! ' + str(i) + ' files read.')
    return mfccs

mfccs = extract_cough_features()
        

500 files read
1000 files read
1500 files read
2000 files read
2500 files read
3000 files read
4000 files read
4500 files read
7000 files read
8500 files read
9000 files read
10000 files read
11500 files read
12000 files read
12500 files read
14000 files read
15000 files read
16000 files read
16500 files read
17000 files read
17500 files read
19500 files read
21000 files read
22500 files read
23000 files read
23500 files read
24500 files read
25000 files read
27500 files read
Done! 27548 files read.


In [104]:
mfcc_df = pd.DataFrame(mfccs, columns = ['uuid','mfcc'])
mfcc_df

Unnamed: 0,uuid,mfcc
0,00039425-7f3a-42aa-ac13-834aaa2b6b92,"[-563.3379, 18.238453, -2.3402798, 5.720763, -..."
1,0007c6f1-5441-40e6-9aaf-a761d8f2da3b,"[-736.4029, 55.818638, 9.053529, 1.899994, -12..."
2,0009eb28-d8be-4dc1-92bb-907e53bc5c7a,"[-278.87955, 100.88816, -25.86599, 15.782069, ..."
3,001328dc-ea5d-4847-9ccf-c5aa2a3f2d0f,"[-519.8622, 26.079721, -28.713432, 7.987691, -..."
4,001e2f19-d81c-4029-b33c-d2db56b23a4a,"[-524.2752, 17.533388, -1.9887484, 6.611181, -..."
...,...,...
15213,ffe5e2a4-ef67-464d-b1cd-b0e321f6a2dd,"[-275.19452, 93.39197, 0.5366596, -16.469807, ..."
15214,ffedc843-bfc2-4ad6-a749-2bc86bdac84a,"[-420.83243, 24.159922, -7.462644, -1.1091201,..."
15215,ffeea120-92a4-40f9-b692-c3865c7a983f,"[-509.1114, 29.669987, -13.902734, 3.884701, -..."
15216,fff13fa2-a725-49ef-812a-39c6cedda33d,"[-542.51886, 25.57454, -14.508806, 1.2672373, ..."


In [119]:
scaler = MinMaxScaler()
mfcc_df['scaled_mfcc'] = mfcc_df['mfcc'].apply(lambda x: scaler.fit_transform(x.reshape(-1, 1)).flatten())
mfcc_df

Unnamed: 0,uuid,mfcc,scaled_mfcc
0,00039425-7f3a-42aa-ac13-834aaa2b6b92,"[-563.3379, 18.238453, -2.3402798, 5.720763, -...","[0.0, 1.0, 0.9646156, 0.9784763, 0.95860326, 0..."
1,0007c6f1-5441-40e6-9aaf-a761d8f2da3b,"[-736.4029, 55.818638, 9.053529, 1.899994, -12...","[0.0, 0.99999994, 0.9409696, 0.9319399, 0.9134..."
2,0009eb28-d8be-4dc1-92bb-907e53bc5c7a,"[-278.87955, 100.88816, -25.86599, 15.782069, ...","[0.0, 1.0, 0.6662324, 0.77589965, 0.67740226, ..."
3,001328dc-ea5d-4847-9ccf-c5aa2a3f2d0f,"[-519.8622, 26.079721, -28.713432, 7.987691, -...","[0.0, 1.0, 0.89963555, 0.9668609, 0.93035656, ..."
4,001e2f19-d81c-4029-b33c-d2db56b23a4a,"[-524.2752, 17.533388, -1.9887484, 6.611181, -...","[0.0, 1.0, 0.9639686, 0.97984123, 0.96617883, ..."
...,...,...,...
15213,ffe5e2a4-ef67-464d-b1cd-b0e321f6a2dd,"[-275.19452, 93.39197, 0.5366596, -16.469807, ...","[0.0, 1.0, 0.74807733, 0.7019376, 0.73368365, ..."
15214,ffedc843-bfc2-4ad6-a749-2bc86bdac84a,"[-420.83243, 24.159922, -7.462644, -1.1091201,...","[0.0, 1.0, 0.92893684, 0.9432147, 0.9293083, 0..."
15215,ffeea120-92a4-40f9-b692-c3865c7a983f,"[-509.1114, 29.669987, -13.902734, 3.884701, -...","[0.0, 1.0, 0.9191273, 0.95214146, 0.9316142, 0..."
15216,fff13fa2-a725-49ef-812a-39c6cedda33d,"[-542.51886, 25.57454, -14.508806, 1.2672373, ...","[0.0, 1.0, 0.92944235, 0.9572125, 0.91856706, ..."


In [69]:
mfcc_df.to_csv('mfcc.csv', index=False)

In [120]:
df2 = df[['uuid','age','respiratory_condition','fever_muscle_pain','gender','status']]
total_df = df2.merge(mfcc_df, how='inner', on='uuid')
total_df

Unnamed: 0,uuid,age,respiratory_condition,fever_muscle_pain,gender,status,mfcc,scaled_mfcc
0,00039425-7f3a-42aa-ac13-834aaa2b6b92,15.0,100,100,100,healthy,"[-563.3379, 18.238453, -2.3402798, 5.720763, -...","[0.0, 1.0, 0.9646156, 0.9784763, 0.95860326, 0..."
1,0007c6f1-5441-40e6-9aaf-a761d8f2da3b,46.0,100,100,100,healthy,"[-736.4029, 55.818638, 9.053529, 1.899994, -12...","[0.0, 0.99999994, 0.9409696, 0.9319399, 0.9134..."
2,0009eb28-d8be-4dc1-92bb-907e53bc5c7a,34.0,100,100,100,healthy,"[-278.87955, 100.88816, -25.86599, 15.782069, ...","[0.0, 1.0, 0.6662324, 0.77589965, 0.67740226, ..."
3,001328dc-ea5d-4847-9ccf-c5aa2a3f2d0f,21.0,100,100,100,healthy,"[-519.8622, 26.079721, -28.713432, 7.987691, -...","[0.0, 1.0, 0.89963555, 0.9668609, 0.93035656, ..."
4,001e2f19-d81c-4029-b33c-d2db56b23a4a,20.0,100,100,100,healthy,"[-524.2752, 17.533388, -1.9887484, 6.611181, -...","[0.0, 1.0, 0.9639686, 0.97984123, 0.96617883, ..."
...,...,...,...,...,...,...,...,...
15213,ffe5e2a4-ef67-464d-b1cd-b0e321f6a2dd,26.0,100,100,100,healthy,"[-275.19452, 93.39197, 0.5366596, -16.469807, ...","[0.0, 1.0, 0.74807733, 0.7019376, 0.73368365, ..."
15214,ffedc843-bfc2-4ad6-a749-2bc86bdac84a,23.0,100,100,100,healthy,"[-420.83243, 24.159922, -7.462644, -1.1091201,...","[0.0, 1.0, 0.92893684, 0.9432147, 0.9293083, 0..."
15215,ffeea120-92a4-40f9-b692-c3865c7a983f,22.0,100,100,100,healthy,"[-509.1114, 29.669987, -13.902734, 3.884701, -...","[0.0, 1.0, 0.9191273, 0.95214146, 0.9316142, 0..."
15216,fff13fa2-a725-49ef-812a-39c6cedda33d,21.0,100,100,100,healthy,"[-542.51886, 25.57454, -14.508806, 1.2672373, ...","[0.0, 1.0, 0.92944235, 0.9572125, 0.91856706, ..."


In [121]:
# Factors
x = np.array(total_df[['age','gender','respiratory_condition','fever_muscle_pain']])
mfcc = np.array(total_df['scaled_mfcc'].tolist())
x_concat = np.concatenate((mfcc, x), axis=1)

# Classification
y = np.array(total_df['status'].tolist())

x_concat

array([[0.0, 1.0, 0.9646155834197998, ..., 100, 100, 100],
       [0.0, 0.9999999403953552, 0.9409695863723755, ..., 100, 100, 100],
       [0.0, 1.0, 0.6662324070930481, ..., 100, 100, 100],
       ...,
       [0.0, 1.0, 0.9191272854804993, ..., 100, 100, 100],
       [0.0, 1.0, 0.9294423460960388, ..., 100, 100, 100],
       [0.0, 1.0, 0.5592305064201355, ..., 100, 100, 100]], dtype=object)

In [122]:
#Label encoding

labelencoder=LabelEncoder()
y=to_categorical(labelencoder.fit_transform([str(t) for t in y]))
print(y.shape)
y

(15218, 3)


array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       ...,
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.]], dtype=float32)

In [124]:
# Split train/test

x_train,x_test,y_train,y_test=train_test_split(x_concat,y,test_size=0.2,random_state=0)

In [125]:
x_train=np.array(x_train)
num_labels=y.shape[1]

In [126]:
model=Sequential()
# First layer
model.add(Dense(100,input_shape=(x_concat.shape[1],)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
# Second layer
model.add(Dense(200))
model.add(Activation('relu'))
model.add(Dropout(0.5))
# Third layer
model.add(Dense(100))
model.add(Activation('relu'))
model.add(Dropout(0.5))
# Final layer
model.add(Dense(num_labels))
model.add(Activation('softmax'))

model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_18 (Dense)            (None, 100)               10500     
                                                                 
 activation_16 (Activation)  (None, 100)               0         
                                                                 
 dropout_12 (Dropout)        (None, 100)               0         
                                                                 
 dense_19 (Dense)            (None, 200)               20200     
                                                                 
 activation_17 (Activation)  (None, 200)               0         
                                                                 
 dropout_13 (Dropout)        (None, 200)               0         
                                                                 
 dense_20 (Dense)            (None, 100)              

In [127]:
model.compile(loss='categorical_crossentropy',metrics=['accuracy'],optimizer='adam')

In [128]:
[print(i.shape, i.dtype) for i in model.inputs]
[print(o.shape, o.dtype) for o in model.outputs]
[print(l.name, l.input_shape, l.dtype) for l in model.layers]

x_train = np.asarray(x_train).astype(np.float32)
y_train = np.asarray(y_train).astype(np.float32)

x_test = np.asarray(x_test).astype(np.float32)
y_test = np.asarray(y_test).astype(np.float32)

(None, 104) <dtype: 'float32'>
(None, 3) <dtype: 'float32'>
dense_18 (None, 104) float32
activation_16 (None, 100) float32
dropout_12 (None, 100) float32
dense_19 (None, 100) float32
activation_17 (None, 200) float32
dropout_13 (None, 200) float32
dense_20 (None, 200) float32
activation_18 (None, 100) float32
dropout_14 (None, 100) float32
dense_21 (None, 100) float32
activation_19 (None, 3) float32


In [129]:
# Train model
warnings.filterwarnings("ignore", category=UserWarning)

num_epochs = 25
batch_size = 32
checkpointer = ModelCheckpoint(filepath='saved_models/audio_classification.hdf5', verbose=1, save_best_only=True)
start = datetime.now()
model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs, validation_data=(x_test, y_test), callbacks=[checkpointer], verbose=1)
duration = datetime.now() - start
print("Training completed in time: ", duration)

Epoch 1/25
Epoch 1: val_loss improved from inf to 0.82811, saving model to saved_models/audio_classification.hdf5
Epoch 2/25
Epoch 2: val_loss improved from 0.82811 to 0.74251, saving model to saved_models/audio_classification.hdf5
Epoch 3/25
Epoch 3: val_loss improved from 0.74251 to 0.73003, saving model to saved_models/audio_classification.hdf5
Epoch 4/25
Epoch 4: val_loss improved from 0.73003 to 0.69453, saving model to saved_models/audio_classification.hdf5
Epoch 5/25
Epoch 5: val_loss improved from 0.69453 to 0.65630, saving model to saved_models/audio_classification.hdf5
Epoch 6/25
Epoch 6: val_loss improved from 0.65630 to 0.65565, saving model to saved_models/audio_classification.hdf5
Epoch 7/25
Epoch 7: val_loss did not improve from 0.65565
Epoch 8/25
Epoch 8: val_loss improved from 0.65565 to 0.65493, saving model to saved_models/audio_classification.hdf5
Epoch 9/25
Epoch 9: val_loss did not improve from 0.65493
Epoch 10/25
Epoch 10: val_loss improved from 0.65493 to 0.6547

In [130]:
accuracy = model.evaluate(x_test, y_test, verbose=1)
print(accuracy[1])

0.7808803915977478


In [54]:
def ANN_covid_predict(file, info):
    # Info is supposed to be an array containing age, gender, respiratory condition, 
    # and fever_muscle_pain from the table in their original order
    mfcc = mfcc_extract(file)
    prediction_data = np.concatenate((mfcc, info), axis=0)
    input_array = np.asarray(prediction_data)
    input_reshaped = input_array.reshape(1,-1)
    input_reshaped.shape
    predictions = model.predict(input_reshaped)
    predict = str(predictions[0])
    predictions = predict.split()
    return('COVID-19: ' + str(predictions[0]) + '\n' + 'Healthy: ' + str(predictions[1]) + '\n' + 'Sympotmatic: ' + str(predictions[2]))