# Import

In [1]:
import random
import pandas as pd
import numpy as np
import os
import librosa

from tqdm.auto import tqdm

# from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder

import warnings
warnings.filterwarnings(action='ignore') 

In [2]:
import jinja2

from pycaret.classification import *

# Hyperparameter Setting

In [3]:
CFG = {
    'SR':16000,
    'N_MFCC':39, # MFCC 벡터를 추출할 개수
    'SEED':1209
}

# Fixed Random-Seed

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(CFG['SEED']) # Seed 고정

# Data Pre-Processing 1

In [5]:
train_df = pd.read_csv('./train_data.csv')
test_df = pd.read_csv('./test_data.csv')

train data 대부분의 샘플이 8초 이하이므로, 모든 데이터의 길이를 16000 * 8 = 128000으로 고정

In [6]:
def timeCheck(df, data_type, root_path):
    root_folder = os.path.join(root_path, data_type)
    sec = [0 for _ in range(len(df))]
    duration_list = []
    
    for uid in tqdm(df['id']):
        path = os.path.join(root_folder, str(uid).zfill(5)+'.wav')
        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=CFG['SR'])
        dur=librosa.get_duration(y)
        duration_list.append(dur)
    
    return duration_list

def printTime(time_ary):
    temp=[]
    for i in range(3,10):
        over = 0
        for time in time_ary:
            if time <= i:
                over += 1
        print(f'duration {i} sec 이하의 샘플 수 => {over}')
        
path = './wav_dataset'

In [7]:
# time_ary = timeCheck(train_df, 'train', path)
# printTime(time_ary)

#duration 3 sec 이하의 샘플 수 => 317
#duration 4 sec 이하의 샘플 수 => 640
#duration 5 sec 이하의 샘플 수 => 983
#duration 6 sec 이하의 샘플 수 => 1233
#duration 7 sec 이하의 샘플 수 => 1645
#duration 8 sec 이하의 샘플 수 => 3795
#duration 9 sec 이하의 샘플 수 => 3796

In [8]:
def get_mfcc_feature(df, data_type, save_path):
    # Data Folder path
    root_folder = './wav_dataset'
    if os.path.exists(save_path):
        print(f'{save_path} is exist.')
        return
    features = []
    for uid in tqdm(df['id']):
        root_path = os.path.join(root_folder, data_type)
        path = os.path.join(root_path, str(uid).zfill(5)+'.wav')

        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=CFG['SR'])
        y = librosa.util.fix_length(y, 128000) 
        
        # librosa패키지를 사용하여 mfcc 추출
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CFG['N_MFCC'])

        y_feature = []
        # 추출된 MFCC들의 평균을 Feature로 사용
        for e in mfcc:
            y_feature.append(np.mean(e))
        features.append(y_feature)
    
    # 기존의 자가진단 정보를 담은 데이터프레임에 추출된 오디오 Feature를 추가
    mfcc_df = pd.DataFrame(features, columns=['mfcc_'+str(x) for x in range(1,CFG['N_MFCC']+1)])
    df = pd.concat([df, mfcc_df], axis=1)
    df.to_csv(save_path, index=False)
    print('Done.')

In [9]:

get_mfcc_feature(train_df, 'train', './train_mfcc_data(16000,39).csv')
get_mfcc_feature(test_df, 'test', './test_mfcc_data(16000,39).csv')


  0%|          | 0/3805 [00:00<?, ?it/s]

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


Done.


  0%|          | 0/5732 [00:00<?, ?it/s]

Done.


In [10]:
# train_mfcc_data = pd.read_csv('./train_mfcc_data.csv')
# test_mfcc_data = pd.read_csv('./test_mfcc_data.csv')
# get_mfcc_feature2('unlabeled', './unlabeled_mfcc_data.csv')

# Data Pre-Processing 2

In [11]:
# wav 파일의 MFCC Feature와 상태정보를 합친 학습데이터를 불러옵니다.
train_df = pd.read_csv('./train_mfcc_data(16000,39).csv')

# 학습데이터를 모델의 input으로 들어갈 x와 label로 사용할 y로 분할
# train_x = train_df.drop(columns=['id', 'covid19'])
# train_y = train_df['covid19']

In [12]:
def onehot_encoding(ohe, x):
    # 학습데이터로 부터 fit된 one-hot encoder (ohe)를 받아 transform 시켜주는 함수
    encoded = ohe.transform(x['gender'].values.reshape(-1,1))
    encoded_df = pd.DataFrame(encoded, columns=ohe.categories_[0])
    x = pd.concat([x.drop(columns=['gender']), encoded_df], axis=1)
    return x

In [13]:
# 'gender' column의 경우 추가 전처리가 필요 -> OneHotEncoder 적용
ohe = OneHotEncoder(sparse=False)
ohe.fit(train_df['gender'].values.reshape(-1,1))
train_x = onehot_encoding(ohe, train_df)

In [14]:
train_x

Unnamed: 0,id,age,respiratory_condition,fever_or_muscle_pain,covid19,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12,mfcc_13,mfcc_14,mfcc_15,mfcc_16,mfcc_17,mfcc_18,mfcc_19,mfcc_20,mfcc_21,mfcc_22,mfcc_23,mfcc_24,mfcc_25,mfcc_26,mfcc_27,mfcc_28,mfcc_29,mfcc_30,mfcc_31,mfcc_32,mfcc_33,mfcc_34,mfcc_35,mfcc_36,mfcc_37,mfcc_38,mfcc_39,female,male,other
0,1,24,0,1,0,-359.22797,18.725014,-12.463950,-4.104031,-6.273463,-4.566713,-20.701300,-6.922103,-3.791822,2.248050,-5.412954,7.048406,1.235090,3.280158,-5.502744,0.167365,-4.493169,-0.024983,-3.117989,-0.377527,-0.475275,1.558049,-1.643940,1.505849,-0.721705,1.419939,-0.208319,1.542708,-2.935635,0.760228,-0.955219,-0.950241,-1.477706,-0.621423,-1.384570,0.727923,-0.929843,-0.003780,-1.214787,1.0,0.0,0.0
1,2,51,0,0,0,-492.53705,16.620646,-0.525034,-2.884007,1.335213,-0.035296,-5.307189,0.167285,-1.674905,2.779286,-3.689295,1.401150,-3.563730,0.826246,-3.597273,1.188313,-3.704741,-0.677987,-0.264512,0.112274,-1.070155,0.739694,-2.228907,0.373945,-2.121927,-0.562728,-2.234932,0.210999,-0.429721,-0.549178,-0.848390,0.338460,-0.934613,0.334736,-1.014041,-0.243517,-1.470395,-0.239411,-0.960995,0.0,1.0,0.0
2,3,22,0,0,0,-412.76060,55.148693,-26.977858,-4.199242,-16.537760,0.392552,-10.750298,-11.334574,-9.289491,3.190088,-8.900794,1.303755,-1.770142,-4.133532,-4.566784,-2.849132,-2.592480,-2.623648,-6.283154,0.329315,-2.807865,-0.544678,-0.124472,-2.002747,3.138178,1.434716,-0.035400,-1.193442,-0.808593,-0.546102,1.009230,0.394788,0.177333,-0.914746,0.141120,1.593800,0.202482,-0.979783,-0.861908,0.0,1.0,0.0
3,4,29,1,0,0,-355.13740,47.546375,-8.298250,-1.240015,-22.798750,-9.486360,-11.353716,-10.458317,1.107141,-1.550276,-18.361511,6.080814,-10.487546,0.263172,-4.362597,-1.802061,-8.869318,0.316289,-3.559158,0.002278,-3.971929,0.744933,0.147008,4.655498,-2.403361,3.507704,1.695809,2.372703,0.257786,2.024651,-0.611229,1.742659,-2.070349,1.881100,-0.610811,1.431621,-1.562665,-0.590383,-2.107972,1.0,0.0,0.0
4,5,23,0,0,0,-534.59875,7.591898,-7.857255,2.601601,-5.693156,-0.315231,-6.495390,-0.566859,0.094364,1.222945,-5.531987,-1.075687,-4.183568,-0.838670,-2.826192,-1.197865,-3.599933,0.896516,-5.989251,2.291378,-2.225379,0.386904,-0.286982,-0.301649,-1.582595,0.332218,-1.063021,0.858535,-0.795381,1.196680,-0.536345,1.219049,0.112142,1.317136,-1.277842,1.077284,-0.039836,0.021139,-0.312544,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3800,3801,53,0,0,0,-366.78976,57.718204,-34.554462,2.829680,-0.606111,-24.384256,-6.933423,-18.031605,-9.040642,-6.633099,-8.397150,-0.618212,-12.168248,10.507279,-8.390300,2.208860,3.075952,1.087040,-1.306395,3.749767,-9.106146,2.189116,-0.372399,1.230416,-4.279350,0.049114,0.953728,3.304498,-3.458012,2.211299,-3.332170,-2.292739,-1.405284,-2.437312,-1.705466,-2.851438,-2.885133,1.138393,-0.897526,0.0,1.0,0.0
3801,3802,25,0,0,0,-533.21250,31.648153,-5.508970,0.176436,-7.209403,-6.087759,-8.666494,-8.680750,-4.511870,-1.273499,-6.949265,1.292431,-4.917324,1.877427,-5.232191,-0.764924,-3.642919,-0.877957,-4.808671,-2.881746,-4.037108,-0.380661,-1.285087,2.101684,0.460024,2.564777,1.221083,1.468569,-0.207221,1.569732,1.172312,2.426375,1.471325,-0.204833,-0.302684,0.184483,0.487367,0.796958,-0.102586,0.0,1.0,0.0
3802,3803,26,0,0,0,-323.98944,65.870380,12.097858,32.782760,22.395697,10.463599,5.662384,-1.031516,2.429032,3.665551,-16.292486,-1.863466,-5.909410,-7.910620,-8.955866,-5.463309,-8.408904,-4.378131,-10.840342,-2.235950,-6.425963,-2.085424,-2.765914,-2.625386,-2.795446,-1.579491,-3.485840,-1.285817,0.292844,-1.143106,-1.196832,-0.949310,-0.750538,-1.265775,-1.271012,1.222079,-2.045656,-1.664324,-0.389393,1.0,0.0,0.0
3803,3804,27,0,0,0,-160.00093,78.485880,-15.236269,-5.666398,-26.689854,4.183416,-14.796123,-11.879127,-15.297010,14.701807,-12.235196,5.849805,-15.934942,3.287256,-5.784975,0.485237,-15.036746,-2.137737,-5.341454,0.580341,-5.738627,1.056798,-8.478350,6.095975,-5.070590,0.978247,-2.526058,-0.869069,-4.493664,0.067069,-5.668485,-1.019330,-4.683699,-2.660207,-3.907937,-2.579902,-3.493914,0.503437,-2.852232,1.0,0.0,0.0


In [15]:
train_x = train_x.drop(columns=['id','mfcc_1'])
train_x

Unnamed: 0,age,respiratory_condition,fever_or_muscle_pain,covid19,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12,mfcc_13,mfcc_14,mfcc_15,mfcc_16,mfcc_17,mfcc_18,mfcc_19,mfcc_20,mfcc_21,mfcc_22,mfcc_23,mfcc_24,mfcc_25,mfcc_26,mfcc_27,mfcc_28,mfcc_29,mfcc_30,mfcc_31,mfcc_32,mfcc_33,mfcc_34,mfcc_35,mfcc_36,mfcc_37,mfcc_38,mfcc_39,female,male,other
0,24,0,1,0,18.725014,-12.463950,-4.104031,-6.273463,-4.566713,-20.701300,-6.922103,-3.791822,2.248050,-5.412954,7.048406,1.235090,3.280158,-5.502744,0.167365,-4.493169,-0.024983,-3.117989,-0.377527,-0.475275,1.558049,-1.643940,1.505849,-0.721705,1.419939,-0.208319,1.542708,-2.935635,0.760228,-0.955219,-0.950241,-1.477706,-0.621423,-1.384570,0.727923,-0.929843,-0.003780,-1.214787,1.0,0.0,0.0
1,51,0,0,0,16.620646,-0.525034,-2.884007,1.335213,-0.035296,-5.307189,0.167285,-1.674905,2.779286,-3.689295,1.401150,-3.563730,0.826246,-3.597273,1.188313,-3.704741,-0.677987,-0.264512,0.112274,-1.070155,0.739694,-2.228907,0.373945,-2.121927,-0.562728,-2.234932,0.210999,-0.429721,-0.549178,-0.848390,0.338460,-0.934613,0.334736,-1.014041,-0.243517,-1.470395,-0.239411,-0.960995,0.0,1.0,0.0
2,22,0,0,0,55.148693,-26.977858,-4.199242,-16.537760,0.392552,-10.750298,-11.334574,-9.289491,3.190088,-8.900794,1.303755,-1.770142,-4.133532,-4.566784,-2.849132,-2.592480,-2.623648,-6.283154,0.329315,-2.807865,-0.544678,-0.124472,-2.002747,3.138178,1.434716,-0.035400,-1.193442,-0.808593,-0.546102,1.009230,0.394788,0.177333,-0.914746,0.141120,1.593800,0.202482,-0.979783,-0.861908,0.0,1.0,0.0
3,29,1,0,0,47.546375,-8.298250,-1.240015,-22.798750,-9.486360,-11.353716,-10.458317,1.107141,-1.550276,-18.361511,6.080814,-10.487546,0.263172,-4.362597,-1.802061,-8.869318,0.316289,-3.559158,0.002278,-3.971929,0.744933,0.147008,4.655498,-2.403361,3.507704,1.695809,2.372703,0.257786,2.024651,-0.611229,1.742659,-2.070349,1.881100,-0.610811,1.431621,-1.562665,-0.590383,-2.107972,1.0,0.0,0.0
4,23,0,0,0,7.591898,-7.857255,2.601601,-5.693156,-0.315231,-6.495390,-0.566859,0.094364,1.222945,-5.531987,-1.075687,-4.183568,-0.838670,-2.826192,-1.197865,-3.599933,0.896516,-5.989251,2.291378,-2.225379,0.386904,-0.286982,-0.301649,-1.582595,0.332218,-1.063021,0.858535,-0.795381,1.196680,-0.536345,1.219049,0.112142,1.317136,-1.277842,1.077284,-0.039836,0.021139,-0.312544,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3800,53,0,0,0,57.718204,-34.554462,2.829680,-0.606111,-24.384256,-6.933423,-18.031605,-9.040642,-6.633099,-8.397150,-0.618212,-12.168248,10.507279,-8.390300,2.208860,3.075952,1.087040,-1.306395,3.749767,-9.106146,2.189116,-0.372399,1.230416,-4.279350,0.049114,0.953728,3.304498,-3.458012,2.211299,-3.332170,-2.292739,-1.405284,-2.437312,-1.705466,-2.851438,-2.885133,1.138393,-0.897526,0.0,1.0,0.0
3801,25,0,0,0,31.648153,-5.508970,0.176436,-7.209403,-6.087759,-8.666494,-8.680750,-4.511870,-1.273499,-6.949265,1.292431,-4.917324,1.877427,-5.232191,-0.764924,-3.642919,-0.877957,-4.808671,-2.881746,-4.037108,-0.380661,-1.285087,2.101684,0.460024,2.564777,1.221083,1.468569,-0.207221,1.569732,1.172312,2.426375,1.471325,-0.204833,-0.302684,0.184483,0.487367,0.796958,-0.102586,0.0,1.0,0.0
3802,26,0,0,0,65.870380,12.097858,32.782760,22.395697,10.463599,5.662384,-1.031516,2.429032,3.665551,-16.292486,-1.863466,-5.909410,-7.910620,-8.955866,-5.463309,-8.408904,-4.378131,-10.840342,-2.235950,-6.425963,-2.085424,-2.765914,-2.625386,-2.795446,-1.579491,-3.485840,-1.285817,0.292844,-1.143106,-1.196832,-0.949310,-0.750538,-1.265775,-1.271012,1.222079,-2.045656,-1.664324,-0.389393,1.0,0.0,0.0
3803,27,0,0,0,78.485880,-15.236269,-5.666398,-26.689854,4.183416,-14.796123,-11.879127,-15.297010,14.701807,-12.235196,5.849805,-15.934942,3.287256,-5.784975,0.485237,-15.036746,-2.137737,-5.341454,0.580341,-5.738627,1.056798,-8.478350,6.095975,-5.070590,0.978247,-2.526058,-0.869069,-4.493664,0.067069,-5.668485,-1.019330,-4.683699,-2.660207,-3.907937,-2.579902,-3.493914,0.503437,-2.852232,1.0,0.0,0.0


# Pycaret

In [16]:
cat_col = ['covid19', 'respiratory_condition', 'fever_or_muscle_pain', 'female', 'male', 'other']
clf = setup(train_x, preprocess = False, train_size = 0.999,
           target = 'covid19', numeric_features = list(train_x.drop(columns = cat_col).columns),
            silent = True, session_id = 1209, fold_shuffle = True)

Unnamed: 0,Description,Value
0,session_id,1209
1,Target,covid19
2,Target Type,Binary
3,Label Encoded,"0: 0, 1: 1"
4,Original Data,"(3805, 45)"
5,Missing Values,False
6,Numeric Features,39
7,Categorical Features,5
8,Transformed Train Set,"(3801, 44)"
9,Transformed Test Set,"(4, 44)"


In [17]:
top3_models = compare_models(fold = 5, round = 3, sort = 'F1', n_select = 3)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
qda,Quadratic Discriminant Analysis,0.886,0.599,0.232,0.265,0.247,0.185,0.186,0.01
nb,Naive Bayes,0.837,0.68,0.314,0.194,0.238,0.154,0.16,0.158
lda,Linear Discriminant Analysis,0.908,0.684,0.141,0.341,0.198,0.158,0.176,0.012
dt,Decision Tree Classifier,0.84,0.524,0.147,0.117,0.13,0.044,0.044,0.046
svm,SVM - Linear Kernel,0.913,0.0,0.056,0.411,0.083,0.063,0.097,0.012
ada,Ada Boost Classifier,0.912,0.612,0.046,0.237,0.075,0.052,0.072,0.194
gbc,Gradient Boosting Classifier,0.912,0.67,0.033,0.213,0.056,0.036,0.054,0.534
lr,Logistic Regression,0.917,0.678,0.029,0.364,0.054,0.041,0.082,0.356
knn,K Neighbors Classifier,0.915,0.535,0.02,0.209,0.036,0.022,0.041,0.184
et,Extra Trees Classifier,0.919,0.67,0.016,0.5,0.031,0.026,0.076,0.068


# Train

In [18]:
models = []
for m in top3_models:
    models.append(tune_model(m,
                            optimize = 'F1',
                            choose_better = True,
                            n_iter = 500))

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9029,0.6147,0.0968,0.25,0.1395,0.0986,0.1112
1,0.9053,0.697,0.1333,0.2857,0.1818,0.1385,0.15
2,0.9132,0.6879,0.1,0.3333,0.1538,0.1218,0.1469
3,0.9053,0.6843,0.1667,0.3125,0.2174,0.1719,0.1816
4,0.9053,0.7294,0.1,0.25,0.1429,0.1024,0.1145
5,0.9079,0.7263,0.2258,0.3889,0.2857,0.2402,0.2504
6,0.9053,0.6392,0.0968,0.2727,0.1429,0.1046,0.1206
7,0.9263,0.7343,0.1613,0.7143,0.2632,0.2403,0.3167
8,0.9079,0.6757,0.1935,0.375,0.2553,0.2115,0.2247
9,0.9237,0.6978,0.2581,0.5714,0.3556,0.3211,0.35


In [19]:
voting = blend_models(models, optimize = 'F1')
voting = tune_model(voting,
                   optimize = 'F1',
                   choose_better = True,
                   n_iter = 500)

voting = finalize_model(voting)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8871,0.6006,0.2258,0.2692,0.2456,0.1851,0.186
1,0.8868,0.6786,0.2333,0.2593,0.2456,0.1846,0.1849
2,0.9079,0.6578,0.1667,0.3333,0.2222,0.179,0.1912
3,0.9026,0.749,0.3667,0.3793,0.3729,0.3201,0.3202
4,0.8737,0.6969,0.2,0.2,0.2,0.1314,0.1314
5,0.8921,0.7029,0.3548,0.3438,0.3492,0.2904,0.2904
6,0.8711,0.6068,0.129,0.1538,0.1404,0.0712,0.0716
7,0.9211,0.7401,0.2903,0.5294,0.375,0.3367,0.3541
8,0.8895,0.6625,0.2581,0.2963,0.2759,0.2163,0.2169
9,0.9132,0.739,0.3226,0.4545,0.3774,0.3321,0.3378


In [20]:
# model = MLPClassifier(random_state=CFG['SEED']) # Sklearn에서 제공하는 Multi-layer Perceptron classifier 사용
# model.fit(train_x, train_y) # Model Train

# Inference

In [21]:
# 위의 학습데이터를 전처리한 과정과 동일하게 test data에도 적용
test_x = pd.read_csv('./test_mfcc_data(16000,39).csv')

# Data Leakage에 유의하여 train data로만 학습된 ohe를 사용
test_x = onehot_encoding(ohe, test_x)
test_x = test_x.drop(columns=['id','mfcc_1'])

In [22]:
# Model 추론
pred = voting.predict(test_x)
pred

array([0, 0, 0, ..., 0, 0, 1])

In [23]:
# # 정확도

# print("훈련 세트 정확도 : {:.3f}".format(voting.score(train_x, train_y)))
# print("테스트 세트 정확도 : {:.3f}".format(voting.score(test_x, preds)))

# Submission

In [24]:
submission = pd.read_csv('./sample_submission.csv')

submission['covid19'] = pred
submission.to_csv('./submit_220711_2.csv', index=False)







