# Import

In [1]:
import random
import pandas as pd
import numpy as np
import os
import librosa

from tqdm.auto import tqdm

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder

import warnings
warnings.filterwarnings(action='ignore') 

In [2]:
import imblearn
import xgboost
import catboost

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from pycaret.classification import *

# Hyperparameter Setting

In [3]:
CFG = {
    'SR':22050,
    'N_MFCC':20, # MFCC 벡터를 추출할 개수
    'SEED':123
}

# Fixed Random-Seed

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(CFG['SEED']) # Seed 고정

# Data Pre-Processing 1

In [5]:
train_df = pd.read_csv('./train_data.csv')
test_df = pd.read_csv('./test_data.csv')

In [6]:
def get_mfcc_feature(df, data_type, save_path):
    # Data Folder path
    root_folder = './wav_dataset'
    if os.path.exists(save_path):
        print(f'{save_path} is exist.')
        return
    features = []
    for uid in tqdm(df['id']):
        root_path = os.path.join(root_folder, data_type)
        path = os.path.join(root_path, str(uid).zfill(5)+'.wav')

        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=CFG['SR'])
        
        # librosa패키지를 사용하여 mfcc 추출
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CFG['N_MFCC'])

        y_feature = []
        # 추출된 MFCC들의 평균을 Feature로 사용
        for e in mfcc:
            y_feature.append(np.mean(e))
        features.append(y_feature)
    
    # 기존의 자가진단 정보를 담은 데이터프레임에 추출된 오디오 Feature를 추가
    mfcc_df = pd.DataFrame(features, columns=['mfcc_'+str(x) for x in range(1,CFG['N_MFCC']+1)])
    df = pd.concat([df, mfcc_df], axis=1)
    df.to_csv(save_path, index=False)
    print('Done.')

In [7]:
get_mfcc_feature(train_df, 'train', './train_mfcc_data20_123.csv')
get_mfcc_feature(test_df, 'test', './test_mfcc_data20_123.csv')

./train_mfcc_data20_123.csv is exist.
./test_mfcc_data20_123.csv is exist.


In [8]:
# train_mfcc_data = pd.read_csv('./train_mfcc_data.csv')
# test_mfcc_data = pd.read_csv('./test_mfcc_data.csv')
# get_mfcc_feature2('unlabeled', './unlabeled_mfcc_data.csv')

# Data Pre-Processing 2

In [9]:
# wav 파일의 MFCC Feature와 상태정보를 합친 학습데이터를 불러옵니다.
train_df = pd.read_csv('./train_mfcc_data20_123.csv')

# 학습데이터를 모델의 input으로 들어갈 x와 label로 사용할 y로 분할
train_x = train_df.drop(columns=['id', 'covid19'])
train_y = train_df['covid19']

In [10]:
def onehot_encoding(ohe, x):
    # 학습데이터로 부터 fit된 one-hot encoder (ohe)를 받아 transform 시켜주는 함수
    encoded = ohe.transform(x['gender'].values.reshape(-1,1))
    encoded_df = pd.DataFrame(encoded, columns=ohe.categories_[0])
    x = pd.concat([x.drop(columns=['gender']), encoded_df], axis=1)
    return x

In [11]:
# 'gender' column의 경우 추가 전처리가 필요 -> OneHotEncoder 적용
ohe = OneHotEncoder(sparse=False)
ohe.fit(train_df['gender'].values.reshape(-1,1))
train_x = onehot_encoding(ohe, train_x)

In [12]:
train_x

Unnamed: 0,age,respiratory_condition,fever_or_muscle_pain,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12,mfcc_13,mfcc_14,mfcc_15,mfcc_16,mfcc_17,mfcc_18,mfcc_19,mfcc_20,female,male,other
0,24,0,1,-305.59457,51.977142,-33.470818,7.374570,-23.994840,9.758025,-30.245546,-16.384844,-15.226559,1.544100,-0.490239,-6.273493,8.557936,3.087276,6.395567,-5.806313,-1.785880,-4.300522,-3.020053,-2.702223,1.0,0.0,0.0
1,51,0,0,-337.25710,72.093080,-13.855276,9.463384,-18.600903,24.353014,-25.513569,4.360093,-11.163043,5.821661,2.507646,-6.770346,2.831020,-8.816771,1.718806,-8.076585,-0.251985,-3.758666,-6.999936,-4.127479,0.0,1.0,0.0
2,22,0,0,-455.41095,54.783130,-23.121520,-1.284064,-12.956191,-2.164156,-3.227035,-7.204677,-9.200953,-4.621264,1.595663,-6.384151,-0.210856,0.250801,-3.175273,-3.116248,-2.673163,-1.772126,-1.548385,-2.584018,0.0,1.0,0.0
3,29,1,0,-391.82820,58.368170,-15.205889,8.082242,-22.550287,-5.647254,-13.082260,-5.258176,-12.553550,7.121708,-5.109934,-13.964030,3.944809,-5.349287,-1.244205,-2.000374,-0.838888,-5.662015,-3.186864,-1.911061,1.0,0.0,0.0
4,23,0,0,-546.70090,17.552140,-15.368838,8.755314,-9.293792,3.379225,-7.003066,-0.883937,-2.776162,2.303463,-0.467268,-4.099334,-1.033820,-3.911650,-0.469578,-3.081244,0.149063,-4.029262,-0.077644,-1.582466,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3800,53,0,0,-351.68085,86.487020,-40.589554,-2.390865,2.895489,-12.300478,-25.822184,-1.451508,-26.655594,-2.371466,-10.900512,-8.453163,1.252489,-16.352022,8.974976,-2.680085,-4.187309,2.467289,4.951514,-2.692058,0.0,1.0,0.0
3801,25,0,0,-412.60098,93.115520,-21.278133,11.800548,-20.548487,-3.913667,-23.469475,-11.187430,-24.276222,-2.072995,-6.652527,-13.955736,2.214910,-10.965079,2.611229,-6.951846,-5.652804,-4.464256,-4.713693,-4.796800,0.0,1.0,0.0
3802,26,0,0,-362.44458,62.484260,12.215590,23.770980,22.232940,14.473652,6.543558,6.901525,-2.250514,5.841632,-0.113891,-10.030197,-4.308362,-2.286434,-8.156878,-6.686550,-6.439074,-5.252420,-5.743541,-4.447176,1.0,0.0,0.0
3803,27,0,0,-216.41720,102.442535,-33.941437,16.964209,-43.680378,11.247188,-17.097466,2.984330,-21.722847,-0.496897,5.125669,-7.906786,2.524000,-13.947393,-2.017673,-3.898373,1.995725,-8.732617,-6.260137,-5.297608,1.0,0.0,0.0


# Pycaret

In [13]:
train_x['covid19'] = train_y
train_x.shape

(3805, 27)

# Train

In [14]:
model = setup(data=train_x, target='covid19', session_id=CFG['SEED'], train_size=0.8,
              data_split_stratify=True, data_split_shuffle=True, fold_strategy='stratifiedkfold',
              normalize=True, normalize_method='minmax',
              categorical_features=['respiratory_condition', 'fever_or_muscle_pain', 'female', 'male', 'other'])

# model.fit(train_x, train_y) # Model Train

IntProgress(value=0, description='Processing: ', max=3)

ValueError: Setting a random_state has no effect since shuffle is False. You should leave random_state to its default (None), or set shuffle=True.

# Model

# Inference

In [None]:
# 위의 학습데이터를 전처리한 과정과 동일하게 test data에도 적용
# test_x = pd.read_csv('/content/drive/MyDrive/YDS/DACON/COVID19/test_mfcc_data.csv')

test_x = test_mfcc_data.drop(columns=['id'])
# Data Leakage에 유의하여 train data로만 학습된 ohe를 사용
test_x = onehot_encoding(ohe, test_x)

In [None]:
# Model 추론
preds = model.predict(test_x)

In [None]:
# # 정확도

print("훈련 세트 정확도 : {:.3f}".format(model.score(train_x, train_y)))
print("테스트 세트 정확도 : {:.3f}".format(model.score(test_x, preds)))

# Submission

In [None]:
submission = pd.read_csv('./sample_submission.csv')
submission['covid19'] = preds
submission.to_csv('./submit.csv', index=False)