# Import

In [1]:
import random
import pandas as pd
import numpy as np
import os
import librosa

from tqdm.auto import tqdm

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder

import warnings
warnings.filterwarnings(action='ignore') 

# Hyperparameter Setting

In [2]:
CFG = {
    'SR':16000,
    'N_MFCC':32, # MFCC 벡터를 추출할 개수
    'SEED':42
}

# Fixed Random-Seed

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(CFG['SEED']) # Seed 고정

# Data Pre-Processing 1

In [4]:
train_df = pd.read_csv('./train_data.csv')
test_df = pd.read_csv('./test_data.csv')
unlabeled_df = pd.read_csv('./unlabeled_data.csv')

In [None]:
test_label = pd.read_csv('./test_label.csv')

In [None]:
test_tt = pd.merge(test_df, test_label, how='inner')
test_df = test_tt

In [None]:
new_train = pd.concat([train_df, unlabeled_df]).reset_index()
new_train = new_train.drop(columns=['index'])
new_train

In [None]:
new_train = new_train.fillna(1)

In [None]:
new_train = new_train.astype({'covid19' : 'int'})
new_train = new_train.reset_index().drop(columns=['index','level_0'])

In [None]:
new_train

In [None]:
# new_train.to_csv('./new_train.csv')

In [None]:
def get_mfcc_feature(df, data_type, save_path):
    # Data Folder path
    root_folder = './wav_dataset'
    if os.path.exists(save_path):
        print(f'{save_path} is exist.')
        return
    features = []
    for uid in tqdm(df['id']):
        root_path = os.path.join(root_folder, data_type)
        path = os.path.join(root_path, str(uid).zfill(5)+'.wav')

        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=CFG['SR'])
        
        # librosa패키지를 사용하여 mfcc 추출
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CFG['N_MFCC'])

        y_feature = []
        # 추출된 MFCC들의 평균을 Feature로 사용
        for e in mfcc:
            y_feature.append(np.mean(e))
        features.append(y_feature)
    
    # 기존의 자가진단 정보를 담은 데이터프레임에 추출된 오디오 Feature를 추가
    mfcc_df = pd.DataFrame(features, columns=['mfcc_'+str(x) for x in range(1,CFG['N_MFCC']+1)])
    df = pd.concat([df, mfcc_df], axis=1)
    df.to_csv(save_path, index=False)
    print('Done.')

In [None]:
# get_mfcc_feature(unlabeled_df, 'unlabeled', './unlabeled_mfcc_data.csv')
# get_mfcc_feature(new_train, 'new_train', './new_train_mfcc_data2.csv')

In [None]:
'''
get_mfcc_feature(train_df, 'train', './train_mfcc_data.csv')
get_mfcc_feature(test_df, 'test', './test_mfcc_data.csv')
get_mfcc_feature('unlabeled', './unlabeled_mfcc_data.csv')
'''

In [6]:
# train_mfcc_data = pd.read_csv('./train_mfcc_data.csv')
# test_mfcc_data = pd.read_csv('./test_mfcc_data.csv')
new_train_mfcc_data = pd.read_csv('./new_train_mfcc_data.csv')
test_mfcc_data = pd.read_csv('./test_mfcc_data.csv')
unlabeled_mfcc_data = pd.read_csv('./unlabeled_mfcc_data.csv')

In [8]:
new_train_mfcc_data = pd.concat([new_train_mfcc_data, unlabeled_mfcc_data])
new_train_mfcc_data = new_train_mfcc_data.fillna(-999)
new_train_mfcc_data = new_train_mfcc_data.astype({'covid19' : 'int'})
new_train_mfcc_data = new_train_mfcc_data.reset_index()
new_train_mfcc_data = new_train_mfcc_data.drop(columns=['index'])
# new_train_mfcc_data = new_train_mfcc_data.drop(columns=['level_0'])
new_train_mfcc_data

Unnamed: 0,id,age,gender,respiratory_condition,fever_or_muscle_pain,covid19,mfcc_1,mfcc_2,mfcc_3,mfcc_4,...,mfcc_23,mfcc_24,mfcc_25,mfcc_26,mfcc_27,mfcc_28,mfcc_29,mfcc_30,mfcc_31,mfcc_32
0,1,24,female,0,1,0,-276.01898,30.519340,-20.314617,-6.689037,...,-2.679408,2.454339,-1.176285,2.314315,-0.339533,2.514413,-4.784703,1.239072,-1.556883,-1.548770
1,2,51,male,0,0,0,-312.99362,54.141323,-1.748550,-9.437217,...,-7.248304,1.238725,-6.894970,-1.810402,-7.259594,0.715029,-1.372265,-1.760624,-2.735181,1.134190
2,3,22,male,0,0,0,-438.60306,46.675842,-22.771935,-3.527922,...,-0.136723,-1.707353,2.649277,1.208829,-0.033701,-1.008729,-0.687255,-0.472232,0.850565,0.353839
3,4,29,female,1,0,0,-369.26100,47.762012,-8.256503,-2.891349,...,-0.389230,4.033148,-2.658165,2.867084,1.679876,2.136411,0.289792,1.709179,-0.592465,1.754549
4,5,23,male,0,0,0,-535.68915,7.509357,-7.762263,2.567660,...,-0.279360,-0.292286,-1.559678,0.328864,-1.053423,0.844060,-0.788914,1.182740,-0.527028,1.208361
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13266,11400,8,female,0,0,-999,-461.32520,22.623890,-6.396159,7.355534,...,-1.262772,0.351229,-0.073305,-0.782535,-0.052364,1.048027,-1.696817,-0.173869,-0.438885,0.898622
13267,11401,29,male,0,1,-999,-333.90408,37.195827,3.925677,9.610576,...,-7.815808,-0.335320,-8.180732,-4.139085,-6.545818,-2.195700,-1.213498,-0.847897,0.001108,2.640274
13268,11402,17,male,0,0,-999,-430.32492,26.273499,-0.465416,1.431988,...,3.291479,1.793235,-1.445930,-2.996969,-2.641327,1.613704,2.414957,3.153693,0.158814,0.487620
13269,11403,22,male,0,0,-999,-648.42773,27.682764,6.621850,16.785809,...,-2.176159,-0.372978,-1.241015,0.524427,-0.349503,-0.422278,-1.628088,0.241716,-1.117643,-1.026300


# Data Pre-Processing 2

In [9]:
# wav 파일의 MFCC Feature와 상태정보를 합친 학습데이터를 불러옵니다.
# train_df = pd.read_csv('./train_mfcc_data.csv')

# 학습데이터를 모델의 input으로 들어갈 x와 label로 사용할 y로 분할
train_x = new_train_mfcc_data.drop(columns=['id', 'covid19'])
train_y = new_train_mfcc_data['covid19']

In [10]:
def onehot_encoding(ohe, x):
    # 학습데이터로 부터 fit된 one-hot encoder (ohe)를 받아 transform 시켜주는 함수
    encoded = ohe.transform(x['gender'].values.reshape(-1,1))
    encoded_df = pd.DataFrame(encoded, columns=ohe.categories_[0])
    x = pd.concat([x.drop(columns=['gender']), encoded_df], axis=1)
    return x

In [11]:
# 'gender' column의 경우 추가 전처리가 필요 -> OneHotEncoder 적용
ohe = OneHotEncoder(sparse=False)
ohe.fit(new_train_mfcc_data['gender'].values.reshape(-1,1))
train_x = onehot_encoding(ohe, train_x)

In [12]:
train_x

Unnamed: 0,age,respiratory_condition,fever_or_muscle_pain,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,...,mfcc_26,mfcc_27,mfcc_28,mfcc_29,mfcc_30,mfcc_31,mfcc_32,female,male,other
0,24,0,1,-276.01898,30.519340,-20.314617,-6.689037,-10.224930,-7.443150,-33.740433,...,2.314315,-0.339533,2.514413,-4.784703,1.239072,-1.556883,-1.548770,1.0,0.0,0.0
1,51,0,0,-312.99362,54.141323,-1.748550,-9.437217,4.317682,-0.148136,-17.331125,...,-1.810402,-7.259594,0.715029,-1.372265,-1.760624,-2.735181,1.134190,0.0,1.0,0.0
2,22,0,0,-438.60306,46.675842,-22.771935,-3.527922,-13.949551,0.344213,-9.082897,...,1.208829,-0.033701,-1.008729,-0.687255,-0.472232,0.850565,0.353839,0.0,1.0,0.0
3,29,1,0,-369.26100,47.762012,-8.256503,-2.891349,-21.302510,-8.495335,-11.653670,...,2.867084,1.679876,2.136411,0.289792,1.709179,-0.592465,1.754549,1.0,0.0,0.0
4,23,0,0,-535.68915,7.509357,-7.762263,2.567660,-5.632455,-0.318077,-6.422602,...,0.328864,-1.053423,0.844060,-0.788914,1.182740,-0.527028,1.208361,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13266,8,0,0,-461.32520,22.623890,-6.396159,7.355534,-5.021908,-4.876315,-17.696940,...,-0.782535,-0.052364,1.048027,-1.696817,-0.173869,-0.438885,0.898622,1.0,0.0,0.0
13267,29,0,1,-333.90408,37.195827,3.925677,9.610576,-13.433489,7.472061,-2.294662,...,-4.139085,-6.545818,-2.195700,-1.213498,-0.847897,0.001108,2.640274,0.0,1.0,0.0
13268,17,0,0,-430.32492,26.273499,-0.465416,1.431988,-4.389915,2.145267,-7.819598,...,-2.996969,-2.641327,1.613704,2.414957,3.153693,0.158814,0.487620,0.0,1.0,0.0
13269,22,0,0,-648.42773,27.682764,6.621850,16.785809,4.029088,2.122290,5.483944,...,0.524427,-0.349503,-0.422278,-1.628088,0.241716,-1.117643,-1.026300,0.0,1.0,0.0


# Train

In [13]:
model = MLPClassifier(random_state=CFG['SEED']) # Sklearn에서 제공하는 Multi-layer Perceptron classifier 사용
model.fit(train_x, train_y) # Model Train

MLPClassifier(random_state=42)

# Inference

In [14]:
# 위의 학습데이터를 전처리한 과정과 동일하게 test data에도 적용
# test_x = pd.read_csv('/content/drive/MyDrive/YDS/DACON/COVID19/test_mfcc_data.csv')

test_x = test_mfcc_data.drop(columns=['id'])
# Data Leakage에 유의하여 train data로만 학습된 ohe를 사용
test_x = onehot_encoding(ohe, test_x)

In [15]:
# Model 추론
preds = model.predict(test_x)

In [16]:
# # 정확도

print("훈련 세트 정확도 : {:.3f}".format(model.score(train_x, train_y)))
print("테스트 세트 정확도 : {:.3f}".format(model.score(test_x, preds)))

훈련 세트 정확도 : 0.710
테스트 세트 정확도 : 1.000


# Submission

In [17]:
submission = pd.read_csv('./sample_submission.csv')
submission['covid19'] = preds
submission.to_csv('./submit_new.csv', index=False)

In [None]:
submission