# Import

In [1]:
import random
import pandas as pd
import numpy as np
import os
import librosa

from tqdm.auto import tqdm

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder

import warnings
warnings.filterwarnings(action='ignore') 

# Hyperparameter Setting

In [2]:
CFG = {
    'SR':32000,
    'N_MFCC':100, # MFCC 벡터를 추출할 개수
    'SEED':42
}

# Fixed Random-Seed

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(CFG['SEED']) # Seed 고정

# Data Pre-Processing 1

In [4]:
train_df = pd.read_csv('./train_data.csv')
test_df = pd.read_csv('./test_data.csv')

In [5]:
unlabeled_df = pd.read_csv('./unlabeled_data.csv')

In [6]:
def get_mfcc_feature(df, data_type, save_path):
    # Data Folder path
    root_folder = './wav_dataset'
    if os.path.exists(save_path):
        print(f'{save_path} is exist.')
        return
    features = []
    for uid in tqdm(df['id']):
        root_path = os.path.join(root_folder, data_type)
        path = os.path.join(root_path, str(uid).zfill(5)+'.wav')

        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=CFG['SR'])
        
        # librosa패키지를 사용하여 mfcc 추출
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CFG['N_MFCC'])

        y_feature = []
        # 추출된 MFCC들의 평균을 Feature로 사용
        for e in mfcc:
            y_feature.append(np.mean(e))
        features.append(y_feature)
    
    # 기존의 자가진단 정보를 담은 데이터프레임에 추출된 오디오 Feature를 추가
    mfcc_df = pd.DataFrame(features, columns=['mfcc_'+str(x) for x in range(1,CFG['N_MFCC']+1)])
    df = pd.concat([df, mfcc_df], axis=1)
    df.to_csv(save_path, index=False)
    print('Done.')

In [7]:
def get_mfcc_feature2(df, data_type, save_path):
    # Data Folder path
    root_folder = './wav_dataset'
    if os.path.exists(save_path):
        print(f'{save_path} is exist.')
        return
    features = []
    for uid in tqdm(df['id']):
        root_path = os.path.join(root_folder, data_type)
        path = os.path.join(root_path, str(uid).zfill(5)+'.wav')

        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=CFG['SR'])
        
        # librosa패키지를 사용하여 mfcc 추출
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CFG['N_MFCC'])

        y_feature = []
        # 추출된 MFCC들의 평균을 Feature로 사용
        for e in mfcc:
            y_feature.append(np.mean(e))
        features.append(y_feature)
    
    # 기존의 자가진단 정보를 담은 데이터프레임에 추출된 오디오 Feature를 추가
    mfcc_df = pd.DataFrame(features, columns=['mfcc_'+str(x) for x in range(1,CFG['N_MFCC']+1)])
    df = pd.concat([df, mfcc_df], axis=1)
    df.to_csv(save_path, index=False)
    print('Done.')

In [8]:
9538 - 3806

5732

In [9]:
get_mfcc_feature(train_df, 'train', './train_mfcc_data.csv')
get_mfcc_feature(test_df, 'test', './test_mfcc_data.csv')

./train_mfcc_data.csv is exist.
./test_mfcc_data.csv is exist.


In [10]:
get_mfcc_feature2(unlabeled_df, 'unlabeled', './unlabeled_mfcc_data.csv')

./unlabeled_mfcc_data.csv is exist.


In [11]:
train_mfcc_data = pd.read_csv('./train_mfcc_data.csv')
test_mfcc_data = pd.read_csv('./test_mfcc_data.csv')

In [12]:
train_mfcc_data

Unnamed: 0,id,age,gender,respiratory_condition,fever_or_muscle_pain,covid19,mfcc_1,mfcc_2,mfcc_3,mfcc_4,...,mfcc_23,mfcc_24,mfcc_25,mfcc_26,mfcc_27,mfcc_28,mfcc_29,mfcc_30,mfcc_31,mfcc_32
0,1,24,female,0,1,0,-276.01898,30.519340,-20.314617,-6.689037,...,-2.679408,2.454339,-1.176285,2.314315,-0.339533,2.514413,-4.784703,1.239072,-1.556883,-1.548770
1,2,51,male,0,0,0,-312.99362,54.141323,-1.748550,-9.437217,...,-7.248304,1.238725,-6.894970,-1.810402,-7.259594,0.715029,-1.372265,-1.760624,-2.735181,1.134190
2,3,22,male,0,0,0,-438.60306,46.675842,-22.771935,-3.527922,...,-0.136723,-1.707353,2.649277,1.208829,-0.033701,-1.008729,-0.687255,-0.472232,0.850565,0.353839
3,4,29,female,1,0,0,-369.26100,47.762012,-8.256503,-2.891349,...,-0.389230,4.033148,-2.658165,2.867084,1.679876,2.136411,0.289792,1.709179,-0.592465,1.754549
4,5,23,male,0,0,0,-535.68915,7.509357,-7.762263,2.567660,...,-0.279360,-0.292286,-1.559678,0.328864,-1.053423,0.844060,-0.788914,1.182740,-0.527028,1.208361
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3800,3801,53,male,0,0,0,-329.53840,69.317080,-41.498420,3.398324,...,-0.447236,1.477676,-5.139314,0.058984,1.145387,3.968560,-4.152923,2.655675,-4.001793,-2.753481
3801,3802,25,male,0,0,0,-387.18520,74.883385,-13.053332,0.423010,...,-3.050042,4.993294,1.104003,6.089378,2.936215,3.518800,-0.500065,3.678556,2.742439,5.705073
3802,3803,26,female,0,0,0,-347.74250,58.946648,12.572187,28.366724,...,-2.563241,-2.590208,-2.765020,-1.786291,-3.289306,-1.355590,-0.098087,-1.539166,-1.679146,-1.346460
3803,3804,27,female,0,0,0,-180.65393,72.516770,-16.372229,-8.363718,...,-8.350143,6.216520,-2.514634,2.063279,-1.506954,0.092602,-4.883226,0.641473,-5.265770,-0.327298


In [13]:
test_mfcc_data.mfcc_1.value_counts().sort_index()

-1125.973300    1
-1124.835700    1
-1124.304100    1
-1124.237000    1
-1119.632200    1
               ..
-112.061700     1
-109.896576     1
-100.145630     1
-95.639910      1
-64.710106      1
Name: mfcc_1, Length: 5719, dtype: int64

# Train, Test, Unlabel data 합쳐서 train test 로 분류해서 학습해보기

In [14]:
train_df

Unnamed: 0,id,age,gender,respiratory_condition,fever_or_muscle_pain,covid19
0,1,24,female,0,1,0
1,2,51,male,0,0,0
2,3,22,male,0,0,0
3,4,29,female,1,0,0
4,5,23,male,0,0,0
...,...,...,...,...,...,...
3800,3801,53,male,0,0,0
3801,3802,25,male,0,0,0
3802,3803,26,female,0,0,0
3803,3804,27,female,0,0,0


In [15]:
test_df

Unnamed: 0,id,age,gender,respiratory_condition,fever_or_muscle_pain
0,3806,48,female,1,0
1,3807,24,female,0,0
2,3808,29,male,0,0
3,3809,39,female,0,0
4,3810,34,male,0,0
...,...,...,...,...,...
5727,9533,43,male,0,0
5728,9534,48,female,0,1
5729,9535,44,female,0,0
5730,9536,25,female,0,0


In [17]:
data = pd.concat([train_df.drop(columns=['covid19']), test_df]).reset_index()

In [18]:
data

Unnamed: 0,index,id,age,gender,respiratory_condition,fever_or_muscle_pain
0,0,1,24,female,0,1
1,1,2,51,male,0,0
2,2,3,22,male,0,0
3,3,4,29,female,1,0
4,4,5,23,male,0,0
...,...,...,...,...,...,...
9532,5727,9533,43,male,0,0
9533,5728,9534,48,female,0,1
9534,5729,9535,44,female,0,0
9535,5730,9536,25,female,0,0


# Data Pre-Processing 2

In [None]:
# wav 파일의 MFCC Feature와 상태정보를 합친 학습데이터를 불러옵니다.
train_df = pd.read_csv('./train_mfcc_data.csv')

# 학습데이터를 모델의 input으로 들어갈 x와 label로 사용할 y로 분할
train_x = train_df.drop(columns=['id', 'covid19'])
train_y = train_df['covid19']

In [None]:
def onehot_encoding(ohe, x):
    # 학습데이터로 부터 fit된 one-hot encoder (ohe)를 받아 transform 시켜주는 함수
    encoded = ohe.transform(x['gender'].values.reshape(-1,1))
    encoded_df = pd.DataFrame(encoded, columns=ohe.categories_[0])
    x = pd.concat([x.drop(columns=['gender']), encoded_df], axis=1)
    return x

In [None]:
# 'gender' column의 경우 추가 전처리가 필요 -> OneHotEncoder 적용
ohe = OneHotEncoder(sparse=False)
ohe.fit(train_x['gender'].values.reshape(-1,1))
train_x = onehot_encoding(ohe, train_x)

In [None]:
train_x

In [None]:
train_y.value_counts()

# Modeling

In [None]:
# import tensorflow as tf

In [None]:
# model = tf.keras.models.Sequential([
#   tf.keras.layers.Flatten(input_shape=(28, 28)),
#   tf.keras.layers.Dense(128, activation='relu'),
#   tf.keras.layers.Dropout(0.2),
#   tf.keras.layers.Dense(10, activation='softmax')
# ])

# model.compile(optimizer='adam',
#               loss='binary_crossentropy',
#               metrics=['accuracy'])

# Train

In [None]:
mlp = MLPClassifier(random_state=CFG['SEED']) # Sklearn에서 제공하는 Multi-layer Perceptron classifier 사용
mlp.fit(train_x, train_y) # Model Train

# Inference

In [None]:
# 위의 학습데이터를 전처리한 과정과 동일하게 test data에도 적용
test_x = pd.read_csv('./test_mfcc_data.csv')
test_x = test_x.drop(columns=['id'])
# Data Leakage에 유의하여 train data로만 학습된 ohe를 사용
test_x = onehot_encoding(ohe, test_x)

In [None]:

# Model 추론
preds = mlp.predict(test_x)


In [None]:
preds

In [None]:
# 정확도

print("훈련 세트 정확도 : {:.3f}".format(mlp.score(train_x, train_y)))
print("테스트 세트 정확도 : {:.3f}".format(mlp.score(test_x, preds)))

# Submission

In [None]:
submission = pd.read_csv('./sample_submission.csv')
submission['covid19'] = preds
submission.to_csv('./submit.csv', index=False)