<a href="https://colab.research.google.com/github/konkuk-gaegul/MLP_Classification_AI/blob/main/MFCC_Feature_MLP_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GPU 가속기

In [None]:
# gpu_info = !nvidia-smi
# gpu_info = '\n'.join(gpu_info)
# if gpu_info.find('failed') >= 0:
#   print('Not connected to a GPU')
# else:
#   print(gpu_info)

# 추가 메모리

<p>Colab Pro를 구독하면 고용량 메모리 VM에 액세스할 수 있습니다&#40;사용 가능한 경우&#41;. Pro+에는 더 많은 메모리가 제공됩니다. 고용량 메모리 런타임을 사용하도록 노트북 환경설정을 지정하려면 런타임 &gt; '런타임 유형 변경' 메뉴를 선택한 다음 런타임 구성 드롭다운에서 고용량 RAM을 선택하세요.</p>
<p>언제든지 다음 코드 셀을 실행하여 사용 가능한 메모리 용량을 확인할 수 있습니다.</p>
아래 코드 셀의 실행 결과가 ‘Not using a high-RAM runtime’인 경우 메뉴의 런타임 &gt; 런타임 유형 변경에서 고용량 RAM 런타임을 사용 설정하고 런타임 구성 드롭다운에서 고용량 RAM을 선택한 다음 코드 셀을 다시 실행하면 됩니다.

In [None]:
# from psutil import virtual_memory
# ram_gb = virtual_memory().total / 1e9
# print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

# if ram_gb < 20:
#   print('Not using a high-RAM runtime')
# else:
#   print('You are using a high-RAM runtime!')

# 라이브러리

In [None]:
import random
import pandas as pd
import numpy as np
import os
import librosa

import copy, os, shutil
import matplotlib as plt

import matplotlib
import matplotlib.pyplot as plt

from tqdm.auto import tqdm

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder

import warnings
warnings.filterwarnings(action='ignore') 

# 시각화

In [None]:
# 폰트 설정을 위한 라이브러리 임포트
from matplotlib import font_manager, rcParams

# 한글 표현이 가능한 폰트를 설치
!apt-get install fonts-nanum*
font_manager.findSystemFonts(fontext='ttf')
font_manager.FontProperties( fname='/usr/share/fonts/truetype/nanum/NanumGothicCoding.ttf').get_name()

# 폰트를 변경
rcParams['font.family'] = 'NanumGothicCoding'

# - 부호도 깨져서 나오기 때문에, 같이 설정
rcParams['axes.unicode_minus'] = False

# 설정을 변경했으면, rebuild를 통해서 변경된 설정을 적용
font_manager._rebuild()

In [None]:
CFG = {
    'SR':16000,
    'N_MFCC':32, # MFCC 벡터를 추출할 개수
    'SEED':41
}

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(CFG['SEED']) # Seed 고정

In [None]:
# get_mfcc_feature(train_df, 'train', '/content/drive/MyDrive/음향 데이터 AI 경진대회/wav_dataset/train_mfcc_data.csv')

def get_mfcc_feature(df, data_type, save_path):
    # Data Folder path
    root_folder = '/content/drive/MyDrive/음향 데이터 AI 경진대회/wav_dataset'
    if os.path.exists(save_path):
        print(f'{save_path} is exist.')
        return
    features = []
    for uid in tqdm(df['id']):
        root_path = os.path.join(root_folder, data_type)
        path = os.path.join(root_path, str(uid).zfill(5)+'.wav')

        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=CFG['SR'])
        
        # librosa패키지를 사용하여 mfcc 추출
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CFG['N_MFCC'])

        y_feature = []
        # 추출된 MFCC들의 평균을 Feature로 사용
        for e in mfcc:
            y_feature.append(np.mean(e))
        features.append(y_feature)
    
    # 기존의 자가진단 정보를 담은 데이터프레임에 추출된 오디오 Feature를 추가
    mfcc_df = pd.DataFrame(features, columns=['mfcc_'+str(x) for x in range(1,CFG['N_MFCC']+1)])
    df = pd.concat([df, mfcc_df], axis=1)
    df.to_csv(save_path, index=False)
    print('Done.')

In [None]:
def onehot_encoding(ohe, x):
    # 학습데이터로 부터 fit된 one-hot encoder (ohe)를 받아 transform 시켜주는 함수
    encoded = ohe.transform(x['gender'].values.reshape(-1,1))
    encoded_df = pd.DataFrame(encoded, columns=ohe.categories_[0])
    x = pd.concat([x.drop(columns=['gender']), encoded_df], axis=1)
    return x

# data upload

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/음향 데이터 AI 경진대회/wav_dataset/train_data.csv')
# train_df.drop(columns = 'id', inplace=True )
test_df = pd.read_csv('/content/drive/MyDrive/음향 데이터 AI 경진대회/wav_dataset/test_data.csv')
# test_df.drop(columns = 'id', inplace=True )

In [None]:
train_df

Unnamed: 0,id,age,gender,respiratory_condition,fever_or_muscle_pain,covid19
0,1,24,female,0,1,0
1,2,51,male,0,0,0
2,3,22,male,0,0,0
3,4,29,female,1,0,0
4,5,23,male,0,0,0
...,...,...,...,...,...,...
3800,3801,53,male,0,0,0
3801,3802,25,male,0,0,0
3802,3803,26,female,0,0,0
3803,3804,27,female,0,0,0


In [None]:
test_df

Unnamed: 0,id,age,gender,respiratory_condition,fever_or_muscle_pain
0,3806,48,female,1,0
1,3807,24,female,0,0
2,3808,29,male,0,0
3,3809,39,female,0,0
4,3810,34,male,0,0
...,...,...,...,...,...
5727,9533,43,male,0,0
5728,9534,48,female,0,1
5729,9535,44,female,0,0
5730,9536,25,female,0,0


# data preprocessing 1

In [None]:
%cd '/content/drive/MyDrive/음향 데이터 AI 경진대회/wav_dataset'

/content/drive/MyDrive/음향 데이터 AI 경진대회/wav_dataset


In [None]:
!pwd

/content/drive/MyDrive/음향 데이터 AI 경진대회/wav_dataset


In [None]:
# get_mfcc_feature(train_df, 'train', '/content/drive/MyDrive/음향 데이터 AI 경진대회/wav_dataset/train_mfcc_data.csv')
# get_mfcc_feature(test_df, 'test', '/content/drive/MyDrive/음향 데이터 AI 경진대회/wav_dataset/test_mfcc_data.csv')

# data preprocessing 2

In [None]:
# wav 파일의 MFCC Feature와 상태정보를 합친 학습데이터를 불러옵니다.
train_df = pd.read_csv('./train_mfcc_data.csv')

# 학습데이터를 모델의 input으로 들어갈 x와 label로 사용할 y로 분할
train_x = train_df.drop(columns=['id', 'covid19'])
train_y = train_df['covid19']

In [None]:
# 'gender' column의 경우 추가 전처리가 필요 -> OneHotEncoder 적용
ohe = OneHotEncoder(sparse=False)
ohe.fit(train_x['gender'].values.reshape(-1,1))
train_x = onehot_encoding(ohe, train_x)

In [None]:
train_df.head(5)

Unnamed: 0,id,age,gender,respiratory_condition,fever_or_muscle_pain,covid19,mfcc_1,mfcc_2,mfcc_3,mfcc_4,...,mfcc_23,mfcc_24,mfcc_25,mfcc_26,mfcc_27,mfcc_28,mfcc_29,mfcc_30,mfcc_31,mfcc_32
0,1,24,female,0,1,0,-274.93472,29.345425,-19.152718,-7.836881,...,-2.320942,2.150005,-0.925417,2.11603,-0.19273,2.417784,-4.73665,1.237788,-1.600426,-1.462419
1,2,51,male,0,0,0,-311.55988,52.48278,-0.094471,-11.066645,...,-6.497138,0.543653,-6.26389,-2.386007,-6.744628,0.254181,-0.96756,-2.113267,-2.433431,0.881612
2,3,22,male,0,0,0,-438.28616,46.591045,-22.690268,-3.60658,...,-0.156195,-1.680287,2.617625,1.242552,-0.073267,-0.961353,-0.733599,-0.419574,0.796274,0.412893
3,4,29,female,1,0,0,-368.42603,46.939438,-7.44307,-3.694368,...,-0.155832,3.83932,-2.503331,2.750769,1.758514,2.094565,0.29582,1.737585,-0.6542,1.847927
4,5,23,male,0,0,0,-535.193,7.167374,-7.42083,2.231418,...,-0.144892,-0.412889,-1.451348,0.237435,-0.971688,0.779251,-0.734346,1.141693,-0.498866,1.189332


# 학습 - 1

## train

In [None]:
model = MLPClassifier(random_state=CFG['SEED']) # Sklearn에서 제공하는 Multi-layer Perceptron classifier 사용
model.fit(train_x, train_y) # Model Train

MLPClassifier(random_state=41)

## test

In [None]:
# 위의 학습데이터를 전처리한 과정과 동일하게 test data에도 적용
test_x = pd.read_csv('./test_mfcc_data.csv')
test_x = test_x.drop(columns=['id'])
# Data Leakage에 유의하여 train data로만 학습된 ohe를 사용
test_x = onehot_encoding(ohe, test_x)

# Model 추론
preds = model.predict(test_x)

In [None]:
preds

array([0, 0, 0, ..., 0, 0, 1])

## submission

In [None]:
submission = pd.read_csv('./sample_submission.csv')
submission['covid19'] = preds
# submission.to_csv('.submission/submit_1.csv', index=False)

# 학습 - 2
- 연령대를 파생변수로 생성
- 10대, 20대, ... , 90대

In [None]:
# train
train_df = pd.read_csv('./train_mfcc_data.csv')
train_df['age_range'] = np.where( train_df.age < 10 , 0 ,
         np.where(train_df.age < 20 , 10,
         np.where(train_df.age < 30 , 20 ,
         np.where(train_df.age < 40 , 30 ,
         np.where(train_df.age < 50 , 40 ,
         np.where(train_df.age < 60 , 50 ,
         np.where(train_df.age < 70 , 60 ,
         np.where(train_df.age < 80 , 70 ,
         np.where(train_df.age < 90 , 80 ,
         np.where(train_df.age < 100 , 90 , 100)))))))))
         )

# test
test_df = pd.read_csv('./test_mfcc_data.csv')
test_df['age_range'] = np.where( test_df.age < 10 , 0 ,
         np.where(test_df.age < 20 , 10 ,
         np.where(test_df.age < 30 , 20 ,
         np.where(test_df.age < 40 , 30 ,
         np.where(test_df.age < 50 , 40 ,
         np.where(test_df.age < 60 , 50 ,
         np.where(test_df.age < 70 , 60 ,
         np.where(test_df.age < 80 , 70 ,
         np.where(test_df.age < 90 , 80 ,
         np.where(test_df.age < 100 , 90 , 100)))))))))
         )
test_df.head(5)

Unnamed: 0,id,age,gender,respiratory_condition,fever_or_muscle_pain,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,...,mfcc_24,mfcc_25,mfcc_26,mfcc_27,mfcc_28,mfcc_29,mfcc_30,mfcc_31,mfcc_32,age_range
0,3806,48,female,1,0,-619.9879,9.328649,0.821094,2.373343,-0.171163,...,-0.053711,-0.029855,0.043562,-0.405293,-0.246975,-0.110937,0.019196,-0.155829,-0.130705,40
1,3807,24,female,0,0,-493.6057,8.492651,-2.758262,-2.801817,-2.770231,...,-0.643835,-0.822677,-0.113844,-1.210391,0.273594,-0.685921,0.440101,-0.960277,0.371881,20
2,3808,29,male,0,0,-1058.8418,-2.350609,-7.395449,-0.689041,-5.432969,...,3.134083,2.58713,0.463058,-0.127755,1.612876,-3.393354,-2.874998,-1.302709,-2.915723,20
3,3809,39,female,0,0,-362.16165,27.11974,-9.231675,9.747519,-1.013352,...,-0.271802,0.0349,0.414227,-2.836006,0.105023,0.518484,2.139494,0.944032,1.029428,30
4,3810,34,male,0,0,-387.59268,45.165874,-14.454832,-2.965205,-9.931789,...,-3.59422,0.289146,-5.298121,1.614137,3.108044,-5.373443,0.106868,2.0541,-1.994548,30


In [None]:
def onehot_encoding_age_range(ohe2, x):
    # 학습데이터로 부터 fit된 one-hot encoder (ohe)를 받아 transform 시켜주는 함수
    encoded = ohe2.transform(x['age_range'].values.reshape(-1,1))
    encoded_df = pd.DataFrame(encoded, columns=ohe.categories_[0])
    x = pd.concat([x.drop(columns=['age_range']), encoded_df], axis=1)
    return x

In [None]:
# 학습데이터를 모델의 input으로 들어갈 x와 label로 사용할 y로 분할
train_x = train_df.drop(columns=['id', 'covid19'])
train_y = train_df['covid19']

# 'gender' column의 경우 추가 전처리가 필요 -> OneHotEncoder 적용
ohe = OneHotEncoder(sparse=False)
ohe.fit(train_x['gender'].values.reshape(-1,1))
train_x = onehot_encoding(ohe, train_x)

age_range_dum = pd.get_dummies(train_x['age_range'])
train_x = pd.concat([train_x, age_range_dum], axis = 1 )
train_x.drop(columns = 'age_range', inplace=True)
train_x

Unnamed: 0,age,respiratory_condition,fever_or_muscle_pain,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,...,0,10,20,30,40,50,60,70,80,90
0,24,0,1,-274.93472,29.345425,-19.152718,-7.836881,-9.094099,-8.553541,-32.653940,...,0,0,1,0,0,0,0,0,0,0
1,51,0,0,-311.55988,52.482780,-0.094471,-11.066645,5.936104,-1.736337,-15.763054,...,0,0,0,0,0,1,0,0,0,0
2,22,0,0,-438.28616,46.591045,-22.690268,-3.606580,-13.870162,0.272160,-9.013411,...,0,0,1,0,0,0,0,0,0,0
3,29,1,0,-368.42603,46.939438,-7.443070,-3.694368,-20.511784,-9.271747,-10.894163,...,0,0,1,0,0,0,0,0,0,0
4,23,0,0,-535.19300,7.167374,-7.420830,2.231418,-5.301193,-0.646569,-6.103723,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3800,53,0,0,-328.48108,68.198110,-40.380050,2.296546,0.350861,-30.353664,-7.293074,...,0,0,0,0,0,1,0,0,0,0
3801,25,0,0,-386.11325,74.041670,-12.082752,-0.463326,-16.093834,-15.316323,-19.726470,...,0,0,1,0,0,0,0,0,0,0
3802,26,0,0,-347.16568,58.591610,12.996252,27.981964,20.635437,10.189990,6.844964,...,0,0,1,0,0,0,0,0,0,0
3803,27,0,0,-179.06174,70.699880,-14.584357,-10.161293,-25.655546,2.021584,-11.036316,...,0,0,1,0,0,0,0,0,0,0


In [None]:
train_x.columns

Index([                  'age', 'respiratory_condition',
        'fever_or_muscle_pain',                'mfcc_1',
                      'mfcc_2',                'mfcc_3',
                      'mfcc_4',                'mfcc_5',
                      'mfcc_6',                'mfcc_7',
                      'mfcc_8',                'mfcc_9',
                     'mfcc_10',               'mfcc_11',
                     'mfcc_12',               'mfcc_13',
                     'mfcc_14',               'mfcc_15',
                     'mfcc_16',               'mfcc_17',
                     'mfcc_18',               'mfcc_19',
                     'mfcc_20',               'mfcc_21',
                     'mfcc_22',               'mfcc_23',
                     'mfcc_24',               'mfcc_25',
                     'mfcc_26',               'mfcc_27',
                     'mfcc_28',               'mfcc_29',
                     'mfcc_30',               'mfcc_31',
                     'mfcc_32',

## train

In [None]:
model = MLPClassifier(random_state=CFG['SEED']) # Sklearn에서 제공하는 Multi-layer Perceptron classifier 사용
model.fit(train_x, train_y) # Model Train

MLPClassifier(random_state=41)

## test

In [None]:
# test
test_df = pd.read_csv('./test_mfcc_data.csv')
test_df['age_range'] = np.where( test_df.age < 10 , 0 ,
         np.where(test_df.age < 20 , 10 ,
         np.where(test_df.age < 30 , 20 ,
         np.where(test_df.age < 40 , 30 ,
         np.where(test_df.age < 50 , 40 ,
         np.where(test_df.age < 60 , 50 ,
         np.where(test_df.age < 70 , 60 ,
         np.where(test_df.age < 80 , 70 ,
         np.where(test_df.age < 90 , 80 ,
         np.where(test_df.age < 100 , 90 , 100)))))))))
         )
test_df.head(5)

Unnamed: 0,id,age,gender,respiratory_condition,fever_or_muscle_pain,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,...,mfcc_24,mfcc_25,mfcc_26,mfcc_27,mfcc_28,mfcc_29,mfcc_30,mfcc_31,mfcc_32,age_range
0,3806,48,female,1,0,-619.9879,9.328649,0.821094,2.373343,-0.171163,...,-0.053711,-0.029855,0.043562,-0.405293,-0.246975,-0.110937,0.019196,-0.155829,-0.130705,40
1,3807,24,female,0,0,-493.6057,8.492651,-2.758262,-2.801817,-2.770231,...,-0.643835,-0.822677,-0.113844,-1.210391,0.273594,-0.685921,0.440101,-0.960277,0.371881,20
2,3808,29,male,0,0,-1058.8418,-2.350609,-7.395449,-0.689041,-5.432969,...,3.134083,2.58713,0.463058,-0.127755,1.612876,-3.393354,-2.874998,-1.302709,-2.915723,20
3,3809,39,female,0,0,-362.16165,27.11974,-9.231675,9.747519,-1.013352,...,-0.271802,0.0349,0.414227,-2.836006,0.105023,0.518484,2.139494,0.944032,1.029428,30
4,3810,34,male,0,0,-387.59268,45.165874,-14.454832,-2.965205,-9.931789,...,-3.59422,0.289146,-5.298121,1.614137,3.108044,-5.373443,0.106868,2.0541,-1.994548,30


In [None]:
test_x = test_df.drop(columns=['id'])

# Data Leakage에 유의하여 train data로만 학습된 ohe를 사용
test_x = onehot_encoding(ohe, test_x)

age_range_dum = pd.get_dummies(test_x['age_range'])
test_x = pd.concat([test_x, age_range_dum], axis = 1 )
test_x.drop(columns = ['age_range', 100], inplace=True)

# Model 추론
preds = model.predict(test_x)

In [None]:
test_x[test_x.age > 100]

Unnamed: 0,age,respiratory_condition,fever_or_muscle_pain,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,...,10,20,30,40,50,60,70,80,90,100
5411,102,1,1,-464.60672,15.70595,8.444934,1.737424,1.154192,-1.921177,-0.260033,...,0,0,0,0,0,0,0,0,0,1


## submission

In [None]:
submission = pd.read_csv('./sample_submission.csv')
submission['covid19'] = preds
submission.to_csv('/content/drive/MyDrive/음향 데이터 AI 경진대회/wav_dataset/submission/submit_2.csv', index=False)