## Import

In [1]:
import random
import pandas as pd
import numpy as np
import os
import librosa
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from tqdm.auto import tqdm

import warnings
warnings.filterwarnings(action='ignore') 

In [None]:
def MacroF1Score(y_true, y_pred):
    f1_score(y_true, y_pred, average='macro')

## Hyperparameter Setting

In [2]:
CFG = {
    'SR':16000,
    'N_MFCC':128, # MFCC 벡터를 추출할 개수 (<=128)
    'SEED':41
}

## Fixed RandomSeed

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(CFG['SEED']) # Seed 고정

## Data Pre-processing

In [4]:
import os
os.chdir('/Users/lhs/Desktop/Machine_Sound_Data')

In [5]:
train_df = pd.read_csv('./train.csv') # 모두 정상 Sample
test_df = pd.read_csv('./test.csv')

In [6]:
def get_mfcc_feature(df):
    features = []
    for path in tqdm(df['SAMPLE_PATH']):
        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=CFG['SR'])
        
        # librosa패키지를 사용하여 mfcc 추출
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CFG['N_MFCC'])

        y_feature = []
        # 추출된 MFCC들의 평균을 Feature로 사용
        for e in mfcc:
            y_feature.append(np.mean(e))
        features.append(y_feature)
    return features

In [7]:
train_features = get_mfcc_feature(train_df)
test_features = get_mfcc_feature(test_df)

  0%|          | 0/1279 [00:00<?, ?it/s]

  0%|          | 0/1514 [00:00<?, ?it/s]

In [8]:
# mfcc data와 train data column concat

train1 = pd.DataFrame(data = train_features)
final_train = pd.concat([train1, train_df.iloc[:, 2:4]], axis = 1)
final_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,120,121,122,123,124,125,126,127,FAN_TYPE,LABEL
0,-332.822968,96.714211,-14.923252,21.968349,-8.564011,-2.027139,-11.869204,3.885383,-5.751901,3.539213,...,0.523915,-0.309364,-0.818508,-0.129562,0.530418,0.116271,-0.800855,-0.867277,2,0
1,-438.514435,142.282898,-2.122107,30.592234,0.737141,15.533080,-2.803471,4.224413,-1.894261,3.574963,...,0.042867,0.374269,0.762829,0.738473,0.288381,0.002098,0.344611,0.157145,0,0
2,-419.292419,123.302338,10.127716,21.659258,-1.092546,11.255921,-3.392200,1.564470,3.889135,3.802191,...,0.199252,0.078277,0.520589,0.860090,0.382899,-0.062304,-0.115386,-0.302028,0,0
3,-333.860535,97.461487,-13.956148,22.239996,-9.348626,-2.875100,-11.314520,6.395192,-2.481325,3.891461,...,-0.021310,0.223788,-0.586876,-0.260907,-0.127185,0.113592,-0.935072,-0.169548,2,0
4,-333.146606,90.000252,-21.705933,14.738313,-18.315388,-9.914182,-16.343977,2.563049,-6.700718,-0.878550,...,-0.031247,0.558820,-0.543790,-0.374026,-0.223215,0.091165,-0.612550,-0.216652,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1274,-331.278717,83.516144,-24.639460,17.099073,-17.417221,-9.260210,-16.640516,-0.968224,-8.006650,-0.210976,...,-0.172667,-0.083145,-0.373329,0.001631,-0.215286,-0.068310,-0.687748,-0.080964,2,0
1275,-331.944550,96.614929,-15.033259,22.885084,-9.548154,-3.824131,-13.007987,5.053965,-3.507394,4.024566,...,0.629672,0.535989,-0.188029,-0.090514,0.081146,0.166000,0.017817,-0.548864,2,0
1276,-335.417572,92.459663,-19.939020,14.659496,-14.785930,-6.819523,-14.032729,1.931983,-8.374944,0.402798,...,0.327789,0.258334,-0.339538,-0.661168,-0.750548,-0.497419,-0.981389,-1.050707,2,0
1277,-335.176880,94.095650,-16.676874,17.122639,-13.769090,-7.083161,-12.534894,5.114513,-4.965012,0.256115,...,-0.008657,-0.311247,-0.878406,-0.427334,-0.186461,-0.092595,-0.752961,-0.147661,2,0


In [9]:
# mfcc data와 train data column concat

test1 = pd.DataFrame(data = test_features)
final_test = pd.concat([test1, test_df['FAN_TYPE']], axis = 1)
final_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,119,120,121,122,123,124,125,126,127,FAN_TYPE
0,-337.466980,93.499161,-14.531839,18.782629,-12.993633,-8.891464,-15.007888,2.987521,-4.080492,3.411654,...,0.485381,0.375715,0.610794,0.073637,0.207322,0.096070,0.408414,-0.723793,-0.071414,2
1,-336.332123,98.042473,-14.108411,22.853271,-10.276470,-4.724311,-13.290108,5.331754,-3.047695,3.999879,...,0.016831,0.042590,0.843998,0.041890,0.020773,-0.281918,0.173492,-0.634122,0.104590,2
2,-411.103058,146.174515,-9.374091,28.873428,-3.150095,14.049159,-3.056240,6.908220,-0.042742,4.407775,...,0.600820,0.476922,0.563234,0.534703,0.684900,0.188768,0.328292,0.420767,-0.095794,0
3,-432.922638,149.789581,17.103006,15.994092,11.349452,14.492430,4.636677,7.593799,-0.623311,6.458681,...,0.770491,0.940058,0.350600,0.109936,0.470496,0.976864,-1.214070,-1.232217,-0.899378,0
4,-421.299835,147.652374,-8.984773,27.868708,-2.585596,14.579635,-2.599407,6.154584,-1.616730,5.772117,...,0.168322,-0.095196,0.534444,0.721170,1.066953,0.719139,0.225388,-0.045223,-0.460058,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1509,-433.645569,149.396683,15.755502,13.032167,12.392975,16.820803,6.094834,5.686713,-0.725644,6.116574,...,1.318711,0.423131,-0.261257,0.159387,0.486768,1.015050,-0.103403,0.325985,0.371032,0
1510,-400.652466,128.039993,7.505565,9.477396,-8.824609,3.085480,-6.793148,3.817405,-2.430128,-0.906629,...,0.330574,-0.324109,0.174071,-0.162394,-0.051136,0.150210,-0.030394,-0.762911,0.062790,2
1511,-425.808594,140.022720,-5.812835,33.935928,-1.197876,15.227143,-4.239627,4.924495,-3.108877,4.950401,...,0.302331,0.258988,0.435238,0.395255,0.415199,-0.006936,0.052725,0.278880,0.126119,0
1512,-428.836456,142.657898,-6.374496,33.241409,-2.003935,14.646495,-4.510120,4.902259,-3.379752,5.055974,...,0.336713,0.479046,0.773436,0.756806,0.699998,-0.015126,-0.232445,0.278983,0.515977,0


## Model Fit

In [None]:
# model = IsolationForest(n_estimators=200, max_samples=256, contamination='auto', random_state=CFG['SEED'], verbose=0)
# model.fit(train_features)

## Prediction

In [None]:
def get_pred_label(model_pred):
    # IsolationForest 모델 출력 (1:정상, -1:불량) 이므로 (0:정상, 1:불량)로 Label 변환
    model_pred = np.where(model_pred == 1, 0, model_pred)
    model_pred = np.where(model_pred == -1, 1, model_pred)
    return model_pred

In [None]:
test_pred = model.predict(test_features) # model prediction
# test_pred = get_pred_label(test_pred)

In [None]:
test_pred

## Submission

In [None]:
submit = pd.read_csv('./sample_submission.csv')

In [None]:
submit['LABEL'] = test_pred
submit.head()

In [None]:
import datetime

In [None]:
path = '/Users/lhs/Desktop/GitHub/Dacon/230116_Machine_Error_Sound/result/'

now = datetime.datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
submit.to_csv(f'{path}{now}.csv',encoding='utf-8', index=False)