## Import

In [2]:
import random
import pandas as pd
import numpy as np
import os
import librosa
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from tqdm.auto import tqdm

import warnings
warnings.filterwarnings(action='ignore') 

In [26]:
def MacroF1Score(y_true, y_pred):
    f1_score(y_true, y_pred, average='macro')

## Hyperparameter Setting

In [3]:
CFG = {
    'SR':16000,
    'N_MFCC':128, # MFCC 벡터를 추출할 개수 (<=128)
    'SEED':41
}

## Fixed RandomSeed

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(CFG['SEED']) # Seed 고정

## Data Pre-processing

In [5]:
import os
os.chdir('/Users/lhs/Desktop/Machine_Sound_Data')

In [64]:
train_df = pd.read_csv('./train.csv') # 모두 정상 Sample
test_df = pd.read_csv('./test.csv')

In [61]:
def get_mfcc_feature(df):
    features = []
    for path in tqdm(df['SAMPLE_PATH']):
        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=CFG['SR'])
        
        # librosa패키지를 사용하여 mfcc 추출
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CFG['N_MFCC'])

        y_feature = []
        # 추출된 MFCC들의 평균을 Feature로 사용
        for e in mfcc:
            y_feature.append(np.mean(e))
        features.append(y_feature)
    return features

In [8]:
train_features = get_mfcc_feature(train_df)
test_features = get_mfcc_feature(test_df)

  0%|          | 0/1279 [00:00<?, ?it/s]

  0%|          | 0/1514 [00:00<?, ?it/s]

In [71]:
# mfcc data와 train data column concat

train1 = pd.DataFrame(data = train_features)
final_train = pd.concat([train1, train_df.iloc[:, 2:4]], axis = 1)
final_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,120,121,122,123,124,125,126,127,FAN_TYPE,LABEL
0,-332.822968,96.714211,-14.923252,21.968349,-8.564011,-2.027139,-11.869205,3.885383,-5.751901,3.539213,...,0.523915,-0.309364,-0.818508,-0.129562,0.530418,0.116271,-0.800855,-0.867277,2,0
1,-438.514435,142.282898,-2.122107,30.592234,0.737141,15.533081,-2.803471,4.224413,-1.894261,3.574962,...,0.042868,0.374269,0.762829,0.738473,0.288381,0.002098,0.344611,0.157145,0,0
2,-419.292419,123.302338,10.127716,21.659258,-1.092546,11.255921,-3.392200,1.564470,3.889135,3.802191,...,0.199252,0.078277,0.520589,0.860090,0.382899,-0.062304,-0.115386,-0.302028,0,0
3,-333.860535,97.461487,-13.956148,22.239996,-9.348626,-2.875100,-11.314520,6.395192,-2.481325,3.891461,...,-0.021310,0.223788,-0.586875,-0.260907,-0.127185,0.113592,-0.935072,-0.169548,2,0
4,-333.146606,90.000252,-21.705933,14.738313,-18.315388,-9.914182,-16.343977,2.563049,-6.700718,-0.878550,...,-0.031247,0.558820,-0.543790,-0.374026,-0.223215,0.091165,-0.612550,-0.216652,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1274,-331.278717,83.516144,-24.639460,17.099073,-17.417221,-9.260210,-16.640516,-0.968224,-8.006650,-0.210976,...,-0.172667,-0.083145,-0.373329,0.001631,-0.215286,-0.068310,-0.687748,-0.080964,2,0
1275,-331.944550,96.614929,-15.033259,22.885084,-9.548154,-3.824131,-13.007987,5.053965,-3.507394,4.024566,...,0.629672,0.535989,-0.188029,-0.090514,0.081146,0.166000,0.017817,-0.548864,2,0
1276,-335.417572,92.459663,-19.939020,14.659496,-14.785930,-6.819523,-14.032729,1.931983,-8.374944,0.402798,...,0.327789,0.258334,-0.339538,-0.661168,-0.750548,-0.497419,-0.981389,-1.050707,2,0
1277,-335.176880,94.095650,-16.676874,17.122639,-13.769090,-7.083161,-12.534894,5.114513,-4.965012,0.256115,...,-0.008657,-0.311247,-0.878406,-0.427334,-0.186461,-0.092595,-0.752961,-0.147661,2,0


In [73]:
# mfcc data와 train data column concat

test1 = pd.DataFrame(data = test_features)
final_test = pd.concat([test1, test_df['FAN_TYPE']], axis = 1)
final_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,119,120,121,122,123,124,125,126,127,FAN_TYPE
0,-337.466980,93.499161,-14.531839,18.782629,-12.993633,-8.891464,-15.007888,2.987521,-4.080492,3.411654,...,0.485382,0.375715,0.610794,0.073637,0.207322,0.096070,0.408414,-0.723793,-0.071414,2
1,-336.332123,98.042473,-14.108411,22.853271,-10.276470,-4.724311,-13.290108,5.331754,-3.047695,3.999879,...,0.016831,0.042590,0.843998,0.041890,0.020773,-0.281918,0.173492,-0.634122,0.104590,2
2,-411.103058,146.174515,-9.374091,28.873428,-3.150095,14.049159,-3.056240,6.908220,-0.042742,4.407775,...,0.600820,0.476922,0.563234,0.534703,0.684900,0.188768,0.328292,0.420767,-0.095793,0
3,-432.922638,149.789581,17.103006,15.994092,11.349452,14.492430,4.636677,7.593799,-0.623311,6.458681,...,0.770491,0.940058,0.350600,0.109936,0.470496,0.976864,-1.214070,-1.232217,-0.899378,0
4,-421.299835,147.652374,-8.984773,27.868708,-2.585596,14.579635,-2.599407,6.154584,-1.616730,5.772117,...,0.168322,-0.095196,0.534444,0.721170,1.066953,0.719139,0.225388,-0.045223,-0.460058,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1509,-433.645569,149.396683,15.755502,13.032167,12.392975,16.820803,6.094834,5.686713,-0.725644,6.116574,...,1.318711,0.423131,-0.261257,0.159387,0.486768,1.015050,-0.103403,0.325985,0.371032,0
1510,-400.652466,128.039993,7.505565,9.477396,-8.824609,3.085480,-6.793148,3.817405,-2.430128,-0.906629,...,0.330574,-0.324109,0.174071,-0.162393,-0.051136,0.150210,-0.030394,-0.762911,0.062789,2
1511,-425.808594,140.022720,-5.812835,33.935928,-1.197876,15.227143,-4.239627,4.924495,-3.108877,4.950400,...,0.302331,0.258988,0.435238,0.395255,0.415199,-0.006936,0.052726,0.278880,0.126119,0
1512,-428.836456,142.657898,-6.374496,33.241409,-2.003935,14.646495,-4.510120,4.902259,-3.379752,5.055974,...,0.336712,0.479046,0.773436,0.756806,0.699998,-0.015126,-0.232445,0.278983,0.515977,0


## Model Fit

In [9]:
# model = IsolationForest(n_estimators=200, max_samples=256, contamination='auto', random_state=CFG['SEED'], verbose=0)
# model.fit(train_features)

# GridSearch CV

In [74]:
import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler

In [82]:
X_train, X_val, y_train, y_val = train_test_split(final_train.iloc[:,:-1], final_train['LABEL'], test_size=0.2, random_state = 2022)



In [88]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,119,120,121,122,123,124,125,126,127,FAN_TYPE
923,-335.863281,97.765419,-15.796762,20.356749,-10.554089,-3.499343,-13.088073,4.177598,-4.561926,2.695228,...,0.538365,0.135021,-0.270394,-0.897908,0.061179,0.103535,0.215751,-1.097392,-0.513721,2
6,-334.768341,97.643181,-16.474014,20.113930,-11.316394,-4.576036,-13.921519,4.081810,-4.064880,3.871529,...,-0.082930,-0.148794,0.347283,-0.199467,-0.061160,-0.226007,0.369359,-0.308052,0.034860,2
218,-390.911102,99.408913,-0.726862,33.246262,-2.331520,12.342574,-2.742883,5.833442,-1.458750,4.398959,...,0.499994,-0.130218,0.156451,0.567097,0.091805,0.339062,0.380822,-0.068756,-0.016941,0
986,-421.920868,136.678192,-5.861570,33.361790,-1.582056,15.921360,-4.179723,6.033989,-2.185930,5.405187,...,0.201563,0.323319,0.746767,0.766282,0.560964,0.164961,0.322844,0.387393,0.351424,0
405,-333.761047,93.761833,-17.791639,16.463968,-14.422776,-7.727587,-13.207055,4.385335,-5.464883,0.168778,...,-0.086782,-0.157032,-0.285320,-0.626236,-0.227667,-0.091748,0.097783,-0.580408,-0.619501,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240,-334.622009,96.911003,-14.729789,21.805477,-9.827540,-3.683684,-11.551768,5.732097,-3.692539,3.363894,...,0.282510,-0.112432,-0.100007,-0.596921,-0.010097,-0.028019,0.335168,-0.812406,-0.441684,2
624,-425.526245,140.033768,-6.064503,32.821766,-1.948781,15.550639,-4.123956,5.626962,-2.507173,5.388901,...,0.195876,0.470622,0.487370,0.537508,0.610446,-0.089015,0.102456,0.546928,0.271275,0
173,-334.873962,98.232239,-14.263616,22.629784,-10.237970,-4.473517,-13.412573,4.767087,-4.039220,3.107128,...,0.154476,0.413897,0.581093,-0.037016,0.281799,0.111084,0.218491,-0.424956,0.164271,2
1244,-339.049713,97.750153,-14.241977,22.827490,-9.208010,-2.640785,-11.580046,5.248443,-4.219984,4.018301,...,0.614140,0.126496,0.068913,-0.535674,0.000986,0.495360,0.315336,-0.671182,-0.451917,2


In [89]:
sampler = TPESampler(seed=20) # 젤 많이 쓰이는 sampler, DL에서 optimizer 느낌

def objective(trial):
#     dtrain = Dataset(X_train, label = y_train)
#     dtest = Dataset(X_val, label = y_val)

    params = {
        'verbose': -1,
        'random_state' : 2022,
        'max_features': trial.suggest_loguniform('max_depth', 1e-12, 1.0),     
        'n_estimators': trial.suggest_int('n_estimators', 100, 5000),
    }
    
    model = IsolationForest(**params) # ** = 가변적이라는 뜻
    model.fit(X_train, y_train)
    fl_score = f1_score(y_val, model.predict(X_val), average='macro')
    return fl_score

study_lgb = optuna.create_study(direction='minimize', sampler=sampler)
study_lgb.optimize(objective, n_trials=30)

[32m[I 2022-12-15 17:53:25,624][0m A new study created in memory with name: no-name-bd163fea-ba3d-4481-8cc2-1259a67ced67[0m
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[33m[W 2022-12-15 17:53:27,664][0m Trial 0 failed because of the following error: ZeroDivisionError('integer division or modulo by zero')[0m
Traceback (most recent call last):
  File "/Users/lhs/miniforge3/envs/lhs/lib/python3.8/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/1d/lfkpl6ps02xgh0rdprtbmgvw0000gn/T/ipykernel_7330/757224065.py", line 15, in objective
    model.fit(X_train, y_train)
  File "/Users/lhs/miniforge3/envs/lhs/lib/python3.8/site-packages/sklearn/ensemble/_iforest.py", line 306, in fit
    super()._fit(
  File "/Users/lhs/miniforge3/envs/lhs/lib/python3.8/site-packages/sklearn/ensemble/_bagging.py", line 434, in _fit
    all_results = Parallel(
  File "/Users/lhs/miniforge3/envs/lhs/lib

ZeroDivisionError: integer division or modulo by zero

## Prediction

In [10]:
def get_pred_label(model_pred):
    # IsolationForest 모델 출력 (1:정상, -1:불량) 이므로 (0:정상, 1:불량)로 Label 변환
    model_pred = np.where(model_pred == 1, 0, model_pred)
    model_pred = np.where(model_pred == -1, 1, model_pred)
    return model_pred

In [54]:
test_pred = model.predict(test_features) # model prediction
# test_pred = get_pred_label(test_pred)

In [55]:
test_pred

array([ 1,  1,  1, ...,  1,  1, -1])

## Submission

In [14]:
submit = pd.read_csv('./sample_submission.csv')

In [15]:
submit['LABEL'] = test_pred
submit.head()

Unnamed: 0,SAMPLE_ID,LABEL
0,TEST_0000,0
1,TEST_0001,0
2,TEST_0002,0
3,TEST_0003,1
4,TEST_0004,1


In [23]:
import datetime

In [24]:
path = '/Users/lhs/Desktop/GitHub/Dacon/230116_Machine_Error_Sound/result/'

now = datetime.datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
submit.to_csv(f'{path}{now}.csv',encoding='utf-8', index=False)