https://dacon.io/competitions/official/235930/codeshare/5994?page=1&dtype=recent

## Import

In [1]:
# !pip install catboost
# !pip install xgboost
# !pip install pycaret[full]

!pip install markupsafe==2.0.1

# from pycaret.utils import enable_colab
import jinja2




In [2]:
import random
import pandas as pd
import numpy as np
import os
import librosa
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import IsolationForest
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt

from tqdm.auto import tqdm

import warnings
warnings.filterwarnings(action='ignore') 

In [3]:
import tensorflow as tf

In [4]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    tf.random.set_seed(seed)

seed_everything(42)

## Data Pre-processing

In [5]:
import os
os.chdir('/Users/lhs/Desktop/Machine_Sound_Data')
# os.chdir('/content/drive/MyDrive/YDS/DACON/230116_Machine_Error_Sound')

In [6]:
train = pd.read_csv('./train_.csv').drop(columns=['Unnamed: 0']) # 모두 정상 Sample
# train_df = train_df.iloc[:,2:]
test = pd.read_csv('./test_.csv').drop(columns=['Unnamed: 0'])
# test_df = test_df.iloc[:,2:]

In [None]:
'''
def get_mfcc_feature(df):
    features = []
    for path in tqdm(df['SAMPLE_PATH']):
        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=CFG['SR'])
        
        # librosa패키지를 사용하여 mfcc 추출
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CFG['N_MFCC'])

        y_feature = []
        # 추출된 MFCC들의 평균을 Feature로 사용
        for e in mfcc:
            y_feature.append(np.mean(e))
        features.append(y_feature)
        
    return features

train_features = get_mfcc_feature(train_df)
test_features = get_mfcc_feature(test_df)
'''

In [None]:
X_train, x_val, Y_train, y_val = train_test_split(train, train['LABEL'], test_size=0.2)

# MCD Labeling

In [7]:
from sklearn.covariance import MinCovDet

In [8]:
mcd_model = MinCovDet(support_fraction=1, random_state=44)
mcd_model.fit(train)

MinCovDet(random_state=44, support_fraction=1)

In [9]:
import torch
# 상위 N개가 이상치라고 판단

def get_pred_label(model, x, k):
    prob = abs(mcd_model.mahalanobis(x))
    prob2 = torch.tensor(prob, dtype = torch.float)
    topk_indices = torch.topk(prob2, k = k, largest = True).indices
    
    pred = torch.zeros(len(x), dtype = torch.long)
    pred[topk_indices] = 1
    return pred.tolist(), prob2.tolist()


val_pred, val_prob = get_pred_label(mcd_model, train, 118)
train["LABEL"] = val_pred #라벨값 지정

# Pycaret predict

In [10]:
from pycaret.classification import *

reg1 = setup(train, target = 'LABEL',silent = True, session_id = 1974)

Unnamed: 0,Description,Value
0,Session id,1974
1,Target,LABEL
2,Target type,classification
3,Data shape,"(1279, 130)"
4,Train data shape,"(895, 130)"
5,Test data shape,"(384, 130)"
6,Numeric features,129
7,Preprocess,True
8,Imputation type,simple
9,Numeric imputation,mean


In [23]:
# 2가지 모델 학습
lr_model = create_model("lr")

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9333,0.9204,0.5556,0.7143,0.625,0.589,0.5946
1,0.8778,0.8162,0.4444,0.4,0.4211,0.3529,0.3536
2,0.9444,0.9492,0.6667,0.75,0.7059,0.6753,0.6768
3,0.9111,0.8642,0.5556,0.5556,0.5556,0.5062,0.5062
4,0.9333,0.9259,0.6667,0.6667,0.6667,0.6296,0.6296
5,0.9663,0.966,0.625,1.0,0.7692,0.7521,0.7763
6,0.9326,0.9691,0.75,0.6,0.6667,0.6297,0.6345
7,0.9438,0.9182,0.75,0.6667,0.7059,0.6749,0.6764
8,0.9551,0.8549,0.625,0.8333,0.7143,0.6904,0.6989
9,0.9326,0.8704,0.375,0.75,0.5,0.4681,0.5007


In [24]:
cb_model = create_model("catboost")

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9222,0.9739,0.4444,0.6667,0.5333,0.4928,0.5048
1,0.9111,0.9383,0.5556,0.5556,0.5556,0.5062,0.5062
2,0.9444,0.9849,0.5556,0.8333,0.6667,0.6377,0.6533
3,0.9333,0.8999,0.4444,0.8,0.5714,0.5385,0.5659
4,0.9556,0.9835,0.6667,0.8571,0.75,0.726,0.7329
5,0.9438,0.9969,0.375,1.0,0.5455,0.522,0.5943
6,0.9101,0.9614,0.5,0.5,0.5,0.4506,0.4506
7,0.9775,0.9846,0.75,1.0,0.8571,0.8452,0.8555
8,0.9438,0.9892,0.5,0.8,0.6154,0.5868,0.6057
9,0.9438,0.9799,0.5,0.8,0.6154,0.5868,0.6057


In [25]:
xgb_model = create_model("xgboost")

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9444,0.9698,0.6667,0.75,0.7059,0.6753,0.6768
1,0.9333,0.9602,0.7778,0.6364,0.7,0.6629,0.6671
2,0.9333,0.9794,0.4444,0.8,0.5714,0.5385,0.5659
3,0.9444,0.8807,0.5556,0.8333,0.6667,0.6377,0.6533
4,0.9444,0.9767,0.6667,0.75,0.7059,0.6753,0.6768
5,0.9438,0.9954,0.375,1.0,0.5455,0.522,0.5943
6,0.8989,0.9475,0.375,0.4286,0.4,0.3451,0.346
7,0.9663,0.9784,0.625,1.0,0.7692,0.7521,0.7763
8,0.9326,0.9599,0.5,0.6667,0.5714,0.5357,0.5422
9,0.9551,0.9877,0.625,0.8333,0.7143,0.6904,0.6989


In [26]:
lgbm_model = create_model("lightgbm")

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9556,0.9781,0.7778,0.7778,0.7778,0.7531,0.7531
1,0.9222,0.9492,0.5556,0.625,0.5882,0.5455,0.5466
2,0.9444,0.9808,0.5556,0.8333,0.6667,0.6377,0.6533
3,0.9333,0.882,0.4444,0.8,0.5714,0.5385,0.5659
4,0.9333,0.9671,0.6667,0.6667,0.6667,0.6296,0.6296
5,0.9663,1.0,0.625,1.0,0.7692,0.7521,0.7763
6,0.9326,0.9722,0.625,0.625,0.625,0.588,0.588
7,0.9551,0.9769,0.625,0.8333,0.7143,0.6904,0.6989
8,0.9551,0.9707,0.625,0.8333,0.7143,0.6904,0.6989
9,0.9551,0.983,0.625,0.8333,0.7143,0.6904,0.6989


In [None]:
rf_model = create_model("rf")

IntProgress(value=0, description='Processing: ', max=4)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC


In [13]:
prediction_test = test

def get_pred_label(model,x, k):
    prob =(model.predict_proba(test)[:,0])
    prob2 = torch.tensor(prob, dtype = torch.float)
    
    topk_indices = torch.topk(prob2, k = k, largest = False).indices
    
    pred = torch.zeros(len(x), dtype = torch.long)
    pred[topk_indices] = 1
    
    return pred.tolist(), prob2.tolist()

val_pred_lr, val_prob = get_pred_label(lr_model,prediction_test, 311)
val_pred_cat, val_prob = get_pred_label(cb_model,prediction_test, 305)


print("End!!")


End!!


In [15]:
result = np.array(val_pred_cat) + np.array(val_pred_lr)
temp_result = np.where( result >=2,1,0) # 최종 결과

## Submission

In [14]:
submit = pd.read_csv('./sample_submission.csv')

In [17]:
submit['LABEL'] = temp_result
submit.head()

Unnamed: 0,SAMPLE_ID,LABEL
0,TEST_0000,0
1,TEST_0001,0
2,TEST_0002,0
3,TEST_0003,0
4,TEST_0004,0


In [18]:
submit.LABEL.value_counts()

0    1325
1     189
Name: LABEL, dtype: int64

In [19]:
import datetime

In [20]:
path = '/Users/lhs/Desktop/GitHub/Dacon/230116_Machine_Error_Sound/result/'

now = datetime.datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
submit.to_csv(f'{path}{now}.csv',encoding='utf-8', index=False)