## Import

In [1]:
import random
import pandas as pd
import numpy as np
import os
from tqdm.auto import tqdm
import librosa

from sklearn.tree import DecisionTreeClassifier

import warnings
warnings.filterwarnings(action='ignore') 

## Hyperparameter Setting

In [2]:
CFG = {
    'SR':16000,
    'N_MFCC':32, # Melspectrogram 벡터를 추출할 개수
    'SEED':42
}

## Fixed Random-Seed

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(CFG['SEED']) # Seed 고정

## Data Pre-Processing

In [4]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

In [5]:
def get_mfcc_feature(df):
    features = []
    for path in tqdm(df['path']):
        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=CFG['SR'])
        # librosa패키지를 사용하여 mfcc 추출
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CFG['N_MFCC'])
        y_feature = []
        # 추출된 MFCC들의 평균을 Feature로 사용
        for e in mfcc:
            y_feature.append(np.mean(e))
        features.append(y_feature)

    mfcc_df = pd.DataFrame(features, columns=['mfcc_'+str(x) for x in range(1,CFG['N_MFCC']+1)])
    return mfcc_df

In [6]:
train_x = get_mfcc_feature(train_df)
test_x = get_mfcc_feature(test_df)

  0%|          | 0/5001 [00:00<?, ?it/s]

  0%|          | 0/1881 [00:00<?, ?it/s]

In [None]:
# kurtosis = pd.DataFrame(df0.kurtosis(),columns = ['kurtosis'])
# skew = pd.DataFrame(df0.skew(),columns = ['skew'])
# rms = pd.DataFrame(df0.apply(lambda d: np.sqrt((d ** 2).sum()/d.size)),columns = ['rms'])
# std = pd.DataFrame(df0.std(),columns = ['std'])
# mx = pd.DataFrame(df0.apply(lambda d: d.max()),columns = ['max'])
# mn = pd.DataFrame(df0.apply(lambda d: d.min()),columns = ['min'])
# mean = pd.DataFrame(df0.mean(),columns = ['mean'])

# crest = pd.DataFrame(df0.apply(lambda d: abs(d.max())/ np.sqrt ((d ** 2).sum()/d.size)), columns = ['crest'])
# shape = pd.DataFrame(df0.apply(lambda d: np.sqrt ((d ** 2).sum()/d.size) / abs(d.mean())), columns = ['shape'])
# impulse = pd.DataFrame(df0.apply(lambda d: abs(d.max()) / abs(d.mean())), columns = ['impulse'])
# p2p = pd.DataFrame(df0.apply(lambda d: abs(d.max()) + abs(d.min())), columns = ['p2p'])

# clearence = pd.DataFrame(df0.apply(lambda d: ((np.sqrt(d.abs())).sum()/len(d))**2),columns = ['clearence'])
# entropy = pd.DataFrame(df0.apply(lambda d: stats.entropy(pd.cut(d, 500).value_counts())),columns = ['entropy'])

In [7]:
train_x['label'] = train_df['label']

## Classification Model Fit

In [8]:
features = list(test_x.columns)
target = 'label'

In [9]:
from catboost import CatBoostClassifier, CatBoostRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from xgboost import XGBClassifier
from sklearn.metrics import log_loss, mean_squared_error, mean_absolute_error, accuracy_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import GridSearchCV
from tqdm import tqdm

kf = StratifiedKFold(n_splits = 10)
models = []
val_scores = []
preds = []

params = {
'n_estimators':2000,
'max_depth':6,
'early_stopping_rounds':200,
'learning_rate':0.005,
#'one_hot_max_size':3,
'bootstrap_type':"MVS",
'l2_leaf_reg':2,
'random_state':41
         }

for i, (train_index, val_index) in tqdm(enumerate(kf.split(train_x, train_x[target]))):

        # clf.fit(train_x, train_y)

        X_train, X_val = train_x[features].loc[train_index], train_x[features].loc[val_index]
        y_train, y_val = train_x[target][train_index], train_x[target][val_index]

        model = CatBoostClassifier(**params)
    
        model.fit(X_train, y_train, eval_set = [(X_val, y_val)], verbose = 2000)

        models.append(model)
        score = accuracy_score(y_val, model.predict(X_val))
        print(f'''
        ***********************************
            score : {score}                           
        ***********************************
        ''')

        val_scores.append(score)


# model = DecisionTreeClassifier(random_state=CFG['SEED'])
# model.fit(train_x, train_y)

0it [00:00, ?it/s]

0:	learn: 1.7889203	test: 1.7886515	best: 1.7886515 (0)	total: 382ms	remaining: 12m 43s
1999:	learn: 1.0889217	test: 1.3186659	best: 1.3186659 (1999)	total: 5m 51s	remaining: 0us

bestTest = 1.318665851
bestIteration = 1999



1it [05:51, 351.67s/it]


        ***********************************
            score : 0.47704590818363274                           
        ***********************************
        
0:	learn: 1.7888228	test: 1.7890161	best: 1.7890161 (0)	total: 198ms	remaining: 6m 35s
1999:	learn: 1.0842536	test: 1.3629930	best: 1.3629930 (1999)	total: 5m 40s	remaining: 0us

bestTest = 1.36299302
bestIteration = 1999



2it [11:32, 345.22s/it]


        ***********************************
            score : 0.436                           
        ***********************************
        
0:	learn: 1.7888575	test: 1.7890051	best: 1.7890051 (0)	total: 199ms	remaining: 6m 37s
1999:	learn: 1.0823248	test: 1.3829239	best: 1.3829239 (1999)	total: 5m 33s	remaining: 0us

bestTest = 1.382923904
bestIteration = 1999



3it [17:06, 340.32s/it]


        ***********************************
            score : 0.45                           
        ***********************************
        
0:	learn: 1.7892833	test: 1.7892872	best: 1.7892872 (0)	total: 196ms	remaining: 6m 32s


3it [19:54, 398.10s/it]


KeyboardInterrupt: 

In [None]:
preds = np.array([model.predict_proba(test_x[features]) for model in models])
preds

In [None]:
preds.mean(axis=0)

In [None]:
preds = [np.argmax(i) for i in preds.mean(axis=0)]

## Inference

In [None]:
pd.DataFrame(preds).value_counts()

## Submission

In [None]:
submission = pd.read_csv('./sample_submission.csv')
submission['label'] = preds
submission.to_csv('./baseline_submission.csv', index=False)