In [34]:
import random
import pandas as pd
import numpy as np
import os
import librosa
from tqdm import tqdm

from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV, cross_val_score

from xgboost import XGBClassifier

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

In [3]:
train_df = pd.read_csv('./open/train.csv')
test_df = pd.read_csv('./open/test.csv')

train_df['path'] = train_df['path'].str[2:]
test_df['path'] = test_df['path'].str[2:]

train_df['path'] = './open/' + train_df['path']
test_df['path'] = './open/' + test_df['path']

In [4]:
def get_mfcc_feature(df):
    features = []
    for path in tqdm(df['path']):
        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=16000)
        # librosa패키지를 사용하여 mfcc 추출
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=128)
        y_feature = []
        # 추출된 MFCC들의 평균을 Feature로 사용
        for e in mfcc:
            y_feature.append(np.mean(e))
        features.append(y_feature)

    mfcc_df = pd.DataFrame(features, columns=['mfcc_'+str(x) for x in range(1,128+1)])
    return mfcc_df

In [5]:
train_x = get_mfcc_feature(train_df)
test_x = get_mfcc_feature(test_df)

100%|██████████| 5001/5001 [00:31<00:00, 160.56it/s]
100%|██████████| 1881/1881 [00:08<00:00, 234.78it/s]


In [10]:
train_y = train_df['label']

In [12]:
# 코랩에서 사용하기 위해서 train_x, test_x 추출
train_x.to_csv('./for_colab/train_mfcc.csv', index=False)
test_x.to_csv('./for_colab/test_mfcc.csv', index=False)

In [7]:
scaler = MinMaxScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

In [26]:
model = XGBClassifier(n_estimators=200, objective='multi:softmax', random_state=42)

params = {
    'max_depth' : [6,7,8,9,10],
    'subsample':[.5,.75,1],
    'colsample_bytree' : [.5,.75,1],
    'reg_lambda' : [0,.5,1,1.5,2],
    'reg_alpha' : [0,.5,1],
}

In [27]:
grid = GridSearchCV(model, param_grid=params, cv=4, scoring='accuracy', verbose=2)
grid.fit(train_x, train_y)

Fitting 4 folds for each of 675 candidates, totalling 2700 fits
[CV] END colsample_bytree=0.5, max_depth=6, reg_alpha=0, reg_lambda=0, subsample=0.5; total time=   2.1s
[CV] END colsample_bytree=0.5, max_depth=6, reg_alpha=0, reg_lambda=0, subsample=0.5; total time=   2.1s
[CV] END colsample_bytree=0.5, max_depth=6, reg_alpha=0, reg_lambda=0, subsample=0.5; total time=   2.1s
[CV] END colsample_bytree=0.5, max_depth=6, reg_alpha=0, reg_lambda=0, subsample=0.5; total time=   2.1s
[CV] END colsample_bytree=0.5, max_depth=6, reg_alpha=0, reg_lambda=0, subsample=0.75; total time=   2.5s
[CV] END colsample_bytree=0.5, max_depth=6, reg_alpha=0, reg_lambda=0, subsample=0.75; total time=   2.7s
[CV] END colsample_bytree=0.5, max_depth=6, reg_alpha=0, reg_lambda=0, subsample=0.75; total time=   2.6s
[CV] END colsample_bytree=0.5, max_depth=6, reg_alpha=0, reg_lambda=0, subsample=0.75; total time=   2.6s
[CV] END colsample_bytree=0.5, max_depth=6, reg_alpha=0, reg_lambda=0, subsample=1; total ti

In [30]:
res_1 = pd.DataFrame(grid.cv_results_)

In [52]:
SDICT = {}

rss = [42, 69, 1000]

for rs in tqdm(rss):
    key = str(rs)

    model = XGBClassifier(n_estimators=300, objective='multi:softmax', random_state=69,
                        colsample_bytree=1, max_depth=6, reg_alpha=0, reg_lambda=2, subsample=.75,
                        learning_rate=.05
                        )

    score = cross_val_score(model, X=train_x, y=train_y, scoring='accuracy', cv=4)
    value = score.mean()

    SDICT[key] = value

100%|██████████| 3/3 [01:42<00:00, 34.12s/it]

{'42': 0.4663034372501998, '69': 0.4737005595523581, '1000': 0.47050103916866504}





In [54]:
model = XGBClassifier(n_estimators=300, objective='multi:softmax', random_state=69, colsample_bytree=1, max_depth=6, reg_alpha=0, reg_lambda=2, subsample=.75, learning_rate=.05)

model.fit(train_x, train_y)

preds = model.predict(test_x)

submission = pd.read_csv('./open/sample_submission.csv')
submission['label'] = preds
submission.to_csv('./xgb_grid_submission.csv', index=False)