https://dacon.io/competitions/official/235738/codeshare/2902?page=1&dtype=recent

In [2]:
import os
import sys
import shutil
import numpy as np
import pandas as pd
import datetime
import random
# import glob

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import seaborn as sns
import scipy as sc
import scipy.cluster.hierarchy as shc
from scipy.stats import skew
from scipy.stats import spearmanr

from tqdm import tqdm
from glob import glob
from matplotlib import font_manager, rc, cm
plt.rcParams['font.family'] = 'Malgun Gothic'
%matplotlib inline

import sklearn
from sklearn.linear_model import *
from sklearn.svm import SVR
from sklearn.cluster import KMeans
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
# from sktime.forecasting.model_selection import temporal_train_test_split
# from sktime.utils.plotting import plot_series

import lightgbm as lgb
from lightgbm import LGBMRegressor

import catboost
from catboost import CatBoostRegressor
from catboost import Pool
from xgboost import XGBRegressor
from xgboost import plot_importance
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import warnings
warnings.filterwarnings("ignore")

SEED = 2
np.random.seed(SEED)

# 음성 데이터 분석용 라이브러리
import librosa
import librosa.display

In [3]:
data_path = r'D:\Study\Competition_English_Sound_Classification\Data'

In [60]:
africa_train_paths = glob(data_path + r'\train\africa\*.wav')
australia_train_paths = glob(data_path + r'\train\australia\*.wav')
canada_train_paths = glob(data_path + r'\train\canada\*.wav')
england_train_paths = glob(data_path + r'\train\england\*.wav')
hongkong_train_paths = glob(data_path + r'\train\hongkong\*.wav')
us_train_paths = glob(data_path + r'\train\us\*.wav')

path_list = [africa_train_paths,
             australia_train_paths,
             canada_train_paths,
             england_train_paths,
             hongkong_train_paths,
             us_train_paths]

In [16]:
test_paths = glob(data_path + r'\test\*.wav')

In [43]:
def load_data(paths):
    result = []
    
    for path in tqdm(paths):
        # sr(sampling rate) - 1초당 16000개의 데이터를 샘플링 한다는 의미
        data, sr = librosa.load(path, sr = 16000)
        result.append(data)
        
    result = np.array(result)
    
    return result

def get_features(data, sr = 16000, n_fft = 2048, win_length = 200, hop_length = 160, n_mels = 64):
    mel = []
    for i in tqdm(data):
        mel_ = librosa.feature.melspectrogram(i,
                                              sr = sr,
                                              n_fft = n_fft,
                                              win_length = win_length,
                                              hop_length = hop_length,
                                              n_mels = n_mels)
        mel.append(mel_)
    mel = np.array(mel)
    mel = librosa.power_to_db(mel, ref = np.max)
    
    return mel

def set_length(data, d_mini):
    result = []
    for value in tqdm(data):
        # 입력한 값(d_mini)을 기준으로 음성 데이터를 자른다.
        # 입력한 값보다 더 짧은 데이터의 경우 남는 부분에 0을 채워넣어서 길이를 맞춘다.
        value = value[:d_mini]
        if len(value) < d_mini:
            value = np.append(value, [0] * (d_mini - len(value)))
            
        result.append(np.array(value, dtype = np.float32))
    
    result = np.array(result)
    
    return result

### 작업의 편의성을 위해 데이터를 numpy 파일로 변환하여 저장

In [72]:
if os.path.isdir(os.path.join(data_path, 'npy_data')):
    pass
else:
    os.mkdir(os.path.join(data_path, 'npy_data'))

In [73]:
africa_train_data = load_data(africa_train_paths)
australia_train_data = load_data(australia_train_paths)
canada_train_data = load_data(canada_train_paths)
england_train_data = load_data(england_train_paths)
hongkong_train_data = load_data(hongkong_train_paths)
us_train_data = load_data(us_train_paths)

100%|██████████| 2500/2500 [07:53<00:00,  5.28it/s]
100%|██████████| 1000/1000 [03:10<00:00,  5.24it/s]
100%|██████████| 1000/1000 [03:00<00:00,  5.53it/s]
100%|██████████| 10000/10000 [32:43<00:00,  5.09it/s] 
100%|██████████| 1020/1020 [03:36<00:00,  4.72it/s]
100%|██████████| 10000/10000 [34:17<00:00,  4.86it/s] 


In [76]:
test_data = load_data(test_paths)

100%|██████████| 6100/6100 [18:55<00:00,  5.37it/s] 


In [77]:
# np.save(os.path.join(data_path, r'npy_data/africa.npy'), africa_train_data)
# np.save(os.path.join(data_path, r'npy_data/australia.npy'), australia_train_data)
# np.save(os.path.join(data_path, r'npy_data/canada.npy'), canada_train_data)
# np.save(os.path.join(data_path, r'npy_data/england.npy'), england_train_data)
# np.save(os.path.join(data_path, r'npy_data/hongkong.npy'), hongkong_train_data)
# np.save(os.path.join(data_path, r'npy_data/us.npy'), us_train_data)

# np.save(os.path.join(data_path, r'npy_data/test.npy'), test_data)

In [9]:
# africa_train_data = np.load(os.path.join(data_path, 'npy_data/africa.npy'), allow_pickle = True)
# australia_train_data = np.load(os.path.join(data_path, 'npy_data/australia.npy'), allow_pickle = True)
# canada_train_data = np.load(os.path.join(data_path, 'npy_data/canada.npy'), allow_pickle = True)
# england_train_data = np.load(os.path.join(data_path, 'npy_data/england.npy'), allow_pickle = True)
# hongkong_train_data = np.load(os.path.join(data_path, 'npy_data/hongkong.npy'), allow_pickle = True)
# us_train_data = np.load(os.path.join(data_path, 'npy_data/us.npy'), allow_pickle = True)

train_data_list = [africa_train_data, australia_train_data, canada_train_data, england_train_data, hongkong_train_data, us_train_data]

test_data = np.load(os.path.join(data_path, 'npy_data/test.npy'), allow_pickle = True)

In [13]:
for t_data in train_data_list:
    print(t_data.shape)

print()
print(test_data.shape)

(2500,)
(1000,)
(1000,)
(10000,)
(1020,)
(10000,)

(6100,)


### Data Reshape

In [65]:
np.save(os.path.join(data_path, r'npy_data/X_train_200.npy'), X_train_200)
np.save(os.path.join(data_path, r'npy_data/X_train_400.npy'), X_train_400)
np.save(os.path.join(data_path, r'npy_data/X_train_800.npy'), X_train_800)
np.save(os.path.join(data_path, r'npy_data/X_train_1000.npy'), X_train_1000)

In [64]:
# X_train = np.concatenate(train_data_list, axis = 0)
# X_train = set_length(X_train, 100000)

# X_train_200 = get_features(data = X_train, win_length = 200)
# X_train_400 = get_features(data = X_train, win_length = 400)
# X_train_800 = get_features(data = X_train, win_length = 800)
# X_train_1000 = get_features(data = X_train, win_length = 1000)

# 
# X_train_200 = X_train_200.reshape(X_train_200.shape[0], X_train_200.shape[1], X_train_200.shape[2], 1)
# X_train_400 = X_train_400.reshape(X_train_400.shape[0], X_train_400.shape[1], X_train_400.shape[2], 1)
# X_train_800 = X_train_800.reshape(X_train_800.shape[0], X_train_800.shape[1], X_train_800.shape[2], 1)
# X_train_1000 = X_train_1000.reshape(X_train_1000.shape[0], X_train_1000.shape[1], X_train_1000.shape[2], 1)

X_train_multi = np.concatenate([X_train_200,
                                X_train_400,
                                X_train_800,
                                X_train_1000], -1)
np.save(os.path.join(data_path, r'npy_data/X_train_multi.npy'), X_train_multi)

MemoryError: Unable to allocate 15.2 GiB for an array with shape (25520, 64, 626, 4) and data type float32

In [None]:
X_test = np.load(os.path.join(data_path, r'npy_data/test.npy'), allow_pickle = True)
X_test = set_length(X_test, 100000)

X_test_200 = get_feature(data = X_test, win_length = 200)
X_test_400 = get_feature(data = X_test, win_length = 400)
X_test_800 = get_feature(data = X_test, win_length = 800)
X_test_1000 = get_feature(data = X_test, win_length = 1000)

X_test_200 = X_test_200.reshape(X_test_200.shape[0], X_test_200.shape[1], X_test_200.shape[2], 1)
X_test_400 = X_test_400.reshape(X_test_400.shape[0], X_test_400.shape[1], X_test_400.shape[2], 1)
X_test_800 = X_test_800.reshape(X_test_800.shape[0], X_test_800.shape[1], X_test_800.shape[2], 1)
X_test_1000 = X_test_1000.reshape(X_test_1000.shape[0], X_test_1000.shape[1], X_test_1000.shape[2], 1)
X_test_multi = np.concatenate([X_test_200,
                               X_test_400,
                               X_test_800,
                               X_test_1000], axis = 1)
np.save(os.path.join(data_path, r'npy_data/X_test_multi.npy'), X_test_multi)

# Network & DataLoader

- 네트워크는 Conv2d, BatchNorm2d, ReLU 를 합친 block으로 주로 구성되어있고, output 전 dropout, AvgPool, AdaptiveAvgPool을 적용했습니다.
- 데이터를 배치단위로 불러올 때 train 데이터의 분포로 min-max scaling 해주었습니다.
- 1/2 확률로 데이터를 rolling 하여 augmentation 효과를 노려봤지만 큰 도움이 되지는 않았던거 같아서 이것은 적용하지 않았습니다.

In [66]:
def block(input_, units = 32, dropout_rate = 0.5):
    return

In [None]:
class conv_bn_relu(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size):
        super(conv_bn_relu, self).__init__()
        
        self.conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, padding=kernel_size // 2)
        self.BN = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU()
        
        
    def forward(self, x):
        x = self.conv(x)
        x = self.BN(x)
        x = self.relu(x)
        return x
    

class Network(nn.Module):
    def __init__(self, N):
        super(Network, self).__init__()
        self.N = N
        self.AveragePooling = nn.AvgPool2d(2)
        self.MaxPooling = nn.MaxPool2d(2)
        
        
        self.input_conv = conv_bn_relu(in_channels=4, out_channels=self.N, kernel_size=3)
        
        self.block1 = nn.Sequential(
            conv_bn_relu(in_channels=self.N*1, out_channels=self.N*2, kernel_size=3),
            conv_bn_relu(in_channels=self.N*2, out_channels=self.N*4, kernel_size=3),
            conv_bn_relu(in_channels=self.N*4, out_channels=self.N*2, kernel_size=3),
            conv_bn_relu(in_channels=self.N*2, out_channels=self.N*1, kernel_size=3),
        )
        
        self.block2 = nn.Sequential(
            conv_bn_relu(in_channels=self.N*1, out_channels=self.N*2, kernel_size=3),
            conv_bn_relu(in_channels=self.N*2, out_channels=self.N*4, kernel_size=3),
            conv_bn_relu(in_channels=self.N*4, out_channels=self.N*2, kernel_size=3),
            conv_bn_relu(in_channels=self.N*2, out_channels=self.N*1, kernel_size=3),
        )
        
        self.block3 = nn.Sequential(
            conv_bn_relu(in_channels=self.N*1, out_channels=self.N*2, kernel_size=3),
            conv_bn_relu(in_channels=self.N*2, out_channels=self.N*4, kernel_size=3),
            conv_bn_relu(in_channels=self.N*4, out_channels=self.N*2, kernel_size=3),
            conv_bn_relu(in_channels=self.N*2, out_channels=self.N*1, kernel_size=3),
        )
        
        self.block4 = nn.Sequential(
            conv_bn_relu(in_channels=self.N*1, out_channels=self.N*2, kernel_size=3),
            conv_bn_relu(in_channels=self.N*2, out_channels=self.N*4, kernel_size=3),
            conv_bn_relu(in_channels=self.N*4, out_channels=self.N*2, kernel_size=3),
            conv_bn_relu(in_channels=self.N*2, out_channels=self.N*1, kernel_size=3),
        )
        
        self.pool_block = nn.Sequential(
            conv_bn_relu(in_channels=self.N*1, out_channels=self.N*2, kernel_size=3),
            nn.Dropout(0.15),
            nn.AvgPool2d(2),
            conv_bn_relu(in_channels=self.N*2, out_channels=self.N*2, kernel_size=3),
            nn.Dropout(0.15),
            nn.AvgPool2d(2),
            conv_bn_relu(in_channels=self.N*2, out_channels=self.N*2, kernel_size=3),
            nn.Dropout(0.15),
            nn.AdaptiveAvgPool2d(1)
        )
        
        self.output = nn.Sequential(
            nn.Flatten(),
            nn.Linear(in_features=self.N*2, out_features=6),
        )
        
    def forward(self, x, out=''):
        x = self.input_conv(x)
        x = self.MaxPooling(x)
        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        x = self.block4(x)
        x = self.pool_block(x)
        x=self.output(x)
        
        if out=='sigmoid':
            x = F.sigmoid(x)
        return x
    

class VoiceDatasetSimple(Dataset):
        def __init__(self, X, y, transform, inference=False, roll=False):
            self.X = X
            self.y = y
            self.transform = transform
            self.inference = inference
            self.roll = roll
        def __len__(self):
            return len(self.X)
        
        def __getitem__(self, idx):
            X = self.X[idx]
            X = (X-train_x_min)/(train_x_max-train_x_min)
            
            if self.inference:
                X = self.transform(X)
                return X
            else:
                if (self.roll==True) and (random.randint(0, 1)==1):
                    X = np.roll(X,random.randint(-200, 200), axis=1)
                    
                X = self.transform(X)    
                y = self.y[idx]
                
                onehot = np.zeros(6)
                onehot[y] = 1.
                y = onehot
                return X, y

def model_save(model, path):
    torch.save({
        'model': model.state_dict(),
    }, path)