In [120]:
import numpy as np
import pandas as pd

import os
import librosa

import scipy
from scipy.stats import skew
from tqdm import tqdm, tqdm_pandas

tqdm.pandas()

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.svm import SVC
from sklearn import svm, metrics

### 모델 설명
- 각 음성데이터에서 mfcc로 추출된 40가지 특징들의 평균, 표준편차, 왜도, 최댓값, 최솟값, 중앙값으로 피쳐 생성
- 프레임의 크기는 25ms, hop은 10ms 로 지정하여 전처리 시행
- 각 학습데이터와 테스트 데이터는 섞지 않은 상태로 학습 진행 후 결과 도출
- 음성의 길이는 각각 다르므로 프레임의 크기를 맞추어 자르는 방식도 있지만, 가장 작은 음성의 길이로 맞춘 뒤, 프레임을 같은 크기로 나눔.

# 01 데이터 전처리

In [121]:
def train_dataset():
    file_path = 'fmcc_train.ctl'
    dataset = []
    with open(file_path) as f:
        lines = f.readlines()
    train_files_names = [i.strip("\n") for i in lines] # \n값 제거
    
    for train_file in train_files_names:
        audio, sr = librosa.load('raw16k/train/' + train_file + ".wav", sr=16000)
        # 남/녀 별로 labeling
        # 0 : 남자 , 1: 여자
        if "M" in train_file[0]:
            dataset.append([audio, 0])
        elif "F" in train_file[0]:
            dataset.append([audio, 1])
    
    print("TrainDataset 생성 완료")
    return pd.DataFrame(dataset,columns=['data','label'])


def test_dataset():
    file_path = 'fmcc_test900.ctl'
    dataset = []
    with open(file_path) as f:
        lines = f.readlines()
    test_files_names = [i.strip("\n") for i in lines] # \n값 제거
    for test_file in test_files_names:
        audio, sr = librosa.load('raw16k/test/' + test_file + ".wav", sr=16000)
        dataset.append(audio)
    
    print("TestDataset 생성 완료")
    return pd.DataFrame({"data":dataset})


def get_test_label():
    test_labels=[]
    file_path = 'fmcc_test900_ref.txt'
    with open(file_path) as f:
        lines = f.readlines()
    test_files_names = [i.strip("\n") for i in lines] # \n값 제거
    for test_label in test_files_names:
        if test_label.split(" ")[1] == "feml":
            test_labels.append(1)
        elif test_label.split(" ")[1] == "male":
            test_labels.append(0)
    
    return np.array(test_labels)


# 음성의 길이 중 가장 작은 길이를 구합니다.

def get_min(data):

    min_data = 9999999
    for i in data:
        if len(i) < min_data:
            min_data = len(i)

    return min_data



def set_length(data, min_length):

    result = []
    for i in data:
        result.append(i[:min_length])
    result = np.array(result)

    return result

In [122]:
train_wav = train_dataset()
test_wav = test_dataset()

# train 레이블 값 생성
train_label = train_wav.label
train_labels = np.array(train_label)


# 테스트 레이블 값 생성
test_labels = get_test_label()

TrainDataset 생성 완료
TestDataset 생성 완료


In [123]:
train_x = np.array(train_wav.data)
test_x = np.array(test_wav.data)

train_min = get_min(train_x)
test_min = get_min(test_x)

min_data = np.min([train_min, test_min])
print('가장 작은 길이 :', min_data)

가장 작은 길이 : 10880


In [124]:
train_x = set_length(train_x, min_data)
test_x = set_length(test_x, min_data)
print(train_x[0])
#print('train :', train_x.shape) #(데이터셋 개수, 음성 길이)
#print('test :', test_x.shape)

[ 0.         -0.00064087 -0.00036621 ... -0.13336182 -0.1991272
 -0.28030396]


# 02 특징 추출

In [125]:
def preprocess_dataset(data):
    mfccs = []
    for i in data:
        mfcc = librosa.feature.mfcc(y=i,sr=16000,n_mfcc=40,   # n_mfcc:return 될 mfcc의 개수를 정해주는 파라미터, 더 다양한 데이터 특징을 추출하려면 값을 증가시키면 됨. 일반적으로 40개 추출
                                                  n_fft=400,  # n_fft:frame의 length를 결정하는 파라미터 
                                                  hop_length=160) # hop_length의 길이만큼 옆으로 가면서 데이터를 읽음(10ms기본)
        
        ft1_trunc = np.hstack((np.mean(mfcc, axis=1), np.std(mfcc, axis=1), skew(mfcc, axis = 1), np.max(mfcc, axis = 1), np.median(mfcc, axis = 1), np.min(mfcc, axis = 1)))
        mfccs.append(ft1_trunc)
    return mfccs

In [126]:
train_mfccs = preprocess_dataset(train_x)
train_mfccs = np.array(train_mfccs)

test_mfccs = preprocess_dataset(test_x)
test_mfccs = np.array(test_mfccs)

In [127]:
#print(train_mfccs)
print(train_mfccs.shape)
print(test_mfccs.shape)

(10000, 240)
(900, 240)


In [128]:
train_data = pd.DataFrame(train_mfccs)
test_data = pd.DataFrame(test_mfccs)
train_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,230,231,232,233,234,235,236,237,238,239
0,-380.365814,76.334946,-1.395402,-2.194204,-17.747524,-5.754469,-15.892243,-11.95515,-12.445198,-7.762434,...,-22.29504,-22.244614,-24.729691,-27.391726,-15.319876,-19.618578,-14.013907,-16.075361,-13.573168,-15.176089
1,-343.87326,88.127762,-21.545973,-5.645923,-21.817709,9.816467,-8.49781,-4.265706,-16.999857,-1.066157,...,-15.263478,-11.197349,-17.914913,-14.137145,-17.711437,-13.960834,-11.36907,-9.835249,-11.850672,-14.158845
2,-290.456909,106.672348,-7.071352,7.418191,-36.337719,-15.828415,-11.525275,-21.305012,-26.783209,-1.976714,...,-16.104332,-15.583339,-25.294594,-19.232141,-21.681736,-13.61528,-9.12232,-9.899756,-23.690935,-22.948257
3,-280.25,74.39315,-19.325123,20.682898,-11.374447,-15.076911,-18.467619,-11.006845,-9.498373,-1.645369,...,-19.057915,-17.051262,-16.890179,-22.768242,-23.360519,-13.87118,-15.989393,-19.247276,-13.28759,-11.615672
4,-256.26416,98.770813,-40.115376,21.815762,-28.333029,-10.120652,-21.746471,-18.536825,-11.67527,-3.179917,...,-17.667896,-17.798054,-13.071291,-13.166066,-11.112615,-11.356215,-16.528999,-14.287234,-11.237858,-10.815157


In [129]:
# 정규화 작업
sc = StandardScaler()
sc.fit(train_mfccs)
 
X_train_std = sc.transform(train_mfccs)
X_test_std = sc.transform(test_mfccs)

# SVM 모델 학습

In [130]:
clf = svm.SVC()
clf.fit(X_train_std, train_labels)

print(accuracy_score(clf.predict(X_test_std), test_labels))

0.8544444444444445
