In [64]:
import numpy as np
import pandas as pd

import os
import librosa

import scipy
from scipy.stats import skew
from tqdm import tqdm, tqdm_pandas

tqdm.pandas()

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.svm import SVC
from sklearn import svm, metrics

### 모델 설명
- 각 음성데이터에서 mfcc로 추출된 40가지 특징들의 평균, 표준편차, 왜도, 최댓값, 최솟값, 중앙값으로 피쳐 생성
- 프레임의 크기는 25ms, hop은 10ms 로 지정하여 전처리 시행
- 각 학습데이터와 테스트 데이터는 섞지 않은 상태로 학습 진행 후 결과 도출
- 음성의 길이는 각각 다르므로 프레임의 크기를 맞추어 자르는 방식도 있지만, 가장 작은 음성의 길이로 맞춘 뒤, 프레임을 같은 크기로 나눔.

# 01 데이터 전처리

In [65]:
def train_dataset():
    file_path = 'fmcc_train.ctl'
    dataset = []
    with open(file_path) as f:
        lines = f.readlines()
    train_files_names = [i.strip("\n") for i in lines] # \n값 제거
    
    for train_file in train_files_names:
        audio, sr = librosa.load('raw16k/train/' + train_file + ".wav", sr=16000)
        # 남/녀 별로 labeling
        # 0 : 남자 , 1: 여자
        if "M" in train_file[0]:
            dataset.append([audio, 0])
        elif "F" in train_file[0]:
            dataset.append([audio, 1])
    
    print("TrainDataset 생성 완료")
    return pd.DataFrame(dataset,columns=['data','label'])


def test_dataset():
    file_path = 'fmcc_test900_ref.txt'
    dataset = []
    with open(file_path) as f:
        lines = f.readlines()
    test_files_names = [i.strip("\n") for i in lines] # \n값 제거
    for test_file in test_files_names:
        test_file = test_file.split(" ")
        audio, sr = librosa.load('raw16k/test/' + test_file[0] + ".wav", sr=16000)
        if test_file[1] == "feml":
            dataset.append([audio, 1])
        elif test_file[1] == "male":
            dataset.append([audio, 0])
    
    print("TestDataset 생성 완료")
    return pd.DataFrame(dataset, columns=['data','label'])


def get_test_label():
    test_labels=[]
    file_path = 'fmcc_test900_ref.txt'
    with open(file_path) as f:
        lines = f.readlines()
    test_files_names = [i.strip("\n") for i in lines] # \n값 제거
    for test_label in test_files_names:
        if test_label.split(" ")[1] == "feml":
            test_labels.append(1)
        elif test_label.split(" ")[1] == "male":
            test_labels.append(0)
    
    return np.array(test_labels)


# 음성의 길이 중 가장 작은 길이를 구합니다.

def get_min(data):

    min_data = 9999999
    for i in data:
        if len(i) < min_data:
            min_data = len(i)

    return min_data



def set_length(data, min_length):

    result = []
    for i in data:
        result.append(i[:min_length])
    result = np.array(result)

    return result

In [66]:
train_wav = train_dataset()
test_wav = test_dataset()
train_wav = train_wav.sample(frac=1)  # row 전체 shuffle
#test_wav = train_wav.sample(frac=1)  # row 전체 shuffle

# train 레이블 값 생성
train_label = train_wav.label
train_labels = np.array(train_label)

# 테스트 레이블 값 생성
test_label = test_wav.label
test_labels = np.array(test_label)


TrainDataset 생성 완료
TestDataset 생성 완료


In [67]:
train_x = np.array(train_wav.data)
test_x = np.array(test_wav.data)

train_min = get_min(train_x)
test_min = get_min(test_x)

min_data = np.min([train_min, test_min])
print('가장 작은 길이 :', min_data)

가장 작은 길이 : 10880


In [68]:
train_x = set_length(train_x, min_data)
test_x = set_length(test_x, min_data)
print(train_x[0])
#print('train :', train_x.shape) #(데이터셋 개수, 음성 길이)
#print('test :', test_x.shape)

[0.01660156 0.019104   0.02096558 ... 0.08987427 0.14273071 0.16067505]


# 02 특징 추출

In [69]:
def preprocess_dataset(data):
    mfccs = []
    for i in data:
        mfcc = librosa.feature.mfcc(y=i,sr=16000,n_mfcc=40,   # n_mfcc:return 될 mfcc의 개수를 정해주는 파라미터, 더 다양한 데이터 특징을 추출하려면 값을 증가시키면 됨. 일반적으로 40개 추출
                                                  n_fft=400,  # n_fft:frame의 length를 결정하는 파라미터 
                                                  hop_length=160) # hop_length의 길이만큼 옆으로 가면서 데이터를 읽음(10ms기본)
        ft2 = librosa.feature.zero_crossing_rate(y=i)[0]
        ft3 = librosa.feature.spectral_rolloff(y=i)[0]
        ft4 = librosa.feature.spectral_centroid(y=i)[0]
        ft5 = librosa.feature.spectral_contrast(y=i)[0]
        ft6 = librosa.feature.spectral_bandwidth(y=i)[0]
        ft1_trunc = np.hstack((np.mean(mfcc, axis=1), np.std(mfcc, axis=1), skew(mfcc, axis = 1), np.max(mfcc, axis = 1), np.median(mfcc, axis = 1), np.min(mfcc, axis = 1)))
        ft2_trunc = np.hstack((np.mean(ft2), np.std(ft2), skew(ft2), np.max(ft2), np.median(ft2), np.min(ft2)))
        ft3_trunc = np.hstack((np.mean(ft3), np.std(ft3), skew(ft3), np.max(ft3), np.median(ft3), np.min(ft3)))
        ft4_trunc = np.hstack((np.mean(ft4), np.std(ft4), skew(ft4), np.max(ft4), np.median(ft4), np.min(ft4)))
        ft5_trunc = np.hstack((np.mean(ft5), np.std(ft5), skew(ft5), np.max(ft5), np.median(ft5), np.min(ft5)))
        ft6_trunc = np.hstack((np.mean(ft6), np.std(ft6), skew(ft6), np.max(ft6), np.median(ft6), np.max(ft6)))
        mfccs.append(np.hstack((ft1_trunc, ft2_trunc, ft3_trunc, ft4_trunc, ft5_trunc, ft6_trunc)))
    return pd.DataFrame(mfccs)

In [70]:
train_mfccs = preprocess_dataset(train_x)
train_mfccs = np.array(train_mfccs)

test_mfccs = preprocess_dataset(test_x)
test_mfccs = np.array(test_mfccs)

In [71]:
train_mfccs

array([[-269.36199951,   74.01269531,   16.16133118, ..., 3633.13171755,
        2833.64255075, 3633.13171755],
       [-297.97909546,   72.95867157,  -24.09208488, ..., 3049.33391211,
        2246.37511658, 3049.33391211],
       [-196.43765259,   75.14990997,  -21.70511436, ..., 2989.11097575,
        2415.67663236, 2989.11097575],
       ...,
       [-222.71995544,   93.5573349 ,   15.14359856, ..., 3138.89413604,
        2691.30121787, 3138.89413604],
       [-232.20715332,   90.29489899,   28.48860359, ..., 3299.9285924 ,
        2937.13341082, 3299.9285924 ],
       [-251.08180237,   58.95478439,  -52.06054306, ..., 2449.72786814,
        2265.16015172, 2449.72786814]])

In [72]:
#print(train_mfccs)
print(train_mfccs.shape)
print(test_mfccs.shape)


(10000, 270)
(900, 270)


In [73]:
train_data = pd.DataFrame(train_mfccs)
test_data = pd.DataFrame(test_mfccs)
train_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,260,261,262,263,264,265,266,267,268,269
0,-269.362,74.012695,16.161331,49.461853,2.084093,15.346108,1.435024,14.484039,-7.915791,12.596985,...,-0.515575,25.802986,19.331053,6.97203,2852.627751,464.265174,-0.01501,3633.131718,2833.642551,3633.131718
1,-297.979095,72.958672,-24.092085,34.787235,-10.491766,-8.983263,-8.902759,-9.953056,-3.805806,-5.13038,...,-0.222293,30.579553,21.677399,7.158816,2344.511208,237.542613,1.633632,3049.333912,2246.375117,3049.333912
2,-196.437653,75.14991,-21.705114,18.888514,-22.395741,-2.079852,-11.970265,1.583614,-8.43371,-13.726496,...,-0.286753,25.445345,19.484178,11.222117,2509.055006,243.486875,0.775139,2989.110976,2415.676632,2989.110976
3,-179.448135,105.576546,-13.916913,9.062522,-23.919323,-5.069688,-22.712831,-0.933076,-20.017641,2.612354,...,-1.423353,32.372792,27.130022,6.550785,2288.426596,309.611066,0.278212,2864.169192,2332.766579,2864.169192
4,-258.555573,63.962608,-7.303214,19.467211,-16.508804,-13.047114,-7.283066,5.969139,-3.581412,-7.383719,...,-0.098361,32.146969,21.187685,8.239988,2720.240784,190.175557,0.139834,3089.602031,2729.035876,3089.602031


In [74]:
# 정규화 작업
sc = StandardScaler()
sc.fit(train_mfccs)
 
X_train_std = sc.transform(train_mfccs)
X_test_std = sc.transform(test_mfccs)
X_train_std.shape

(10000, 270)

In [75]:
# predict 데이터로 trainset 쪼개기
#X_predict_std = X_train_std[8000:]
#X_train_std = X_train_std[:8000]
#print(X_predict_std.shape)
#print(X_train_std.shape)

# SVM 모델 학습

In [76]:
C=1
clf = svm.LinearSVC(C=C, max_iter = 10000)
clf.fit(X_train_std, train_labels)

print(accuracy_score(clf.predict(X_test_std), test_labels))

0.8722222222222222




In [77]:
clf = svm.SVC()
clf.fit(X_train_std, train_labels)

print(accuracy_score(clf.predict(X_test_std), test_labels))

0.8811111111111111


In [78]:
# rbf 활용
clf = svm.SVC(kernel = 'rbf', gamma = 0.7, C=C, max_iter = 10000)
clf.fit(X_train_std, train_labels)

print(accuracy_score(clf.predict(X_test_std), test_labels))

0.4444444444444444


In [79]:
# polynomial 활용

clf = svm.SVC(kernel = 'poly', degree = 3, gamma = 'auto', C=C, max_iter = 10000)    
#3차항으로 설정, degree = 3
#gamma는 sigma^2에 해당하는 scale parameter
#학습 반복횟수 10000

clf.fit(X_train_std, train_labels)

print(accuracy_score(clf.predict(X_test_std), test_labels))

0.87


In [80]:
C=4
clf = svm.LinearSVC(C=C, max_iter = 10000)
clf.fit(X_train_std, train_labels)

print(accuracy_score(clf.predict(X_test_std), test_labels))

0.8744444444444445


