In [1]:
import numpy as np
import pandas as pd

import os
import librosa

import scipy
from scipy.stats import skew
from tqdm import tqdm, tqdm_pandas

tqdm.pandas()

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.svm import SVC
from sklearn import svm, metrics

### 모델 설명
- 각 음성데이터에서 mfcc로 추출된 40가지 특징들의 평균, 표준편차, 왜도, 최댓값, 최솟값, 중앙값으로 피쳐 생성
- 프레임의 크기는 25ms, hop은 10ms 로 지정하여 전처리 시행
- 각 학습데이터와 테스트 데이터는 섞지 않은 상태로 학습 진행 후 결과 도출
- 음성의 길이는 각각 다르므로 프레임의 크기를 맞추어 자르는 방식도 있지만, 가장 작은 음성의 길이로 맞춘 뒤, 프레임을 같은 크기로 나눔.

# 01 데이터 전처리

In [2]:
def train_dataset():
    file_path = 'fmcc_train.ctl'
    dataset = []
    with open(file_path) as f:
        lines = f.readlines()
    train_files_names = [i.strip("\n") for i in lines] # \n값 제거
    
    for train_file in train_files_names:
        audio, sr = librosa.load('raw16k/train/' + train_file + ".wav", sr=16000)
        # 남/녀 별로 labeling
        # 0 : 남자 , 1: 여자
        if "M" in train_file[0]:
            dataset.append([audio, 0])
        elif "F" in train_file[0]:
            dataset.append([audio, 1])
    
    print("TrainDataset 생성 완료")
    return pd.DataFrame(dataset,columns=['data','label'])


def test_dataset():
    file_path = 'fmcc_test900_ref.txt'
    dataset = []
    with open(file_path) as f:
        lines = f.readlines()
    test_files_names = [i.strip("\n") for i in lines] # \n값 제거
    for test_file in test_files_names:
        test_file = test_file.split(" ")
        audio, sr = librosa.load('raw16k/test/' + test_file[0] + ".wav", sr=16000)
        if test_file[1] == "feml":
            dataset.append([audio, 1])
        elif test_file[1] == "male":
            dataset.append([audio, 0])
    
    print("TestDataset 생성 완료")
    return pd.DataFrame(dataset, columns=['data','label'])


def get_test_label():
    test_labels=[]
    file_path = 'fmcc_test900_ref.txt'
    with open(file_path) as f:
        lines = f.readlines()
    test_files_names = [i.strip("\n") for i in lines] # \n값 제거
    for test_label in test_files_names:
        if test_label.split(" ")[1] == "feml":
            test_labels.append(1)
        elif test_label.split(" ")[1] == "male":
            test_labels.append(0)
    
    return np.array(test_labels)


# 음성의 길이 중 가장 작은 길이를 구합니다.

def get_min(data):

    min_data = 9999999
    for i in data:
        if len(i) < min_data:
            min_data = len(i)

    return min_data



def set_length(data, min_length):

    result = []
    for i in data:
        result.append(i[:min_length])
    result = np.array(result)

    return result

In [3]:
train_wav = train_dataset()
test_wav = test_dataset()
train_wav = train_wav.sample(frac=1)  # row 전체 shuffle
#test_wav = train_wav.sample(frac=1)  # row 전체 shuffle

# train 레이블 값 생성
train_label = train_wav.label
train_labels = np.array(train_label)

# 테스트 레이블 값 생성
test_label = test_wav.label
test_labels = np.array(test_label)


TrainDataset 생성 완료
TestDataset 생성 완료


In [4]:
train_x = np.array(train_wav.data)
test_x = np.array(test_wav.data)

train_min = get_min(train_x)
test_min = get_min(test_x)

min_data = np.min([train_min, test_min])
print('가장 작은 길이 :', min_data)

가장 작은 길이 : 10880


In [5]:
train_x = set_length(train_x, min_data)
test_x = set_length(test_x, min_data)
print(train_x[0])
#print('train :', train_x.shape) #(데이터셋 개수, 음성 길이)
#print('test :', test_x.shape)

[ 0.019104    0.02798462  0.03579712 ... -0.07562256  0.0043335
 -0.02819824]


# 02 특징 추출

In [6]:
def preprocess_dataset(data):
    mfccs = []
    for i in data:
        mfcc = librosa.feature.mfcc(y=i,sr=16000,n_mfcc=40,   # n_mfcc:return 될 mfcc의 개수를 정해주는 파라미터, 더 다양한 데이터 특징을 추출하려면 값을 증가시키면 됨. 일반적으로 40개 추출
                                                  n_fft=400,  # n_fft:frame의 length를 결정하는 파라미터 
                                                  hop_length=160) # hop_length의 길이만큼 옆으로 가면서 데이터를 읽음(10ms기본)
        ft2 = librosa.feature.zero_crossing_rate(y=i)[0]
        ft3 = librosa.feature.spectral_rolloff(y=i)[0]
        ft4 = librosa.feature.spectral_centroid(y=i)[0]
        ft5 = librosa.feature.spectral_contrast(y=i)[0]
        ft6 = librosa.feature.spectral_bandwidth(y=i)[0]
        ft1_trunc = np.hstack((np.mean(mfcc, axis=1), np.std(mfcc, axis=1), skew(mfcc, axis = 1), np.max(mfcc, axis = 1), np.median(mfcc, axis = 1), np.min(mfcc, axis = 1)))
        ft2_trunc = np.hstack((np.mean(ft2), np.std(ft2), skew(ft2), np.max(ft2), np.median(ft2), np.min(ft2)))
        ft3_trunc = np.hstack((np.mean(ft3), np.std(ft3), skew(ft3), np.max(ft3), np.median(ft3), np.min(ft3)))
        ft4_trunc = np.hstack((np.mean(ft4), np.std(ft4), skew(ft4), np.max(ft4), np.median(ft4), np.min(ft4)))
        ft5_trunc = np.hstack((np.mean(ft5), np.std(ft5), skew(ft5), np.max(ft5), np.median(ft5), np.min(ft5)))
        ft6_trunc = np.hstack((np.mean(ft6), np.std(ft6), skew(ft6), np.max(ft6), np.median(ft6), np.max(ft6)))
        mfccs.append(np.hstack((ft1_trunc, ft2_trunc, ft3_trunc, ft4_trunc, ft5_trunc, ft6_trunc)))
    return pd.DataFrame(mfccs)

In [7]:
train_mfccs = preprocess_dataset(train_x)
train_mfccs = np.array(train_mfccs)

test_mfccs = preprocess_dataset(test_x)
test_mfccs = np.array(test_mfccs)

In [8]:
train_mfccs

array([[-311.5475769 ,   53.3382225 ,    8.89660454, ..., 3736.34609485,
        2636.606291  , 3736.34609485],
       [-248.69319153,   78.53645325,  -14.07200336, ..., 2806.84934695,
        2403.11558213, 2806.84934695],
       [-324.43707275,  125.96269226,   11.75678635, ..., 3420.47886151,
        2204.67495258, 3420.47886151],
       ...,
       [-253.001297  ,   80.48394012,  -14.25700378, ..., 3419.59913265,
        2637.33905524, 3419.59913265],
       [-178.29191589,   59.30818558,   -4.84354544, ..., 3209.52345952,
        2758.13398485, 3209.52345952],
       [-259.14242554,   66.1275177 ,  -10.90282345, ..., 3199.4337539 ,
        2421.17302229, 3199.4337539 ]])

In [9]:
#print(train_mfccs)
print(train_mfccs.shape)
print(test_mfccs.shape)


(10000, 270)
(900, 270)


In [10]:
train_data = pd.DataFrame(train_mfccs)
test_data = pd.DataFrame(test_mfccs)
train_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,260,261,262,263,264,265,266,267,268,269
0,-311.547577,53.338223,8.896605,38.776142,-34.816292,-16.477997,-26.165955,-16.90387,-12.493114,-12.947577,...,0.136975,35.152576,17.052583,2.980638,2612.733075,464.700965,0.266037,3736.346095,2636.606291,3736.346095
1,-248.693192,78.536453,-14.072003,35.496078,-18.896502,-15.408017,-18.685266,-18.504139,-8.073468,-6.537329,...,-0.524195,27.251086,19.266709,5.97997,2434.428172,140.168294,0.900074,2806.849347,2403.115582,2806.849347
2,-324.437073,125.962692,11.756786,41.339958,-7.995292,5.50391,-6.733978,-6.75018,-4.354207,-4.57766,...,-0.265892,26.375108,18.967333,10.559089,2303.541076,435.924573,1.341557,3420.478862,2204.674953,3420.478862
3,-264.023804,108.4189,19.892815,-6.441436,-19.824553,-12.067206,-25.062845,-5.971821,-16.30217,-7.671244,...,-0.473144,30.620872,18.389379,3.118209,2599.161969,587.375436,-0.02876,3455.768442,2659.951549,3455.768442
4,-202.88678,89.818893,-7.076195,33.795536,-23.732798,-5.32249,-19.458488,-14.77377,-13.241652,2.982929,...,-1.011367,33.541033,23.697638,6.492317,2343.365744,292.199297,1.465677,3067.47263,2242.848093,3067.47263


In [11]:
# 정규화 작업
sc = StandardScaler()
sc.fit(train_mfccs)
 
X_train_std = sc.transform(train_mfccs)
X_test_std = sc.transform(test_mfccs)
X_train_std.shape

(10000, 270)

In [12]:
# predict 데이터로 trainset 쪼개기
#X_predict_std = X_train_std[8000:]
#X_train_std = X_train_std[:8000]
#print(X_predict_std.shape)
#print(X_train_std.shape)

# SVM 모델 학습

In [13]:
C=1
clf = svm.LinearSVC(C=C, max_iter = 10000)
clf.fit(X_train_std, train_labels)

print(accuracy_score(clf.predict(X_test_std), test_labels))

0.8722222222222222




In [14]:
clf = svm.SVC()
clf.fit(X_train_std, train_labels)

print(accuracy_score(clf.predict(X_test_std), test_labels))

0.8811111111111111


In [15]:
# rbf 활용
clf = svm.SVC(kernel = 'rbf', gamma = 0.7, C=C, max_iter = 10000)
clf.fit(X_train_std, train_labels)

print(accuracy_score(clf.predict(X_test_std), test_labels))

0.4444444444444444


In [16]:
# polynomial 활용

clf = svm.SVC(kernel = 'poly', degree = 3, gamma = 'auto', C=C, max_iter = 10000)    
#3차항으로 설정, degree = 3
#gamma는 sigma^2에 해당하는 scale parameter
#학습 반복횟수 10000

clf.fit(X_train_std, train_labels)

print(accuracy_score(clf.predict(X_test_std), test_labels))

0.87


In [17]:
C=4
clf = svm.LinearSVC(C=C, max_iter = 10000)
clf.fit(X_train_std, train_labels)

print(accuracy_score(clf.predict(X_test_std), test_labels))

0.8722222222222222


