In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os
import librosa

import scipy
from scipy.stats import skew
from tqdm import tqdm, tqdm_pandas

tqdm.pandas()

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.svm import SVC
from sklearn import svm, metrics
from sklearn.utils import shuffle

### 모델 설명
- 각 음성데이터에서 mfcc로 추출된 40가지 특징들의 평균, 표준편차, 왜도, 최댓값, 최솟값, 중앙값으로 피쳐 생성
- 프레임의 크기는 25ms, hop은 10ms 로 지정하여 전처리 시행
- 각 학습데이터와 테스트 데이터는 섞지 않은 상태로 학습 진행 후 결과 도출
- 음성의 길이는 각각 다르므로 프레임의 크기를 맞추어 자르는 방식도 있지만, 가장 작은 음성의 길이로 맞춘 뒤, 프레임을 같은 크기로 나눔.

# 01 데이터 전처리

In [7]:
def train_dataset():
    file_path = 'fmcc_train.ctl'
    dataset = []
    with open(file_path) as f:
        lines = f.readlines()
    train_files_names = [i.strip("\n") for i in lines] # \n값 제거
    
    for train_file in train_files_names:
        file_name='raw16k/train/' + train_file + ".wav"
        audio, sr = librosa.load(file_name, sr=16000)
        # 남/녀 별로 labeling
        # 1 : 남자 , 0: 여자
        if "M" in train_file[0]:
            #dataset.append([audio, "male"])
            dataset.append([file_name, audio, "male"])
        elif "F" in train_file[0]:
            #dataset.append([audio, "feml"])
            dataset.append([file_name, audio, "feml"])
    
    print("TrainDataset 생성 완료")
    return pd.DataFrame(dataset,columns=['fname', 'data','label'])


def test_dataset():
    file_path = 'fmcc_test900_ref.txt'
    dataset = []
    with open(file_path) as f:
        lines = f.readlines()
    test_files_names = [i.strip("\n") for i in lines] # \n값 제거
    for test_file in test_files_names:
        test_file = test_file.split(" ")
        #fname = 'raw16k/test/' + test_file[0]
        fname = test_file[0]
        audio, sr = librosa.load('raw16k/test/' + fname + ".wav", sr=16000)
        if test_file[1] == "feml":
            #dataset.append([fname+".raw", audio, "feml"])
            dataset.append(['/home/wikim/exp/raw16k/test/'+fname+".raw", audio, "feml"])
        elif test_file[1] == "male":
            #dataset.append([fname+".raw", audio, "male"])
            dataset.append(['/home/wikim/exp/raw16k/test/'+fname+".raw", audio, "male"])
    
    print("TestDataset 생성 완료")
    return pd.DataFrame(dataset, columns=['fname','data','label'])


# 음성의 길이 중 가장 작은 길이를 구합니다.

def get_min(data):

    min_data = 9999999
    for i in data:
        if len(i) < min_data:
            min_data = len(i)

    return min_data


def get_max(data):

    max_data = -999
    for i in data:
        if len(i) > max_data:
            max_data = len(i)

    return max_data

def set_length(data, min_length):

    result = []
    for i in data:
        result.append(i[:min_length])
    result = np.array(result)

    return result

def set_max_length(data, max_length):

    result = []
    for i in range(max_length):
        result.append(0)
    for j in range(len(data)):
        result[j]=data[j]
    result = np.array(result)

    return result

In [8]:
train_wav = train_dataset()
test_wav = test_dataset()
#shuffle_train_wav = train_wav.sample(frac=1)  # row 전체 shuffle
#shuffle_test_wav = test_wav.sample(frac=1)  # row 전체 shuffle

# train 레이블 값 생성
#train_label = train_wav.label
#train_labels = np.array(train_label)

# 테스트 레이블 값 생성
#test_label = test_wav.label
#test_labels = np.array(test_label)

TrainDataset 생성 완료
TestDataset 생성 완료


In [9]:
train_x = np.array(train_wav.data)
test_x = np.array(test_wav.data)

train_max = get_max(train_x)
test_max = get_max(test_x)

max_data = np.max([train_max, test_max])
print('가장 긴 길이 :', max_data)

가장 긴 길이 : 39040


In [10]:
train_x = set_max_length(train_x, max_data)
test_x = set_max_length(test_x, max_data)
print(train_x[0])
print('train :', train_x.shape) #(데이터셋 개수, 음성 길이)
print('test :', test_x.shape)

[ 0.         -0.00064087 -0.00036621 ...  0.07077026  0.05587769
  0.01251221]
train : (39040,)
test : (39040,)


  result = np.array(result)


# 02 특징 추출

In [11]:
def preprocess_dataset(data):
    mfccs = []
    for i in data:
        mfcc = librosa.feature.mfcc(y=i,sr=16000,n_mfcc=40,   # n_mfcc:return 될 mfcc의 개수를 정해주는 파라미터, 더 다양한 데이터 특징을 추출하려면 값을 증가시키면 됨. 일반적으로 40개 추출
                                                  n_fft=400,  # n_fft:frame의 length를 결정하는 파라미터 
                                                  hop_length=160) # hop_length의 길이만큼 옆으로 가면서 데이터를 읽음(10ms기본)
        #print(mfcc)
        #ft1_trunc = np.hstack((np.mean(mfcc, axis=1), np.std(mfcc, axis=1), skew(mfcc, axis = 1), np.max(mfcc, axis = 1), np.median(mfcc, axis = 1), np.min(mfcc, axis = 1)))

        mfccs.append(mfcc)
    return pd.DataFrame(mfccs)

In [12]:
train_mfccs = preprocess_dataset(train_x)
#train_mfccs = np.array(train_mfccs)

test_mfccs = preprocess_dataset(test_x)
#test_mfccs = np.array(test_mfccs)

ParameterError: Audio data must be of type numpy.ndarray

In [74]:
# 데이터셋 재설정하기
train_set = pd.DataFrame()
train_set['fname'] = train_wav['fname']
test_set = pd.DataFrame()
test_set['fname'] = test_wav['fname']

train_set = pd.concat([train_set,train_mfccs],axis=1)
train_set['label'] = train_wav['label']
test_set = pd.concat([test_set,test_mfccs],axis=1)
test_set['label'] = test_wav['label']

In [75]:
train_set

Unnamed: 0,fname,0,1,2,3,4,5,6,7,8,...,255,256,257,258,259,260,261,262,263,label
0,raw16k/train/FCJY0/FCJY0_pbw1001.wav,-357.228851,70.462700,-2.156656,-0.493067,-22.894753,-2.081519,-19.210491,-14.009414,-9.968751,...,5589.590074,2426.171164,1550.108523,13.180541,6.031166,1.112625,31.122784,10.646379,4.345991,feml
1,raw16k/train/FCJY0/FCJY0_pbw1002.wav,-339.398376,91.876221,-22.855867,0.399164,-22.641399,10.525990,-12.593031,-6.533293,-23.694679,...,5057.467507,2306.491905,1636.786297,16.060592,4.799543,0.220403,25.455225,16.402648,8.575869,feml
2,raw16k/train/FCJY0/FCJY0_pbw1003.wav,-307.900024,111.039551,-11.965649,9.581073,-35.134731,-17.317619,-10.091484,-22.156830,-25.859383,...,2997.934197,1882.224883,1532.785595,18.272618,5.497595,-0.909471,28.805922,19.958048,2.794613,feml
3,raw16k/train/FCJY0/FCJY0_pbw1004.wav,-292.031067,86.136795,-23.937466,17.126577,-16.261923,-14.787249,-18.061773,-7.738655,-5.916373,...,4283.754965,2567.367103,1622.480582,16.186572,3.622599,-0.156958,25.025905,15.604625,6.789018,feml
4,raw16k/train/FCJY0/FCJY0_pbw1005.wav,-260.532623,95.999504,-42.201618,20.899199,-28.934317,-10.480850,-23.660446,-19.058420,-10.468557,...,2708.893894,2263.384242,1943.886329,15.980851,5.752206,-0.164352,29.944127,16.218201,0.713301,feml
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,raw16k/train/MLWS0/MLWS0_pbw1196.wav,-270.408875,81.380440,-16.315908,24.434874,-10.781040,-12.447013,-7.800146,-11.365909,-7.647517,...,3564.534317,2462.904548,1822.078680,20.429385,3.391569,0.154286,26.846035,20.365465,13.515818,male
9996,raw16k/train/MLWS0/MLWS0_pbw1197.wav,-353.529144,100.193527,-2.436024,23.333628,-1.828546,-13.569883,-10.589240,3.692477,-14.470538,...,2780.464093,2255.418696,1769.976387,22.119711,5.008375,-0.232421,32.620170,22.825617,11.785901,male
9997,raw16k/train/MLWS0/MLWS0_pbw1198.wav,-264.608917,63.507053,-14.042987,24.930367,-14.630364,8.640616,-15.595984,9.907090,-3.578396,...,3779.008788,3216.256879,2700.302251,21.856846,4.876360,1.770214,39.089601,20.870084,15.851291,male
9998,raw16k/train/MLWS0/MLWS0_pbw1199.wav,-283.717072,113.128601,-35.432854,31.492065,-20.108822,4.374619,-11.871616,9.548581,-15.521561,...,2632.716876,2293.214152,1907.678427,20.011231,3.649737,0.840842,31.296304,19.895945,13.057680,male


In [76]:
shuffle_train=shuffle(train_set, random_state = 20)
shuffle_test=shuffle(test_set, random_state = 30)
shuffle_train

Unnamed: 0,fname,0,1,2,3,4,5,6,7,8,...,255,256,257,258,259,260,261,262,263,label
9957,raw16k/train/MLWS0/MLWS0_pbw1158.wav,-314.979889,82.739449,6.389763,22.278984,-8.398458,-2.677688,-28.834642,-6.055861,-21.753105,...,4577.733236,2695.437898,1423.104798,24.292869,4.334089,-0.780719,31.679443,26.175682,13.715223,male
1687,raw16k/train/FJYJ1/FJYJ1_pbw1088.wav,-226.971939,61.370483,-44.255512,17.489904,-18.518642,-13.435062,-19.188448,-13.666183,-5.024434,...,3486.691716,2902.512043,1999.498535,16.927433,3.472022,0.013004,24.546603,17.061044,8.683390,feml
2116,raw16k/train/FKEJ0/FKEJ0_pbw1117.wav,-333.614380,54.199627,2.391518,23.990250,-24.286991,-8.908220,-20.718632,-23.588701,-20.601360,...,7817.796372,2956.953667,2017.543178,15.413831,3.847512,1.104592,28.447579,15.166905,8.864182,feml
231,raw16k/train/FGBS0/FGBS0_pbw1032.wav,-271.695984,55.087444,-35.524151,15.012136,-19.925812,-1.037488,-15.226067,11.619479,-19.834372,...,6694.134560,3219.182745,1909.169585,13.394664,5.269800,1.848882,33.658785,12.666030,6.284396,feml
2780,raw16k/train/FLGH0/FLGH0_pbw1181.wav,-233.409637,65.965027,-17.979252,20.538010,-24.317638,-10.322409,-21.393955,-6.222099,-11.450763,...,5230.163169,2874.271493,1869.942443,19.435518,7.153778,1.094881,40.066851,16.810454,8.996715,feml
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3915,raw16k/train/FLSH1/FLSH1_pbw1116.wav,-283.220428,81.391106,-20.840357,19.654039,-24.191692,2.744695,-19.306742,-0.115585,-11.568568,...,3756.770218,2668.821525,2186.002595,16.586447,5.309475,2.875838,41.529504,16.205276,9.568842,feml
9620,raw16k/train/MLWJ1/MLWJ1_pbw1021.wav,-227.512299,80.613831,7.365667,13.004544,-9.486271,10.483340,-12.197699,15.385054,-10.213816,...,4023.407457,3133.070700,2086.370913,24.795448,5.531655,-0.092789,35.702722,25.415755,13.129573,male
7068,raw16k/train/MJSG0/MJSG0_pbw1069.wav,-317.243591,91.476395,26.234255,26.352848,-11.294439,-3.940541,-34.887245,-1.258684,-9.730688,...,4469.651358,2516.305359,1073.695064,23.452459,6.489817,-0.028771,43.149696,23.947814,4.907473,male
7391,raw16k/train/MJST0/MJST0_pbw1192.wav,-238.817673,36.579140,-19.445511,0.482900,-19.336252,10.968548,-13.686270,8.191335,-11.287183,...,5178.881262,3830.595624,2927.361247,18.952310,4.080755,-0.183124,30.679547,18.763352,6.097830,male


In [77]:
print(train_set.shape)
print(test_set.shape)

(10000, 266)
(900, 266)


In [78]:
def convert_to_labels(preds, i2c, k=2):
    ans = []
    ids = []
    for p in preds:
        idx = np.argsort(p)[::-1]
        ids.append([i for i in idx[:k]])
        ans.append(' '.join([i2c[i] for i in idx[:k]]))

    return ans, ids

In [47]:
X = train_set.drop(['label', 'fname'], axis=1)
feature_names = list(X.columns)  # 특징 번호 리스트

X = X.values  # 특징벡터 값 전체


# 라벨값 문자열 -> int 변환 (PCA 적용시 변환 필요)
labels= np.sort(np.unique(train_set.label.values))  # 라벨 이름 가져오기

num_class = len(labels)
c2i = {}
i2c = {}
for i, c in enumerate(labels):
    c2i[c] = i  # feml, male -> index = 0 or 1로 변환
    i2c[i] = c  # 0 or 1 -> feml, male 변환
    
y=np.array([c2i[x] for x in train_set.label.values])
y_test=np.array([c2i[x] for x in test_set.label.values])
print(y)

[0 0 0 ... 1 1 1]


In [79]:
X = shuffle_train.drop(['label', 'fname'], axis=1)
feature_names = list(X.columns)  # 특징 번호 리스트

X = X.values  # 특징벡터 값 전체


# 라벨값 문자열 -> int 변환 (PCA 적용시 변환 필요)
labels= np.sort(np.unique(shuffle_train.label.values))  # 라벨 이름 가져오기

num_class = len(labels)
c2i = {}
i2c = {}
for i, c in enumerate(labels):
    c2i[c] = i  # feml, male -> index = 0 or 1로 변환
    i2c[i] = c  # 0 or 1 -> feml, male 변환
    
y=np.array([c2i[x] for x in shuffle_train.label.values])
y_test=np.array([c2i[x] for x in shuffle_test.label.values])
print(y)

[1 0 0 ... 1 1 0]


In [28]:
X_test = test_set.drop(['label', 'fname'], axis=1)
X_test = X_test.values

In [80]:
X_test = shuffle_test.drop(['label', 'fname'], axis=1)
X_test = X_test.values

In [81]:
print(X.shape)
print(X_test.shape)

(10000, 264)
(900, 264)


In [82]:
# PCA를 적용하기 위한 scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)



# 차원축소를 위한 PCA 적용 
pca = PCA(n_components=100).fit(X_scaled)
X_pca = pca.transform(X_scaled)
X_test_pca = pca.transform(X_test_scaled)

print(sum(pca.explained_variance_ratio_)) 

0.913716084356145


In [84]:
# 기존 트레인셋을 분할하여 정확도 테스트
# Fit an SVM model
X_train, X_val, y_train, y_val = train_test_split(X_pca, y, test_size = 0.3, random_state = 6, shuffle = True)

clf = svm.SVC() # 93.5
#clf.fit(X_pca, train_labels)
#clf = SVC(kernel = 'linear', probability=True)  # 90.6
#clf = SVC(kernel = 'rbf', C = 4, gamma = 0.01, probability=True)  # 93.3
clf.fit(X_train, y_train)

print(accuracy_score(clf.predict(X_val), y_val))

0.9373333333333334


In [85]:
clf = svm.SVC(probability=True)

clf.fit(X_pca, y)
str_preds, _ = convert_to_labels(clf.predict_proba(X_test_pca), i2c, k=3)
print(str_preds)
print(accuracy_score(clf.predict(X_test_pca), y_test))

['feml male', 'feml male', 'feml male', 'male feml', 'male feml', 'male feml', 'feml male', 'male feml', 'male feml', 'male feml', 'male feml', 'male feml', 'feml male', 'feml male', 'feml male', 'male feml', 'feml male', 'male feml', 'feml male', 'male feml', 'male feml', 'feml male', 'male feml', 'feml male', 'feml male', 'feml male', 'feml male', 'male feml', 'male feml', 'feml male', 'feml male', 'feml male', 'male feml', 'feml male', 'feml male', 'male feml', 'feml male', 'male feml', 'feml male', 'feml male', 'feml male', 'feml male', 'male feml', 'male feml', 'feml male', 'male feml', 'male feml', 'male feml', 'feml male', 'male feml', 'male feml', 'male feml', 'male feml', 'male feml', 'male feml', 'male feml', 'male feml', 'male feml', 'male feml', 'feml male', 'feml male', 'male feml', 'male feml', 'feml male', 'male feml', 'male feml', 'male feml', 'feml male', 'feml male', 'feml male', 'male feml', 'male feml', 'feml male', 'male feml', 'feml male', 'feml male', 'feml male'

0.8788888888888889


# SVM 모델 학습

In [24]:
def to_df_test(y_pred, test_wav):
    predict_df = pd.DataFrame()
    pred_name=[]
    for i in range(len(y_pred)):
        if(y_pred[i] == 0):
            pred_name.append("male")
        elif(y_pred[i] == 1):
            pred_name.append("feml")
    
    predict_df["data"] = test_wav["fname"]
    predict_df["label"] = pred_name
    #predict_df = predict_df.data
        
    return predict_df

def to_txt_test(predict_df):
    predict_df.to_csv('강력한컴공_test_results.txt', sep = " ", index=False, header=False, lineterminator='\n')
    

In [25]:
C=1
clf = svm.LinearSVC(C=C, max_iter = 10000)
clf.fit(X_pca, train_labels)

print(accuracy_score(clf.predict(X_test_pca), test_labels))
y_pred = clf.predict(X_test_pca)
#predict_df = to_df_test(y_pred, test_wav)
#to_txt_test(predict_df)


0.5911111111111111
                                               data label
0    /home/wikim/exp/raw16k/test/fmcc_test_0001.raw  feml
1    /home/wikim/exp/raw16k/test/fmcc_test_0002.raw  feml
2    /home/wikim/exp/raw16k/test/fmcc_test_0003.raw  feml
3    /home/wikim/exp/raw16k/test/fmcc_test_0004.raw  feml
4    /home/wikim/exp/raw16k/test/fmcc_test_0005.raw  feml
..                                              ...   ...
895  /home/wikim/exp/raw16k/test/fmcc_test_0996.raw  male
896  /home/wikim/exp/raw16k/test/fmcc_test_0997.raw  male
897  /home/wikim/exp/raw16k/test/fmcc_test_0998.raw  male
898  /home/wikim/exp/raw16k/test/fmcc_test_0999.raw  male
899  /home/wikim/exp/raw16k/test/fmcc_test_1000.raw  male

[900 rows x 2 columns]




In [26]:
clf = svm.SVC()
clf.fit(X_pca, train_labels)

print(accuracy_score(clf.predict(X_test_pca), test_labels))
y_pred = clf.predict(X_test_pca)
predict_df = to_df_test(y_pred, test_wav)
to_txt_test(predict_df)



0.5633333333333334


In [27]:
C=8
clf = svm.LinearSVC(C=C, max_iter = 10000)
clf.fit(X_pca, train_labels)

print(accuracy_score(clf.predict(X_test_pca), test_labels))
y_pred = clf.predict(X_test_pca)
predict_df = to_df_test(y_pred, test_wav)
to_txt_test(predict_df)


0.5944444444444444




In [77]:
# polynomial 활용

clf = svm.SVC(kernel = 'poly', degree = 3, gamma = 'auto', C=C, max_iter = 10000)    
#3차항으로 설정, degree = 3
#gamma는 sigma^2에 해당하는 scale parameter
#학습 반복횟수 10000

clf.fit(X_pca, train_labels)

print(accuracy_score(clf.predict(X_test_pca), test_labels))
y_pred = clf.predict(X_test_pca)
predict_df = to_df_test(y_pred, test_wav)
to_txt_test(predict_df)


0.9066666666666666


In [78]:
C=4
clf = svm.LinearSVC(C=C, max_iter = 10000)
clf.fit(X_pca, train_labels)

print(accuracy_score(clf.predict(X_test_pca), test_labels))
y_pred = clf.predict(X_test_pca)
predict_df = to_df_test(y_pred, test_wav)
to_txt_test(predict_df)


0.9233333333333333




In [79]:
# 함수 정의
def make_meshgrid(x, y, h=.02):
    x_min, x_max = x.min() - 1, x.max() + 1
    y_min, y_max = y.min() - 1, y.max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    return xx, yy


def plot_contours(ax, clf, xx, yy, **params):
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    out = ax.contourf(xx, yy, Z, **params)
    return out

In [80]:
# 데이터 로드하기
X = X_pca[:, :2]
y = train_labels
X.shape

(10000, 2)

In [82]:
# 모델 정의&피팅
C = 1.0 #regularization parameter
models = (svm.SVC(kernel='linear', C=C),
          svm.LinearSVC(C=C, max_iter=10000),
          svm.SVC(),
          svm.SVC(kernel='poly', degree=3, gamma='auto', C=C))
models = (clf.fit(X, y) for clf in models)

In [84]:
# plot title 형성
titles = ('SVC with linear kernel',
          'LinearSVC (linear kernel)',
          'SVC',
          'SVC with polynomial (degree 3) kernel')

In [None]:
# plot 그리기

fig, sub = plt.subplots(2, 2)
plt.subplots_adjust(wspace=0.4, hspace=0.4)

X0, X1 = X[:, 0], X[:, 1]
xx, yy = make_meshgrid(X0, X1)

for clf, title, ax in zip(models, titles, sub.flatten()):
    plot_contours(ax, clf, xx, yy,
                  cmap=plt.cm.coolwarm, alpha=0.8)
    ax.scatter(X0, X1, c=y, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    ax.set_xlabel('Sepal length')
    ax.set_ylabel('Sepal width')
    ax.set_xticks(())
    ax.set_yticks(())
    ax.set_title(title)

plt.show()