In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os
import librosa

import scipy
from scipy.stats import skew
from tqdm import tqdm, tqdm_pandas

tqdm.pandas()

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import VotingClassifier

from sklearn.svm import SVC
from sklearn import svm, metrics
from sklearn.utils import shuffle
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline

### 모델 설명
- 각 음성데이터에서 mfcc로 추출된 40가지 특징들의 평균, 표준편차, 왜도, 최댓값, 최솟값, 중앙값으로 피쳐 생성
- 프레임의 크기는 25ms, hop은 10ms 로 지정하여 전처리 시행
- 각 학습데이터와 테스트 데이터는 섞지 않은 상태로 학습 진행 후 결과 도출
- 음성의 길이는 각각 다르므로 프레임의 크기를 맞추어 자르는 방식도 있지만, 가장 작은 음성의 길이로 맞춘 뒤, 프레임을 같은 크기로 나눔.

# 01 데이터 전처리

In [2]:
def train_dataset():
    file_path = 'fmcc_train.ctl'
    dataset = []
    with open(file_path) as f:
        lines = f.readlines()
    train_files_names = [i.strip("\n") for i in lines] # \n값 제거
    
    for train_file in train_files_names:
        file_name='raw16k/train/' + train_file + ".wav"
        audio, sr = librosa.load(file_name, sr=16000)
        # 남/녀 별로 labeling
        # 1 : 남자 , 0: 여자
        if "M" in train_file[0]:
            #dataset.append([audio, "male"])
            dataset.append([file_name, audio, "male"])
        elif "F" in train_file[0]:
            #dataset.append([audio, "feml"])
            dataset.append([file_name, audio, "feml"])
    
    print("TrainDataset 생성 완료")
    return pd.DataFrame(dataset,columns=['fname', 'data','label'])


def test_dataset():
    file_path = 'fmcc_test900_ref.txt'
    dataset = []
    with open(file_path) as f:
        lines = f.readlines()
    test_files_names = [i.strip("\n") for i in lines] # \n값 제거
    for test_file in test_files_names:
        test_file = test_file.split(" ")
        #fname = 'raw16k/test/' + test_file[0]
        fname = test_file[0]
        audio, sr = librosa.load('raw16k/test/' + fname + ".wav", sr=16000)
        if test_file[1] == "feml":
            #dataset.append([fname+".raw", audio, "feml"])
            dataset.append(['/home/wikim/exp/raw16k/test/'+fname+".raw", audio, "feml"])
        elif test_file[1] == "male":
            #dataset.append([fname+".raw", audio, "male"])
            dataset.append(['/home/wikim/exp/raw16k/test/'+fname+".raw", audio, "male"])
    
    print("TestDataset 생성 완료")
    return pd.DataFrame(dataset, columns=['fname','data','label'])


# 음성의 길이 중 가장 작은 길이를 구합니다.

def get_min(data):

    min_data = 9999999
    for i in data:
        if len(i) < min_data:
            min_data = len(i)

    return min_data


def get_max(data):

    max_data = -999
    for i in data:
        if len(i) > max_data:
            max_data = len(i)

    return max_data

def set_length(data, min_length):

    result = []
    for i in data:
        result.append(i[:min_length])
    result = np.array(result)

    return result

def set_max_length(data, max_length):

    result = []
    for i in range(max_length):
        result.append(0)
    for j in range(len(data)):
        result[j]=data[j]
    result = np.array(result)

    return result

In [3]:
train_wav = train_dataset()
test_wav = test_dataset()
#shuffle_train_wav = train_wav.sample(frac=1)  # row 전체 shuffle
#shuffle_test_wav = test_wav.sample(frac=1)  # row 전체 shuffle

# train 레이블 값 생성
#train_label = train_wav.label
#train_labels = np.array(train_label)

# 테스트 레이블 값 생성
#test_label = test_wav.label
#test_labels = np.array(test_label)

TrainDataset 생성 완료
TestDataset 생성 완료


In [4]:
train_x = np.array(train_wav.data)
test_x = np.array(test_wav.data)

train_max = get_max(train_x)
test_max = get_max(test_x)

max_data = np.max([train_max, test_max])
print('가장 긴 길이 :', max_data)

가장 긴 길이 : 39040


In [5]:
train_x = set_length(train_x, max_data)
test_x = set_length(test_x, max_data)
print(train_x[0])
print('train :', train_x.shape) #(데이터셋 개수, 음성 길이)
print('test :', test_x.shape)

[ 0.         -0.00064087 -0.00036621 ...  0.07077026  0.05587769
  0.01251221]
train : (10000,)
test : (900,)


  result = np.array(result)


# 02 특징 추출

In [6]:

def preprocess_dataset(data):
    '''
    mel_spectrogram = []
    log_mel_spectrogram = []
    for i in data:
        mel_spectrogram = librosa.feature.melspectrogram(y=i, sr=16000)
        log_mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)
    return pd.DataFrame(log_mel_spectrogram)

    '''
    mfccs = []
    for i in data:
        mfcc = librosa.feature.mfcc(y=i,sr=16000,n_mfcc=40,   # n_mfcc:return 될 mfcc의 개수를 정해주는 파라미터, 더 다양한 데이터 특징을 추출하려면 값을 증가시키면 됨. 일반적으로 40개 추출
                                                  n_fft=400,  # n_fft:frame의 length를 결정하는 파라미터 
                                                  hop_length=160) # hop_length의 길이만큼 옆으로 가면서 데이터를 읽음(10ms기본)
        ft2 = librosa.feature.zero_crossing_rate(y=i)[0]
        ft3 = librosa.feature.spectral_rolloff(y=i)[0]
        ft4 = librosa.feature.spectral_centroid(y=i)[0]
        ft5 = librosa.feature.spectral_contrast(y=i)[0]
        ft6 = librosa.feature.spectral_bandwidth(y=i)[0]
        ft1_trunc = np.hstack((np.mean(mfcc, axis=1), np.std(mfcc, axis=1), skew(mfcc, axis = 1), np.max(mfcc, axis = 1), np.median(mfcc, axis = 1), np.min(mfcc, axis = 1)))
        ft2_trunc = np.hstack((np.mean(ft2), np.std(ft2), skew(ft2), np.max(ft2), np.median(ft2), np.min(ft2)))
        ft3_trunc = np.hstack((np.mean(ft3), np.std(ft3), skew(ft3), np.max(ft3), np.median(ft3), np.min(ft3)))
        ft4_trunc = np.hstack((np.mean(ft4), np.std(ft4), skew(ft4), np.max(ft4), np.median(ft4), np.min(ft4)))
        ft5_trunc = np.hstack((np.mean(ft5), np.std(ft5), skew(ft5), np.max(ft5), np.median(ft5), np.min(ft5)))
        ft6_trunc = np.hstack((np.mean(ft6), np.std(ft6), skew(ft6), np.max(ft6), np.median(ft6), np.max(ft6)))
        mfccs.append(np.hstack((ft1_trunc, ft2_trunc, ft3_trunc, ft4_trunc, ft5_trunc, ft6_trunc)))
    return pd.DataFrame(mfccs)
    

In [7]:
train_mfccs = preprocess_dataset(train_x)
#train_mfccs = np.array(train_mfccs)

test_mfccs = preprocess_dataset(test_x)
#test_mfccs = np.array(test_mfccs)

In [8]:
# 데이터셋 재설정하기
train_set = pd.DataFrame()
train_set['fname'] = train_wav['fname']
test_set = pd.DataFrame()
test_set['fname'] = test_wav['fname']

train_set = pd.concat([train_set,train_mfccs],axis=1)
train_set['label'] = train_wav['label']
test_set = pd.concat([test_set,test_mfccs],axis=1)
test_set['label'] = test_wav['label']

In [9]:
train_set

Unnamed: 0,fname,0,1,2,3,4,5,6,7,8,...,261,262,263,264,265,266,267,268,269,label
0,raw16k/train/FCJY0/FCJY0_pbw1001.wav,-357.228851,70.462700,-2.156656,-0.493067,-22.894753,-2.081519,-19.210491,-14.009414,-9.968751,...,31.122784,10.646379,4.345991,2452.006033,380.447868,-0.274310,3087.168294,2542.999922,3087.168294,feml
1,raw16k/train/FCJY0/FCJY0_pbw1002.wav,-339.398376,91.876221,-22.855867,0.399164,-22.641399,10.525990,-12.593031,-6.533293,-23.694679,...,25.455225,16.402648,8.575869,2320.306546,318.681228,0.485393,2900.275372,2237.504570,2900.275372,feml
2,raw16k/train/FCJY0/FCJY0_pbw1003.wav,-307.900024,111.039551,-11.965649,9.581073,-35.134731,-17.317619,-10.091484,-22.156830,-25.859383,...,28.805922,19.958048,2.794613,2133.477947,380.309140,0.422592,2836.961514,1976.137887,2836.961514,feml
3,raw16k/train/FCJY0/FCJY0_pbw1004.wav,-292.031067,86.136795,-23.937466,17.126577,-16.261923,-14.787249,-18.061773,-7.738655,-5.916373,...,25.025905,15.604625,6.789018,2359.878304,363.083989,-0.146681,3135.044580,2413.562375,3135.044580,feml
4,raw16k/train/FCJY0/FCJY0_pbw1005.wav,-260.532623,95.999504,-42.201618,20.899199,-28.934317,-10.480850,-23.660446,-19.058420,-10.468557,...,29.944127,16.218201,0.713301,2057.241933,109.816465,1.382232,2434.342142,2033.184051,2434.342142,feml
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,raw16k/train/MLWS0/MLWS0_pbw1196.wav,-270.408875,81.380440,-16.315908,24.434874,-10.781040,-12.447013,-7.800146,-11.365909,-7.647517,...,26.846035,20.365465,13.515818,2401.667903,239.099928,1.163928,3088.359598,2324.652450,3088.359598,male
9996,raw16k/train/MLWS0/MLWS0_pbw1197.wav,-353.529144,100.193527,-2.436024,23.333628,-1.828546,-13.569883,-10.589240,3.692477,-14.470538,...,32.620170,22.825617,11.785901,2466.446888,291.635634,-0.661532,2868.351784,2591.246031,2868.351784,male
9997,raw16k/train/MLWS0/MLWS0_pbw1198.wav,-264.608917,63.507053,-14.042987,24.930367,-14.630364,8.640616,-15.595984,9.907090,-3.578396,...,39.089601,20.870084,15.851291,2584.654475,121.432371,-0.129680,2787.535098,2570.070366,2787.535098,male
9998,raw16k/train/MLWS0/MLWS0_pbw1199.wav,-283.717072,113.128601,-35.432854,31.492065,-20.108822,4.374619,-11.871616,9.548581,-15.521561,...,31.296304,19.895945,13.057680,2052.846627,156.003709,-0.115914,2323.153046,2046.746497,2323.153046,male


In [10]:
shuffle_train=shuffle(train_set, random_state = 20)
shuffle_test=shuffle(test_set, random_state = 30)
shuffle_train

Unnamed: 0,fname,0,1,2,3,4,5,6,7,8,...,261,262,263,264,265,266,267,268,269,label
9957,raw16k/train/MLWS0/MLWS0_pbw1158.wav,-314.979889,82.739449,6.389763,22.278984,-8.398458,-2.677688,-28.834642,-6.055861,-21.753105,...,31.679443,26.175682,13.715223,2572.844005,470.501218,-0.500636,3209.817255,2743.204423,3209.817255,male
1687,raw16k/train/FJYJ1/FJYJ1_pbw1088.wav,-226.971939,61.370483,-44.255512,17.489904,-18.518642,-13.435062,-19.188448,-13.666183,-5.024434,...,24.546603,17.061044,8.683390,2213.536586,126.212179,-1.023469,2418.656143,2239.347996,2418.656143,feml
2116,raw16k/train/FKEJ0/FKEJ0_pbw1117.wav,-333.614380,54.199627,2.391518,23.990250,-24.286991,-8.908220,-20.718632,-23.588701,-20.601360,...,28.447579,15.166905,8.864182,2639.507916,366.952242,1.313544,3722.132011,2501.836310,3722.132011,feml
231,raw16k/train/FGBS0/FGBS0_pbw1032.wav,-271.695984,55.087444,-35.524151,15.012136,-19.925812,-1.037488,-15.226067,11.619479,-19.834372,...,33.658785,12.666030,6.284396,2543.230159,427.162816,0.446200,3647.027326,2516.430635,3647.027326,feml
2780,raw16k/train/FLGH0/FLGH0_pbw1181.wav,-233.409637,65.965027,-17.979252,20.538010,-24.317638,-10.322409,-21.393955,-6.222099,-11.450763,...,40.066851,16.810454,8.996715,2515.367098,354.733230,0.465549,3245.112108,2474.969771,3245.112108,feml
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3915,raw16k/train/FLSH1/FLSH1_pbw1116.wav,-283.220428,81.391106,-20.840357,19.654039,-24.191692,2.744695,-19.306742,-0.115585,-11.568568,...,41.529504,16.205276,9.568842,2343.338266,242.071781,-0.124005,2716.882535,2318.393003,2716.882535,feml
9620,raw16k/train/MLWJ1/MLWJ1_pbw1021.wav,-227.512299,80.613831,7.365667,13.004544,-9.486271,10.483340,-12.197699,15.385054,-10.213816,...,35.702722,25.415755,13.129573,2799.760678,321.536779,1.129495,3612.274069,2698.834229,3612.274069,male
7068,raw16k/train/MJSG0/MJSG0_pbw1069.wav,-317.243591,91.476395,26.234255,26.352848,-11.294439,-3.940541,-34.887245,-1.258684,-9.730688,...,43.149696,23.947814,4.907473,2638.234091,494.215983,-0.433410,3458.585506,2700.237926,3458.585506,male
7391,raw16k/train/MJST0/MJST0_pbw1192.wav,-238.817673,36.579140,-19.445511,0.482900,-19.336252,10.968548,-13.686270,8.191335,-11.287183,...,30.679547,18.763352,6.097830,2794.525247,90.780836,0.269943,3000.972053,2790.566899,3000.972053,male


In [11]:
print(train_set.shape)
print(test_set.shape)

(10000, 272)
(900, 272)


In [12]:
def convert_to_labels(preds, i2c, k=2):
    ans = []
    ids = []
    for p in preds:
        idx = np.argsort(p)[::-1]
        ids.append([i for i in idx[:k]])
        ans.append(' '.join([i2c[i] for i in idx[:k]]))

    return ans, ids

In [13]:
X = train_set.drop(['label', 'fname'], axis=1)
feature_names = list(X.columns)  # 특징 번호 리스트

X = X.values  # 특징벡터 값 전체


# 라벨값 문자열 -> int 변환 (PCA 적용시 변환 필요)
labels= np.sort(np.unique(train_set.label.values))  # 라벨 이름 가져오기

num_class = len(labels)
c2i = {}
i2c = {}
for i, c in enumerate(labels):
    c2i[c] = i  # feml, male -> index = 0 or 1로 변환
    i2c[i] = c  # 0 or 1 -> feml, male 변환
    
y=np.array([c2i[x] for x in train_set.label.values])
y_test=np.array([c2i[x] for x in test_set.label.values])
print(y)

[0 0 0 ... 1 1 1]


In [14]:
X = shuffle_train.drop(['label', 'fname'], axis=1)
feature_names = list(X.columns)  # 특징 번호 리스트

X = X.values  # 특징벡터 값 전체


# 라벨값 문자열 -> int 변환 (PCA 적용시 변환 필요)
labels= np.sort(np.unique(shuffle_train.label.values))  # 라벨 이름 가져오기

num_class = len(labels)
c2i = {}
i2c = {}
for i, c in enumerate(labels):
    c2i[c] = i  # feml, male -> index = 0 or 1로 변환
    i2c[i] = c  # 0 or 1 -> feml, male 변환
    
y=np.array([c2i[x] for x in shuffle_train.label.values])
y_test=np.array([c2i[x] for x in shuffle_test.label.values])
print(y)

[1 0 0 ... 1 1 0]


In [15]:
#X_test = test_set.drop(['label', 'fname'], axis=1)
#X_test = X_test.values

In [16]:
X_test = shuffle_test.drop(['label', 'fname'], axis=1)
X_test = X_test.values

In [17]:
print(X.shape)
print(X_test.shape)

(10000, 270)
(900, 270)


In [18]:
# PCA를 적용하기 위한 scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

#pca_components = 100  # PCA의 구성 요소 개수
lda_components = 1   # LDA의 구성 요소 개수

#차원축소를 위한 PCA 적용 
#pca = PCA(n_components=pca_components).fit(X_scaled)
#X_pca = pca.transform(X_scaled)
#X_test_pca = pca.transform(X_test_scaled)

#print(sum(pca.explained_variance_ratio_)) 

# LDA를 사용한 차원 축소
lda = LinearDiscriminantAnalysis(n_components=lda_components)
X_lda = lda.fit_transform(X_scaled, y)
X_test_lda = lda.transform(X_test_scaled)
#X_lda.shape
#X.shape

In [29]:
# 기존 트레인셋을 분할하여 정확도 테스트
# Fit an SVM model
X_train, X_test, y_train, y_test = train_test_split(X_lda, y, test_size = 0.2, random_state = 42, shuffle = True)


clf = svm.SVC() # 93.5
#clf.fit(X_pca, train_labels)
#clf = SVC(kernel = 'linear', probability=True)  # 90.6
#clf = SVC(kernel = 'rbf', C = 4, gamma = 0.01, probability=True)  # 93.3
clf.fit(X_lda, y)

# 테스트 데이터에 대한 예측
y_pred = clf.predict(X_test)

# 정확도 평가
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

#print(accuracy_score(clf.predict(X_val), y_val))

Accuracy: 0.9305


In [30]:
clf = svm.SVC(probability=True)

clf.fit(X_lda, y)
str_preds, _ = convert_to_labels(clf.predict_proba(X_lda), i2c, k=3)
#print(str_preds)
print(accuracy_score(clf.predict(X_test), y_test))

0.9305


In [None]:
# 2차원 시각화를 위해 새로운 데이터 생성
x_min, x_max = X_lda.min() - 1, X_lda.max() + 1
xx = np.linspace(x_min, x_max, 100).reshape(-1, 1)
'''
# 생성된 데이터를 SVM 모델로 분류
y_pred = clf.predict(xx)

# 시각화
plt.scatter(X_lda, y, c=y, cmap='viridis')
plt.plot(xx, y_pred, color='red', linewidth=2)
plt.xlabel('LDA Projection')
plt.ylabel('Gender')
plt.title('LDA Projection of Gender Classification')
plt.show()
'''

Z = clf.predict(xx)

# 결정 경계 시각화
plt.scatter(X_lda, np.zeros_like(X_lda), c=y, cmap='viridis')
plt.plot(xx, np.zeros_like(xx), 'k--')
plt.scatter(clf.support_vectors_[:, 0], np.zeros_like(clf.support_vectors_[:, 0]), color='red', marker='x', label='Support Vectors')
plt.xlabel('LDA Component')
plt.title('SVM Decision Boundary in LDA Space')
plt.legend()
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import learning_curve

def plot_learning_curve(estimator, X, y, cv, train_sizes=np.linspace(0.1, 1.0, 10)):
    train_sizes, train_scores, val_scores = learning_curve(
        estimator, X, y, cv=cv, train_sizes=train_sizes, scoring='accuracy')
    
    train_scores_mean = np.mean(train_scores, axis=1)
    val_scores_mean = np.mean(val_scores, axis=1)
    
    plt.figure()
    plt.plot(train_sizes, train_scores_mean, 'o-', label='Training accuracy')
    plt.plot(train_sizes, val_scores_mean, 'o-', label='Validation accuracy')
    plt.xlabel('Training Set Size')
    plt.ylabel('Accuracy')
    plt.title('SVM Learning Curve')
    plt.legend()
    plt.show()

# SVM 모델 생성
#svm_model = SVC()

# 학습 곡선 그리기
plot_learning_curve(clf, X_train, y_train, cv=5)


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import learning_curve

def plot_learning_curve(estimator, X, y, cv, train_sizes=np.linspace(0.1, 1.0, 10)):
    train_sizes, train_scores, val_scores = learning_curve(
        estimator, X, y, cv=cv, train_sizes=train_sizes, scoring='neg_mean_squared_error')
    
    train_losses = -np.mean(train_scores, axis=1)
    val_losses = -np.mean(val_scores, axis=1)
    
    plt.figure()
    plt.plot(train_sizes, train_losses, 'o-', label='Training Loss')
    plt.plot(train_sizes, val_losses, 'o-', label='Validation Loss')
    plt.xlabel('Training Set Size')
    plt.ylabel('Loss')
    plt.title('SVM Learning Curve')
    plt.legend()
    plt.show()

# SVM 모델 생성
svm_model = SVC()

# 학습 곡선 그리기
plot_learning_curve(svm_model, X_train, y_train, cv=5)


# SVM 모델 학습

In [None]:
def to_df_test(y_pred, test_wav):
    predict_df = pd.DataFrame()
    pred_name=[]
    for i in range(len(y_pred)):
        if(y_pred[i] == 0):
            pred_name.append("male")
        elif(y_pred[i] == 1):
            pred_name.append("feml")
    
    predict_df["data"] = test_wav["fname"]
    predict_df["label"] = pred_name
    #predict_df = predict_df.data
        
    return predict_df

def to_txt_test(predict_df):
    predict_df.to_csv('강력한컴공_test_results.txt', sep = " ", index=False, header=False, lineterminator='\n')
    

In [None]:
C=1
clf = svm.LinearSVC(C=C, max_iter = 10000)
clf.fit(X_pca, train_labels)

print(accuracy_score(clf.predict(X_test_pca), test_labels))
y_pred = clf.predict(X_test_pca)
#predict_df = to_df_test(y_pred, test_wav)
#to_txt_test(predict_df)


In [None]:
clf = svm.SVC()
clf.fit(X_pca, train_labels)

print(accuracy_score(clf.predict(X_test_pca), test_labels))
y_pred = clf.predict(X_test_pca)
predict_df = to_df_test(y_pred, test_wav)
to_txt_test(predict_df)



In [None]:
C=8
clf = svm.LinearSVC(C=C, max_iter = 10000)
clf.fit(X_pca, train_labels)

print(accuracy_score(clf.predict(X_test_pca), test_labels))
y_pred = clf.predict(X_test_pca)
predict_df = to_df_test(y_pred, test_wav)
to_txt_test(predict_df)


In [None]:
# polynomial 활용

clf = svm.SVC(kernel = 'poly', degree = 3, gamma = 'auto', C=C, max_iter = 10000)    
#3차항으로 설정, degree = 3
#gamma는 sigma^2에 해당하는 scale parameter
#학습 반복횟수 10000

clf.fit(X_pca, train_labels)

print(accuracy_score(clf.predict(X_test_pca), test_labels))
y_pred = clf.predict(X_test_pca)
predict_df = to_df_test(y_pred, test_wav)
to_txt_test(predict_df)


In [None]:
C=4
clf = svm.LinearSVC(C=C, max_iter = 10000)
clf.fit(X_pca, train_labels)

print(accuracy_score(clf.predict(X_test_pca), test_labels))
y_pred = clf.predict(X_test_pca)
predict_df = to_df_test(y_pred, test_wav)
to_txt_test(predict_df)


In [None]:
# 함수 정의
def make_meshgrid(x, y, h=.02):
    x_min, x_max = x.min() - 1, x.max() + 1
    y_min, y_max = y.min() - 1, y.max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    return xx, yy


def plot_contours(ax, clf, xx, yy, **params):
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    out = ax.contourf(xx, yy, Z, **params)
    return out

In [None]:
# 데이터 로드하기
X = X_pca[:, :2]
y = train_labels
X.shape

In [None]:
# 모델 정의&피팅
C = 1.0 #regularization parameter
models = (svm.SVC(kernel='linear', C=C),
          svm.LinearSVC(C=C, max_iter=10000),
          svm.SVC(),
          svm.SVC(kernel='poly', degree=3, gamma='auto', C=C))
models = (clf.fit(X, y) for clf in models)

In [None]:
# plot title 형성
titles = ('SVC with linear kernel',
          'LinearSVC (linear kernel)',
          'SVC',
          'SVC with polynomial (degree 3) kernel')

In [None]:
# plot 그리기

fig, sub = plt.subplots(2, 2)
plt.subplots_adjust(wspace=0.4, hspace=0.4)

X0, X1 = X[:, 0], X[:, 1]
xx, yy = make_meshgrid(X0, X1)

for clf, title, ax in zip(models, titles, sub.flatten()):
    plot_contours(ax, clf, xx, yy,
                  cmap=plt.cm.coolwarm, alpha=0.8)
    ax.scatter(X0, X1, c=y, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    ax.set_xlabel('Sepal length')
    ax.set_ylabel('Sepal width')
    ax.set_xticks(())
    ax.set_yticks(())
    ax.set_title(title)

plt.show()