In [1]:
import numpy as np
from sklearn import svm, datasets, preprocessing
import joblib
import matplotlib.pyplot as plt

In [2]:
# 加载数据集
iris = datasets.load_iris()

feature = iris['data']
target = iris['target']
feature_names = iris['feature_names']
target_names = iris['target_names']

sample_cnt = feature.shape[0]
feature_dim = feature.shape[1]

print(feature.shape)
print(target.shape)
print(target)
print(feature_names)
print(target_names)
print(sample_cnt)
print(feature_dim)

(150, 4)
(150,)
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
['setosa' 'versicolor' 'virginica']
150
4


In [3]:
# 打乱数据集
shuffling_indices = np.arange(target.shape[0])
np.random.shuffle(shuffling_indices)
feature = feature[shuffling_indices,:]
target = target[shuffling_indices]

print(feature.shape)
print(target.shape)
print(target)

(150, 4)
(150,)
[1 0 1 2 1 2 2 0 0 2 0 2 0 1 0 1 0 0 1 0 2 1 1 2 0 2 1 1 2 2 0 2 2 1 0 2 1
 1 2 0 0 1 0 2 1 0 0 2 2 2 0 0 2 2 1 0 1 1 0 0 1 0 0 0 2 0 1 2 2 2 1 1 2 1
 1 0 2 2 2 1 0 2 0 1 0 2 0 1 1 0 0 1 0 0 1 1 2 2 0 1 1 0 2 2 0 1 1 2 2 0 0
 2 2 1 1 2 1 1 2 1 0 0 1 2 1 0 1 2 2 2 1 0 1 1 1 0 0 2 2 0 2 0 1 0 2 1 0 2
 1 2]


In [4]:
# 归一化预处理
scaler = preprocessing.StandardScaler()
scaler.fit(feature)
print(scaler.mean_)
print(scaler.scale_)

feature = scaler.transform(feature)

[5.84333333 3.05733333 3.758      1.19933333]
[0.82530129 0.43441097 1.75940407 0.75969263]


In [5]:
# 分出20个样本作为测试集
test_cnt = 20

trainval_feature = feature[:-test_cnt,:]
trainval_target = target[:-test_cnt]
test_feature = feature[-test_cnt:,:]
test_target = target[-test_cnt:]

print(trainval_feature.shape)
print(trainval_target.shape)
print(test_feature.shape)
print(test_target.shape)

(130, 4)
(130,)
(20, 4)
(20,)


In [6]:
# 使用k折交叉验证，将数据集分为k份
k = 10
feature_tv_list = np.reshape(trainval_feature, [k, -1, feature_dim])
target_tv_list = np.reshape(trainval_target, [k, -1])

print(feature_tv_list.shape)
print(target_tv_list.shape)
print(target_tv_list)

(10, 13, 4)
(10, 13)
[[1 0 1 2 1 2 2 0 0 2 0 2 0]
 [1 0 1 0 0 1 0 2 1 1 2 0 2]
 [1 1 2 2 0 2 2 1 0 2 1 1 2]
 [0 0 1 0 2 1 0 0 2 2 2 0 0]
 [2 2 1 0 1 1 0 0 1 0 0 0 2]
 [0 1 2 2 2 1 1 2 1 1 0 2 2]
 [2 1 0 2 0 1 0 2 0 1 1 0 0]
 [1 0 0 1 1 2 2 0 1 1 0 2 2]
 [0 1 1 2 2 0 0 2 2 1 1 2 1]
 [1 2 1 0 0 1 2 1 0 1 2 2 2]]


In [7]:
# 创建一个数组用于存放模型
svm_models = []
for i in range(k):
    # 创建 SVM 模型，选用高斯核
    svm_models.append(svm.SVC(gamma='scale', kernel='rbf'))
    

In [8]:
val_accuracies = []
# 训练验证k次，每次选用不同的验证集
for i in range(k):
    # 获取除了第 9-i 项以外的所有作为训练集
    train_feature = np.delete(feature_tv_list, 9-i, axis=0).reshape([-1, feature_dim])
    train_target = np.delete(target_tv_list, 9-i, axis=0).reshape([-1,])
    # 获取第 9-i 项作为验证集
    val_feature = feature_tv_list[9-i,:,:]
    val_target = target_tv_list[9-i,:]
    # 训练
    svm_models[i].fit(train_feature, train_target)
    # 评估
    val_pred = svm_models[i].predict(val_feature)
    accuracy = np.sum(val_pred == val_target) / val_target.shape[0]
    val_accuracies.append(accuracy)
    print(f'SVM No.{i} training completed with accuracy {accuracy:.2%} on validating')

SVM No.0 training completed with accuracy 100.00% on validating
SVM No.1 training completed with accuracy 76.92% on validating
SVM No.2 training completed with accuracy 100.00% on validating
SVM No.3 training completed with accuracy 100.00% on validating
SVM No.4 training completed with accuracy 100.00% on validating
SVM No.5 training completed with accuracy 92.31% on validating
SVM No.6 training completed with accuracy 100.00% on validating
SVM No.7 training completed with accuracy 100.00% on validating
SVM No.8 training completed with accuracy 100.00% on validating
SVM No.9 training completed with accuracy 84.62% on validating


In [9]:
# 测试
for i in range(k):
    test_pred = svm_models[i].predict(test_feature)
    accuracy = np.sum(test_pred == test_target) / test_target.shape[0]
    print(f'SVM No.{i} inferred with accuracy {accuracy:.2%} on testing')

SVM No.0 inferred with accuracy 95.00% on testing
SVM No.1 inferred with accuracy 100.00% on testing
SVM No.2 inferred with accuracy 95.00% on testing
SVM No.3 inferred with accuracy 95.00% on testing
SVM No.4 inferred with accuracy 100.00% on testing
SVM No.5 inferred with accuracy 100.00% on testing
SVM No.6 inferred with accuracy 100.00% on testing
SVM No.7 inferred with accuracy 95.00% on testing
SVM No.8 inferred with accuracy 95.00% on testing
SVM No.9 inferred with accuracy 95.00% on testing


In [10]:
# 保存模型
svm_to_save = np.argmax(val_accuracies)
joblib.dump(scaler, './saves/scaler.pkl')
joblib.dump(svm_models[svm_to_save], './saves/svm.pkl')

['./saves/svm.pkl']