# 基于Scikit-Learn的SVC模板
最后更新：2021.10.22  戴以恒  

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from multiprocessing import Pool
from IPython.display import clear_output as clear
import time
c_time = time.strftime("%Y%m%d_%H%M%S", time.localtime())
c_time_m = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())

In [None]:
# 参数
# ======== System Setup ========
Version = 'V5.0'
EPOCH = 1000
# ======== Fit Data Input ========
INPUT_X = 'Features_68_5_c.csv'
INPUT_Y = 'Values_YF_68.csv'
INPUT_TITLE = 'Title_5_c.csv'
BEGIN_INDEX = 0
END_INDEX_PLUS_ONE = None   # 设为None或者调成0和特征数量，即可使用全部特征
# ======== Find Split Settings ========
INPUT_SPLIT = None
FIND_SPLIT = True
CAL_MAE_LOOP = True
SAVE_MODEL = False
# ======== Other Fitting Settings ========
TRAIN_TEST_SPLIT = 0.75
CV_LOOP_EPOCH = 20
FOLD = 7
TEST_SPLIT_OOB = True
# ======== Data Output ========
LOG_NAME = 'SVC_05_Log_'+c_time+'.txt'

In [None]:
if END_INDEX_PLUS_ONE != None:
    X = np.loadtxt(INPUT_X, delimiter=',')[:, BEGIN_INDEX:END_INDEX_PLUS_ONE]
    title = np.loadtxt(INPUT_TITLE, dtype='str', delimiter=',', comments='!')[BEGIN_INDEX:END_INDEX_PLUS_ONE, ]
else:
    X = np.loadtxt(INPUT_X, delimiter=',')
    title = np.loadtxt(INPUT_TITLE, dtype='str', delimiter=',', comments='!')
y = np.loadtxt(INPUT_Y, delimiter=',', dtype=float)
print('X:', X.shape, '   y:', y.shape)

In [None]:
from sklearn.svm import SVC
from sklearn import model_selection
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
import joblib
from sklearn import svm, datasets
from sklearn.metrics import auc
from sklearn.metrics import plot_roc_curve
from sklearn.model_selection import StratifiedKFold
n_samples, n_features = X.shape
from sklearn.model_selection import ShuffleSplit

In [None]:
import os
from pathlib import Path
if FIND_SPLIT:
    DIR = 'SVC_'+Version+'_FindSplit_'+str(X.shape[1])+'_Fs_'+c_time
else:
    DIR = 'SVC_'+Version+'_TestSplit_'+str(X.shape[1])+'_Fs_'+c_time
os.mkdir(DIR)

In [None]:
# 打乱和切分数据集
point = round(X.shape[0]*TRAIN_TEST_SPLIT)
if not FIND_SPLIT:
    permutation = np.loadtxt(INPUT_SPLIT).astype(int).flatten().tolist()
    train_idx = []
    test_idx = []
    for i in range(X.shape[0]):
        if i in permutation:
            train_idx.append(i)
        else:
            test_idx.append(i)
    X_train = X[train_idx, :]
    y_train = y[train_idx]
    X_test = X[test_idx, :]
    y_test = y[test_idx]

In [None]:
def cm_plot(y, yp, path, acc): 
    cm = confusion_matrix(y, yp) #混淆矩阵
    plt.figure(figsize=(5,5), dpi=300)
    plt.matshow(cm, cmap='Greens', vmin=0, vmax=max(cm[0, 0]+cm[1, 0], cm[1, 1]+cm[0, 1])) #画混淆矩阵图，配色风格使用cm.Greens，更多风格请参考官网。
    plt.colorbar()
    for x in range(len(cm)):
        for y in range(len(cm)):
            plt.annotate(cm[x,y], xy=(x, y), horizontalalignment='center', verticalalignment='center', fontsize=14)
    plt.ylabel('True label', fontsize=15) #坐标轴标签
    plt.xlabel('Predicted label', fontsize=15) #坐标轴标签
    plt.suptitle('SVC Round ConfusionMatrix'+'\nMean Acc: '+str(round(acc, 2)) ,fontsize=18)
    plt.savefig(path, bbox_inches='tight', dpi=300)
    plt.clf()
    plt.close('all')

In [None]:
clf = SVC(kernel='rbf', gamma=np.exp(-6.964045492214664), C=13.589042049663163, verbose=1, max_iter=-1, cache_size=2048)
paras = clf.get_params()

In [None]:
# 初始化MAE矩阵
if CAL_MAE_LOOP:
    mae_m = np.zeros((X.shape[0], 4))
    mae_count = np.zeros((X.shape[0], 2))

In [None]:
mse_list = []
acc_list = []
mean_acc_list = []
max_acc = -999.99
full_m = []
split_l = []
if FIND_SPLIT:
    test_idx_m = []
for _ in range(EPOCH):
    if FIND_SPLIT:
        permutation = np.random.permutation(y.shape[0])
        train_idx = permutation[:point]
        test_idx = permutation[point:]
        X_train = X[train_idx, :]
        y_train = y[train_idx]
        X_test = X[test_idx, :]
        y_test = y[test_idx]
        split_l.append(train_idx)
        test_idx_m.append(test_idx)
    else:
        if TEST_SPLIT_OOB:
            idx_t = np.random.choice(X_train_o.shape[0], size=X_train_o.shape[0], replace=True).flatten().tolist()
            X_train = X_train_o[idx_t, :]
            y_train = y_train_o[idx_t]
        else:
            perm_train = np.random.permutation(X_train.shape[0])
            X_train = X_train_o[perm_train, :]
            y_train = y_train_o[perm_train]
    clf_new = SVC()
    for k, v in paras.items():
        clf_new.set_params(**{k: v})
    # 拟合模型
    clf_new.fit(X_train, y_train)
    # 计算损失
    y_pred = clf_new.predict(X_test)
    acc_count = 0
    for i in range(X_test.shape[0]):
        if y_pred[i]==y_test[i]:
            acc_count += 1
    acc = acc_count*100/X_test.shape[0]
    mse = mean_squared_error(y_test, y_pred)
    mse_list.append(mse)
    acc_list.append(acc)
    mean_acc = np.mean(acc_list)
    mean_acc_list.append(mean_acc)
    print('Round:', _+1, "MSE: %.4f" % mse, '  Accuracy: %.4f' % acc, '  current mean acc: %.4f' % mean_acc)
    y_full_pred = clf_new.predict(X)
    full_m.append(y_full_pred)
    
    # 计算所有样本的MAE
    if CAL_MAE_LOOP:
        for i in range(len(y_full_pred)):
            if i in train_idx:
                mae_m[i, 0] += abs(y_full_pred[i]-y[i, ])
                mae_m[i, 2] += y_full_pred[i]-y[i, ]
                mae_count[i, 0] += 1
            elif i in test_idx:
                mae_m[i, 1] += abs(y_full_pred[i]-y[i, ])
                mae_m[i, 3] += y_full_pred[i]-y[i, ]
                mae_count[i, 1] += 1
    if acc>max_acc or mse<0.015:
        if acc>max_acc:
            max_acc = acc
        if SAVE_MODEL:
            clf_name = str(round(acc, 4))+'_SVC.pkl'
            clf_name = Path('.', DIR, clf_name)
            joblib.dump(clf_new, clf_name)
        pred = clf_new.predict(X_test)
        PLOT_NAME2 = str(round(acc, 4))+'_SVC_ConfusionMatrix_'+c_time+'.png'
        PLOT_NAME2 = Path('.', DIR, PLOT_NAME2)
        cm_plot(y_test, pred, PLOT_NAME2, acc)
        # 保存切分数据
        if FIND_SPLIT:
            SPLIT_NAME = str(round(acc, 4))+'_Split.csv'
            SPLIT_NAME = Path('.', DIR, SPLIT_NAME)
            np.savetxt(SPLIT_NAME, np.array(permutation[:point]).reshape(point, 1), fmt='%d')
    del clf_new, y_full_pred
clear()
print('Mean accuracy:', np.mean(acc_list))

In [None]:
full_m = np.array(full_m)
y_mean = np.zeros((X.shape[0], 1))
for i in range(X.shape[0]):
    y_mean[i, 0] = np.mean(full_m[:, i])
cm = np.zeros((2, 2))
for i in range(X.shape[0]):
    if y[i] == 0 and y_mean[i, 0] < 0.5:
        cm[0, 0] += 1
    elif y[i] == 0 and y_mean[i, 0] >= 0.5:
        cm[1, 0] += 1
    elif y[i] == 1 and y_mean[i, 0] >= 0.5:
        cm[1, 1] += 1
    elif y[i] == 1 and y_mean[i, 0] < 0.5:
        cm[0, 1] += 1
plt.figure(figsize=(5,6), dpi=300)
plt.matshow(cm.astype(int), cmap='Blues', vmin=0, vmax=max(cm[0, 0]+cm[1, 0], cm[1, 1]+cm[0, 1]))
plt.colorbar() #颜色标签 
for x_id in range(len(cm)): #数据标签
    for y_id in range(len(cm)):
        plt.annotate(cm[x_id,y_id], xy=(x_id, y_id), horizontalalignment='center', verticalalignment='center', fontsize=14)
plt.ylabel('True label', fontsize=15) #坐标轴标签
plt.xlabel('Predicted label', fontsize=15) #坐标轴标签
plt.suptitle('SVC Full ConfusionMatrix'+'\nMean Acc: '+str(round(np.mean(acc_list), 2)) ,fontsize=18)
save_name = 'SVC_01a_Full-Mean_ConfusionMatrix_'+c_time+'.png'
save_name = Path('.', DIR, save_name)
plt.savefig(save_name, bbox_inches='tight', dpi=300)

In [None]:
if FIND_SPLIT:
    test_idx_m = np.array(test_idx_m)
    save_name = 'SVC_01b_Test_Index_'+c_time+'.csv'
    save_name = Path('.', DIR, save_name)
    np.savetxt(save_name, test_idx_m, fmt='%d', delimiter=',')
    test_data_m = []
    for i in range(X.shape[0]):
        test_data_m.append([])
    for i in range(test_idx_m.shape[0]):
        for j in range(test_idx_m.shape[1]):
            test_data_m[test_idx_m[i, j]].append(full_m[i, test_idx_m[i, j]])
    test_mean_l = []
    for i in range(X.shape[0]):
        test_mean_l.append(np.mean(test_data_m[i]))
    cm_2 = np.zeros((2, 2))
    for i in range(X.shape[0]):
        if y[i] == 0 and test_mean_l[i] < 0.5:
            cm_2[0, 0] += 1
        elif y[i] == 0 and test_mean_l[i] >= 0.5:
            cm_2[1, 0] += 1
        elif y[i] == 1 and test_mean_l[i] >= 0.5:
            cm_2[1, 1] += 1
        elif y[i] == 1 and test_mean_l[i] < 0.5:
            cm_2[0, 1] += 1
    plt.figure(figsize=(5,6), dpi=300)
    plt.matshow(cm_2, cmap='Oranges', vmin=0, vmax=max(cm_2[0, 0]+cm_2[1, 0], cm_2[1, 1]+cm_2[0, 1]))
    plt.colorbar() #颜色标签 
    for x_id in range(len(cm_2)): #数据标签
        for y_id in range(len(cm_2)):
            plt.annotate(cm_2[x_id,y_id], xy=(x_id, y_id), horizontalalignment='center', verticalalignment='center', fontsize=14)
    plt.ylabel('True label', fontsize=15) #坐标轴标签
    plt.xlabel('Predicted label', fontsize=15) #坐标轴标签
    plt.suptitle('SVC Test ConfusionMatrix'+'\nMean Acc: '+str(round(np.mean(acc_list), 2)) ,fontsize=18)
    save_name = 'SVC_01c_Test-Mean_ConfusionMatrix_'+c_time+'.png'
    save_name = Path('.', DIR, save_name)
    plt.savefig(save_name, bbox_inches='tight', dpi=300)

In [None]:
if CAL_MAE_LOOP:
    MAE_NAME = 'SVC_00_MAE_Counts_'+c_time+'.txt'
    MAE_NAME = Path('.', DIR, MAE_NAME)
    for i in range(mae_m.shape[0]):
        mae_m[i, 0] /= mae_count[i, 0]
        mae_m[i, 2] /= mae_count[i, 0]
        mae_m[i, 1] /= mae_count[i, 1]
        mae_m[i, 3] /= mae_count[i, 1]
    idx = np.argsort(-mae_m[:, 1])
    f2 = open(MAE_NAME, 'w+')
    f2.write('MAE Calculation of all samples:\n')
    f2.write('Total Rounds: '+str(EPOCH)+'\n\n')
    for i in range(len(idx)):
        index = idx[i]
        f2.write('Point No.'+str(index+1)+'\n')
        f2.write('  Feature Data: '+str(X[index, :].flatten().tolist())+'\n')
        f2.write('    True Values: '+str(y[index, ])+'\n')
        f2.write('    Training MAE: '+str(mae_m[index, 0])+'\n')
        f2.write('    Training ME: '+str(mae_m[index, 2])+'\n')
        f2.write('    Testing MAE: '+str(mae_m[index, 1])+'\n')
        f2.write('    Testing ME: '+str(mae_m[index, 3])+'\n\n\n')
    f2.close()

In [None]:
cv_acc_list = []
cv_mean_acc_list = []
for _ in range(CV_LOOP_EPOCH):
    # 打乱训练集并分割
    permutation = np.random.permutation(y.shape[0])
    X = X[permutation, :]
    y = y[permutation]
    clf_brand_new = SVC()
    for k, v in paras.items():
        clf_brand_new.set_params(**{k: v})
    scores = 100*cross_val_score(clf_brand_new, X, y, cv=FOLD, n_jobs=-1)
    cv_acc_list.append(np.mean(scores))
    cv_mean_acc_list.append(np.mean(cv_acc_list))
    print('round:', _+1, '  accuarcy: %.4f' % (np.mean(scores)), '  current mean acc: %.4f' % np.mean(cv_acc_list))
clear()
print('Mean accuracy of CV-Loop:', np.mean(cv_acc_list))

In [None]:
from scipy.stats import norm
mu = np.mean(acc_list)
sigma = np.std(acc_list)
acc_array = np.array(acc_list).reshape(len(acc_list), 1)
acc_sorted = np.sort(acc_array, axis=0)
x_arg = np.linspace(1, acc_sorted.shape[0], acc_sorted.shape[0])
fig = plt.figure(figsize=(8, 8), dpi=300)
ax = fig.add_axes([0.11, 0.08, 0.85, 0.81])
n, bins, patches = ax.hist(acc_sorted, bins=30, density=1, facecolor='#3CB371', edgecolor='#006400', alpha=0.75, linewidth=1.6)
acc_N = norm.pdf(bins, mu, sigma)
ax.plot(bins, acc_N, color='#8B0000', linestyle=':', linewidth=3)
plt.suptitle('Distribution of SVC Acc of '+str(EPOCH)+' Rounds Loop'+
             '\nMean Acc: '+str(round(np.mean(acc_list), 3))+'  Max Acc: '+str(round(max(acc_list), 3)), fontsize=22)
ax.set_ylabel('Possibility', fontsize=17)
ax.set_xlabel('Accuracy', fontsize=17)
save_name = 'SVC_02a_Acc_Distribution_NormalLoop_'+c_time+'.png'
save_name = Path('.', DIR, save_name)
plt.savefig(save_name)

In [None]:
mu = np.mean(cv_acc_list)
sigma = np.std(cv_acc_list)
acc_array = np.array(cv_acc_list).reshape(len(cv_acc_list), 1)
acc_sorted = np.sort(acc_array, axis=0)
x_arg = np.linspace(1, acc_sorted.shape[0], acc_sorted.shape[0])
fig = plt.figure(figsize=(8, 8), dpi=300)
ax = fig.add_axes([0.11, 0.08, 0.85, 0.81])
n, bins, patches = ax.hist(acc_sorted, bins=30, density=1, facecolor='#FF6347', edgecolor='#FF4500', alpha=0.75, linewidth=1.6)
acc_N = norm.pdf(bins, mu, sigma)
ax.plot(bins, acc_N, color='#006400', linestyle=':', linewidth=3)
plt.suptitle('Distribution of SVC Acc of '+str(EPOCH)+' Rounds CV'+
             '\nMean Acc: '+str(round(np.mean(cv_acc_list), 3))+'  Max Acc: '+str(round(max(cv_acc_list), 3)), fontsize=22)
ax.set_ylabel('Possibility', fontsize=17)
ax.set_xlabel('Accuracy', fontsize=17)
save_name = 'SVC_02b_Acc_Distribution_CV_'+c_time+'.png'
save_name = Path('.', DIR, save_name)
plt.savefig(save_name)

In [None]:
plt.figure(figsize=(11,11), dpi=300)
plt.subplot(211)
x_idx = np.linspace(1, len(acc_list), len(acc_list)).tolist()
plt.scatter(x_idx, acc_list, color='r')
plt.plot(x_idx, mean_acc_list, 'b:')
plt.title('Acc Curve of Normal Loop', fontsize=18)
plt.ylabel('Accuracy', fontsize=15)
plt.xlabel('Epoch', fontsize=15)
plt.subplot(212)
plt.title('Acc Curve of CV-Loop', fontsize=18)
cv_x_idx = np.linspace(1, len(cv_acc_list), len(cv_acc_list)).tolist()
plt.scatter(cv_x_idx, cv_acc_list, color='r')
plt.plot(cv_x_idx, cv_mean_acc_list, 'b:')
plt.ylabel('Accuracy', fontsize=15)
plt.xlabel('Epoch', fontsize=15)
plt.suptitle('SVC Accuracy-Epoch Curves\n'+'Epoch of normal loop: '+str(EPOCH)+'   Epoch of CV-loop: '+str(CV_LOOP_EPOCH), fontsize=20)
save_name = 'SVC_03_Mean-Acc_Change_Plot_'+c_time+'.png'
save_name = Path('.', DIR, save_name)
plt.savefig(save_name)

In [None]:
LOG_NAME = Path('.', DIR, LOG_NAME)
f1 = open(LOG_NAME, 'w+')
f1.write('SVC Log\n\n')
f1.write('Input data: '+INPUT_X+' and '+INPUT_Y+'\n')
f1.write('Data Shape:'+str(X.shape)+', '+str(y.shape)+'\n\n')
f1.write('Epoch of normal loop: '+str(EPOCH)+'\n')
f1.write('Epoch of CV-loop: '+str(CV_LOOP_EPOCH)+'\n')
f1.write('Fold number of CV-loop: '+str(FOLD)+'\n\n')
f1.write('Classifier parameters:\n')
f1.write(str(paras)+'\n\n')
f1.write('Mean accuracy of Normal Loop: '+str(np.mean(acc_list))+'\n')
f1.write('Mean accuracy of CV-Loop: '+str(np.mean(cv_acc_list))+'\n\n')
f1.write('Mean MSE:'+str(np.mean(mse_list))+'\n')
# f1.write('30 Maximum Features:\n')
# for i in range(30):
#     f1.write('name:'+str(title[sorted_idx[i, ], ])+'   value:'+str(f_i_temp[sorted_idx[i, ], 0])+'\n')
f1.close()

In [None]:
rand_state = np.random.randint(5000)
print('Random state:', rand_state)
cv = ShuffleSplit(n_splits=10, test_size=.15, random_state=rand_state)
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
plt.figure(figsize=(8,5), dpi=300)
fig, ax = plt.subplots()
clf_list = []
for i in range(10):
    clf_list.append(SVC())
for i, (train, test) in enumerate(cv.split(X, y)):
    for k, v in paras.items():
        # clf_brand_new_1.set_params(**{k: v})
        clf_list[i].set_params(**{k: v})
    clf_list[i].fit(X[train], y[train])
    viz = plot_roc_curve(clf_list[i], X[test], y[test],
                         name='ROC fold {}'.format(i),
                         alpha=0.3, lw=1, ax=ax)
    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)
clear()
ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8)
mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
ax.plot(mean_fpr, mean_tpr, color='b',
        label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
        lw=2, alpha=.8)
std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                label=r'$\pm$ 1 std. dev.')
ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
       title="SVC Receiver operating characteristic curve\n"+'Random state: '+str(rand_state))
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
save_name = 'SVC_04_ROC_Plot_Rand-'+str(rand_state)+'.png'
save_name = Path('.', DIR, save_name)
plt.savefig(save_name, bbox_inches='tight', dpi=300)

In [None]:
import openpyxl
XLSX_FILE = r'/home/jyb/dyh/Python/MachineLearningLog_DYH.xlsx'
data = openpyxl.load_workbook(XLSX_FILE)
table = [data.sheetnames[1]]
table = data.active
nrows = table.max_row
if not FIND_SPLIT:
    SPLIT_STR = INPUT_SPLIT
else:
    SPLIT_STR = 'None'
out_excel = [c_time_m, 'Yiheng Dai', 'SVC '+Version, INPUT_X, INPUT_Y, INPUT_TITLE, SPLIT_STR,
             str(clf.get_params()), str(EPOCH), str(max(acc_list)), str(np.mean(acc_list)), str(FOLD), str(CV_LOOP_EPOCH), 
             str(max(cv_acc_list)), str(np.mean(cv_acc_list)), str(os.getcwd())+'/'+DIR, 'None']
for i in range(len(out_excel)):
    table.cell(nrows+1,i+1).value = out_excel[i]
data.save(XLSX_FILE)