# 支持向量机分类器

#### 导入包

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy.stats import ttest_ind, levene
from sklearn.utils import shuffle
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC #

#### 导入数据

In [2]:
xlsx_a = 'data/featureTable/aa.xlsx'
xlsx_b = 'data/featureTable/bb.xlsx'
data_a = pd.read_excel(xlsx_a)
data_b = pd.read_excel(xlsx_b)
print(data_a.shape,data_b.shape)

(212, 30) (357, 30)


#### t检验特征筛选

In [3]:
index = []
for colName in data_a.columns[:]: 
    if levene(data_a[colName], data_b[colName])[1] > 0.05: 
        if ttest_ind(data_a[colName], data_b[colName])[1] < 0.05: 
            index.append(colName)
    else: 
        if ttest_ind(data_a[colName], data_b[colName],equal_var=False)[1] < 0.05: 
            index.append(colName)
print(len(index))
print(index)

25
['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'M', 'N', 'P', 'Q', 'R', 'U', 'V', 'W', 'X', 'Y', 'Z', 'AA', 'AB', 'AC', 'AD']


#### t检验后数据处理

In [4]:
data_a = data_a[index]
data_b = data_b[index]
rows_a,cols_a = data_a.shape
rows_b,cols_b = data_b.shape
labels_a = np.zeros(rows_a)
labels_b = np.ones(rows_b)
data_a.insert(0, 'label', labels_a)
data_b.insert(0, 'label', labels_b)
data = pd.concat([data_a,data_b])
data = shuffle(data)
data.index = range(len(data))
X = data[data.columns[1:]]
y = data['label']
X = X.apply(pd.to_numeric, errors='ignore')
colNames = X.columns
X = X.fillna(0)
X = X.astype(np.float64)
X = StandardScaler().fit_transform(X)
X = pd.DataFrame(X)
X.columns = colNames
print(data.shape)

(569, 26)


#### LASSO特征筛选

In [5]:
alphas = np.logspace(-4,1,50)
model_lassoCV = LassoCV(alphas = alphas,max_iter = 100000).fit(X,y)
coef = pd.Series(model_lassoCV.coef_, index = X.columns)
print(model_lassoCV.alpha_)
print('%s %d'%('Lasso picked',sum(coef != 0)))
print(coef[coef != 0])
index = coef[coef != 0].index
X = X[index]

0.0006551285568595509
Lasso picked 20
B    -0.034474
E     0.015739
F     0.176224
G    -0.107364
H    -0.096814
I     0.000767
K    -0.147710
M     0.007740
N     0.089843
P    -0.008272
Q     0.107749
R    -0.063408
U    -0.491343
V    -0.031098
X     0.325386
Y    -0.063509
AA   -0.079608
AB   -0.014173
AC   -0.047987
AD   -0.054080
dtype: float64


#### SVM分类

In [6]:
np.set_printoptions(threshold=np.inf)  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
model_svm = SVC(kernel='rbf',gamma = 'scale',probability=True)
model_svm.fit(X_train,y_train)
# print(model_svm.score(X_test,y_test))
# print(model_svm.predict(X_test))
# print(model_svm.predict_proba(X_test))
print(model_svm.get_params())

{'C': 1.0, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'probability': True, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}
