# 影像组学特征筛选和降维

## T检验结合LASSO实现影像组学特征筛选

#### 导入包

In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler #用于数据归一化处理
from scipy.stats import ttest_ind, levene
from sklearn.linear_model import LassoCV
from sklearn.utils import shuffle

#### 导入数据

In [2]:
xlsx_a = 'data/featureTable/aa.xlsx'
xlsx_b = 'data/featureTable/bb.xlsx'
data_a = pd.read_excel(xlsx_a)
data_b = pd.read_excel(xlsx_b)
print(data_a.shape,data_b.shape)

(212, 30) (357, 30)


#### t检验特征筛选

In [3]:
print(levene(data_a['A'], data_b['A']))

LeveneResult(statistic=90.47705934341127, pvalue=5.279775501703329e-20)


In [4]:
print(ttest_ind(data_a['A'], data_b['A'],equal_var=False))

Ttest_indResult(statistic=22.208797758464524, pvalue=1.6844591259582747e-64)


In [5]:
index = []
for colName in data_a.columns[:]: 
    if levene(data_a[colName], data_b[colName])[1] > 0.05: 
        if ttest_ind(data_a[colName], data_b[colName])[1] < 0.05: 
            index.append(colName)
    else: 
        if ttest_ind(data_a[colName], data_b[colName],equal_var=False)[1] < 0.05: 
            index.append(colName)
print(len(index))
print(index)

25
['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'M', 'N', 'P', 'Q', 'R', 'U', 'V', 'W', 'X', 'Y', 'Z', 'AA', 'AB', 'AC', 'AD']


#### t检验后数据处理

In [6]:
data_a = data_a[index]
data_b = data_b[index]
rows_a,cols_a = data_a.shape
rows_b,cols_b = data_b.shape
labels_a = np.zeros(rows_a)
labels_b = np.ones(rows_b)
data_a.insert(0, 'label', labels_a)
data_b.insert(0, 'label', labels_b)
data = pd.concat([data_a,data_b])
data = shuffle(data)
data.index = range(len(data))
X = data[data.columns[1:]]
y = data['label']
X = X.apply(pd.to_numeric, errors='ignore') # 将数据类型转化为数值型
colNames = X.columns #读取特征的名字
X = X.fillna(0)
X = X.astype(np.float64)
X = StandardScaler().fit_transform(X)
X = pd.DataFrame(X)
X.columns = colNames

In [7]:
print(data.shape)
print(X.head())

(569, 26)
          A         B         C         D         E         F         G  \
0 -0.473535 -1.503204 -0.541199 -0.505082 -1.611206 -1.211208 -1.024816   
1  1.551487  1.328837  1.471766  1.524754  0.486752 -0.106715  0.962975   
2  2.874993  0.211845  3.057588  3.145893  3.440117  3.455973  4.243589   
3 -0.121357 -0.383884 -0.173371 -0.238305  0.223439 -0.469447 -0.543873   
4 -0.263364 -0.807410 -0.325363 -0.334435 -0.800631 -0.982274 -1.096530   

          H         I         K  ...         U         V         W         X  \
0 -0.965447 -0.725145 -0.279974  ... -0.637646 -1.517252 -0.715492 -0.609263   
1  1.075889 -0.542598  0.224594  ...  1.070784  0.860267  0.969195  0.950006   
2  3.927930  3.079138  3.983947  ...  2.019222 -0.274754  2.193393  2.096165   
3 -0.446730 -0.290683 -0.584952  ... -0.271110 -0.349662 -0.341978 -0.341181   
4 -1.177705 -0.655777 -0.775518  ... -0.385006 -0.851221 -0.454568 -0.428374   

          Y         Z        AA        AB        AC       

#### LASSO特征筛选

In [8]:
alphas = np.logspace(-4,1,50)
model_lassoCV = LassoCV(alphas = alphas,max_iter = 100000).fit(X,y)
coef = pd.Series(model_lassoCV.coef_, index = X.columns)
print(model_lassoCV.alpha_)
print('%s %d'%('Lasso picked',sum(coef != 0)))
print(coef[coef != 0])

0.00040949150623804275
Lasso picked 21
B    -0.034295
D     0.002491
E     0.017027
F     0.184738
G    -0.122652
H    -0.092809
I     0.001634
K    -0.157311
M     0.018898
N     0.086753
P    -0.012537
Q     0.116439
R    -0.067602
U    -0.516894
V    -0.030768
X     0.349906
Y    -0.065249
AA   -0.078566
AB   -0.010136
AC   -0.048976
AD   -0.056062
dtype: float64
