# 随机森林分类器

#### 导入包

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy.stats import ttest_ind, levene
from sklearn.linear_model import LassoCV
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier #
from sklearn.model_selection import train_test_split #

#### 导入数据

In [2]:
xlsx_a = 'data/featureTable/aa.xlsx'
xlsx_b = 'data/featureTable/bb.xlsx'
data_a = pd.read_excel(xlsx_a)
data_b = pd.read_excel(xlsx_b)
print(data_a.shape,data_b.shape)

(212, 30) (357, 30)


#### t检验特征筛选

In [3]:
index = []
for colName in data_a.columns[:]: 
    if levene(data_a[colName], data_b[colName])[1] > 0.05: 
        if ttest_ind(data_a[colName], data_b[colName])[1] < 0.05: 
            index.append(colName)
    else: 
        if ttest_ind(data_a[colName], data_b[colName],equal_var=False)[1] < 0.05: 
            index.append(colName)
print(len(index))
print(index)

25
['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'M', 'N', 'P', 'Q', 'R', 'U', 'V', 'W', 'X', 'Y', 'Z', 'AA', 'AB', 'AC', 'AD']


#### t检验后数据处理

In [4]:
data_a = data_a[index]
data_b = data_b[index]
rows_a,cols_a = data_a.shape
rows_b,cols_b = data_b.shape
labels_a = np.zeros(rows_a)
labels_b = np.ones(rows_b)
data_a.insert(0, 'label', labels_a)
data_b.insert(0, 'label', labels_b)
data = pd.concat([data_a,data_b])
data = shuffle(data)
data.index = range(len(data))
X = data[data.columns[1:]]
y = data['label']
X = X.apply(pd.to_numeric, errors='ignore')
colNames = X.columns
X = X.fillna(0)
X = X.astype(np.float64)
X = StandardScaler().fit_transform(X)
X = pd.DataFrame(X)
X.columns = colNames
print(data.shape)

(569, 26)


#### LASSO特征筛选

In [5]:
alphas = np.logspace(-4,1,50)
model_lassoCV = LassoCV(alphas = alphas,max_iter = 100000).fit(X,y)
coef = pd.Series(model_lassoCV.coef_, index = X.columns)
print(model_lassoCV.alpha_)
print('%s %d'%('Lasso picked',sum(coef != 0)))
print(coef[coef != 0])
index = coef[coef != 0].index
X = X[index]

0.0008286427728546842
Lasso picked 19
B    -0.034716
E     0.014874
F     0.170354
G    -0.096859
H    -0.100157
I     0.000167
K    -0.140338
N     0.091212
P    -0.005341
Q     0.101661
R    -0.060501
U    -0.474987
V    -0.031190
X     0.308392
Y    -0.062234
AA   -0.080219
AB   -0.016723
AC   -0.047266
AD   -0.052621
dtype: float64


#### 随机森林分类

In [6]:
np.set_printoptions(threshold=np.inf)  
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3,random_state = 15)
model_rf = RandomForestClassifier(n_estimators = 200 # default 100
                                  , criterion = 'entropy'# support 'gini' and 'entropy', default 'gini'
                                  , random_state = 20 # default = None
                                  , class_weight = 'balanced' # default = None
                                 )
model_rf.fit(X_train,y_train)
# print(model_rf.score(X_test,y_test))
# print(model_rf.predict(X_test))
# print(model_rf.predict_proba(X_test))
# print(model_rf.n_features_)
# print(model_rf.feature_importances_)
print(model_rf.get_params())

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 200, 'n_jobs': None, 'oob_score': False, 'random_state': 20, 'verbose': 0, 'warm_start': False}
