In [1]:
#决策树（没有限制深度）
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
cancer =load_breast_cancer()
X_train,X_test,y_train,y_test=train_test_split(cancer.data,cancer.target,stratify=cancer.target,random_state=42)
tree=DecisionTreeClassifier(random_state=0)
tree.fit(X_train,y_train)
print('Accuracy on train set:{:3f}'.format(tree.score(X_train,y_train)))
print('Accuracy on test set:{:3f}'.format(tree.score(X_test,y_test)))

Accuracy on train set:1.000000
Accuracy on test set:0.937063


因为没有限制树的深度，所以训练集的成绩为1，不过测试集的成绩比之前的线性模型要差一些

In [7]:
#决策树限制深度
tree=DecisionTreeClassifier(max_depth=4,random_state=0)
tree.fit(X_train,y_train)
print('Accuracy on train set:{:3f}'.format(tree.score(X_train,y_train)))
print('Accuracy on test set:{:3f}'.format(tree.score(X_test,y_test)))

Accuracy on train set:0.988263
Accuracy on test set:0.951049


限制深度也就是预剪枝之后果然好了很多

决策树虽然容易可视化，也不熟数据缩放的影响，但是太容易过拟合，一般我们会使用一些继承操作的方法来改进它

In [8]:
#随机森林
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_moons
X,y=make_moons(n_samples=100,noise=0.25,random_state=3)
X_train,X_test,y_train,y_test=train_test_split(cancer.data,cancer.target,stratify=cancer.target,random_state=42)
forest=RandomForestClassifier(n_estimators=5,random_state=2)
forest.fit(X_train,y_train)
print('Accuracy on train set:{:3f}'.format(forest.score(X_train,y_train)))
print('Accuracy on test set:{:3f}'.format(forest.score(X_test,y_test)))

Accuracy on train set:1.000000
Accuracy on test set:0.958042


In [7]:
from sklearn.ensemble import RandomForestClassifier
cancer =load_breast_cancer()
X_train,X_test,y_train,y_test=train_test_split(cancer.data,cancer.target,stratify=cancer.target,random_state=0)
forest=RandomForestClassifier(n_estimators=100,random_state=0)
forest.fit(X_train,y_train)
print('Accuracy on train set:{:3f}'.format(forest.score(X_train,y_train)))
print('Accuracy on test set:{:3f}'.format(forest.score(X_test,y_test)))

Accuracy on train set:0.997653
Accuracy on test set:0.944056


随机森林是一个非常强大而且经常用到的机器学习算法，为了得到更好的模型，调整参数是非常有必要的

In [11]:
#使用网格搜索来寻找最佳模型
params_rf = {'n_estimators':[100,350,500],
    'max_features':['log2','auto','sqrt'],
    'min_samples_leaf':[2,10,30]
}
# Import GridSearchCV
from sklearn.model_selection import GridSearchCV
rf=RandomForestClassifier(random_state=0)
# Instantiate grid_rf
grid_rf = GridSearchCV(estimator=rf,
                       param_grid=params_rf,
                       scoring='neg_mean_squared_error',
                       cv=3,
                       verbose=1,
                       n_jobs=-1)
grid_rf.fit(X_train,y_train)
# Extract the best estimator
best_model = grid_rf.best_estimator_
print('Accuracy on train set:{:3f}'.format(best_model.score(X_train,y_train)))
print('Accuracy on test set:{:3f}'.format(best_model.score(X_test,y_test)))

Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done  66 out of  81 | elapsed:    4.5s remaining:    1.0s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:    5.5s finished


Accuracy on train set:0.992958
Accuracy on test set:0.958042


通过上面的例子可以看到虽然测试集的成绩不如没有使用网格搜索好，但是测试集的数据确实提高了

In [13]:
# 梯度提升机
from sklearn.ensemble import GradientBoostingClassifier
X_train,X_test,y_train,y_test=train_test_split(cancer.data,cancer.target,stratify=cancer.target,random_state=0)
gbrt = GradientBoostingClassifier(random_state=0)
gbrt.fit(X_train,y_train)
print('Accuracy on train set:{:3f}'.format(gbrt.score(X_train,y_train)))
print('Accuracy on test set:{:3f}'.format(gbrt.score(X_test,y_test)))

Accuracy on train set:1.000000
Accuracy on test set:0.958042


梯度提升机对于参数非常敏感，这也是为什么他是机器学习竞赛中经常成为冠军的原因

In [16]:
gbrt = GradientBoostingClassifier(max_depth=1,random_state=0)
gbrt.fit(X_train,y_train)
print('Accuracy on train set:{:.3f}'.format(gbrt.score(X_train,y_train)))
print('Accuracy on test set:{:.3f}'.format(gbrt.score(X_test,y_test)))

Accuracy on train set:0.995
Accuracy on test set:0.965


In [28]:
gbrt = GradientBoostingClassifier(random_state=0,learning_rate=0.01)
gbrt.fit(X_train,y_train)
print('Accuracy on train set:{:.3f}'.format(gbrt.score(X_train,y_train)))
print('Accuracy on test set:{:.3f}'.format(gbrt.score(X_test,y_test)))

Accuracy on train set:0.995
Accuracy on test set:0.944


In [27]:
params_rf = {'n_estimators':[20,50,100,350,500],
    'max_features':['log2','auto','sqrt'],
    'max_depth':[1,2,3,4,5],
    'learning_rate':[0.001,0.01,0.05,0.1]
}
gbrt = GradientBoostingClassifier(random_state=0)
grid_rf = GridSearchCV(estimator=gbrt,
                       param_grid=params_rf,
                       scoring='roc_auc',
                       cv=3,
                       verbose=1,
                       n_jobs=-1)
grid_rf.fit(X_train,y_train)
best_model = grid_rf.best_estimator_
print('Accuracy on train set:{:3f}'.format(best_model.score(X_train,y_train)))
print('Accuracy on test set:{:3f}'.format(best_model.score(X_test,y_test)))

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Done 268 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 832 tasks      | elapsed:   14.5s


Accuracy on train set:1.000000
Accuracy on test set:0.958042


[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:   15.4s finished
