In [2]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_wine
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
import warnings
warnings.filterwarnings('ignore')

wine = load_wine()
X_train, X_test, y_train, y_test=train_test_split(wine.data, 
                                                 wine.target,
                                                 random_state=38)


In [None]:
best_score = 0
for alpha in [0.01,0.1,1.0,10.0]:
    for max_iter in [100,1000,5000,10000]:
        lasso = Lasso(alpha=alpha,max_iter=max_iter)
        lasso.fit(X_train, y_train)
        score = lasso.score(X_test, y_test)
        if score > best_score:
            best_score = score
            best_parameters={'alpha':alpha,'最大迭代次数':max_iter}
            
print("模型最高分为：{:.3f}".format(best_score))
print('最佳参数设置：{}'.format(best_parameters))

In [3]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(wine.data, 
                                                 wine.target,
                                                 random_state=0)
best_score = 0
for alpha in [0.01,0.1,1.0,10.0]:
    for max_iter in [100,1000,5000,10000]:
        lasso = Lasso(alpha=alpha,max_iter=max_iter)
        lasso.fit(X_train, y_train)
        score = lasso.score(X_test, y_test)
        if score > best_score:
            best_score = score
            best_parameters={'alpha':alpha,'最大迭代次数':max_iter}
print("模型最高分为：{:.3f}".format(best_score))
print('最佳参数设置：{}'.format(best_parameters))            

模型最高分为：0.830
最佳参数设置：{'alpha': 0.1, '最大迭代次数': 100}


In [4]:
import numpy as np
for alpha in [0.01,0.1,1.0,10.0]:
    for max_iter in [100,1000,5000,10000]:
        lasso = Lasso(alpha=alpha,max_iter=max_iter)
        scores = cross_val_score(lasso, X_train, y_train, cv=6)
        score = np.mean(scores)
        if score > best_score:
            best_score = score
            best_parameters={'alpha':alpha, '最大迭代数':max_iter}
            
print("模型最高分为：{:.3f}".format(best_score))
print('最佳参数设置：{}'.format(best_parameters))

模型最高分为：0.865
最佳参数设置：{'alpha': 0.01, '最大迭代数': 100}


In [5]:
lasso = Lasso(alpha=0.01, max_iter=100).fit(X_train, y_train)
print('测试数据集得分：{:.3f}'.format(lasso.score(X_test,y_test)))

测试数据集得分：0.819


In [2]:
from sklearn.model_selection import GridSearchCV
params = {'alpha':[0.01,0.1,1.0,10.0],
         'max_iter':[100,1000,5000,10000]}
grid_search = GridSearchCV(lasso,params,cv=6)
grid_search.fit(X_train, y_train)
print('模型最高分：{:.3f}'.format(grid_search.score(X_test, y_test)))
print('最优参数：{}'.format(grid_search.best_params_))
print('交叉验证最高得分：{:.3f}'.format(grid_search.best_score_))

NameError: name 'lasso' is not defined

使用网格搜索优化决策树模型参数，使模型在鸢尾花数据集上的交叉验证得分最高。请按照以下步骤进行操作：

1.从 scikit-learn 中导入 iris 数据集。

2.将 iris 数据集随机划分为训练集和测试集，其中测试集大小为数据集大小的 30%。

3.定义决策树模型，并通过交叉验证寻找最佳模型参数组合。模型参数组合包括以下两个参数：
criterion：用于衡量节点分裂的质量。可选值包括 "gini" 和 "entropy"。
max_depth：决策树的最大深度，用于控制树的复杂度。 
搜索的参数组合为：
criterion：["gini", "entropy"]
max_depth：[2, 3, 4, 5, 6, 7, 8, 9, 10]

4.使用最佳参数组合在整个训练集上重新训练模型，并在测试集上计算模型得分。输出测试集得分以及最佳参数组合。

5.（选做）尝试使用随机搜索、贝叶斯优化或遗传算法等其他超参数优化方法，与网格搜索进行比较并尝试找到更优的超参数组合。

In [3]:
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test=train_test_split(iris.data, 
                                                 iris.target,
                                                 test_size=0.3)

In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn import tree


dtree = tree.DecisionTreeClassifier()#.fit(X_train,y_train)


params = {'criterion':["gini", "entropy"],
          'max_depth':[2, 3, 4, 5, 6, 7, 8, 9, 10]}
grid_search = GridSearchCV(dtree,params,cv=5)
grid_search.fit(X_train, y_train)
# print('模型最高分：{:.3f}'.format(grid_search.score(X_test, y_test)))
# print('最优参数：{}'.format(grid_search.best_params_))
# print('交叉验证最高得分：{:.3f}'.format(grid_search.best_score_))

In [5]:
dtreep = tree.DecisionTreeClassifier(max_depth=grid_search.best_params_['max_depth'],
                                     criterion=grid_search.best_params_['criterion'])
dtreep.fit(X_train,y_train)
print('DecisionTree网格搜索测试集得分：\n{}'.format(dtreep.score(X_test,y_test)))
print('网格搜索最优参数：{}'.format(grid_search.best_params_))

DecisionTree网格搜索测试集得分：
0.9777777777777777
网格搜索最优参数：{'criterion': 'gini', 'max_depth': 3}


## 5.（选做）尝试使用随机搜索、贝叶斯优化或遗传算法等其他超参数优化方法， 与网格搜索进行比较并尝试找到更优的超参数组合。

In [6]:
# 随机搜索
from scipy.stats import randint as sp_randint 
from sklearn.model_selection import RandomizedSearchCV 
from sklearn.ensemble import RandomForestClassifier 

dtree = tree.DecisionTreeClassifier()
# 给定参数搜索范围：list or distribution 
param_dist = {"max_depth":  [2, 3, 4, 5, 6, 7, 8, 9, 10],  # 给定list 
              "criterion": ["gini", "entropy"]}  # 给定list 
# 用RandomSearch+CV选取超参数 
              
random_search = RandomizedSearchCV(dtree, param_dist,cv=5,scoring='accuracy') 
random_search.fit(X_train,y_train)
print('DecisionTree随机搜索测试集得分：\n{}'.format(random_search.score(X_test,y_test)))
print('随机搜索最优参数：{}'.format(random_search.best_params_))


DecisionTree随机搜索测试集得分：
0.9777777777777777
随机搜索最优参数：{'max_depth': 3, 'criterion': 'entropy'}


In [None]:
#遗传算法