In [2]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_wine
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
import warnings
warnings.filterwarnings('ignore')

wine = load_wine()
X_train, X_test, y_train, y_test=train_test_split(wine.data, 
                                                 wine.target,
                                                 random_state=38)


In [None]:
best_score = 0
for alpha in [0.01,0.1,1.0,10.0]:
    for max_iter in [100,1000,5000,10000]:
        lasso = Lasso(alpha=alpha,max_iter=max_iter)
        lasso.fit(X_train, y_train)
        score = lasso.score(X_test, y_test)
        if score > best_score:
            best_score = score
            best_parameters={'alpha':alpha,'最大迭代次数':max_iter}
            
print("模型最高分为：{:.3f}".format(best_score))
print('最佳参数设置：{}'.format(best_parameters))

In [3]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(wine.data, 
                                                 wine.target,
                                                 random_state=0)
best_score = 0
for alpha in [0.01,0.1,1.0,10.0]:
    for max_iter in [100,1000,5000,10000]:
        lasso = Lasso(alpha=alpha,max_iter=max_iter)
        lasso.fit(X_train, y_train)
        score = lasso.score(X_test, y_test)
        if score > best_score:
            best_score = score
            best_parameters={'alpha':alpha,'最大迭代次数':max_iter}
print("模型最高分为：{:.3f}".format(best_score))
print('最佳参数设置：{}'.format(best_parameters))            

模型最高分为：0.830
最佳参数设置：{'alpha': 0.1, '最大迭代次数': 100}


In [4]:
import numpy as np
for alpha in [0.01,0.1,1.0,10.0]:
    for max_iter in [100,1000,5000,10000]:
        lasso = Lasso(alpha=alpha,max_iter=max_iter)
        scores = cross_val_score(lasso, X_train, y_train, cv=6)
        score = np.mean(scores)
        if score > best_score:
            best_score = score
            best_parameters={'alpha':alpha, '最大迭代数':max_iter}
            
print("模型最高分为：{:.3f}".format(best_score))
print('最佳参数设置：{}'.format(best_parameters))

模型最高分为：0.865
最佳参数设置：{'alpha': 0.01, '最大迭代数': 100}


In [5]:
lasso = Lasso(alpha=0.01, max_iter=100).fit(X_train, y_train)
print('测试数据集得分：{:.3f}'.format(lasso.score(X_test,y_test)))

测试数据集得分：0.819


In [2]:
from sklearn.model_selection import GridSearchCV
params = {'alpha':[0.01,0.1,1.0,10.0],
         'max_iter':[100,1000,5000,10000]}
grid_search = GridSearchCV(lasso,params,cv=6)
grid_search.fit(X_train, y_train)
print('模型最高分：{:.3f}'.format(grid_search.score(X_test, y_test)))
print('最优参数：{}'.format(grid_search.best_params_))
print('交叉验证最高得分：{:.3f}'.format(grid_search.best_score_))

NameError: name 'lasso' is not defined

使用网格搜索优化决策树模型参数，使模型在鸢尾花数据集上的交叉验证得分最高。请按照以下步骤进行操作：

1.从 scikit-learn 中导入 iris 数据集。

2.将 iris 数据集随机划分为训练集和测试集，其中测试集大小为数据集大小的 30%。

3.定义决策树模型，并通过交叉验证寻找最佳模型参数组合。模型参数组合包括以下两个参数：
criterion：用于衡量节点分裂的质量。可选值包括 "gini" 和 "entropy"。
max_depth：决策树的最大深度，用于控制树的复杂度。 
搜索的参数组合为：
criterion：["gini", "entropy"]
max_depth：[2, 3, 4, 5, 6, 7, 8, 9, 10]

4.使用最佳参数组合在整个训练集上重新训练模型，并在测试集上计算模型得分。输出测试集得分以及最佳参数组合。

5.（选做）尝试使用随机搜索、贝叶斯优化或遗传算法等其他超参数优化方法，与网格搜索进行比较并尝试找到更优的超参数组合。

In [3]:
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test=train_test_split(iris.data, 
                                                 iris.target,
                                                 test_size=0.3)

In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn import tree


dtree = tree.DecisionTreeClassifier()#.fit(X_train,y_train)


params = {'criterion':["gini", "entropy"],
          'max_depth':[2, 3, 4, 5, 6, 7, 8, 9, 10]}
grid_search = GridSearchCV(dtree,params,cv=5)
grid_search.fit(X_train, y_train)
# print('模型最高分：{:.3f}'.format(grid_search.score(X_test, y_test)))
# print('最优参数：{}'.format(grid_search.best_params_))
# print('交叉验证最高得分：{:.3f}'.format(grid_search.best_score_))

In [5]:
dtreep = tree.DecisionTreeClassifier(max_depth=grid_search.best_params_['max_depth'],
                                     criterion=grid_search.best_params_['criterion'])
dtreep.fit(X_train,y_train)
print('DecisionTree网格搜索测试集得分：\n{}'.format(dtreep.score(X_test,y_test)))
print('网格搜索最优参数：{}'.format(grid_search.best_params_))

DecisionTree网格搜索测试集得分：
0.9777777777777777
网格搜索最优参数：{'criterion': 'gini', 'max_depth': 3}


## 5.（选做）尝试使用随机搜索、贝叶斯优化或遗传算法等其他超参数优化方法， 与网格搜索进行比较并尝试找到更优的超参数组合。

In [6]:
# 随机搜索
from scipy.stats import randint as sp_randint 
from sklearn.model_selection import RandomizedSearchCV 
from sklearn.ensemble import RandomForestClassifier 

dtree = tree.DecisionTreeClassifier()
# 给定参数搜索范围：list or distribution 
param_dist = {"max_depth":  [2, 3, 4, 5, 6, 7, 8, 9, 10],  # 给定list 
              "criterion": ["gini", "entropy"]}  # 给定list 
# 用RandomSearch+CV选取超参数 
              
random_search = RandomizedSearchCV(dtree, param_dist,cv=5,scoring='accuracy') 
random_search.fit(X_train,y_train)
print('DecisionTree随机搜索测试集得分：\n{}'.format(random_search.score(X_test,y_test)))
print('随机搜索最优参数：{}'.format(random_search.best_params_))


DecisionTree随机搜索测试集得分：
0.9777777777777777
随机搜索最优参数：{'max_depth': 3, 'criterion': 'entropy'}


In [None]:
#遗传算法


In [35]:
#coding=utf-8
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import random
import math
from sklearn import metrics
from sklearn.model_selection import train_test_split
 
 
generations = 10   # 繁殖代数 100
pop_size = 20      # 种群数量  500
max_value = 10      # 基因中允许出现的最大值  
chrom_length = 8    # 染色体长度  
pc = 0.6            # 交配概率  
pm = 0.01           # 变异概率  
results = [[]]      # 存储每一代的最优解，N个三元组（auc最高值, n_estimators, max_depth）  
fit_value = []      # 个体适应度  
fit_mean = []       # 平均适应度 
pop = [[0, 1, 0, 1, 0, 1, 0, 1] for i in range(pop_size)] # 初始化种群中所有个体的基因初始序列
 
 
 
 
'''
n_estimators 取 {10、20、30、40、50、60、70、80、90、100、110、120、130、140、150、160}
max_depth 取 {1、2、3、4、5、6、7、8、9、10、11、12、13、14、15、16} 
（1111，1111）基因组8位长
'''
def randomForest(n_estimators_value, max_depth_value):
 
 
    # print("n_estimators_value: " + str(n_estimators_value))
    # print("max_depth_value: " + str(max_depth_value))
 
 
    train_xy = loadFile("data.csv")
    train_xy = train_xy.drop('ID', axis=1)  # 删除训练集的ID
    # 将训练集划分成7:3（训练集与测试集比例）的比例
    train, val = train_test_split(
        train_xy, test_size=0.3, random_state=80)
    train_y = train['Kind']  # 训练集类标
    val_y = val['Kind']  # 测试集类标
 
 
    train = train.drop('Kind', axis=1)  # 删除训练集的类标
    val = val.drop('Kind', axis=1)  # 删除测试集的类标
 
 
    rf = RandomForestClassifier(n_estimators=n_estimators_value,
                                max_depth=max_depth_value,
                                n_jobs=2)
    rf.fit(train, train_y)  # 训练分类器
    predict_test = rf.predict_proba(val)[:, 1]
    roc_auc = metrics.roc_auc_score(val_y, predict_test)
    return roc_auc
 
 
def loadFile(filePath):
    fileData = pd.read_csv(filePath)
    return fileData
 
 
 
 
# Step 1 : 对参数进行编码（用于初始化基因序列，可以选择初始化基因序列，本函数省略）
def geneEncoding(pop_size, chrom_length):  
    pop = [[]]
    for i in range(pop_size):
        temp = []
        for j in range(chrom_length):
            temp.append(random.randint(0, 1))
        pop.append(temp)
    return pop[1:]
 
 
# Step 2 : 计算个体的目标函数值
def cal_obj_value(pop):
    objvalue = []
    variable = decodechrom(pop)
    for i in range(len(variable)):
        tempVar = variable[i]
        n_estimators_value = (tempVar[0] + 1) * 10
        max_depth_value = tempVar[1] + 1
        aucValue = randomForest(n_estimators_value, max_depth_value)
        objvalue.append(aucValue)
    return objvalue #目标函数值objvalue[m] 与个体基因 pop[m] 对应 
 
 
 
 
# 对每个个体进行解码，并拆分成单个变量，返回 n_estimators 和 max_depth
def decodechrom(pop):
    variable = []
    n_estimators_value = []
    max_depth_value = []
    for i in range(len(pop)):
        res = []
        
        # 计算第一个变量值，即 0101->10(逆转)
        temp1 = pop[i][0:4]
        preValue = 0
        for pre in range(4):
            preValue += temp1[pre] * (math.pow(2, pre))
        res.append(int(preValue))
        
        # 计算第二个变量值
        temp2 = pop[i][4:8]
        aftValue = 0
        for aft in range(4):
            aftValue += temp2[aft] * (math.pow(2, aft))
        res.append(int(aftValue))
        variable.append(res)
    return variable
 
 
 
 
# Step 3: 计算个体的适应值（计算最大值，于是就淘汰负值就好了）
def calfitvalue(obj_value):
    fit_value = []
    temp = 0.0
    Cmin = 0
    for i in range(len(obj_value)):
        if(obj_value[i] + Cmin > 0):
            temp = Cmin + obj_value[i]
        else:
            temp = 0.0
        fit_value.append(temp)
    return fit_value
 
 
 
 
# Step 4: 找出适应函数值中最大值，和对应的个体
def best(pop, fit_value):
    best_individual = pop[0]
    best_fit = fit_value[0]
    for i in range(1, len(pop)):
        if(fit_value[i] > best_fit):
            best_fit = fit_value[i]
            best_individual = pop[i]
    return [best_individual, best_fit]
 
 
 
 
# Step 5: 每次繁殖，将最好的结果记录下来(将二进制转化为十进制)
def b2d(best_individual):
    temp1 = best_individual[0:4]
    preValue = 0
    for pre in range(4):
        preValue += temp1[pre] * (math.pow(2, pre))
    preValue = preValue + 1
    preValue = preValue * 10
    
    # 计算第二个变量值
    temp2 = best_individual[4:8]
    aftValue = 0
    for aft in range(4):
        aftValue += temp2[aft] * (math.pow(2, aft))
    aftValue = aftValue + 1
    return int(preValue), int(aftValue)
 
 
 
 
# Step 6: 自然选择（轮盘赌算法）
def selection(pop, fit_value):
    # 计算每个适应值的概率
    new_fit_value = []
    total_fit = sum(fit_value)
    for i in range(len(fit_value)):
        new_fit_value.append(fit_value[i] / total_fit)
    # 计算每个适应值的累积概率
    cumsum(new_fit_value)
    # 生成随机浮点数序列
    ms = []
    pop_len = len(pop)
    for i in range(pop_len):
        ms.append(random.random())
    # 对生成的随机浮点数序列进行排序
    ms.sort()
    # 轮盘赌算法（选中的个体成为下一轮，没有被选中的直接淘汰，被选中的个体代替）
    fitin = 0
    newin = 0
    newpop = pop
    while newin < pop_len:
        if(ms[newin] < new_fit_value[fitin]):
            newpop[newin] = pop[fitin]
            newin = newin + 1
        else:
            fitin = fitin + 1
    pop = newpop
 
 
# 求适应值的总和
def sum(fit_value):
    total = 0
    for i in range(len(fit_value)):
        total += fit_value[i]
    return total
 
 
# 计算累积概率
def cumsum(fit_value):
    temp=[]
    for i in range(len(fit_value)):
        t = 0
        j = 0
        while(j <= i):
            t += fit_value[j]
            j = j + 1
        temp.append(t)
    for i in range(len(fit_value)):
        fit_value[i]=temp[i]
 
 
# Step 7: 交叉繁殖
def crossover(pop, pc): #个体间交叉，实现基因交换
    poplen = len(pop)
    for i in range(poplen - 1):
        if(random.random() < pc):
            cpoint = random.randint(0,len(pop[0]))
            temp1 = []
            temp2 = []
            temp1.extend(pop[i][0 : cpoint])
            temp1.extend(pop[i+1][cpoint : len(pop[i])])
            temp2.extend(pop[i+1][0 : cpoint])
            temp2.extend(pop[i][cpoint : len(pop[i])])
            pop[i] = temp1
            pop[i+1] = temp2
 
 
 
 
# Step 8: 基因突变
def mutation(pop, pm):
    px = len(pop)
    py = len(pop[0])
    for i in range(px):
        if(random.random() < pm):
            mpoint = random.randint(0,py-1)
            if(pop[i][mpoint] == 1):
                pop[i][mpoint] = 0
            else:
                pop[i][mpoint] = 1
 
 
 
 
if __name__ == '__main__':
    # pop = geneEncoding(pop_size, chrom_length)
    for i in range(generations):
        print("第 " + str(i) + " 代开始繁殖......")
        obj_value = cal_obj_value(pop) # 计算目标函数值
        # print(obj_value)
        fit_value = calfitvalue(obj_value) #计算个体的适应值
        # print(fit_value)
        [best_individual, best_fit] = best(pop, fit_value) #选出最好的个体和最好的函数值
        # print("best_individual: "+ str(best_individual))
        temp_n_estimator, temp_max_depth = b2d(best_individual)
        results.append([best_fit, temp_n_estimator, temp_max_depth]) #每次繁殖，将最好的结果记录下来
        print(str(best_individual) + " " + str(best_fit))
        selection(pop, fit_value) #自然选择，淘汰掉一部分适应性低的个体
        crossover(pop, pc) #交叉繁殖
        mutation(pop, pc) #基因突变
    # print(results)
    results.sort()
    print(results[-1])

第 0 代开始繁殖......


FileNotFoundError: [Errno 2] No such file or directory: 'data.csv'