<img src="image/3/kd_tree_image.png" alt="alt text" style="width: 30%; height: auto;">


In [3]:
import math
# kd-tree每个结点中主要包含的数据结构如下
class KdNode(object):
    def __init__(self, dom_elt, split,parent, left, right, val):
        self.dom_elt = dom_elt  # k维向量节点(k维空间中的一个样本点)
        self.split = split  # 整数（进行分割维度的序号）
        self.parent = parent  # 父节点
        self.left = left  # 该结点分割超平面左子空间构成的kd-tree
        self.right = right  # 该结点分割超平面右子空间构成的kd-tree
        self.val=val  # 考虑到分类的需要，记录训练集的分类类型

class KdTree(object):
    def __init__(self, data, y):
        if not data:
            return None
        if len(data)!=len(y):
            return print("check X Y")
        k = len(data[0])  # 数据维度

        def CreateNode(parent_node,split, data_set, y_set):  # 按第split维划分数据集exset创建KdNode
            if not data_set:  # 数据集为空
                return None
            # key参数的值为一个函数，此函数只有一个参数且返回一个值用来进行比较
            # operator模块提供的itemgetter函数用于获取对象的哪些维的数据，参数为需要获取的数据在对象中的序号
            #data_set.sort(key=itemgetter(split)) # 按要进行分割的那一维数据排序
            data_set.sort(key=lambda x: x[split])
            split_pos = len(data_set) // 2  # //为Python中的整数除法
            median = data_set[split_pos]  # 中位数分割点
            split_next = (split + 1) % k  # cycle coordinates
            val=y_set[split_pos]

            # 递归的创建kd树
            now_node = KdNode(
                median,
                split,
                parent_node,
                None,
                None,
                val
            )  # 创建当前节点
            now_node.left = CreateNode(now_node, split_next, data_set[:split_pos],y_set[:split_pos])  # 创建左子树
            now_node.right = CreateNode(now_node, split_next, data_set[split_pos + 1:],y_set[split_pos + 1:])  # 创建右子树
            return now_node
            
        self.root = CreateNode(None,0, data, y)  # 从第0维分量开始构建kd树,返回根节点



# KDTree的前序遍历
def preorder(root):
    print(root.dom_elt,root.split,root.val)
    if root.left:  # 节点不为空
        preorder(root.left)
    if root.right:
        preorder(root.right)
 

In [None]:
data = [[1,1,1],[2,2,2],[3,3,3],[4,4,4],[5,5,5],[6,6,6],[7,7,7],[8,8,8],[9,9,9],[10,10,10]]
y=[1,2,3,4,5,6,7,8,9,10]
kd = KdTree(data,y)
preorder(kd.root)

In [None]:
print(kd.root.left.parent.dom_elt)

<img src="image/3/search_kd_tree_image.png" alt="alt text" style="width: 30%; height: auto;">

In [36]:
class Searcher:
    def __init__(self,new_point,kd_tree):
        self.new_point = new_point
        self.kd_tree = kd_tree
        self.nearest_point = None
        self.nearest_distance = float("inf")
    
    # 距离计算
    def caculate_Euclidean_distance(self,point1,point2):
        if point1 is None or point2 is None:
            return None
        if len(point1)!= len(point2):
            return None
        sum = 0
        for i in range(len(point1)):
            sum += (point1[i]-point2[i])**2
        return math.sqrt(sum)
    def caculate_Manhattan_distance(self,point1,point2):
        if point1 is None or point2 is None:
            return None
        if len(point1)!= len(point2):
            return None
        sum = 0
        for i in range(len(point1)):
            sum += math.fabs(point1[i]-point2[i])
        return sum
    
    # 返回距离输入点最近的区域节点，和他们之间的距离
    def find_pre_nearest_point(self,root):
            
            new_point=self.new_point
            if root is None:
                return None
            if root.left is None and root.right is None:
                return root,self.caculate_Euclidean_distance(root.dom_elt,new_point)
            split=root.split
            if new_point[split] <= root.dom_elt[split] and root.left is not None:
                return self.find_pre_nearest_point(root.left)
            if new_point[split] > root.dom_elt[split] and root.right is not None:
                return self.find_pre_nearest_point(root.right)
            else:
                 return root,self.caculate_Euclidean_distance(root.dom_elt,new_point)

    # 搜索
    def search(self, now_node, nearest_node, nearest_distance):
        """
        在 KD 树中搜索最近邻节点

        参数:
            now_node: 当前节点
            nearest_node: 当前最近邻节点
            nearest_distance: 当前最近邻距离

        返回:
            最近邻节点和最近邻距离
        """
        new_point = self.new_point
        # 当前节点为空或者已经是根节点，返回
        if now_node is None or now_node.parent is None:
            return None
        distance = self.caculate_Euclidean_distance(now_node.dom_elt, new_point)
        if distance < nearest_distance:
            nearest_distance = distance
            nearest_node = now_node
            if now_node.left is not None:
                left_nearest_node, left_nearest_distance = self.search(now_node.left, nearest_node, nearest_distance)
                if left_nearest_distance < nearest_distance:
                    nearest_distance = left_nearest_distance
                    nearest_node = left_nearest_node
            if now_node.right is not None:
                right_nearest_node, right_nearest_distance = self.search(now_node.right, nearest_node, nearest_distance)
                if right_nearest_distance < nearest_distance:
                    nearest_distance = right_nearest_distance
                    nearest_node = right_nearest_node
            else:
                return self.search(now_node.parent, nearest_node, nearest_distance)
        self.nearest_distance = nearest_distance
        self.nearest_point = nearest_node
        return nearest_node, nearest_distance

In [None]:
new_point = [6.5, -1,9]

searcher=Searcher(new_point,kd)
a,b=searcher.find_pre_nearest_point(kd.root)
print(a.dom_elt)
print(b)

In [None]:
new_point = [3,7,4]

searcher=Searcher(new_point,kd)
a,b=searcher.find_pre_nearest_point(kd.root)
nearest_point,nearest_distance =searcher.search(a.parent,a,b)
print(a.dom_elt)
print(b)

In [None]:

print(nearest_point.dom_elt,nearest_distance)
print(searcher.nearest_point.dom_elt)
print(searcher.nearest_distance)

In [None]:
import pandas as pd  
from sklearn.model_selection import KFold, cross_val_score  
from sklearn.preprocessing import StandardScaler  # 可选，用于标准化特征  
from sklearn.base import BaseEstimator, ClassifierMixin  # 假设我们使用的是分类器  
  
# 假设你有一个包含xls文件路径的列表  
xls_files = ["C:\\Users\\admin\\Desktop\\三个月学习\\算法学习测试数据集\\bal.xls"]  # 替换为你的xls文件路径  
  
# 定义一个函数来读取xls文件并返回数据  
def read_xls(file_path):  
    return pd.read_excel(file_path)  
  
# 定义一个函数来执行十次十折交叉验证  
def ten_times_ten_fold_cv(estimator, X, y):  
    kf = KFold(n_splits=10, shuffle=True, random_state=42)  # 设置随机种子以确保可重复性  
    scores = []  
    for _ in range(10):  # 重复10次  
        fold_scores = cross_val_score(estimator, X, y, cv=kf, scoring='accuracy')  # 根据需要更改评分指标  
        scores.append(fold_scores)  
    # 将结果展平并计算平均值和标准差  
    flat_scores = [score for sublist in scores for score in sublist]  
    mean_score = sum(flat_scores) / len(flat_scores)  
    std_score = pd.Series(flat_scores).std()  
    return mean_score, std_score  
  
# 遍历每个xls文件，并对其执行交叉验证  
for file in xls_files:  
    data = read_xls(file)  
    # 假设最后一列是目标变量，其余列是特征  
    X = data.iloc[:, :-1]  
    y = data.iloc[:, -1]  
      
    # 可选：对特征进行标准化  
    scaler = StandardScaler()  
    X_scaled = scaler.fit_transform(X)  
      
    # 选择或定义你的模型（这里以随机森林分类器为例）  
    from sklearn.ensemble import RandomForestClassifier  
    model = RandomForestClassifier(random_state=42)  
      
    # 执行十次十折交叉验证  
    mean_score, std_score = ten_times_ten_fold_cv(model, X_scaled, y)  
      
    print(f"Results for {file}: Mean Score = {mean_score:.4f}, Std Score = {std_score:.4f}")  
  
# 注意：上面的代码假设你的目标变量是分类的。如果你的任务是回归，你需要更改`RandomForestClassifier`为`RandomForestRegressor`，  
# 并相应地调整`scoring`参数（例如，使用'neg_mean_squared_error'作为回归的评分指标）。

整合

In [9]:
import math  
import heapq
from collections import Counter  
from collections import deque  
  
import math
# kd-tree每个结点中主要包含的数据结构如下
class KdNode(object):
    def __init__(self, dom_elt, split,parent, left, right, val):
        self.dom_elt = dom_elt  # k维向量节点(k维空间中的一个样本点)
        self.split = split  # 整数（进行分割维度的序号）
        self.parent = parent  # 父节点
        self.left = left  # 该结点分割超平面左子空间构成的kd-tree
        self.right = right  # 该结点分割超平面右子空间构成的kd-tree
        self.val=val  # 考虑到分类的需要，记录训练集的分类类型

class KdTree(object):
    def __init__(self, data, y):
        if not data:
            return None
        if len(data)!=len(y):
            return print("check X Y")
        k = len(data[0])  # 数据维度

        def CreateNode(parent_node,split, data_set, y_set):  # 按第split维划分数据集exset创建KdNode
            if not data_set:  # 数据集为空
                return None
            # key参数的值为一个函数，此函数只有一个参数且返回一个值用来进行比较
            # operator模块提供的itemgetter函数用于获取对象的哪些维的数据，参数为需要获取的数据在对象中的序号
            #data_set.sort(key=itemgetter(split)) # 按要进行分割的那一维数据排序
            data_set.sort(key=lambda x: x[split])
            split_pos = len(data_set) // 2  # //为Python中的整数除法
            median = data_set[split_pos]  # 中位数分割点
            split_next = (split + 1) % k  # cycle coordinates
            val=y_set[split_pos]

            # 递归的创建kd树
            now_node = KdNode(
                median,
                split,
                parent_node,
                None,
                None,
                val
            )  # 创建当前节点
            now_node.left = CreateNode(now_node, split_next, data_set[:split_pos],y_set[:split_pos])  # 创建左子树
            now_node.right = CreateNode(now_node, split_next, data_set[split_pos + 1:],y_set[split_pos + 1:])  # 创建右子树
            return now_node
            
        self.root = CreateNode(None,0, data, y)  # 从第0维分量开始构建kd树,返回根节点


# KDTree的前序遍历
def preorder(root):
    print(root.dom_elt,root.split,root.val)
    if root.left:  # 节点不为空
        preorder(root.left)
    if root.right:
        preorder(root.right)
 
class Searcher:
    def __init__(self,new_point,kd_tree):
        self.new_point = new_point
        self.kd_tree = kd_tree
        self.nearest_point = []
        self.nearest_distance = []
    
    # 距离计算
    def caculate_Euclidean_distance(self,point1,point2):
        if point1 is None or point2 is None:
            return None
        if len(point1)!= len(point2):
            return None
        sum = 0
        for i in range(len(point1)):
            sum += (point1[i]-point2[i])**2
        return math.sqrt(sum)
    def caculate_Manhattan_distance(self,point1,point2):
        if point1 is None or point2 is None:
            return None
        if len(point1)!= len(point2):
            return None
        sum = 0
        for i in range(len(point1)):
            sum += math.fabs(point1[i]-point2[i])
        return sum
    
    # 返回距离输入点最近的区域节点，和他们之间的距离
    def find_pre_nearest_point(self,root):
            
            new_point=self.new_point
            if root is None:
                return None
            if root.left is None and root.right is None:
                return root,self.caculate_Euclidean_distance(root.dom_elt,new_point)
            split=root.split
            if new_point[split] < root.dom_elt[split] and root.left is not None:
                return self.find_pre_nearest_point(root.left)
            if new_point[split] >= root.dom_elt[split] and root.right is not None:
                return self.find_pre_nearest_point(root.right)
            else:
                 return root,self.caculate_Euclidean_distance(root.dom_elt,new_point)

    # 搜索
    def search(self, now_node, nearest_node, nearest_distance, k):
        """
        在 KD 树中搜索最近邻节点

        参数:
            now_node: 当前节点
            nearest_node: 当前最近邻节点
            nearest_distance: 当前最近邻距离
            k: 要返回的最近邻节点的数量

        返回:
            最近邻节点和最近邻距离的列表
        """
        new_point = self.new_point
        # 当前节点为空或者已经是根节点，返回
        if now_node is None or now_node.parent is None:
            return None
        distance = self.caculate_Euclidean_distance(now_node.dom_elt, new_point)
        if distance < nearest_distance:
            print(1)
            print(distance, now_node.dom_elt)
            # 为了避免重复输出父节点，此处不添加
            # 添加历史结点
            self.nearest_distance.append(distance)
            self.nearest_point.append(now_node)
            nearest_distance = distance
            nearest_node = now_node
            if now_node.left is not None:
                
                left_nearest_node, left_nearest_distance = self.search(now_node.left, nearest_node, nearest_distance, k)
                print(1.1)
                print(left_nearest_distance, left_nearest_node.dom_elt)

                # 添加历史结点
                self.nearest_distance.append(left_nearest_distance)
                self.nearest_point.append(left_nearest_node)
                if left_nearest_distance < nearest_distance:

                    nearest_distance = left_nearest_distance
                    nearest_node = left_nearest_node
            if now_node.right is not None:
                print(1.2)
                print(right_nearest_distance, right_nearest_node.dom_elt)
                right_nearest_node, right_nearest_distance = self.search(now_node.right, nearest_node, nearest_distance, k)
                # 添加历史结点
                self.nearest_distance.append(right_nearest_distance)
                self.nearest_point.append(right_nearest_node)
                if right_nearest_distance < nearest_distance:

                    nearest_distance = right_nearest_distance
                    nearest_node = right_nearest_node
            # 左右子树为空，返回父节点，回溯
            else:
                print(1.3)
                return self.search(now_node.parent, nearest_node, nearest_distance, k)
        
        print(2)
        print(distance, now_node.dom_elt)
        

        self.nearest_distance.append(distance)
        self.nearest_point.append(now_node)
        # kd子树只搜索logn个结点，故当节点比较少的时候会出现需求的查找最近k个节点超出搜索过的节点数的情况，此时我们不需考虑k，而尊重kd树的结果，唯一可能造成误差的是根节点的另一节点距离，但一般不会影响大局
        k=min(k,len(self.nearest_point))
        # # 对历史结点进行排序
        # self.nearest_distance.sort()
        # self.nearest_point.sort(key=lambda x: self.nearest_distance[self.nearest_point.index(x)])
        # # 取前k个结点
        # nearest_point = self.nearest_point[:k]
        # nearest_distance = self.nearest_distance[:k]

        return nearest_node, nearest_distance
    def predict(self,k):
        k=min(k,len(self.nearest_point))
        
        # 对数组去重，原因见下方markdown文件
        # 如果涉及到训练集中有重复的点，后期考虑在构建kd树时加入一个计数器，在搜索时对计数器进行判断，若计数器大于1，则加入历史结点，不大于1则不加入，如此下方的去重操作可以不执行
        # 使用dict.fromkeys()保持顺序地去重
        def remove_duplicates(lst):
            return list(dict.fromkeys(lst))
        self.nearest_distance = remove_duplicates(self.nearest_distance)
        self.nearest_point = remove_duplicates(self.nearest_point)
        
        # # 取前k个结点
        re1 = map(self.nearest_distance.index, heapq.nsmallest(k, self.nearest_distance)) #求ditance最小的k个索引    nsmallest与nlargest相反，求最小
        val_predict=[]
        for i in re1:
            val_predict.append(self.nearest_point[i].val)
        # 使用 Counter 计算每个元素的出现次数  
        counter = Counter(val_predict)  
        
        # 找出出现次数最多的元素及其次数,当数组中的元素没有严格意义上“最多”的重复次数（即多个元素具有相同的最高重复次数）时，collections.Counter 的 most_common() 方法会按照元素在数组中首次出现的顺序返回这些元素中的一个。具体来说，most_common(n) 方法会返回一个列表，其中包含前 n 个最常见元素及其计数，按计数降序排列；如果计数相同，则按元素在输入中的首次出现顺序排列。这是符合我们的需求的  
        most_common_element, count = counter.most_common(1)[0]       
        # print(f"重复次数最多的元素是: {most_common_element}，出现了 {count} 次")
        return most_common_element

        # # 对历史结点进行排序
        # temp=self.nearest_distance.sort()
        # self.nearest_point.sort(key=lambda x: self.nearest_distance[self.nearest_point.index(x)])
        # # 取前k个结点
        # nearest_point = self.nearest_point[:k]
        # nearest_distance = self.nearest_distance[:k]
        # return nearest_point,nearest_distance

还存在一定的问题，即未加入是否访问数据属性，导致，如果初始父节点满足距离小于当前最小距离，那么会继续访问该节点的子节点，而子节点在算法中必然会回溯回父节点，造成重复输入。具体情况见下两个代码单元

In [None]:
new_point = [5,5,4]

searcher=Searcher(new_point,kd)
a,b=searcher.find_pre_nearest_point(kd.root)
nearest_point,nearest_distance =searcher.search(a.parent,a,b,2)
print(a.dom_elt)
print(b)

In [None]:
print(nearest_point.dom_elt,nearest_distance)

for i in range(len(searcher.nearest_point)):
    print(searcher.nearest_point[i].dom_elt)


print(searcher.nearest_distance)

predict实现了选取和目标最近的k个点，然后根据k个点的标签进行投票，返回最多的标签。

In [12]:
predict=searcher.predict(3)

In [None]:
print(predict)

整合（列表输入，列表返回）

In [7]:
import math  
import heapq
from collections import Counter  
from collections import deque  
  
import math
# kd-tree每个结点中主要包含的数据结构如下
class KdNode(object):
    def __init__(self, dom_elt, split,parent, left, right, val):
        self.dom_elt = dom_elt  # k维向量节点(k维空间中的一个样本点)
        self.split = split  # 整数（进行分割维度的序号）
        self.parent = parent  # 父节点
        self.left = left  # 该结点分割超平面左子空间构成的kd-tree
        self.right = right  # 该结点分割超平面右子空间构成的kd-tree
        self.val=val  # 考虑到分类的需要，记录训练集的分类类型

class KdTree(object):
    def __init__(self, data, y):
        if not data:
            return None
        if len(data)!=len(y):
            return print("check X Y")
        k = len(data[0])  # 数据维度

        def CreateNode(parent_node,split, data_set, y_set):  # 按第split维划分数据集exset创建KdNode
            if not data_set:  # 数据集为空
                return None
            # key参数的值为一个函数，此函数只有一个参数且返回一个值用来进行比较
            # operator模块提供的itemgetter函数用于获取对象的哪些维的数据，参数为需要获取的数据在对象中的序号
            #data_set.sort(key=itemgetter(split)) # 按要进行分割的那一维数据排序
            data_set.sort(key=lambda x: x[split])
            split_pos = len(data_set) // 2  # //为Python中的整数除法
            median = data_set[split_pos]  # 中位数分割点
            split_next = (split + 1) % k  # cycle coordinates
            val=y_set[split_pos]

            # 递归的创建kd树
            now_node = KdNode(
                median,
                split,
                parent_node,
                None,
                None,
                val
            )  # 创建当前节点
            now_node.left = CreateNode(now_node, split_next, data_set[:split_pos],y_set[:split_pos])  # 创建左子树
            now_node.right = CreateNode(now_node, split_next, data_set[split_pos + 1:],y_set[split_pos + 1:])  # 创建右子树
            return now_node
            
        self.root = CreateNode(None,0, data, y)  # 从第0维分量开始构建kd树,返回根节点


# KDTree的前序遍历
def preorder(root):
    print(root.dom_elt,root.split,root.val)
    if root.left:  # 节点不为空
        preorder(root.left)
    if root.right:
        preorder(root.right)
 
class Searcher:
    def __init__(self,new_points,kd_tree):
        self.new_points = new_points
        self.kd_tree = kd_tree
        self.nearest_point = []
        self.nearest_distance = []
        self.temp=[]
    
    # 距离计算
    def caculate_Euclidean_distance(self,point1,point2):
        if point1 is None or point2 is None:
            return None
        if len(point1)!= len(point2):
            return None
        sum = 0
        for i in range(len(point1)):
            sum += (point1[i]-point2[i])**2
        return math.sqrt(sum)
    def caculate_Manhattan_distance(self,point1,point2):
        if point1 is None or point2 is None:
            return None
        if len(point1)!= len(point2):
            return None
        sum = 0
        for i in range(len(point1)):
            sum += math.fabs(point1[i]-point2[i])
        return sum
    
    def find_pre_nearest_point(self,root,new_point):
            
            if root is None:
                return None
            if root.left is None and root.right is None:
                return root,self.caculate_Euclidean_distance(root.dom_elt,new_point)
            split=root.split
            if new_point[split] < root.dom_elt[split] and root.left is not None:
                return self.find_pre_nearest_point(root.left,new_point)
            if new_point[split] >= root.dom_elt[split] and root.right is not None:
                return self.find_pre_nearest_point(root.right,new_point)
            else:
                 return root,self.caculate_Euclidean_distance(root.dom_elt,new_point)
    # 返回距离输入点最近的区域节点，和他们之间的距离
    def find_pre_nearest_points(self,root):
        roots=[]
        distances=[]
        new_points=self.new_points
        for new_point in new_points:
            # 给每个当前节点查找最近的初始叶子节点，并记录   
            root_temp,distance_temp=self.find_pre_nearest_point(root,new_point)
            roots.append(root_temp)
            distances.append(distance_temp)
            continue
            
            # if root is None:
            #     continue
            # if root.left is None and root.right is None:
            #     # 给每个当前节点查找最近的初始叶子节点，并记录
            #     roots.append(root)
            #     distances.append(self.caculate_Euclidean_distance(root.dom_elt,new_point))
            #     continue
            #     # return root,self.caculate_Euclidean_distance(root.dom_elt,new_point)
            # split=root.split
            # if new_point[split] < root.dom_elt[split] and root.left is not None:
            #     return self.find_pre_nearest_point(root.left)
            # if new_point[split] >= root.dom_elt[split] and root.right is not None:
            #     return self.find_pre_nearest_point(root.right)
            # else:
            #      # 给每个当前节点查找最近的初始叶子节点，并记录
            #     roots.append(root)
            #     distances.append(self.caculate_Euclidean_distance(root.dom_elt,new_point))
            #     continue
            #     # return root,self.caculate_Euclidean_distance(root.dom_elt,new_point)
        return roots,distances


    # 搜索
    def search(self, new_point, now_node, nearest_node, nearest_distance, k):
        """
        在 KD 树中搜索最近邻节点

        参数:
            now_node: 当前节点
            nearest_node: 当前最近邻节点
            nearest_distance: 当前最近邻距离
            k: 要返回的最近邻节点的数量

        返回:
            最近邻节点和最近邻距离的列表
        """
        
        # 当前节点为空或者已经是根节点，返回
        if now_node is None or now_node.parent is None:
            return None
        print("new_point",new_point)
        print("now_node",now_node.dom_elt)
        print("nearest_node",nearest_node.dom_elt)
        print("nearest_distance",nearest_distance)

        
        distance = self.caculate_Euclidean_distance(now_node.dom_elt, new_point)
        self.nearest_distance.append(nearest_distance)
        self.nearest_point.append(nearest_node)
        print("distance",distance)
        if distance < nearest_distance:
            # print(1)
            # print(distance, now_node.dom_elt)
            # 为了避免重复输出父节点，此处不添加
            # 添加历史结点
            self.nearest_distance.append(distance)
            self.nearest_point.append(now_node)
            nearest_distance = distance
            nearest_node = now_node
            if now_node.left is not None:
                
                left_nearest_node, left_nearest_distance = self.search(new_point,now_node.left, nearest_node, nearest_distance, k)
                # print(1.1)
                # print(left_nearest_distance, left_nearest_node.dom_elt)

                # 添加历史结点
                self.nearest_distance.append(left_nearest_distance)
                self.nearest_point.append(left_nearest_node)
                if left_nearest_distance < nearest_distance:

                    nearest_distance = left_nearest_distance
                    nearest_node = left_nearest_node
            if now_node.right is not None:
                # print(1.2)
                # print(right_nearest_distance, right_nearest_node.dom_elt)
                right_nearest_node, right_nearest_distance = self.search(new_point, now_node.right, nearest_node, nearest_distance, k)
                # 添加历史结点
                self.nearest_distance.append(right_nearest_distance)
                self.nearest_point.append(right_nearest_node)
                if right_nearest_distance < nearest_distance:

                    nearest_distance = right_nearest_distance
                    nearest_node = right_nearest_node
            # 左右子树为空，返回父节点，回溯
            else:
                # print(1.3)
                return self.search(new_point,now_node.parent, nearest_node, nearest_distance, k)
        
        # print(2)
        # print(distance, now_node.dom_elt)
        
        else:
            self.nearest_distance.append(distance)
            self.nearest_point.append(now_node)
            self.search(new_point,now_node.parent, nearest_node, nearest_distance, k)
        return nearest_node, nearest_distance
    
    
    def predict(self,k):
         # kd子树只搜索logn个结点，故当节点比较少的时候会出现需求的查找最近k个节点超出搜索过的节点数的情况，此时我们不需考虑k，而尊重kd树的结果，唯一可能造成误差的是根节点的另一节点距离，但一般不会影响大局
        k=min(k,len(self.nearest_point))
        
        # 对数组去重，原因见下方markdown文件
        # 如果涉及到训练集中有重复的点，后期考虑在构建kd树时加入一个计数器，在搜索时对计数器进行判断，若计数器大于1，则加入历史结点，不大于1则不加入，如此下方的去重操作可以不执行
        # 使用dict.fromkeys()保持顺序地去重
        def remove_duplicates(lst):
            return list(dict.fromkeys(lst))
        self.nearest_distance = remove_duplicates(self.nearest_distance)
        self.nearest_point = remove_duplicates(self.nearest_point)
        print(1111)
        print('nearest distance',self.nearest_distance)
        print('nearest point',self.nearest_point)

        # 使用zip将节点和距离配对  
        paired = zip(self.nearest_point,self.nearest_distance)  
        
        # 根据距离对配对后的列表进行排序（默认是升序）  
        sorted_paired = sorted(paired, key=lambda x: x[1] )  
        
        # 提取排序后的节点列表  
        sorted_nodes = [node for node, _ in sorted_paired]  
        
        # 如果你只需要前n个（例如最小的前3个），可以使用切片  
        # n = 3  # 假设你想取前3个  

        # 这里倘若列表没有三个元素，可能会报错
        sorted_nodes_top_n = sorted_nodes[:k] 
        
        val_predict=[]
        for i in range(len(sorted_nodes_top_n)):
            print("node append",sorted_nodes_top_n[i].dom_elt)
            val_predict.append(sorted_nodes_top_n[i].val)
            
        # # 取前k个结点
        # re1 = map(self.nearest_distance.index, heapq.nsmallest(k, self.nearest_distance)) #求ditance最小的k个索引    nsmallest与nlargest相反，求最小
        # val_predict=[]
        # self.temp.append(re1)
        # # # 取前k个结点
        # re1 = map(self.nearest_distance.index, heapq.nsmallest(k, self.nearest_distance)) #求ditance最小的k个索引    nsmallest与nlargest相反，求最小
        # val_predict=[]
        # self.temp.append(re1)
        # for i in re1:
        #     val_predict.append(self.nearest_point[i].val)

        
        # 使用 Counter 计算每个元素的出现次数  
        counter = Counter(val_predict)  
        print("typesss",type(counter))
        # 找出出现次数最多的元素及其次数,当数组中的元素没有严格意义上“最多”的重复次数（即多个元素具有相同的最高重复次数）时，collections.Counter 的 most_common() 方法会按照元素在数组中首次出现的顺序返回这些元素中的一个。具体来说，most_common(n) 方法会返回一个列表，其中包含前 n 个最常见元素及其计数，按计数降序排列；如果计数相同，则按元素在输入中的首次出现顺序排列。这是符合我们的需求的  
        most_common_element, count = counter.most_common(1)[0]       
        # print(f"重复次数最多的元素是: {most_common_element}，出现了 {count} 次")
        return most_common_element
def predicts(kd,X_test,k):
    searcher=Searcher(X_test,kd)
    # print(searcher.temp)
    nearest_points,nearest_distances=searcher.find_pre_nearest_points(kd.root)
    print(list(nearest_points),list(nearest_distances))
    predict_result=[]
    # k之后处理，在search中还未写进，放在了predict之中
    for i in range(len(X_test)):
        searcher.search(X_test[i],nearest_points[i].parent,nearest_points[i],nearest_distances[i],k)
        
        pred_temp=searcher.predict(k)
        predict_result.append(pred_temp)
        print(pred_temp)
        print("over")
        print(searcher.nearest_distance,searcher.nearest_point)
        # 恢复状态
        searcher.nearest_distance=[]
        searcher.nearest_point=[]
    # searcher=Searcher(X_test,kd)
    # nearest_points,nearest_distances=searcher.find_pre_nearest_points(kd.root)
    # print(list(nearest_points),list(nearest_distances))
    # predict_result=[]
    # # k之后处理，在search中还未写进，放在了predict之中
    # for i in range(len(X_test)):
    #     searcher.search(X_test[i],nearest_points[i].parent,nearest_points[i],nearest_distances[i],k)
    #     pred_temp=searcher.predict(k)
    #     predict_result.append(pred_temp)
    #     # 恢复状态
    #     searcher.nearest_distance=[]
    #     searcher.nearest_point=[]
    return predict_result
            

In [None]:
new_point = [[7,1,7],[5,4,5],[3,3,3]]
a=predicts(kd,new_point,3)
print(a)

这里是在调试predicts函数时的代码............10月16号早上应该解决，由于初始查找的最近叶子节点没有被放进最近节点的列表中，导致很多时候最近点找寻错误。

In [None]:
X_test = X[195:200]
k=1
searcher=Searcher(X_test,kd)
# print(searcher.temp)
nearest_points,nearest_distances=searcher.find_pre_nearest_points(kd.root)
print(list(nearest_points),list(nearest_distances))
predict_result=[]
# k之后处理，在search中还未写进，放在了predict之中
for i in range(len(X_test)):
    searcher.search(X_test[i],nearest_points[i].parent,nearest_points[i],nearest_distances[i],k)
    
    pred_temp=searcher.predict(k)
    predict_result.append(pred_temp)
    print(pred_temp)
    print("over")
    print(searcher.nearest_distance,searcher.nearest_point)
    # 恢复状态
    searcher.nearest_distance=[]
    searcher.nearest_point=[]
print('result',predict_result)
print('true',y[195:200])   

In [None]:
print('result',predict_result)

In [None]:
searcher.temp

In [None]:
print(list(searcher.temp[2]))

In [None]:
new_point = [[7,1,7],[5,4,5],[3,3,3]]

searcher=Searcher(new_point,kd)
a,b=searcher.find_pre_nearest_points(kd.root)
# nearest_point,nearest_distance =searcher.search(a.parent,a,b,2)
print(list(a))
print(list(b))

In [None]:

for point in a:
    print(point.dom_elt)
for point in b:
    print(point)

In [4]:
from package_py import KnnBaseKdTreee3
import pandas as pd  
from sklearn.model_selection import KFold, cross_val_score  
from sklearn.preprocessing import StandardScaler  # 可选，用于标准化特征  
from sklearn.base import BaseEstimator, ClassifierMixin  # 假设我们使用的是分类器  
  
# 假设您的数据保存在一个名为'data.txt'的文本文件中，每行是一个样本，列之间用空格分隔  
# 如果数据是在Excel文件中，您应该使用pd.read_excel('file_path.xlsx')来读取 
file_path = "data\\bal.xls"  # 请替换为您的实际文件路径  
data = pd.read_excel(file_path, header=None)  
  
# 将数据分为特征和标签  
X = data.iloc[:, :-1]  # 前4列是特征  
y = data.iloc[:, -1]   # 最后一列是分类标签  
# 可选：对特征进行标准化  
scaler = StandardScaler()  
X_scaled = scaler.fit_transform(X)    
# X = [[1,1,1],[2,2,2],[3,3,3],[4,4,4],[5,5,5],[6,6,6],[7,7,7],[8,8,8],[9,9,9],[10,10,10]]
# y=[0,1,1,2,3,3,3,3,3,3]


In [None]:
from sklearn.neighbors import KNeighborsClassifier 
# 初始化KNN分类器，并设置K值（这里设为3）  
knn = KNeighborsClassifier(n_neighbors=21)  
# 设置十折交叉验证  
kf = KFold(n_splits=10, shuffle=True, random_state=42)  # shuffle=True表示在分割前打乱数据  
scores = []
# 遍历交叉验证的每一折  
for fold, (train_index, test_index) in enumerate(kf.split(X)):  
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]  
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]  
    # 训练模型  
    knn.fit(X_train, y_train)  

    # 对测试集进行预测  
    y_pred = knn.predict(X_test)  
    # 评估模型  
    accuracy = accuracy_score(y_test, y_pred)  
    print(f'Accuracy: {accuracy:.2f}') 

新的风暴已经出现，准确度怎会如此之低

In [8]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
# 一次十折验证
def evaluate_classifier(classifier, X, y,k):
    # 设置十折交叉验证  
    kf = KFold(n_splits=10, shuffle=True, random_state=42)  # shuffle=True表示在分割前打乱数据  
    scores = []
    # 遍历交叉验证的每一折  
    for fold, (train_index, test_index) in enumerate(kf.split(X)):  
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]  
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        # 首先将pandas读取的数据转化为array# 然后转化为list形式
        X_train = np.array(X_train).tolist()
        y_train = np.array(y_train).tolist()
        X_test = np.array(X_test).tolist()
        y_test = np.array(y_test).tolist()

        kd=classifier.KdTree(X_train, y_train)
        
        # # 训练分类器
        # classifier.fit(kd)

       
        y_pred = classifier.predicts(kd,X_test,k)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='macro')
        scores.append(accuracy)
        # print(X_test)
        # print(y_test)
        # print(y_pred)
        # print("/n")

    print(scores)
    mean_accuracy = np.mean(scores)
    std_accuracy = np.std(scores)
    return mean_accuracy, std_accuracy,f1


In [12]:
from package_py import KnnBaseKdTreee3
import pandas as pd  
import numpy as np
from sklearn.preprocessing import StandardScaler  # 可选，用于标准化特征  


# 假设您的数据保存在一个名为'data.txt'的文本文件中，每行是一个样本，列之间用空格分隔  
# 如果数据是在Excel文件中，您应该使用pd.read_excel('file_path.xlsx')来读取 
file_path = "data\\bal.xls"  # 请替换为您的实际文件路径  
data = pd.read_excel(file_path, header=None)  
  
# 将数据分为特征和标签  
X = data.iloc[:, :-1]  # 前4列是特征  
y = data.iloc[:, -1]   # 最后一列是分类标签  
# 可选：对特征进行标准化  
scaler = StandardScaler()  
X_scaled = scaler.fit_transform(X)    
# X = [[1,1,1],[2,2,2],[3,3,3],[4,4,4],[5,5,5],[6,6,6],[7,7,7],[8,8,8],[9,9,9],[10,10,10]]
# y=[0,1,1,2,3,3,3,3,3,3]
mean_accuracy,std_accuracy,f1=evaluate_classifier(KnnBaseKdTreee3,X,y,2)

[<package_py.KnnBaseKdTreee3.KdNode object at 0x000002150EB7AEF0>, <package_py.KnnBaseKdTreee3.KdNode object at 0x000002150EB50208>, <package_py.KnnBaseKdTreee3.KdNode object at 0x000002150EB50048>, <package_py.KnnBaseKdTreee3.KdNode object at 0x000002150EB50048>, <package_py.KnnBaseKdTreee3.KdNode object at 0x000002150EB50550>, <package_py.KnnBaseKdTreee3.KdNode object at 0x000002150DE382E8>, <package_py.KnnBaseKdTreee3.KdNode object at 0x000002150EABB160>, <package_py.KnnBaseKdTreee3.KdNode object at 0x000002150EABB160>, <package_py.KnnBaseKdTreee3.KdNode object at 0x000002150EABB2E8>, <package_py.KnnBaseKdTreee3.KdNode object at 0x000002150EABB2E8>, <package_py.KnnBaseKdTreee3.KdNode object at 0x000002150EABB9E8>, <package_py.KnnBaseKdTreee3.KdNode object at 0x000002150EABB898>, <package_py.KnnBaseKdTreee3.KdNode object at 0x000002150EB50400>, <package_py.KnnBaseKdTreee3.KdNode object at 0x000002150EB50668>, <package_py.KnnBaseKdTreee3.KdNode object at 0x000002150EB50518>, <package_

In [13]:
print(mean_accuracy,std_accuracy,f1)

0.40803891449052737 0.025383556565494642 0.3052086506244289


In [6]:
from package_py import KnnBaseKdTreee3
import pandas as pd  
import numpy as np
from sklearn.preprocessing import StandardScaler  # 可选，用于标准化特征  


# 假设您的数据保存在一个名为'data.txt'的文本文件中，每行是一个样本，列之间用空格分隔  
# 如果数据是在Excel文件中，您应该使用pd.read_excel('file_path.xlsx')来读取 
file_path = "data\\iri.xls"  # 请替换为您的实际文件路径  
data = pd.read_excel(file_path, header=None)  
  
# 将数据分为特征和标签  
X = data.iloc[:, :-1]  # 前4列是特征  
y = data.iloc[:, -1]   # 最后一列是分类标签  
# 可选：对特征进行标准化  
scaler = StandardScaler()  
X_scaled = scaler.fit_transform(X)    
# X = [[1,1,1],[2,2,2],[3,3,3],[4,4,4],[5,5,5],[6,6,6],[7,7,7],[8,8,8],[9,9,9],[10,10,10]]
# y=[0,1,1,2,3,3,3,3,3,3]
mean_accuracy,std_accuracy=evaluate_classifier(KnnBaseKdTreee3,X,y,1)

[<package_py.KnnBaseKdTreee3.KdNode object at 0x000002150EB512E8>, <package_py.KnnBaseKdTreee3.KdNode object at 0x000002150DE38940>, <package_py.KnnBaseKdTreee3.KdNode object at 0x000002150EB51DA0>, <package_py.KnnBaseKdTreee3.KdNode object at 0x000002150EB51DA0>, <package_py.KnnBaseKdTreee3.KdNode object at 0x000002150EB51DA0>, <package_py.KnnBaseKdTreee3.KdNode object at 0x000002150EB51128>, <package_py.KnnBaseKdTreee3.KdNode object at 0x000002150DE38E48>, <package_py.KnnBaseKdTreee3.KdNode object at 0x000002150EB51E48>, <package_py.KnnBaseKdTreee3.KdNode object at 0x000002150EB51F98>, <package_py.KnnBaseKdTreee3.KdNode object at 0x000002150EB51940>, <package_py.KnnBaseKdTreee3.KdNode object at 0x000002150EB519B0>, <package_py.KnnBaseKdTreee3.KdNode object at 0x000002150EB51898>, <package_py.KnnBaseKdTreee3.KdNode object at 0x000002150EB51208>, <package_py.KnnBaseKdTreee3.KdNode object at 0x000002150EB51CF8>, <package_py.KnnBaseKdTreee3.KdNode object at 0x000002150EB51208>] [0.264575

In [7]:
print(mean_accuracy,std_accuracy)

0.5066666666666666 0.20912516188477495
