### 云杉松树

In [1]:
import os
import pandas as pd
from PIL import Image
from tqdm import tqdm
import numpy as np
from collections import defaultdict
import xml.etree.ElementTree as ET

# 设置数据集根目录
dataset_root = "./data/Data_Set_Spruce_Bark_Beetle"
categories = ["vertical", "oblique", "damage"]

# 创建储存结果的数据结构
dataset_stats = defaultdict(lambda: defaultdict(dict))

# 遍历所有类别文件夹
for category in categories:
    category_path = os.path.join(dataset_root, category)
    
    if not os.path.exists(category_path):
        print(f"目录不存在: {category_path}")
        continue
    
    print(f"正在处理 {category} 类别...")
    
    # 获取所有调查文件夹（按地名组织）
    site_folders = [f for f in os.listdir(category_path) if os.path.isdir(os.path.join(category_path, f))]
    
    total_images = 0
    site_stats = {}
    
    # 遍历每个地点
    for site_folder in site_folders:
        # 解析地点名称（文件夹命名格式为：地点_日期）
        site_name = site_folder.split('_')[0]  # 提取地点名
        
        # 图片路径和标注路径
        images_folder = os.path.join(category_path, site_folder, "Images")
        annotations_folder = os.path.join(category_path, site_folder, "Annotations")
        
        if not os.path.exists(images_folder):
            print(f"图像目录不存在: {images_folder}")
            continue
        
        if not os.path.exists(annotations_folder):
            print(f"标注目录不存在: {annotations_folder}")
            continue
        
        # 获取图片列表
        image_files = [f for f in os.listdir(images_folder) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
        num_images = len(image_files)
        total_images += num_images
        
        # 统计树种和损害情况
        tree_types = defaultdict(int)
        damage_types = defaultdict(int)
        damage_by_tree = defaultdict(lambda: defaultdict(int))
        
        # 初始化该地点的尺寸字典
        sizes = {}
        
        # 采样最多100张图片检查尺寸（提高处理速度）
        sample_size = min(100, num_images)
        sampled_images = np.random.choice(image_files, sample_size, replace=False) if num_images > 0 else []
        
        # 检查图片尺寸
        dimensions = []
        for img_file in tqdm(sampled_images, desc=f"{site_folder} - 尺寸分析", leave=False):
            img_path = os.path.join(images_folder, img_file)
            try:
                with Image.open(img_path) as img:
                    dimensions.append(img.size)
            except Exception as e:
                print(f"无法读取图片 {img_path}: {e}")
        
        # 解析所有XML文件，统计树种和损害情况
        annotation_files = [f for f in os.listdir(annotations_folder) if f.lower().endswith('.xml')]
        print(f"分析 {site_folder} 的标注信息...")
        
        total_trees = 0
        for xml_file in tqdm(annotation_files, desc=f"{site_folder} - 标注分析", leave=False):
            xml_path = os.path.join(annotations_folder, xml_file)
            try:
                tree = ET.parse(xml_path)
                root = tree.getroot()
                
                # 解析每个目标（树）
                for obj in root.findall('object'):
                    total_trees += 1
                    
                    # 获取树种类型
                    tree_elem = obj.find('tree')
                    if tree_elem is not None:
                        tree_type = tree_elem.text
                        tree_types[tree_type] += 1
                    
                    # 获取损害情况
                    damage_elem = obj.find('damage')
                    if damage_elem is not None:
                        damage_type = damage_elem.text
                        damage_types[damage_type] += 1
                        
                        # 按树种统计损害情况
                        if tree_elem is not None:
                            damage_by_tree[tree_type][damage_type] += 1
                
            except Exception as e:
                print(f"解析XML文件出错 {xml_path}: {e}")
        
        # 统计尺寸信息
        if dimensions:
            unique_dims = set(dimensions)
            dims_count = {f"{w}x{h}": dimensions.count((w, h)) for w, h in unique_dims}
            most_common_dim = max(dims_count.items(), key=lambda x: x[1])[0]
            
            # 存储地点统计信息
            site_stats[site_folder] = {
                "地点": site_name,
                "图片数量": num_images,
                "标注文件数量": len(annotation_files),
                "主要尺寸": most_common_dim,
                "尺寸分布": dims_count,
                "平均宽度": sum(w for w, h in dimensions) / len(dimensions),
                "平均高度": sum(h for w, h in dimensions) / len(dimensions),
                "总树木数": total_trees,
                "树种统计": dict(tree_types),
                "损害统计": dict(damage_types),
                "树种-损害统计": dict(damage_by_tree)
            }
        else:
            site_stats[site_folder] = {
                "地点": site_name,
                "图片数量": 0,
                "标注文件数量": len(annotation_files),
                "主要尺寸": "未知",
                "尺寸分布": {},
                "平均宽度": 0,
                "平均高度": 0,
                "总树木数": total_trees,
                "树种统计": dict(tree_types),
                "损害统计": dict(damage_types),
                "树种-损害统计": dict(damage_by_tree)
            }
    
    # 保存该类别的统计信息
    dataset_stats[category] = {
        "总图片数": total_images,
        "地点统计": site_stats
    }

# 输出统计结果
for category, stats in dataset_stats.items():
    print(f"\n{'='*20} {category} 类别统计 {'='*20}")
    print(f"总图片数量: {stats['总图片数']}")
    
    # 创建pandas DataFrame显示每个地点的信息
    site_data = []
    for site_folder, site_info in stats["地点统计"].items():
        site_data.append({
            "调查文件夹": site_folder,
            "地点": site_info["地点"],
            "图片数量": site_info["图片数量"],
            "标注文件数量": site_info["标注文件数量"],
            "主要尺寸": site_info["主要尺寸"],
            "平均宽度": round(site_info["平均宽度"], 2),
            "平均高度": round(site_info["平均高度"], 2),
            "总树木数": site_info["总树木数"]
        })
    
    # 按地点名和图片数量排序
    site_df = pd.DataFrame(site_data)
    site_df = site_df.sort_values(by=["地点", "图片数量"], ascending=[True, False])
    
    # 显示表格
    print(site_df.to_string(index=False))
    
    # 按地点汇总
    site_summary = site_df.groupby("地点").agg({
        "图片数量": "sum",
        "总树木数": "sum"
    }).reset_index()
    
    print(f"\n{category} 类别地点汇总:")
    print(site_summary.to_string(index=False))
    
    # 汇总每个调查区域的树种统计
    print(f"\n{category} 类别各树种分布:")
    for site_folder, site_info in stats["地点统计"].items():
        print(f"\n调查文件夹: {site_folder}")
        tree_types = site_info["树种统计"]
        if tree_types:
            tree_df = pd.DataFrame([{"树种": k, "数量": v} for k, v in tree_types.items()])
            tree_df = tree_df.sort_values(by="数量", ascending=False)
            print(tree_df.to_string(index=False))
        else:
            print("无树种数据")
    
    # 汇总每个调查区域的损害统计
    print(f"\n{category} 类别各损害类型分布:")
    for site_folder, site_info in stats["地点统计"].items():
        print(f"\n调查文件夹: {site_folder}")
        damage_types = site_info["损害统计"]
        if damage_types:
            damage_df = pd.DataFrame([{"损害类型": k, "数量": v} for k, v in damage_types.items()])
            damage_df = damage_df.sort_values(by="数量", ascending=False)
            print(damage_df.to_string(index=False))
        else:
            print("无损害数据")
    
    # 汇总每个调查区域的树种-损害关系
    print(f"\n{category} 类别树种-损害关系:")
    for site_folder, site_info in stats["地点统计"].items():
        print(f"\n调查文件夹: {site_folder}")
        tree_damage = site_info["树种-损害统计"]
        if tree_damage:
            # 将嵌套字典转换为DataFrame便于显示
            rows = []
            for tree_type, damages in tree_damage.items():
                for damage_type, count in damages.items():
                    rows.append({
                        "树种": tree_type,
                        "损害类型": damage_type,
                        "数量": count
                    })
            if rows:
                tree_damage_df = pd.DataFrame(rows)
                tree_damage_df = tree_damage_df.sort_values(by=["树种", "数量"], ascending=[True, False])
                print(tree_damage_df.to_string(index=False))
            else:
                print("无树种-损害关系数据")
        else:
            print("无树种-损害关系数据")

# 计算并打印总体统计信息
total_images = sum(stats["总图片数"] for stats in dataset_stats.values())
total_trees = sum(sum(site_info["总树木数"] for site_info in stats["地点统计"].values()) for stats in dataset_stats.values())

# 汇总所有树种和损害统计
all_tree_types = defaultdict(int)
all_damage_types = defaultdict(int)
for category, stats in dataset_stats.items():
    for site_info in stats["地点统计"].values():
        for tree_type, count in site_info["树种统计"].items():
            all_tree_types[tree_type] += count
        for damage_type, count in site_info["损害统计"].items():
            all_damage_types[damage_type] += count

print(f"\n{'='*20} 整个数据集统计 {'='*20}")
print(f"总图片数量: {total_images}")
print(f"总标注树木数: {total_trees}")

print("\n树种分布:")
if all_tree_types:
    all_tree_df = pd.DataFrame([{"树种": k, "数量": v, "占比": f"{v/total_trees:.2%}"} for k, v in all_tree_types.items()])
    all_tree_df = all_tree_df.sort_values(by="数量", ascending=False)
    print(all_tree_df.to_string(index=False))

print("\n损害分布:")
if all_damage_types:
    all_damage_df = pd.DataFrame([{"损害类型": k, "数量": v, "占比": f"{v/total_trees:.2%}"} for k, v in all_damage_types.items()])
    all_damage_df = all_damage_df.sort_values(by="数量", ascending=False)
    print(all_damage_df.to_string(index=False))

正在处理 vertical 类别...


                                                                               

分析 Backsjon_20201012 的标注信息...


                                                                                

分析 Lidhem_20200701 的标注信息...


                                                                             

分析 Lidhem_20200709 的标注信息...


                                                                   

分析 Lidhem_20200828 的标注信息...


                                                                   

分析 Lidhem_20201001 的标注信息...


                                                                              

分析 Viken_20180918 的标注信息...


                                                                             

正在处理 oblique 类别...


                                                                             

分析 Backsjon_20201014_oblique 的标注信息...


                                                                                        

分析 Backsjon_20201016_oblique 的标注信息...


                                                                                        

分析 Lidhem_20200527_oblique 的标注信息...


                                                                                      

分析 Lidhem_20200614_oblique 的标注信息...


                                                                                      

分析 Lidhem_20200930_oblique 的标注信息...


                                                                                      

正在处理 damage 类别...


                                                                           

分析 Backsjon_20201012_damage 的标注信息...


                                                                                 

分析 Lidhem_20200527_oblique_damage 的标注信息...


                                                                                 

分析 Lidhem_20200614_oblique_damage 的标注信息...


                                                                                 

分析 Lidhem_20201001_damage 的标注信息...


                                                                                    

分析 Viken_20180918_damage 的标注信息...


                                                                                   


总图片数量: 1478
            调查文件夹       地点  图片数量  标注文件数量      主要尺寸   平均宽度   平均高度  总树木数
Backsjon_20201012 Backsjon   200     200 1536x1536 1536.0 1536.0 16510
  Lidhem_20201001   Lidhem   417     417 1536x1536 1536.0 1536.0 29259
  Lidhem_20200701   Lidhem   131     131 1536x1536 1536.0 1536.0  4478
  Lidhem_20200709   Lidhem   114     114 1536x1536 1536.0 1536.0  2973
  Lidhem_20200828   Lidhem   113     113 1536x1536 1536.0 1536.0  3025
   Viken_20180918    Viken   503     503 1536x1536 1536.0 1536.0 43460

vertical 类别地点汇总:
      地点  图片数量  总树木数
Backsjon   200 16510
  Lidhem   775 39735
   Viken   503 43460

vertical 类别各树种分布:

调查文件夹: Backsjon_20201012
    树种    数量
  Pine 11569
Spruce  3833
 Birch  1091
 Aspen    13
 Other     4

调查文件夹: Lidhem_20200701
    树种   数量
Spruce 2799
  Pine 1386
 Birch  144
 Other  134
 Aspen   15

调查文件夹: Lidhem_20200709
    树种   数量
Spruce 1620
  Pine 1021
 Birch  248
 Other   79
 Aspen    5

调查文件夹: Lidhem_20200828
    树种   数量
Spruce 1664
  Pine 1040
 Birch  230
 