In [None]:
import json
import matplotlib.pyplot as plt
import numpy as np



In [None]:


def load_coco_data(json_file):
    with open(json_file, 'r') as file:
        data = json.load(file)
    return data

def analyze_image_resolutions(data):
    image_dimensions = {}
    for image in data['images']:
        image_dimensions[image['id']] = (image['width'], image['height'])
    
    resolution_count = {}
    for dimensions in image_dimensions.values():
        if dimensions in resolution_count:
            resolution_count[dimensions] += 1
        else:
            resolution_count[dimensions] = 1
    
    total_images = sum(resolution_count.values())

    fig, ax = plt.subplots(figsize=(12, 9))
    for dimensions, count in resolution_count.items():
        ax.scatter(dimensions[0], dimensions[1], s=count*5)
        # ax.text(dimensions[0], dimensions[1] + np.sqrt(count*10), f"{count}", fontsize=10, ha='center', va='bottom')
    
    ax.set_xlabel('Width', fontsize=16)
    ax.set_ylabel('Height', fontsize=16)
    ax.set_title('Real Set Image Resolution Analysis', fontsize=16, fontweight='bold')
    plt.figtext(0.99, 0.01, f'Total images: {total_images}', horizontalalignment='right', fontsize=12, fontweight='bold')

    # 设置刻度字体大小
    ax.tick_params(axis='both', which='major', labelsize=14)

    # 移除上边和右边的边线
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    
    plt.show()

def analyze_object_sizes(data, max_samples=1000):
    object_dimensions = {}
    sample_count = 0
    for annotation in data['annotations']:
        if sample_count >= max_samples:
            break
        bbox = annotation['bbox']
        width = bbox[2]
        height = bbox[3]
        dimensions = (width, height)
        if dimensions in object_dimensions:
            object_dimensions[dimensions] += 1
        else:
            object_dimensions[dimensions] = 1
        sample_count += 1
    
    fig, ax = plt.subplots(figsize=(12, 9))
    for dimensions, count in object_dimensions.items():
        ax.scatter(dimensions[0], dimensions[1], s=count*3)
        # ax.text(dimensions[0], dimensions[1] + np.sqrt(count*10), f"{count}", fontsize=10, ha='center', va='bottom') 

    ax.set_xlabel('Object Width', fontsize=16)
    ax.set_ylabel('Object Height', fontsize=16)
    ax.set_title('Sampled Object Size Distribution in Real Annotations', fontsize=16, fontweight='bold')
    plt.figtext(0.99, 0.01, f'Sampled objects: {max_samples}', horizontalalignment='right', fontsize=12, fontweight='bold')

    # 设置刻度字体大小
    ax.tick_params(axis='both', which='major', labelsize=14)

    # 移除上边和右边的边线
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    
    plt.show()

def analyze_category_distribution(data):
    # Count occurrences of each category
    category_counts = {}
    for annotation in data['annotations']:
        category_id = annotation['category_id']
        if category_id in category_counts:
            category_counts[category_id] += 1
        else:
            category_counts[category_id] = 1

    # Map category IDs to their names
    category_names = {}
    for category in data['categories']:
        category_names[category['id']] = category['name']

    # Prepare data for the pie chart
    labels = [category_names[id] for id in category_counts.keys()]
    sizes = [category_counts[id] for id in category_counts.keys()]
    total = sum(sizes)

    # Create pie chart
    fig, ax = plt.subplots(figsize=(10, 12))
    wedges, texts, autotexts = ax.pie(sizes, labels=labels, autopct=lambda p: f'{p:.1f}%\n({int(p*total/100)})',
                                      startangle=90, pctdistance=0.85, textprops={'fontsize': 16})

    # Draw a circle at the center of pie to make it look like a donut
    centre_circle = plt.Circle((0, 0), 0.70, fc='white')
    fig.gca().add_artist(centre_circle)

    # Equal aspect ratio ensures that pie is drawn as a circle.
    ax.axis('equal')  
    plt.title('Distribution of Test Object Categories', fontsize=16, fontweight='bold')
    
    # 设置刻度字体大小（虽然饼图没有轴刻度，但我们保持代码一致性）
    ax.tick_params(axis='both', which='major', labelsize=12)
    
    plt.show()

In [None]:
# 加载数据
data = load_coco_data('/opt/data/private/fcf/mmdetection/data/HazyDet-365k/Real_Haze/train/train_coco.json')

# 进行分析
analyze_image_resolutions(data)
analyze_object_sizes(data, max_samples=1000)  # Analyze only 1000 samples for performance reasons
analyze_category_distribution(data)  # New function to analyze category distribution

In [None]:
###统计每种种类目标的面积，并用直方图形式来展示
import json
import os
import matplotlib.pyplot as plt
from collections import defaultdict
import numpy as np

# 定义一个颜色列表，以供不同类别使用不同的颜色
colors = ['b', 'r', 'g', 'c', 'm', 'y', 'k']

def plot_histograms_for_categories(json_file_path, num_bins=30, output_dir='output'):
    # 确保输出目录存在
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # 读取json文件
    with open(json_file_path, 'r') as f:
        coco_data = json.load(f)
    
    # 创建一个字典来记录每个种类的尺寸信息
    category_sizes = defaultdict(list)
    
    # 获取种类信息
    categories = {category['id']: category['name'] for category in coco_data.get('categories', [])}
    
    # 收集所有面积的集合
    all_areas = []
    
    # 遍历标注信息并记录每个目标的面积
    for annotation in coco_data.get('annotations', []):
        category_id = annotation['category_id']
        category_name = categories.get(category_id, 'Unknown')
        bbox = annotation['bbox']
        area = bbox[2] * bbox[3]  # 宽 * 高
        category_sizes[category_name].append(area)
        all_areas.append(area)
    
    # 对所有面积进行对数变换，避免log(0)的情况，我们过滤面积为0的目标
    all_areas = np.array([area for area in all_areas if area > 0])
    log_all_areas = np.log(all_areas)
    
    # 获取对数变换后所有面积的最小值和最大值
    min_log_area = min(log_all_areas)
    max_log_area = max(log_all_areas)
    
    # 为每个种类绘制对数直方图
    for idx, (category, sizes) in enumerate(category_sizes.items()):
        log_sizes = np.log([size for size in sizes if size > 0])
        
        plt.figure()
        plt.hist(log_sizes, bins=num_bins, range=(min_log_area, max_log_area), edgecolor='black', color=colors[idx % len(colors)])
        plt.title(f'Distribution of {category} sizes (Log scale)')
        plt.xlabel('Log(Area)')
        plt.ylabel('Count')
        plt.grid(True)
        output_path = os.path.join(output_dir, f'{category}_size_distribution.png')
        plt.savefig(output_path)
        plt.show()

# 示例文件路径
json_file_path = '/opt/data/private/fcf/mmdetection/data/HazyDet-365k/test/test_coco.json'

# 调用函数并生成直方图
plot_histograms_for_categories(json_file_path)

In [None]:
##统计每种目标的数目

import json
from collections import defaultdict

def count_categories_in_coco(json_file_path):
    # 读取json文件
    with open(json_file_path, 'r') as f:
        coco_data = json.load(f)
    
    # 创建一个字典来记录每个种类的目标数量
    category_count = defaultdict(int)
    
    # 获取种类信息
    categories = {category['id']: category['name'] for category in coco_data.get('categories', [])}
    
    # 遍历标注信息并统计各类别的数量
    for annotation in coco_data.get('annotations', []):
        category_id = annotation['category_id']
        category_name = categories.get(category_id, 'Unknown')
        category_count[category_name] += 1
    
    return category_count

# 示例文件路径
json_file_path = '/opt/data/private/fcf/mmdetection/data/HazyDet-365k/Real_Haze/train/train_coco.json'

# 调用函数并打印结果
category_count = count_categories_in_coco(json_file_path)
for category, count in category_count.items():
    print(f"Category: {category}, Count: {count}")