In [3]:
from collections import defaultdict
import pandas as pd
from pathlib import Path
import os
import xml.etree.ElementTree as ET
from concurrent.futures import ProcessPoolExecutor

os.chdir('/mnt/disks/data1/aihub/Training')
work_path = Path("/mnt/disks/data1/aihub/Training")
categories = ['선박', '어망부표', '등대', '기타부유물', '부표', '해상풍력']
df = pd.DataFrame(columns=['target_dir'] + categories + ['average_ship_size', 'file_count'])

filename = "category_counts.csv"
existing_df = pd.read_csv(filename)
targets = existing_df['target_dir'].tolist()

def count_categories(target):
    box_sizes = []
    dataset_path = work_path / target

    category_counts = defaultdict(int)
    for category in categories: 
        category_counts[category] = 0

    for file in dataset_path.glob("**/*"):
        if file.suffix == ".xml":
            tree = ET.parse(file)
            root = tree.getroot()
            for obj in root.findall('object'):
                category = obj.find('property/category').text
                category_counts[category] += 1
                if category == "선박":
                    bbox = obj.find('bbox').text 
                    bbox = bbox.replace("[", "").replace("]", "")
                    bbox = bbox.split(",")
                    bbox = [float(x) for x in bbox] 
                    size = bbox[2] * bbox[3]
                    box_sizes.append(size)

    category_counts["average_ship_size"] = int(sum(box_sizes) / len(box_sizes)) if box_sizes else 0
    category_counts["file_count"] = len(list(dataset_path.glob("**/*.jpg")))
    
    for category in categories:
        if category not in category_counts:
            category_counts[category] = 0

    category_counts['target_dir'] = target
    return category_counts


def mk_dataframe(targets: list, to_csv=True, filename: str = "category_counts.csv", overwrite=False, sort_by='target_dir', ascending=True):
    global df
    if os.path.isfile(filename) and not overwrite:
        df_pre = pd.read_csv(filename)
        df = df_pre.copy()
    else:
        df_pre = None

    with ProcessPoolExecutor() as executor:  # Multi-processing
        results = list(executor.map(count_categories, targets))

    dfs_to_concat = []
    for category_counts in results:
        target = category_counts['target_dir']
        if df_pre is not None and target in df_pre["target_dir"].values:
            print(f"{target} already exists in the dataframe")
            continue

        dfs_to_concat.append(pd.DataFrame(category_counts, index=[0]))
        print(f"{target} is added to the dataframe")

    if dfs_to_concat:  # if the list is not empty
        df_to_add = pd.concat(dfs_to_concat, ignore_index=True)
        df = pd.concat([df, df_to_add]).drop_duplicates(subset='target_dir', keep='first')

    df.dropna(axis=1, inplace=True)
    
    if sort_by:
        df.sort_values(by=sort_by, ascending=ascending, inplace=True)
    
    if to_csv:
        df.to_csv(filename, index=False)

    return df
 

In [32]:
os.chdir('/mnt/disks/data1/aihub/Training')
targets += ['남해_여수항_3구역_BOX']
df = mk_dataframe(targets, to_csv=True, filename="category_counts.csv", sort_by="target_dir",overwrite=True)

남해_여수항_1구역_BOX is added to the dataframe
남해_여수항_2구역_BOX is added to the dataframe
남해_여수항_4구역_BOX is added to the dataframe
남해_여수항_5구역_BOX is added to the dataframe
남해_여수항_6구역_BOX is added to the dataframe
남해_여수항_7구역_BOX is added to the dataframe
남해_여수항_8구역_BOX is added to the dataframe
동해_묵호항_1구역_BOX is added to the dataframe
서해_군산항_1구역_BOX is added to the dataframe
서해_군산항_2구역_BOX is added to the dataframe
서해_군산항_3구역_BOX is added to the dataframe
서해_군산항_4구역_BOX is added to the dataframe
서해_대천항_1구역_BOX is added to the dataframe
서해_대천항_2구역_BOX is added to the dataframe
서해_대천항_3구역_BOX is added to the dataframe
서해_장항_1구역_BOX is added to the dataframe
서해_군산항_5구역_BOX is added to the dataframe


In [4]:
df

Unnamed: 0,target_dir,선박,어망부표,등대,기타부유물,부표,해상풍력,average_ship_size,file_count
