In [1]:
import matplotlib.pyplot as plt
import cv2
import matplotlib.image as image
import json
import numpy as np
import os
import pandas as pd
import plotly.express as px 

In [2]:
with open("data/annotations/instances_train.json","r") as f:
    data = json.load(f)

In [3]:
data.keys()

dict_keys(['licenses', 'info', 'categories', 'images', 'annotations'])

# EDA

In [4]:
for k, v in data.items():
    
    if isinstance(v, list):
        print(f"\nType: {type(v).__name__}")
        print(f"{k}:{len(v)} items")
        if len(v)>0:
            print("first-element:",v[0])
            
    elif isinstance(v, dict):
        print(f"\nType: {type(v).__name__}")
        print(f"{k}: {len(v)}")
        print(f"Dict with {len(v.keys())}keys:{v}")
        
    else:
        print()


Type: list
licenses:1 items
first-element: {'name': 'CarDD', 'id': 0, 'url': 'https://cardd-ustc.github.io/'}

Type: dict
info: 5
Dict with 5keys:{'contributor': 'Xinkuang Wang, Wenjing Li, Zhongcheng Wu', 'date_created': '2022.05.01', 'description': 'CarDD for car damage detection in COCO Format.', 'url': 'https://cardd-ustc.github.io/', 'version': 'v1.0'}

Type: list
categories:6 items
first-element: {'id': 1, 'name': 'dent'}

Type: list
images:2816 items
first-element: {'id': 1, 'width': 1000, 'height': 750, 'file_name': '000001.jpg', 'license': 0}

Type: list
annotations:6211 items
first-element: {'id': 1, 'image_id': 1, 'category_id': 2, 'segmentation': [[233.35, 46.65, 217.25, 58.24, 210.82, 65.97, 204.38, 78.2, 197.3, 93.0, 192.15, 104.59, 187.0, 112.96, 180.56, 123.91, 174.12, 135.49, 169.61, 144.51, 167.04, 154.16, 167.04, 165.11, 175.41, 170.9, 185.71, 171.55, 196.01, 167.68, 205.02, 161.24, 214.03, 154.81, 224.33, 148.37, 233.99, 141.93, 244.29, 134.21, 256.52, 128.41, 267.

## **Conclusion**
The dataset contains five main keys: **licenses**, **info**, **categories**, **images**, and **annotations**.  
For car damage detection, the **important components** are `images`, `annotations`, and `categories`.  
- `annotations` provide bounding boxes (`bbox`) and category IDs — essential for training.  
- `segmentation` and `attributes` inside `annotations` are **not required** unless we do segmentation tasks.  
- `licenses` and `info` are **metadata only**, not used for model training.

Additional dataset characteristics:
- Image resolutions are generally consistent (e.g., around **1000×750**), providing enough detail for detection.  
- With **2,816 images** and **6,211 annotations**, each image contains on average **~2 damage instances**.

In [5]:
def clean_annotations(data):
    
    #path for input annotation-file
    input_path = "data/annotations/instances_train.json"
    #path for cleaned annotation-folder
    cleaned_dir = "data/annotations/cleaned/"
    clean_file_name = "annotations_train_cleaned.json"
    #Create folder if it doesn’t exist
    os.makedirs(cleaned_dir,exist_ok=True)
    #check if we already had the cleaned annotation file
    if os.path.exists(os.path.join(cleaned_dir,clean_file_name)):
        print("The cleaned file already exist.")
        
        with open(os.path.join(cleaned_dir,clean_file_name),"r") as f:
            data= json.load(f)
    
    else:
        with open(input_path,"r") as f:
            data = json.load(f)
        
        #Remove unnecessary meta-data
        keys = ["licenses","info"]
        for key in keys:
            if key in data:
                del data[key]
                print(f"{key} successfully deleted!")
                
        #Remove unnecessary annotation-field
        un_field = ["segmentation","attributes"]
        for u_ in un_field:
            deleted = False
            for anno in data["annotations"]:
                if u_ in anno:
                    anno.pop(u_,None)
                    deleted = True
            if(deleted):
                print(f"{u_} successfully deleted!")
                
        with open(os.path.join(cleaned_dir,clean_file_name),"w") as f:
            json.dump(data,f,indent = 2)
            
        print(f"Cleaned file saved at {os.path.join(cleaned_dir,clean_file_name)}")
                    
    return data

In [6]:
data = clean_annotations(data)

The cleaned file already exist.


In [7]:
with open("annotations_cleaned.json","w") as f:
    json.dump(data,f,indent=4)

In [8]:
df_cat = pd.DataFrame(data["categories"])
df_ann = pd.DataFrame(data["annotations"])
counts = df_ann["category_id"].value_counts().reset_index()
merged_df = pd.merge(df_cat, counts, left_on='id', right_on='category_id', how='inner')

In [9]:
merged_df

Unnamed: 0,id,name,category_id,count
0,1,dent,1,1806
1,2,scratch,2,2560
2,3,crack,3,651
3,4,glass shatter,4,475
4,5,lamp broken,5,494
5,6,tire flat,6,225


In [10]:
fig = px.pie(
    merged_df,
    names="name",
    values="count",
    title="Image Class Distribution (Donut Chart)",
    hole=0.4  # set 0 < hole < 1 to create a donut
)

fig.show()

In [11]:
len(merged_df)

6

## Class Distribution Conclusion

The dataset displays a **clear class imbalance** across the six damage categories.  
The classes **scratch (2560 samples)** and **dent (1806 samples)** are significantly more frequent than the others.  
In contrast, **tire flat (225 samples)**, **glass shatter (475 samples)**, and **lamp broken (494 samples)** have much fewer samples.

This imbalance may cause the model to **bias predictions toward the common classes**, leading to **lower recall and poor mAP** for the minority classes.

### Recommended Measures
We will address this imbalance by applying **class-weighted loss** in the **classification head of Faster R-CNN**.  
This approach increases the contribution of minority classes during training, helping the model learn to recognize them more effectively without altering the dataset itself.

### Evaluation Focus
- Monitor **per-class mAP**, not only overall mAP, to ensure performance is balanced across all damage categories.

In [12]:
df_img = pd.DataFrame(data['images'])
df_img.head(10)

Unnamed: 0,id,width,height,file_name,license
0,1,1000,750,000001.jpg,0
1,2,1000,667,000002.jpg,0
2,3,1000,667,000003.jpg,0
3,4,1000,667,000004.jpg,0
4,5,1000,667,000005.jpg,0
5,6,1000,667,000006.jpg,0
6,7,1000,667,000007.jpg,0
7,8,1000,678,000008.jpg,0
8,9,1000,667,000009.jpg,0
9,10,1000,667,000010.jpg,0


In [32]:
unique_with_all_columns = df_ann.drop_duplicates(subset=['category_id'])

In [35]:
unique_img_per_class = df_ann.groupby('category_id').sample(n=1)

In [36]:
unique_with_all_columns

Unnamed: 0,id,image_id,category_id,area,bbox,iscrowd
4803,4804,2978,1,74358.0,"[261.0, 207.3, 314.28, 322.25]",0
4104,4105,2549,2,22445.0,"[369.4, 481.02, 169.67, 179.72]",0
4745,4746,2942,3,3643.0,"[332.77, 560.62, 67.31, 93.24]",0
282,283,178,4,549347.0,"[0.0, 71.56, 1000.0, 582.44]",0
5303,5304,3391,5,186211.0,"[0.0, 106.74, 667.76, 446.62]",0
1088,1089,728,6,314669.0,"[181.8, 70.35, 646.9, 584.89]",0


In [14]:
def show_presentated_image_eachclass(img_dir,ammount_of_class):
    img_dir = "data/train"
    
    row = math.ceil(class_/3)
    fig, axes = plt.subplots(row,3,figsize=(10,10))
    fig.tight_layout()
    fig.subplots_adjust(hspace=.30)
    

    for i, ax in enumerate(axes.flat):
        if i < len(img):
            ax.imshow(img[i])
        ax.axis('off')

In [15]:
show_persentated_image_eachclass(6)

NameError: name 'show_persentated_image_eachclass' is not defined