# Data Visualisation


The following code is used to visualise the data and see the distribution of the classes. This code was done before even having the dataset in coco format in order to know what the data really has before any preprocessing and cleaning.

## Import all the necessary libraries

In [None]:
from __future__ import annotations

#  Standard library 
import importlib
import subprocess
import sys

#  Helper function to ensure packages 
def ensure_package(pkg: str, import_name: str | None = None, pip_name: str | None = None):
    try:
        return importlib.import_module(import_name or pkg)
    except ImportError:
        pip_target = pip_name or pkg
        print(f"Installing missing package: {pip_target}")
        subprocess.check_call(
            [sys.executable, "-m", "pip", "install", pip_target],
            stdout=subprocess.DEVNULL
        )
        return importlib.import_module(import_name or pkg)

#  Ensure third-party packages 
pd = ensure_package("pandas")
IPython_display = ensure_package("IPython.display", "IPython.display", "ipython")
plt = ensure_package("matplotlib.pyplot", "matplotlib.pyplot", "matplotlib")
patches = ensure_package("matplotlib.patches", "matplotlib.patches", "matplotlib")
PIL = ensure_package("Pillow", "PIL", "pillow")
np = ensure_package("numpy")

import os
import json

#  Explicit imports for clarity / IDE support 
from IPython.display import display
from PIL import Image, ImageOps

#  Custom/local module 
try:
    from stefania_livori_utils import unzip_folder
except ImportError:
    raise ImportError(
        "‚ùå 'stefania_livori_utils' not found.\n"
        "Make sure 'stefania_livori_utils.py' is in the working directory "
        "or on PYTHONPATH."
    )

print("Environment ready: all dependencies installed and imported.")


## Data Distribution

The following code below is a helper method. This method is used to load all the data from the dataset at once with the properties it contains. This is then used to see the distributions of the different attributes with the sign types available in the dataset. This helps also to visualise the data with the aid of this function. Therefore this function is a helper method which is used throughout the whole notebook.

In [None]:
def load_labelstudio_annotations(json_path):
    records = []

    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    for task in data:
        # Handle local paths
        image_url = task.get("data", {}).get("image", "")
        image_filename = os.path.basename(image_url)

        for ann in task.get("annotations", []):
            regions = {}
            for r in ann.get("result", []):
                region_id = r.get("id")
                if region_id not in regions:
                    regions[region_id] = {"image_path": image_filename}
                regions[region_id][r["from_name"]] = r["value"]

            for region in regions.values():
                if "sign_type" in region:
                    # Updated to extract all requested fields
                    mounting_type = region.get("mounting", {}).get("choices", ["Unknown"])[0]
                    sign_condition = region.get("condition", {}).get("choices", ["Unknown"])[0]
                    sign_shape = region.get("sign_shape", {}).get("choices", ["Unknown"])[0]
                    
                    records.append({
                        "sign_type": region["sign_type"]["rectanglelabels"][0],
                        "view_angle": region.get("view_angle", {}).get("choices", ["Unknown"])[0],
                        "mounting_type": mounting_type,
                        "sign_condition": sign_condition,
                        "sign_shape": sign_shape,
                        "image_path": region["image_path"],
                        "bbox": [region.get("sign_type", {}).get("x"), 
                                 region.get("sign_type", {}).get("y"), 
                                 region.get("sign_type", {}).get("width"), 
                                 region.get("sign_type", {}).get("height")]
                    })

    return pd.DataFrame(records)


### Load Annotations

In [None]:
df = load_labelstudio_annotations("merged_input.json")
len(df)

### Sign Type vs Viewing  Angle

In [None]:
summary_angle = df.groupby(["sign_type", "view_angle"]).size().unstack(fill_value=0)
display(summary_angle)
print("Overall:")
print(df["view_angle"].value_counts())

Note how some of the signs do not have equal front, back and side views. This is due to the fact that there are images which had some background signs which we decided to label as well, to have a higher reliable model that can be used as a good prototype.

### Sign Type vs Mounting Type

In [None]:

summary_mounting = df.groupby(["sign_type", "mounting_type"]).size().unstack(fill_value=0)
display(summary_mounting)
print("Overall:")
print(df["mounting_type"].value_counts())

### Sign Type vs Sign Condition

In [None]:

summary_condition = df.groupby(["sign_type", "sign_condition"]).size().unstack(fill_value=0)
display(summary_condition)
print("Overall:")
print(df["sign_condition"].value_counts())

### Sign Type vs Sign Shape Type

In [None]:
summary_shape = df.groupby(["sign_type", "sign_shape"]).size().unstack(fill_value=0)
display(summary_shape)
print("Overall:")
print(df["sign_shape"].value_counts())

## Distribution of sign types and viewing angles

In [None]:
# Sign Type vs Viewing Angle Visualization
counts = (
    df.groupby(["sign_type", "view_angle"])
      .size()
      .unstack(fill_value=0)
)

counts.plot(
    kind="bar",
    stacked=True,
    figsize=(12, 6)
)

plt.title("Viewing Angle Distribution per Traffic Sign Type")
plt.xlabel("Traffic Sign Type")
plt.ylabel("Number of Instances")
plt.legend(title="Viewing Angle")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()


## Visualisation of Annotated Images

### Extract image file

In [None]:
# Path to your zip file
zip_path = "merged_images.zip"
extract_dir = "merged_images"

# Extract all files
unzip_folder(zip_path, extract_dir)

The following code block below provides a function to visualise the sign views. Each sign is visualised once with each view: front, back and side.

In [None]:


CLASS_NAMES = {
    1: "Stop",
    2: "No Entry (One Way)",
    3: "Pedestrian Crossing",
    4: "Roundabout Ahead",
    5: "No Through Road (T-Sign)",
    6: "Blind-Spot Mirror (Convex)"
}

def visualize_sign_views(df, image_dir="merged_images"):
    for sign_id, sign_name in CLASS_NAMES.items():
        # Filter the dataframe for this sign
        sign_df = df[df['sign_type'] == sign_name]
        
        # Collect one sample per view: front, back, side
        views_needed = ['front', 'back', 'side']
        samples = []
        for view in views_needed:
            view_df = sign_df[sign_df['view_angle'].str.lower() == view]
            if not view_df.empty:
                samples.append(view_df.sample(1).iloc[0])
        
        if not samples:
            continue  # Skip if no images for this sign
        
        fig, axes = plt.subplots(1, len(samples), figsize=(20, 10))
        if len(samples) == 1:
            axes = [axes]
        
        for i, row in enumerate(samples):
            img_path = os.path.join(image_dir, os.path.basename(row['image_path']))
 
            if os.path.exists(img_path):
                img_raw = Image.open(img_path)
                img_fixed = ImageOps.exif_transpose(img_raw)
                img = np.array(img_fixed)
                axes[i].imshow(img)

                # Convert Label Studio bbox to pixels
                h, w = img.shape[:2]
                x, y, bw, bh = row['bbox']
                
                if x is not None:
                    px = x * w / 100
                    py = y * h / 100
                    pbw = bw * w / 100
                    pbh = bh * h / 100

                    rect = patches.Rectangle((px, py), pbw, pbh, linewidth=3, edgecolor='r', facecolor='none')
                    axes[i].add_patch(rect)
                
                axes[i].set_title(f"{row['sign_type']} ({row['view_angle']})")
            else:
                axes[i].text(0.5, 0.5, f"Image not found:\n{row['image_path']}", ha='center')
            
            axes[i].axis('off')
        
        plt.tight_layout()
        plt.show()


# print(df)
visualize_sign_views(df)

### Visualise all the available images

The code below aids for us to debug that all the images where annotated well in the Label Studio. As a matter of fact, we can see that each image has one annotatio only, therefore if an image has more than one sign it can be found more than once in the plot.

In [None]:
def visualize_all_signs_grouped_with_bbox(df, img_dir="merged_images" ,cols=5):
    img_path_not_found = []
    for sign_type, group in df.groupby('sign_type'):
        num_images = len(group)
        rows = int(np.ceil(num_images / cols))

        # Create figure and axes
        fig, axes = plt.subplots(
            rows,
            cols,
            figsize=(cols * 4, rows * 4)
        )

        # Ensure axes is always a flat array
        axes = np.array(axes).reshape(-1)

        # Figure-level title
        fig.suptitle(sign_type, fontsize=18)

        for ax, (_, row) in zip(axes, group.iterrows()):
            img_path = os.path.join(img_dir, os.path.basename(row['image_path']))

            if os.path.exists(img_path):
                img = Image.open(img_path)
                img = ImageOps.exif_transpose(img)
                img_np = np.array(img)
                ax.imshow(img_np)

                # Draw bounding box if present
                bbox = row.get('bbox', None)
                if bbox is not None and None not in bbox:
                    h, w = img_np.shape[:2]
                    x, y, bw, bh = bbox

                    # Convert percentage bbox to pixels
                    px = x * w / 100
                    py = y * h / 100
                    pbw = bw * w / 100
                    pbh = bh * h / 100

                    rect = patches.Rectangle(
                        (px, py),
                        pbw,
                        pbh,
                        linewidth=2,
                        edgecolor='r',
                        facecolor='none'
                    )
                    ax.add_patch(rect)
            else:
                ax.text(
                    0.5, 0.5,
                    f"Image {img_path} not found",
                    ha='center',
                    va='center'
                )
                img_path_not_found.append(img_path)

            ax.set_title(row.get('view_angle', ''), fontsize=10)
            ax.axis('off')

        # Hide unused subplots
        for ax in axes[num_images:]:
            ax.axis('off')

        plt.tight_layout(rect=[0, 0, 1, 0.95])
        plt.show()
    print("Images not found:", img_path_not_found)

visualize_all_signs_grouped_with_bbox(df)
