# **DATASET**

**Install packages**

In [None]:
# !pip install roboflow==1.1.49

**Import libraries**

In [1]:
# Import libraries
import glob
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import random
import roboflow
import seaborn as sns
import yaml

from PIL import Image

**Directory structure**

In [None]:
# Run local or not
PROJ_ROOT = os.getcwd()
is_local = True if os.path.basename(PROJ_ROOT) == "notebooks" else False

# Directory path
if not is_local:
  os.makedirs("data", exist_ok=True)
  data_relative_path ="data"
else:
  data_relative_path ="../data/processed"

PROCESSED_DATA_DIR = os.path.join(PROJ_ROOT, data_relative_path)

# Print processed data path
print(f"\033[34m INFO: Processed data in {PROCESSED_DATA_DIR}! \033[0m")

**Configuration variables**

In [None]:
# Configuration class
class CFG:
  # Clases
  CLASSES = ["hard_hat",
            "no_hard_hat",
            "no_safety_harness",
            "no_safety_vest",
            "person",
            "safety_harness",
            "safety_vest"]

  # Dataset settings (Roboflow)
  DATASET_WORKSPACE = "deeplearning-cwudo"
  DATASET_PROJECT = "yolo_ppe_detection"
  DATASET_FORMAT = "yolov8"
  DATASET_VERSION = 5
  DATASET_NAME = f"ppe_dataset_v{DATASET_VERSION}"
  DATASET_PATH = os.path.join(PROCESSED_DATA_DIR, DATASET_NAME)
  DATASET_YAML_PATH = os.path.join(DATASET_PATH, "data.yaml")

# Print information
print(f"\nDataset information (Roboflow)")
print(f"Dataset project: {CFG.DATASET_PROJECT}")
print(f"Dataset version: {CFG.DATASET_VERSION}")
print(f"Dataset path: {CFG.DATASET_PATH}")

**Import dataset**

In [None]:
if not os.path.isfile(CFG.DATASET_YAML_PATH):
  # Login into roboflow
  roboflow.login()

  rf = roboflow.Roboflow()

  project = rf.workspace(CFG.DATASET_WORKSPACE).project(CFG.DATASET_PROJECT)
  version = project.version(CFG.DATASET_VERSION)
  dataset = version.download(model_format=CFG.DATASET_FORMAT, location=CFG.DATASET_PATH, overwrite=True)
else:
  print("The dataset exists.")

In [None]:
# Modify dataset yaml file
def modify_yaml_file(file_path):
  with open(file_path, "r+") as file:
    try:
      data = yaml.safe_load(file)
      file.seek(0)
      file.truncate(0)

      data["train"] = os.path.join(CFG.DATASET_PATH, "train/images")
      data["val"] = os.path.join(CFG.DATASET_PATH, "valid/images")
      data["test"] = os.path.join(CFG.DATASET_PATH, "test/images")

      yaml.dump(data, file)
      yaml_data = yaml.dump(data, default_style=False)
      print(yaml_data)

    except yaml.YAMLError as e:
      print("Error reading YAML: ", e)

  file.close()

# Print dataset yaml file
modify_yaml_file(CFG.DATASET_YAML_PATH)

**Dataset image visualizations**

In [None]:
# Display image function
def display_image(image, print_info = True, hide_axis = False):
  if isinstance(image, str): # Check if it's a file path
    img = Image.open(image)
    plt.imshow(img)
  elif isinstance(image, np.ndarray): # Check if it's a NumPy array
    image = image[..., ::-1] # BGR to RGB
    img = Image.fromarray(image)
    plt.imshow(img)
  else:
    raise ValueError("Unsupported image format")

  if print_info:
    print("Type: ", type(img), "\n")
    print("Shape: ", np.array(img).shape, "\n")

  if hide_axis:
    plt.axis("off")

  # Plot image
  plt.show()

In [None]:
# Select one valid subset: train, test or valid
subset = "train"
images_path = os.path.join(CFG.DATASET_PATH, subset, "images")
images_list = os.listdir(images_path)
image_index = random.randint(0, len(images_list))

# Random image path
example_image_path = os.path.join(images_path, images_list[image_index])

# Plot example image
display_image(example_image_path)

**Visualization of many images**

In [None]:
# Display many images
def plot_random_images_from_folder(folder_path, num_images = 20, seed = 0):

  random.seed(seed)

  # Get a list of image files in the folder
  image_files = [f for f in os.listdir(folder_path) if f.endswith((".jpg", ".png", ".jpeg", ".gif"))]

  # Make sure that we have a least num_images files to choose from
  if len(image_files) < num_images:
    raise ValueError("Not enough images in the folder")

  # Randomly select num_images image files
  selected_files = random.sample(image_files, num_images)

  # Create a subplot grid
  num_cols = 5
  num_rows = (num_images + num_cols - 1) // num_cols
  fig, axes = plt.subplots(num_rows, num_cols, figsize=(12,10))

  for i, file_name in enumerate(selected_files):
    # Open and display the image using PIL
    img = Image.open(os.path.join(folder_path, file_name))

    if num_rows == 1:
      ax = axes[i % num_cols]
    else:
      ax = axes[i // num_cols, i % num_cols]

    ax.imshow(img)
    # ax.axis("off")
    # ax.set_title(file_name)

  # Remove empty subplots
  for i in range(num_images, num_rows * num_cols):
    if num_rows == 1:
      fig.delaxes(axes[i % num_cols])
    else:
      fig.delaxes(axes[i // num_cols, i % num_cols])

  plt.tight_layout()
  plt.show()

In [None]:
# Select one valis subset: train, test or valid
subset = "train"
images_path = os.path.join(CFG.DATASET_PATH, subset, "images")
plot_random_images_from_folder(images_path, num_images=20)

**Dataset statistics**

In [None]:
# Classes
class_idx = {str(i): CFG.CLASSES[i] for i in range(len(CFG.CLASSES))}
class_stat = {}
data_len = {}
class_info = []

for subset in ["train", "valid", "test"]:
  class_count = {CFG.CLASSES[i]: 0 for i in range(len(CFG.CLASSES))}

  labels_path = os.path.join(CFG.DATASET_PATH, subset, "labels")
  for file in os.listdir(labels_path):
    with open(os.path.join(labels_path, file)) as f:
      lines = f.readlines()

      for cls in [line[0] for line in lines]:
        class_count[class_idx[cls]] += 1

    f.close()

  data_len[subset] = len(os.listdir(labels_path))
  class_stat[subset] = class_count

  class_info.append({"Subset": subset, **class_count, "Data_Volume": data_len[subset]})

# Convert class info list to pandas dataframe
dataset_stats_df = pd.DataFrame(class_info)
dataset_stats_df

In [None]:
# Create subplots with 1 row and 3 columns
fig, axes = plt.subplots(1, 3, figsize=(15,5))

# Plot vertical bar plots for each mode in subplots
for i, mode in enumerate(["train", "valid", "test"]):
  sns.barplot(
    data = dataset_stats_df[dataset_stats_df["Subset"] == mode].drop(columns="Subset"),
    orient="v",
    ax = axes[i],
    palette = "Set2"
  )

  axes[i].set_title(f"{mode.capitalize()} Class Statistics")
  axes[i].set_xlabel("Classes")
  axes[i].set_ylabel("Count")
  axes[i].tick_params(axis = "x", rotation = 90)

  # Add annotations on top of each bar
  for p in axes[i].patches:
    axes[i].annotate(f"{int(p.get_height())}",
                      (p.get_x() + p.get_width() / 2., p.get_height()),
                      ha = "center",
                      va = "center",
                      fontsize = 8,
                      color = "black",
                      xytext = (0,5),
                      textcoords = "offset points"
                    )

plt.tight_layout()
plt.show()

**Image size**

In [None]:
for mode in ["train", "valid", "test"]:
  print(f"\nImage size in {mode} set:")

  img_size = 0

  for file in glob.glob(os.path.join(CFG.DATASET_PATH, mode, "images", "*")):

    image = Image.open(file)

    if image.size != img_size:
      print(f"{image.size}")
      img_size = image.size