## Data preparation of RAVV dataset to fine tune the Meta Segment-Anything model

We convert the annotations found in the .json file (either bounding boxes or polygons) to arrays (either zipped numpy arrays .npz or .jpg binary masks). 

First convert labeled ground-truth segmentations (polygons) into pixel maps and store them as .jpg files

In [1]:
import json
import os
import numpy as np
from PIL import Image, ImageDraw

# Specify the directory containing the images
image_directory = '/Users/capelo/Desktop/constructor/final_project/fine_tune_sam/UAVVaste/branch_UAVVaste/UAVVaste/images/'

# Specify the directory for storing the masks
#mask_directory = '/Users/capelo/Desktop/constructor/final_project/fine_tune_sam/UAVVaste/branch_UAVVaste/UAVVaste/masks/pixel_masks'
mask_directory = '/Volumes/Samsung_USB/pixel_masks_rgb2'
# Specify the directory containing the JSON file
json_directory = '/Users/capelo/Desktop/constructor/final_project/fine_tune_sam/UAVVaste/UAVVaste/annotations'
json_filename = 'annotations.json'

# Create the mask directory if it doesn't exist
os.makedirs(mask_directory, exist_ok=True)

# Load the JSON file
with open(os.path.join(json_directory, json_filename)) as json_file:
    data = json.load(json_file)

# Iterate over each image in the directory
for image_info in data.get("images", []):
    image_filename = image_info["file_name"]
    image_width = image_info["width"]
    image_height = image_info["height"]

    # Create a blank binary mask with the same dimensions as the image
    #binary_mask = Image.new('L', (image_width, image_height), 0)  # 'L' for 8-bit pixels, black and white
    binary_mask = Image.new('RGB', (image_width, image_height), (0, 0, 0))  # 'RGB' for 3-channel color image

    # Find the corresponding annotations for the image
    image_id = image_info["id"]
    annotations = [ann for ann in data.get("annotations", []) if ann["image_id"] == image_id]


    # Draw polygons on the mask image
    for annotation in annotations:
        segmentation = annotation.get("segmentation")
        if segmentation is not None:
            for segment in segmentation:
                # Flatten the segment coordinates into a 1D list
                flattened_segment = [int(coord) for coord in segment]
                # Reshape the flattened segment into pairs of (x, y) coordinates
                #coordinates = [(flattened_segment[i], flattened_segment[i + 1]) for i in range(0, len(flattened_segment), 2)]
                coordinates = [(flattened_segment[i + 1], flattened_segment[i]) for i in range(0, len(flattened_segment), 2)]
                draw = ImageDraw.Draw(binary_mask)
                #draw.polygon(coordinates, outline=255, fill=255)
                draw.polygon(coordinates, outline=(255, 255, 255), fill=(255, 255, 255))


    # Invert the binary mask (invert black and white regions)
    inverted_mask = Image.eval(binary_mask, lambda x: 255 - x)

    # Save the binary mask as JPEG in the mask directory
    mask_filename = os.path.splitext(image_filename)[0] + '.jpg'
    mask_path = os.path.join(mask_directory, mask_filename)
    inverted_mask.save(mask_path)
    
   
# Save the binary mask as JPEG in the mask directory
#image_filename_lower = image_filename.lower()
if image_filename.endswith('.JPG') or image_filename.endswith('.JPEG') or image_filename.endswith('.PNG'):
    mask_filename = os.path.splitext(image_filename)[0] + '.JPG'
else:
    mask_filename = os.path.splitext(image_filename)[0] + '.jpg'

mask_path = os.path.join(mask_directory, mask_filename)
inverted_mask.save(mask_path)




Same thing for bounding boxes

In [None]:
import json
import os
import numpy as np
from PIL import Image, ImageDraw

# Specify the directory containing the images
image_directory = '/Users/capelo/Desktop/constructor/final_project/fine_tune_sam/UAVVaste/UAVVaste/images'

# Specify the directory for storing the masks
mask_directory = '/Users/capelo/Desktop/constructor/final_project/fine_tune_sam/UAVVaste/UAVVaste/bb_masks'

# Specify the directory containing the JSON file
json_directory = '/Users/capelo/Desktop/constructor/final_project/fine_tune_sam/UAVVaste/UAVVaste/annotations'
json_filename = 'annotations.json'

# Create the mask directory if it doesn't exist
#os.makedirs(mask_directory, exist_ok=True)

# Load the JSON file
with open(os.path.join(json_directory, json_filename)) as json_file:
    data = json.load(json_file)

# Iterate over each image in the directory
for image_info in data.get("images", []):
    image_filename = image_info["file_name"]
    image_width = image_info["width"]
    image_height = image_info["height"]

    # Create a blank binary mask with the same dimensions as the image
    binary_mask = Image.new('L', (image_width, image_height), 0)  # 'L' for 8-bit pixels, black and white

    # Find the corresponding annotations for the image
    image_id = image_info["id"]
    annotations = [ann for ann in data.get("annotations", []) if ann["image_id"] == image_id]

    # Draw bounding boxes on the mask image
    for annotation in annotations:
        bbox = annotation.get("bbox")
        if bbox is not None and len(bbox) == 4:
            x, y, w, h = bbox
            x1, y1, x2, y2 = int(x), int(y), int(x + w), int(y + h)
            draw = ImageDraw.Draw(binary_mask)
            draw.rectangle([(x1, y1), (x2, y2)], fill=255)

    # Invert the binary mask (invert black and white regions)
    inverted_mask = Image.eval(binary_mask, lambda x: 255 - x)

    # Save the inverted binary mask as JPEG in the mask directory
    mask_filename = os.path.splitext(image_filename)[0] + '_mask_bw.jpg'
    mask_path = os.path.join(mask_directory, mask_filename)
    inverted_mask.save(mask_path)

The .jpg images can be used to inspect the masks and compare them to the locations in the images. However, we caution against using all images and masks in the format provided for two reasons:

First: there are a few instances, especially in the files 'GOPRO..'' where the saved image is either inverted or rotated with respect to the coordinates of the mask. 

Second: this method of reading in the bounding boxes into the program to fine tune sam is only valid for images where the bounding boxes do not overlap -- in other words where there are relatively few and well-separated annotated objects. 

We provide a log file where we have indicated whether the image is rotated or mirrored (First issue) or crowded (second issue). 

To avoid overlapping bounding boxes, we provide the masks as zipped numpy arrays. 

In [1]:
import json
import os
import numpy as np
from PIL import Image, ImageDraw

# Specify the directory containing the images
image_directory = '/Users/capelo/Desktop/constructor/final_project/fine_tune_sam/UAVVaste/branch_UAVVaste/UAVVaste/images'

# Specify the directory for storing the masks
mask_directory = '/Users/capelo/Desktop/constructor/final_project/fine_tune_sam/UAVVaste/branch_UAVVaste/UAVVaste/masks/pixel_zipped'

# Specify the directory containing the JSON file
json_directory = '/Users/capelo/Desktop/constructor/final_project/fine_tune_sam/UAVVaste/UAVVaste/annotations'
json_filename = 'annotations.json'

# Load the JSON file
with open(os.path.join(json_directory, json_filename)) as json_file:
    data = json.load(json_file)

# Iterate over each image in the directory
for image_info in data.get("images", []):
    image_filename = image_info["file_name"]
    image_width = image_info["width"]
    image_height = image_info["height"]

    # Find the corresponding annotations for the image
    image_id = image_info["id"]
    annotations = [ann for ann in data.get("annotations", []) if ann["image_id"] == image_id]

    # Create an empty array to store the masks
    number_masks = len(annotations)
    masks = np.zeros((number_masks, 1, image_height, image_width), dtype=bool)

    # Draw polygons on the mask image and store them in the masks array
    for idx, annotation in enumerate(annotations):
        segmentation = annotation.get("segmentation")
        if segmentation is not None:
            mask = Image.new('L', (image_width, image_height), 0)
            draw = ImageDraw.Draw(mask)
            for segment in segmentation:
                flattened_segment = [int(coord) for coord in segment]
                coordinates = [(flattened_segment[i + 1], flattened_segment[i]) for i in range(0, len(flattened_segment), 2)]
                draw.polygon(coordinates, outline=1, fill=1)
            masks[idx, 0, :, :] = np.array(mask)

    # Create the output file path for the current image
    file_out = os.path.join(mask_directory, os.path.splitext(image_filename)[0] + '.npz')

    # Save the masks as a single .npz file
    np.savez_compressed(file_out, masks=masks)


And the same for the bounding boxes

Here we can check the influence of data type on size. We check this because the fine-tuning algorithm is limited by the required large data set and resultant memory needed. We can check if boolean arrays will help allow us to include more images. 

In [18]:
import numpy as np

arr = np.zeros((100, 100), dtype=np.uint8)
file_path_uint8 = "array_uint8.npy"
np.save(file_path_uint8, arr)
print(f"File size with dtype=np.uint8: {os.path.getsize(file_path_uint8)} bytes")

arr = np.zeros((100, 100), dtype=np.uint16)
file_path_uint16 = "array_uint16.npy"
np.save(file_path_uint16, arr)
print(f"File size with dtype=np.uint16: {os.path.getsize(file_path_uint16)} bytes")

arr = np.zeros((100, 100), dtype=np.float32)
file_path_float32 = "array_float32.npy"
np.save(file_path_float32, arr)
print(f"File size with dtype=np.float32: {os.path.getsize(file_path_float32)} bytes")


arr = np.zeros((100, 100), dtype=np.bool)
print(f"Memory size with dtype=np.bool: {arr.nbytes} bytes")


File size with dtype=np.uint8: 10128 bytes
File size with dtype=np.uint16: 20128 bytes
File size with dtype=np.float32: 40128 bytes
Memory size with dtype=np.bool: 10000 bytes


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  arr = np.zeros((100, 100), dtype=np.bool)


In [None]:
#code to convert the .json files to .jpgs for the bounding boxes
import json
import os
import numpy as np
from PIL import Image, ImageDraw

# Specify the directory containing the images
image_directory = '/Users/capelo/Desktop/constructor/final_project/fine_tune_sam/UAVVaste/UAVVaste/images'

# Specify the directory for storing the masks
mask_directory = '/Users/capelo/Desktop/constructor/final_project/fine_tune_sam/UAVVaste/UAVVaste/bb_masks'

# Specify the directory containing the JSON file
json_directory = '/Users/capelo/Desktop/constructor/final_project/fine_tune_sam/UAVVaste/UAVVaste/annotations'
json_filename = 'annotations.json'

# Create the mask directory if it doesn't exist
#os.makedirs(mask_directory, exist_ok=True)

# Load the JSON file
with open(os.path.join(json_directory, json_filename)) as json_file:
    data = json.load(json_file)

# Iterate over each image in the directory
for image_info in data.get("images", []):
    image_filename = image_info["file_name"]
    image_width = image_info["width"]
    image_height = image_info["height"]

    # Create a blank binary mask with the same dimensions as the image
    binary_mask = Image.new('L', (image_width, image_height), 0)  # 'L' for 8-bit pixels, black and white

    # Find the corresponding annotations for the image
    image_id = image_info["id"]
    annotations = [ann for ann in data.get("annotations", []) if ann["image_id"] == image_id]

    # Draw bounding boxes on the mask image
    for annotation in annotations:
        bbox = annotation.get("bbox")
        if bbox is not None and len(bbox) == 4:
            x, y, w, h = bbox
            x1, y1, x2, y2 = int(x), int(y), int(x + w), int(y + h)
            draw = ImageDraw.Draw(binary_mask)
            draw.rectangle([(x1, y1), (x2, y2)], fill=255)

    # Invert the binary mask (invert black and white regions)
    inverted_mask = Image.eval(binary_mask, lambda x: 255 - x)

    # Save the inverted binary mask as JPEG in the mask directory
    mask_filename = os.path.splitext(image_filename)[0] + '_mask_bw.jpg'
    mask_path = os.path.join(mask_directory, mask_filename)
    inverted_mask.save(mask_path)



From the visual inspection of the files created, we can see that some of the images are either crowded with overlapping bounding boxes, or else need to be inverted. 

In [11]:
import json
import os
import numpy as np
import cv2

# Specify the directory containing the images
image_directory = '/Users/capelo/Desktop/constructor/final_project/fine_tune_sam/UAVVaste/branch_UAVVaste/UAVVaste/images'

# Specify the directory for storing the masks and bounding boxes
output_directory = '/Users/capelo/Desktop/constructor/final_project/fine_tune_sam/UAVVaste/branch_UAVVaste/UAVVaste/masks/bb_zipped'

# Specify the directory containing the JSON file
json_directory = '/Users/capelo/Desktop/constructor/final_project/fine_tune_sam/UAVVaste/UAVVaste/annotations'
json_filename = 'annotations.json'

# Load the JSON file
with open(os.path.join(json_directory, json_filename)) as json_file:
    data = json.load(json_file)

# Iterate over each image in the directory
for image_info in data.get("images", []):
    image_filename = image_info["file_name"]
    image_width = image_info["width"]
    image_height = image_info["height"]

    # Find the corresponding annotations for the image
    image_id = image_info["id"]
    annotations = [ann for ann in data.get("annotations", []) if ann["image_id"] == image_id]

    # Create an empty array to store the bounding boxes
    number_boxes = len(annotations)
    boxes = np.zeros((number_boxes, 1, image_height, image_width))

    # Draw bounding boxes on the mask image and store them in the boxes array
    for idx, annotation in enumerate(annotations):
        bbox = annotation.get("bbox")
        if bbox is not None and len(bbox) == 4:
            x, y, w, h = bbox
            x1, y1, x2, y2 = int(x), int(y), int(x + w), int(y + h)
            mask = np.zeros((image_height, image_width), dtype=np.uint8)
            mask[y1:y2, x1:x2] = 255
            boxes[idx, 0, :, :] = mask

    # Create the output file path for the current image
    file_out = os.path.join(output_directory, os.path.splitext(image_filename)[0] + '.npz')

    # Save the bounding boxes as a single .npz file
    np.savez_compressed(file_out, boxes=boxes)


In [57]:
import csv

# Read the CSV file into a list of lists
my_file='/Users/capelo/Desktop/constructor/final_project/fine_tune_sam/UAVVaste/UAVVaste/logs/uavwaste_notes.csv'
data = []
with open(my_file, 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        data.append(row)

# Access the values in the list of lists
file_list =[row[0] for row in data]
status = [row[1] if len(row) > 1 else None for row in data]


In [68]:
filtered_files1 = [file for file, stat in zip(file_list, status) if stat == 'XY']
#filtered_filess

filtered_files2= [file for file, stat in zip(file_list, status) if stat == 'crowded']
#filtered_files2

filtered_files3= [file for file, stat in zip(file_list, status) if stat == 'L90']
filtered_files3

#we can use this information to decide which images to use and how to transform them. if XY, we will transpose the original image file in x and y. If crowded, we will not work with them as binary masks in jpg format. If L90, we will rotate by 90 degrees to the right. .  

['camera_img_0.jpg', 'camera_img_1.jpg', 'camera_img_2.jpg']

In [69]:
filtered_files1 = [file for file, stat in zip(file_list, status) if stat == 'XY']
filtered_files

['GOPR0021.JPG',
 'GOPR0022.JPG',
 'GOPR0023.JPG',
 'GOPR0026.JPG',
 'GOPR0027.JPG',
 'GOPR0028.JPG',
 'GOPR0030.JPG',
 'GOPR0032.JPG',
 'GOPR0034.JPG',
 'GOPR0035.JPG',
 'GOPR0036.JPG',
 'GOPR0037.JPG',
 'GOPR0038.JPG',
 'GOPR0039.JPG',
 'GOPR0043.JPG',
 'GOPR0044.JPG',
 'GOPR0046.JPG',
 'GOPR0049.JPG',
 'GOPR0050.JPG',
 'GOPR0051.JPG',
 'GOPR0053.JPG',
 'GOPR0054.JPG',
 'GOPR0055.JPG']

In [56]:
import json

# Load the JSON file
fname = '/Users/capelo/Desktop/constructor/final_project/fine_tune_sam/UAVVaste/UAVVaste/annotations/annotations.json'
with open(fname, 'r') as file:
    data = json.load(file)

# Initialize counters
num_images = len(data["images"])
num_annotations = len(data["annotations"])

# Count unique category IDs
unique_categories = set()
for annotation in data["annotations"]:
    unique_categories.add(annotation["category_id"])
num_categories = len(unique_categories)

# Count non-crowd annotations
num_non_crowd_annotations = sum(1 for annotation in data["annotations"] if annotation["iscrowd"] == 0)

# Print the basic properties
print("Number of Images:", num_images)
print("Number of Annotations:", num_annotations)
print("Number of Unique Categories:", num_categories)
print("Number of Non-Crowd Annotations:", num_non_crowd_annotations)


Number of Images: 772
Number of Annotations: 3718
Number of Unique Categories: 1
Number of Non-Crowd Annotations: 3718


apparently they have not stored any litter type labels in the json annotation files. Moreover, they do not consider any of the images crowded. 

In [76]:
import json
import os
import numpy as np
from PIL import Image, ImageDraw
from IPython.display import display

# Specify the directory containing the images
image_directory = '/Users/capelo/Desktop/constructor/final_project/fine_tune_sam/UAVVaste/UAVVaste/images'

# Get the filenames of the filtered files
filtered_files1 = [file for file, stat in zip(file_list, status) if stat == 'XY']

# Iterate over each image filename in the filtered_files
for image_filename in filtered_files:
    # Read the original image
    image_path = os.path.join(image_directory, image_filename)
    image = Image.open(image_path)

    # Flip the image along the X and Y axes
    flipped_image = image.transpose(Image.FLIP_LEFT_RIGHT).transpose(Image.FLIP_TOP_BOTTOM)
    

    # Save the flipped image as a new JPEG
    #flipped_filename = os.path.splitext(image_filename)[0] + '_flipped.jpg'
    #flipped_path = os.path.join(image_directory, flipped_filename)
    #flipped_image.save(flipped_path)
    
       
#display(flipped_image)


  flipped_image = image.transpose(Image.FLIP_LEFT_RIGHT).transpose(Image.FLIP_TOP_BOTTOM)
  flipped_image = image.transpose(Image.FLIP_LEFT_RIGHT).transpose(Image.FLIP_TOP_BOTTOM)


In [2]:
# Load the JSON file
fname = '/Users/capelo/Desktop/constructor/final_project/fine_tune_sam/UAVVaste/UAVVaste/annotations/annotations.json'
with open(fname, 'r') as file:
    data = json.load(file)

# Initialize counters
num_images = len(data["images"])
num_annotations = len(data["annotations"])

# Count unique category IDs
unique_categories = set()
for annotation in data["annotations"]:
    unique_categories.add(annotation["category_id"])
num_categories = len(unique_categories)

# Count non-crowd annotations
num_non_crowd_annotations = sum(1 for annotation in data["annotations"] if annotation["iscrowd"] == 0)

# Check if number of bounding boxes matches number of segmentations
num_bounding_boxes = sum("bbox" in annotation for annotation in data["annotations"])
num_segmentations = sum("segmentation" in annotation for annotation in data["annotations"])
matching_bbox_segmentation = num_bounding_boxes == num_segmentations

# Print the basic properties
print("Number of Images:", num_images)
print("Number of Annotations:", num_annotations)
print("Number of Unique Categories:", num_categories)
print("Number of Non-Crowd Annotations:", num_non_crowd_annotations)
print("Number of Bounding Boxes:", num_bounding_boxes)
print("Number of Segmentations:", num_segmentations)
print("Bounding Boxes and Segmentations Match:", matching_bbox_segmentation)


The history saving thread hit an unexpected error (OperationalError('unable to open database file')).History will not be written to the database.
Number of Images: 772
Number of Annotations: 3718
Number of Unique Categories: 1
Number of Non-Crowd Annotations: 3718
Number of Bounding Boxes: 3718
Number of Segmentations: 3718
Bounding Boxes and Segmentations Match: True
