### This notebook collects all images from the folder images_new and compile them together. All images will be in one folder under `data`, along with a cleaned master list file `master_file.csv` specifying the coordinates of the bounding box. 

# Collect all images in one folder

In [57]:
import os
import shutil
import pandas as pd

def collect_images(source_path, destination_folder):
    raw_folder = destination_folder
    destination_folder = os.path.join(destination_folder, "data")
    # Create the destination folder if it doesn't exist
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)
    else:
        shutil.rmtree(destination_folder)
        os.makedirs(destination_folder)
        
    source_folders = os.listdir(source_path)
    
    all_csv_files = []

    for folder in source_folders:
        if folder == '.DS_Store':
            continue
        
        tmp_files = os.listdir(os.path.join(source_path, folder))
        
        if '.DS_Store' in tmp_files:
            tmp_files.remove('.DS_Store')
        
        # check if train folder is available:
        if 'train' not in tmp_files:
            print(folder)
            print(os.listdir(os.path.join(source_path, folder)))
            raise ValueError("folder structure wrong")
        
        # current folder path
        tmp_folder = os.path.join(source_path, folder, 'train')
        
        # move all files in the train folder to the master data folder
        for file in os.listdir(tmp_folder):
            tmp_source = os.path.join(tmp_folder, file)
            
            # Check if the file is an image; move all imgs
            if file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
                # Create a new name for the file in case of duplicates
                counter = 1
                new_name = file
                while os.path.exists(os.path.join(destination_folder, new_name)):
                    name, ext = os.path.splitext(file)
                    new_name = f"{name}_{counter}{ext}"
                    counter += 1
                
                destination = os.path.join(destination_folder, new_name)
                
                # Copy the image to the destination folder
                shutil.copy(tmp_source, destination)
            
            # read csv annotations
            if file.endswith('.csv'):
                df = pd.read_csv(tmp_source, header=None, names=['filepath', 'xmin', 'ymin', 'xmax', 'ymax', 'label'])
                # use the folder name as the img label
                df['label'] = folder.rstrip("-2")
                df['folder_name'] = folder
                all_csv_files.append(df)

                num_files = df.shape[0]
                
        # check if # imgs in the folder match the # in the csv file
        tmp_num_img = [file for file in os.listdir(tmp_folder) if file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif'))]
        if num_files != len(tmp_num_img):
            print(tmp_folder)   
            raise ValueError("folder img # mismatch")
        
        # check if there is duplicate file name
        all_imgs = os.listdir(destination_folder)
        
        if len(all_imgs) != len(set(all_imgs)):
            raise ValueError("got duplicate img file name")
    
    # Concatenate all the dataframes
    annotations = pd.concat(all_csv_files, ignore_index=True)
    annotations.to_csv(os.path.join(raw_folder,"raw_master_list.csv"), index=False)
    


    
    
# source folders containing images
source_path = '/Users/liupeng/Documents/GitHub/object_detection_using_tensorflow/images_new'

# Destination folder where all images will be collected
destination_folder = '/Users/liupeng/Documents/GitHub/object_detection_using_tensorflow/images_new_v2' 

# Collect images
collect_images(source_path, destination_folder)


In [56]:
# summary data
print("Total # images: ", len(os.listdir(os.path.join(destination_folder,"data"))))

Total # images:  2448


# Generate xml files to show bounding boxes

In [58]:
import xml.etree.ElementTree as ET
from PIL import Image

def create_xml(destination_folder, source_folder, filename, xmin, ymin, xmax, ymax):
    # Create the destination folder if it doesn't exist
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)
    else:
        shutil.rmtree(destination_folder)
        os.makedirs(destination_folder)
    
    annotation = ET.Element("annotation")
    
    # Folder
    folder = ET.SubElement(annotation, "folder")
    folder.text = "bounding_box"

    # Filename
    filename_xml = ET.SubElement(annotation, "filename")
    filename_xml.text = filename
    
    # get file size
    # Open an image file
    with Image.open(os.path.join(source_folder, filename)) as img:
        # Fetch image dimensions
        width, height = img.size

    # print(f"Width: {width}, Height: {height}")

    # Size
    size = ET.SubElement(annotation, "size")
    width_xml = ET.SubElement(size, "width")
    width_xml.text = str(width)
    height_xml = ET.SubElement(size, "height")
    height_xml.text = str(height)
    depth = ET.SubElement(size, "depth")
    depth.text = "3"

    # Object
    obj = ET.SubElement(annotation, "object")
    name = ET.SubElement(obj, "name")
    name.text = "object_name"
    pose = ET.SubElement(obj, "pose")
    pose.text = "Unspecified"
    truncated = ET.SubElement(obj, "truncated")
    truncated.text = "0"
    difficult = ET.SubElement(obj, "difficult")
    difficult.text = "0"

    # Bounding Box
    bndbox = ET.SubElement(obj, "bndbox")
    xmin_xml = ET.SubElement(bndbox, "xmin")
    xmin_xml.text = str(xmin)
    ymin_xml = ET.SubElement(bndbox, "ymin")
    ymin_xml.text = str(ymin)
    xmax_xml = ET.SubElement(bndbox, "xmax")
    xmax_xml.text = str(xmax)
    ymax_xml = ET.SubElement(bndbox, "ymax")
    ymax_xml.text = str(ymax)

    # Create the XML file
    tree = ET.ElementTree(annotation)
    with open(os.path.join(destination_folder,f"{filename}.xml"), "wb") as fh:
        tree.write(fh)

# Create XML
destination_folder = "/Users/liupeng/Documents/GitHub/object_detection_using_tensorflow/images_new_v2/data_edit"
source_folder = "/Users/liupeng/Documents/GitHub/object_detection_using_tensorflow/images_new_v2/data_edit"
filename = "2023_8_11_11_22_30_553933_png.rf.59102374c2520696d8fe64158b5ccb75.jpg"
xmin, ymin, xmax, ymax = 151, 42, 497, 591
create_xml(destination_folder, source_folder, filename, xmin, ymin, xmax, ymax)


In [38]:
shutil.rmtree(destination_folder)

9

In [6]:
source_path = '/Users/liupeng/Documents/GitHub/object_detection_using_tensorflow/images_new/Sugar_400-499g_NonHalal_NonHealthy-2/train'
source_folders = os.listdir(source_path)
source_folders[:3]

['Sugar_400-499g_NonHalal_NonHealthy-2',
 'OtherNoodles_700-799g_Halal_NonHealthy-2',
 'OtherNoodles_400-499g_Halal_NonHealthy-2']

In [16]:
os.path.join(source_path, source_folders[0], 'train')

'/Users/liupeng/Documents/GitHub/object_detection_using_tensorflow/images_new/Sugar_400-499g_NonHalal_NonHealthy-2/train'

In [20]:
os.listdir(os.path.join(source_path, source_folders[0], 'train'))

['IMG_20230428_123704_jpg.rf.5fc2415d06061ea102ef125a37bbc88c.jpg',
 'IMG_20230428_123521_jpg.rf.1069b402272252862ec686589ee0dc3c.jpg',
 'IMG_20230428_123522_jpg.rf.204ff37f497f2dce442c7cc3d291ef78.jpg',
 'IMG_20230428_123529_jpg.rf.c757a35ff8715805953ae034409a23cd.jpg',
 '_annotations.csv',
 'IMG_20230428_123520_jpg.rf.57a9c7c075e605c7c691efb1dcf227d8.jpg',
 'IMG_20230428_123659_jpg.rf.5e1b6c4caabe48cf36003cb4184ff380.jpg',
 'IMG_20230428_123528_jpg.rf.5687b7b914f6d9aa98cadf060d1e3b00.jpg',
 'IMG_20230428_123703_jpg.rf.6a9c54175f59238cdc83999cdee6dad4.jpg',
 'IMG_20230428_123527_jpg.rf.8dd407b0ee5203fd4a46ddf1e061a37f.jpg',
 'IMG_20230428_123708_jpg.rf.141ecd0cefaea75c0b7a5f281475dd6b.jpg']

In [21]:
os.path.splitext('IMG_20230428_123708_jpg.rf.141ecd0cefaea75c0b7a5f281475dd6b.jpg')

('IMG_20230428_123708_jpg.rf.141ecd0cefaea75c0b7a5f281475dd6b', '.jpg')

In [23]:
destination_folder

'data'

In [45]:
shutil.rmtree('/Users/liupeng/Documents/GitHub/object_detection_using_tensorflow/images_new_v2')