### This notebook collects all images from the folder images_new and compile them together. All images will be in one folder under `data`, along with a cleaned master list file `master_file.csv` specifying the coordinates of the bounding box. 

# Collect all images in one folder

In [60]:
import os
import shutil
import pandas as pd

def collect_images(source_path, destination_folder):
    raw_folder = destination_folder
    destination_folder = os.path.join(destination_folder, "data")
    # Create the destination folder if it doesn't exist
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)
    else:
        shutil.rmtree(destination_folder)
        os.makedirs(destination_folder)
        
    source_folders = os.listdir(source_path)
    
    all_csv_files = []

    for folder in source_folders:
        if folder == '.DS_Store':
            continue
        
        tmp_files = os.listdir(os.path.join(source_path, folder))
        
        if '.DS_Store' in tmp_files:
            tmp_files.remove('.DS_Store')
        
        # check if train folder is available:
        if 'train' not in tmp_files:
            print(folder)
            print(os.listdir(os.path.join(source_path, folder)))
            raise ValueError("folder structure wrong")
        
        # current folder path
        tmp_folder = os.path.join(source_path, folder, 'train')
        
        # move all files in the train folder to the master data folder
        for file in os.listdir(tmp_folder):
            tmp_source = os.path.join(tmp_folder, file)
            
            # Check if the file is an image; move all imgs
            if file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
                # Create a new name for the file in case of duplicates
                counter = 1
                new_name = file
                while os.path.exists(os.path.join(destination_folder, new_name)):
                    name, ext = os.path.splitext(file)
                    new_name = f"{name}_{counter}{ext}"
                    counter += 1
                
                destination = os.path.join(destination_folder, new_name)
                
                # Copy the image to the destination folder
                shutil.copy(tmp_source, destination)
            
            # read csv annotations
            if file.endswith('.csv'):
                df = pd.read_csv(tmp_source, header=None, names=['filepath', 'xmin', 'ymin', 'xmax', 'ymax', 'label'])
                # use the folder name as the img label
                df['label'] = folder.rstrip("-2")
                df['folder_name'] = folder
                all_csv_files.append(df)

                num_files = df.shape[0]
                
        # check if # imgs in the folder match the # in the csv file
        tmp_num_img = [file for file in os.listdir(tmp_folder) if file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif'))]
        if num_files != len(tmp_num_img):
            print(tmp_folder)   
            raise ValueError("folder img # mismatch")
        
        # check if there is duplicate file name
        all_imgs = os.listdir(destination_folder)
        
        if len(all_imgs) != len(set(all_imgs)):
            raise ValueError("got duplicate img file name")
    
    # Concatenate all the dataframes
    annotations = pd.concat(all_csv_files, ignore_index=True)
    annotations.to_csv(os.path.join(raw_folder,"raw_master_list.csv"), index=False)
    


    
    
# source folders containing images
source_path = '/Users/liupeng/Documents/GitHub/object_detection_using_tensorflow/images_separate'

# Destination folder where all images will be collected
destination_folder = '/Users/liupeng/Documents/GitHub/object_detection_using_tensorflow/images_combined' 

# Collect images
collect_images(source_path, destination_folder)


In [61]:
# summary data
print("Total # images: ", len(os.listdir(os.path.join(destination_folder,"data"))))

Total # images:  2448


# Generate xml files to show bounding boxes

In [95]:
import xml.etree.ElementTree as ET
from PIL import Image

def create_xml(file_path, filename, xmin, ymin, xmax, ymax):
    
    annotation = ET.Element("annotation")
    
    # Folder
    folder = ET.SubElement(annotation, "folder")
    folder.text = "data_edit"

    # Filename
    filename_xml = ET.SubElement(annotation, "filename")
    filename_xml.text = filename
    
    # get file size
    # Open an image file
    with Image.open(os.path.join(file_path, "data_edit", filename)) as img:
        # Fetch image dimensions
        width, height = img.size

    # print(f"Width: {width}, Height: {height}")

    # Size
    size = ET.SubElement(annotation, "size")
    width_xml = ET.SubElement(size, "width")
    width_xml.text = str(width)
    height_xml = ET.SubElement(size, "height")
    height_xml.text = str(height)
    depth = ET.SubElement(size, "depth")
    depth.text = "3"

    # Object
    obj = ET.SubElement(annotation, "object")
    name = ET.SubElement(obj, "name")
    name.text = "object_name"
    pose = ET.SubElement(obj, "pose")
    pose.text = "Unspecified"
    truncated = ET.SubElement(obj, "truncated")
    truncated.text = "0"
    difficult = ET.SubElement(obj, "difficult")
    difficult.text = "0"

    # Bounding Box
    bndbox = ET.SubElement(obj, "bndbox")
    xmin_xml = ET.SubElement(bndbox, "xmin")
    xmin_xml.text = str(xmin)
    ymin_xml = ET.SubElement(bndbox, "ymin")
    ymin_xml.text = str(ymin)
    xmax_xml = ET.SubElement(bndbox, "xmax")
    xmax_xml.text = str(xmax)
    ymax_xml = ET.SubElement(bndbox, "ymax")
    ymax_xml.text = str(ymax)

    # Create the XML file
    tree = ET.ElementTree(annotation)
    # remove .jpg and other appendix
    with open(os.path.join(file_path, "data_edit", f"{filename[:-4]}.xml"), "wb") as fh:
        tree.write(fh)

# Create XML
file_path = "/Users/liupeng/Documents/GitHub/object_detection_using_tensorflow/images_combined"
# filename = "2023_8_11_11_22_30_553933_png.rf.59102374c2520696d8fe64158b5ccb75.jpg"
# xmin, ymin, xmax, ymax = 31, 179, 319, 562
# create_xml(folder, source_folder, filename, xmin, ymin, xmax, ymax)

annotations = pd.read_csv(os.path.join(folder,"raw_master_list.csv"))
for i in range(len(annotations)):
    tmp_row = annotations.iloc[i,]
    filename = tmp_row['filepath']
    xmin = tmp_row['xmin']
    ymin = tmp_row['ymin']
    xmax = tmp_row['xmax']
    ymax = tmp_row['ymax']
    create_xml(file_path, filename, xmin, ymin, xmax, ymax)

# Process annotations for multi-label multi-class classification

In [97]:
annotations.columns

Index(['filepath', 'xmin', 'ymin', 'xmax', 'ymax', 'label', 'folder_name'], dtype='object')

In [98]:
label_df = pd.DataFrame([label.split('_') for label in annotations.label], columns=['ProductType', 'Weight', 'HalalStatus', 'HealthStatus'])
label_df.head()

Unnamed: 0,ProductType,Weight,HalalStatus,HealthStatus
0,Sugar,400-499g,NonHalal,NonHealthy
1,Sugar,400-499g,NonHalal,NonHealthy
2,Sugar,400-499g,NonHalal,NonHealthy
3,Sugar,400-499g,NonHalal,NonHealthy
4,Sugar,400-499g,NonHalal,NonHealthy


In [100]:
label_df.loc[label_df.ProductType == "Nonfood",].head()

Unnamed: 0,ProductType,Weight,HalalStatus,HealthStatus
2188,Nonfood,,,
2189,Nonfood,,,
2190,Nonfood,,,
2191,Nonfood,,,
2192,Nonfood,,,


In [101]:
# Replace None with "Nonfood"
label_df.replace({None: 'Nonfood'}, inplace=True)
label_df.loc[label_df.ProductType == "Nonfood",].head()

Unnamed: 0,ProductType,Weight,HalalStatus,HealthStatus
2188,Nonfood,Nonfood,Nonfood,Nonfood
2189,Nonfood,Nonfood,Nonfood,Nonfood
2190,Nonfood,Nonfood,Nonfood,Nonfood
2191,Nonfood,Nonfood,Nonfood,Nonfood
2192,Nonfood,Nonfood,Nonfood,Nonfood


In [99]:
label_df.ProductType.unique()

array(['Sugar', 'OtherNoodles', 'Oil', 'Crackers', 'Nuts',
       'SweetsChocolatesOthers', 'HoneyOtherSpreads',
       'BeehoonVermicelliMeesua', 'Milo-powder-',
       'RolledOatsInstantOatmeal', 'OtherSauceDressingSoupbasePaste',
       'BiscuitsCrackersCookies', 'Breakfast cereals -cornflakes-',
       'Coffee', 'InstantNoodles', 'CerealBeverage-powder-',
       'OtherBakingNeeds', 'Cookies', 'Babymilk-powder-',
       'MaternalMilkPowder', 'Tea-powder-leaves-', 'Pasta',
       'FlavoredMilk', 'Sardines', 'Flour', 'Babyfood',
       'PotatochipsKeropok', 'Salt', 'Peanutbutter', 'ChilliSauce',
       'NutellaChocolate', 'InstantMeals', 'Kaya', 'Nonfood'],
      dtype=object)

In [102]:
print("Number of product types: ", label_df.ProductType.nunique())

Number of product types:  34


In [103]:
label_df.Weight.unique()

array(['400-499g', '700-799g', '500-599g', '200-299g', '100-199g',
       '1-99g', '300-399g', '600-699g', '800-899g', '1000-1999g',
       '900-999g', '3000-3999g', 'Nonfood'], dtype=object)

In [104]:
print("Number of weight types: ", label_df.Weight.nunique())

Number of weight types:  13


In [105]:
label_df.HalalStatus.unique()

array(['NonHalal', 'Halal', 'Nonfood'], dtype=object)

In [106]:
label_df.HealthStatus.unique()

array(['NonHealthy', 'Healthy', 'Nonfood'], dtype=object)

In [109]:
# label_df_encoded = pd.get_dummies(label_df, columns=['ProductType', 'Weight', 'HalalStatus', 'HealthStatus'])
# label_df_encoded.head()

In [112]:
file_path = "/Users/liupeng/Documents/GitHub/object_detection_using_tensorflow/images_combined"

annotations2 = pd.concat([annotations, label_df], axis=1)
annotations2.to_csv(os.path.join(file_path,"raw_master_list2.csv"), index=False)