## 1 - Remove duplicate images from data set

In [None]:
## PLEASE NOTE:
## The code from these cells all depend on the structure of the data set partitions
## being [train/test/validate]dir -> [examplefruit]dir -> [fruit].jpg/png.

## For example, the "test" directory in our data set contains 36 subdirectories
## each named after a fruit ("apple," "banana," "orange"), and each of those directories
## contains only image files ("apple1.jpg," "banana35.png").

## You will have to run this code three times, each time changing the name of image_path
## to reflect a different partition (train, test, validate).

import os
import hashlib
import numpy as np
import h5py
import random
from PIL import Image
from numpy import asarray
from numpy import ndarray

# calculate_image_hash uses an algorithm called "md5" to create a unique
# hexadecimal string based off of the contents of a file. Two identical images,
# no matter their name, should return the same hash string. Of course, one pixel
# difference will not be detected by this.
def calculate_image_hash(image_path):
    with open(image_path, 'rb') as f:               # Open image in binary form
        return hashlib.md5(f.read()).hexdigest()    # Calculate hash, convert it to hex, return it as a string

imgfolders = 'Path\\To\\train_or_validate_or_test'
image_hashes = {}   # Create a dictionary to store image hashes and their file paths
removecount = 0     # Keep track of amount of removed photos

for dirname in os.listdir(imgfolders):              #Iterate through [train,test,validate]
    image_path = os.path.join(imgfolders, dirname)  

    for filename in os.listdir(image_path):         #Iterate through [train,test,validate]\\[fruitdir]

        # Generate image hash if file in question is an image, then compare to previously
        # calculated hashes stored in image_hashes dictionary.
        if filename.endswith('.jpg') or filename.endswith('.jpeg') or filename.endswith('.png'):
            image_path_2 = os.path.join(image_path, filename)
            image_hash = calculate_image_hash(image_path_2)

            # Check if the hash already exists in the dictionary, then remove if successful
            if image_hash in image_hashes:
                print(filename + ' is a duplicate of ' + str(image_hashes[image_hash]) + ' (' + dirname + ').')
                os.remove(image_path_2)
                removecount += 1
            else:
                # Otherwise, create new entry in dictionary
                image_hashes[image_hash] = filename

print(str(removecount) + " images removed.")
print("Duplicate check completed.")

## 2 - Augment data

In [None]:
directory = 'C:\\Users\\vange\\OneDrive - Tennessee Tech University\\Desktop\\Fall 23\\4240\\proj\\validation'

def exploreDir(directory):    
    for file in os.listdir(directory):
        f = os.path.join(directory, file)

        # If the item is a directory, explore it until we find a file, recursive call
        if os.path.isdir(f):
            exploreDir(f)
        
        # Check if the item is a file, if so check if it follows the required extension
        # If so, we generate a random number for the img augmentation and then grab the 
        # extension of the file, as well as the entire directory up to the file name.
        # After, we convert each photo to RGB since some photos are in the format of 
        # RGBA, which causes an error because of transparency. After changing each photo
        # to RGB, then the photo augmentation finally begins based on the random number
        
        elif os.path.isfile(f):
            if f.endswith('.jpg' or '.png' or 'jpeg'):
                randNum = random.randint(0, 2)
                rootext = os.path.splitext(f)
                fName = rootext[0].split("\\")
                
                img = Image.open(f)
                if img.mode != 'RGB':
                    img = img.convert('RGB')
                    newFileName = "{}.jpg".format(fName[10])
                    newFilePath = os.path.join(directory, newFileName)
                    img = img.save(newFilePath)
                
                if randNum == 0:
                    img = Image.open(f)
                    img = img.convert(mode = 'L')
                    newFileName = "{}_bw.{}".format(fName[10], rootext[1])
                    newFilePath = os.path.join(directory, newFileName)
                    img = img.save(newFilePath)
                
                elif randNum == 1:
                    img = Image.open(f)
                    img = img.effect_spread(30)
                    newFileName = "{}_blurred.{}".format(fName[10], rootext[1])
                    newFilePath = os.path.join(directory, newFileName)
                    img = img.save(newFilePath)
                    
                elif randNum == 2:
                    img = Image.open(f)
                    img = img.rotate(random.randint(135, 270))
                    newFileName = "{}_rotated.{}".format(fName[10], rootext[1])
                    newFilePath = os.path.join(directory, newFileName)
                    img = img.save(newFilePath)
                    
            
exploreDir(directory)

## 3 - Run a count of total images in one data partition (train, validate, test)

In [None]:
# Purely optional: Gives a count of total photos in each fruit subdirectory of a data partition.

# Additionally, contains an optional downsizing variable that allows
# you to only count every x photos for downsizefactor = x

directory = 'Path\\To\\train_or_validate_or_test'
total_imgcount = 0

for filename in os.listdir(directory):
    imgfolder = os.path.join(directory, filename)
    category_imgcount = 0

    # RUN THIS FOR LOOP TO COUNT ALL PHOTOS
    for imagename in os.listdir(imgfolder):
        category_imgcount += 1

    # RUN THIS BLOCK INSTEAD TO COUNT ONLY EVERY [DOWNSIZEFACTOR]th PHOTO
    '''
    it = 0
    downsizefactor = 5

    for imagename in os.listdir(imgfolder):
        if downsizecount % downsizefactor == 0:
            category_imgcount += 1
        it += 1
    '''
    
    # Ex. "Orange contains 100 photos."
    print(filename + " contains " + str(category_imgcount) + " photos.")
    total_imgcount += category_imgcount

# Ex. "Path\\To\\train contains 3000 photos."
print(directory + ' contains ' + str(total_imgcount) + ' photos.')

## 4 - Generate HDF5 dataset from photo directories

In [None]:
## PLEASE NOTE:
## You will have to run this code three times, each time changing the names of
## directory and hfile to generate a separate dataset for each data partition.
## (i.e., train.h5, validate.h5, test.h5)

# Increments each time we move to the next fruit folder.
category = 0 
directory = 'Path\\To\\fruits_directory'

# Declare a new empty h5 dataset that you will populate in the coming code.
# If one exists with the same name it will be overwritten.
hfile = h5py.File('newdataset.h5', 'w')

allimages = []      # Will become a 4D array which stores all images no matter the category
alltags = []        # Will store the value of "category" for each image, creating a parallel list that
                    # effectively labels each photo with a fruit id
allnames = []       # Will store each category number alongside the fruit name it corresponds to.

for filename in os.listdir(directory):              # For all directories in [train,validate,test]
    imgfolder = os.path.join(directory, filename)
    
    # RUN THIS BLOCK TO ADD EVERY PHOTO
    for imagename in os.listdir(imgfolder):         # For all images in [train,validate,test]\\[fruit]

        imgaddress = os.path.join(imgfolder, imagename)

        # Open the image in RGB format, resize it to 100x100x3 (length, width, pixel RGB vals),
        # then transform it into the form of a numpy array.
        img = Image.open(imgaddress).convert("RGB")
        img = img.resize((100,100))
        imgpixelarray = asarray(img)

        # Add the image to allimages and add the category it belongs to to alltags
        allimages.append(imgpixelarray)
        alltags.append(category)

    # RUN THIS BLOCK INSTEAD TO ADD EVERY [DOWNSIZEPHACTOR]th PHOTO
    '''
    it = 0
    downsizefactor = 5

    for imagename in os.listdir(imgfolder):

        if (it % downsizefactor == 0):
            imgaddress = os.path.join(imgfolder, imagename)
            img = Image.open(imgaddress).convert("RGB")

            img = img.resize((100,100))

            imgpixelarray = asarray(img)

            allimages.append(imgpixelarray)
            alltags.append(category)

        it += 1
    '''

    # After iterating through all photos of a particular fruit, add the fruit's name
    # to allnames
    allnames.append(filename)

    print(filename + " added")
    category += 1

# Package the three arrays into one h5 file
hfile.create_dataset('fruits_photos', data=allimages)
hfile.create_dataset('numeric_labels_photos', data=alltags)
hfile.create_dataset('fruit_names_legend', data=allnames)

hfile.close()