In [4]:
import os

#function to rename files
def file_rename():
    #use counter to count up + 1 for every image
    count = 10
    #for loop that iterates over each image in images
    for image in os.listdir('images'):
       #rename image file to img_(file number)
       
        os.rename(f'./images/{image}', f'./images/img_{count}.jpg')
        count += 1
file_rename()
        

In [6]:

from exif import Image
from PIL import Image
from PIL.ExifTags import TAGS
from csv import DictWriter
import pandas as pd
import os
import numpy as np
import glob
import hashlib
import datetime

def img_filepaths():
    """
    Returns a list of all the file filepaths in our directory variable.
    """
    directory = "./images"
    filepaths = []
    for image in os.listdir(directory):
        i = os.path.join(directory, image)
        if os.path.isfile(i):
            filepaths.append(i)
    return filepaths

def dict_convert():
    """
    Returns a list of dictionaries of all of the metadata for a list of items in our file path from img_filepaths().
    """
    dict_list = []
    image_names = img_filepaths()
    # assign image names as our img_filepaths function
    for image in image_names:
    # Loop through each image in our directory
        try:
            image_file = Image.open(image)
            # assign image_file as the image file being open.
            exifdata = image_file.getexif()
            # assign pillow metadata tags onto id fields
            file_dict = {}
            for tag_id in exifdata:
                # loop through the tag_ids (metadata tags)
                tag = TAGS.get(tag_id, tag_id)
                # acquire the tags and convert them into human readable metadata tags
                data = exifdata.get(tag_id)
                # get the value attributed to the tags in the metadata
                if isinstance(data, bytes):
                # check if our data is readable data
                    data = data.decode()
                # if not readable, decode it.
                file_dict[tag] = data
                # create a dictionary key-value pair with {metadata tag: data from photo}
            name = {'File Path': image}
            file_dict.update(name)
            dict_list.append(file_dict)
            # append the dictionary to our empty list "dict_list"
        except:
        # If the file is not readable by our function, instead of raising a value error. Pass it on through and leave it's dictionary empty.
            pass   
    return dict_list
    # returns our list of dictionaries for each photo.

dict_list = dict_convert()
# assign our dict_list to our function(dict_convert()) output
field_names = ['TileWidth', 'TileLength', 'GPSInfo','ResolutionUnit', 'ExifOffset', 'Make', 'Model', 'Software', 'Orientation', 'DateTime', 'XResolution', 'YResolution', 'HostComputer', 'File Path']
# assign our header column names to the second photo in our directory (The first one is Blank [0])
with open("./data/meta_data.csv", 'w',newline='') as csvfile:
# write to meta_data.csv
    writer = DictWriter(csvfile, fieldnames=field_names, extrasaction='ignore')
    # to the csv assign the fieldnames as 'field_names' and ignore any data that doesnt fit in our columns(field_names)
    writer.writeheader()
    # write the header with "field_names"
    writer.writerows(dict_list)
    # write the rows with our dict_list which was the output of our dict_convert() function.

In [7]:


#read csv file and export to pandas df
meta_file = './data/meta_data.csv'
meta_df = pd.read_csv(meta_file, header=0)

# set a hash id value to each image
def md5_hash():
    def calculate_hash_val(path, block_size=''):
        # calculate hash value on a presecified path
        image = open(path, 'rb')
        # save the variable image as and opened file read in binairy
        hasher = hashlib.md5()
        # assign a variable hasher with the hash values
        data = image.read()
        while len(data) > 0:
            # set data as reading the open file and checking to see if there is any data in the file
            hasher.update(data)
            # if there is, update the opened file with a hash id
            data = image.read()
        image.close()
        return hasher.hexdigest()
    #run calculate_hash_val func over file path column and add to df as 'md5 hash'
    meta_df['MD5 Hash'] = meta_df['File Path'].map(calculate_hash_val)
    #drop duplicate columns using Md5 Hash
    meta_df.drop_duplicates(keep='first', subset='MD5 Hash', inplace = True)

md5_hash()

# function that finds rows missing meta data and adds them to new dataframe
def reject_rows():
    reject_df = meta_df[meta_df[['Make', 'Model', 'DateTime']].isna().all(axis=1)]
    #writes reject_df to csv file
    reject_df.to_csv('data/reject.csv', encoding='utf-8', index=False)

reject_rows()

# find and drop all null values in rows and rewriting over the existing df
def drop_na():
    meta_df.dropna(axis=0, how='all', subset=['Make', 'Model', 'DateTime'], inplace= True)
    return meta_df

drop_na()

def date_format():
    def remove_time(value):
    # Define inner function to act on the DateTime column
        date = value
        # assign the DateTime column value to date
        try:
            date = datetime.datetime.strptime(str(value), '%Y:%m:%d %H:%M:%S').date()
            # format column date with only the date returned, with hours, minutes, seconds removed
        except:
            pass
        return date
    meta_df["DateTime"] = meta_df["DateTime"].map(remove_time)
    # use .map to call remove_time() on all DateTime columns
    date_sorted_df = meta_df.sort_values(["DateTime"])
    # assign variable to sorted dates from earliest to newest
    date_sorted_df.to_csv("data/sorted.csv", encoding="utf-8", index=False)
    # write sorted rows to a new dataframe
    return date_sorted_df

date_format()
    

Unnamed: 0,TileWidth,TileLength,GPSInfo,ResolutionUnit,ExifOffset,Make,Model,Software,Orientation,DateTime,XResolution,YResolution,HostComputer,File Path,MD5 Hash
4,,,,2.0,226.0,samsung,SM-G970U,G970USQS2CSL1,6.0,2020-02-10,72.0,72.0,,./images/img_65.jpg,6e6daa9df7e85905f6c5cc88090e39cb
33,,,735.0,2.0,202.0,LGE,LM-V405,,6.0,2020-05-11,72.0,72.0,,./images/img_107.jpg,96ef4c9584d926b2e15fe175eb2d36e1
30,,,,2.0,226.0,samsung,SM-G970U,G970USQU4ETH7,1.0,2020-10-16,72.0,72.0,,./images/img_104.jpg,b8d07bd71e80e0c9e8b5f100134d0269
58,,,,2.0,170.0,Apple,iPhone XR,14.0.1,,2020-11-22,72.0,72.0,,./images/img_121.jpg,1c96c7c9e1c1ee1ebb080fb705355cf0
42,,,,2.0,226.0,samsung,SM-G970U,G970USQU4FTLN,6.0,2021-01-27,72.0,72.0,,./images/img_94.jpg,42c4eb188a0ee5c59ae0d46bf3a79a82
22,,,,2.0,110.0,,,,,2021-01-29,72.0,72.0,,./images/img_119.jpg,070985c96778ce6cb8a0f9bf6c26794f
56,,,,2.0,228.0,samsung,SM-G991U1,G991U1UEU1ATLI,6.0,2021-02-04,72.0,72.0,,./images/img_122.jpg,60dd53eba0a1b29cc1fa66b9674a597b
41,,,,2.0,202.0,Apple,iPhone XR,14.4,,2021-03-29,72.0,72.0,iPhone XR,./images/img_111.jpg,b5ed7b6b36ab862ba023de4c3c5a0a1e
54,,,,2.0,228.0,samsung,SM-G991U1,G991U1UEU2AUC8,6.0,2021-04-26,72.0,72.0,,./images/img_115.jpg,45ed2b8bc764c3ce31b1404b12eb20bc
14,,,735.0,2.0,202.0,LGE,LM-V405,,6.0,2021-05-04,72.0,72.0,,./images/img_125.jpg,8e594904b8551c13dc48241dffbca7fe


In [9]:

def thumbs_n_nails():
    size = 100, 100
    # define size variable with value 100, 100 to later be used
    os.mkdir('./images/thumbnails')
    # make a new directory in images called thumbnails
    try:
        for in_file in glob.glob("./images/*.jpg"):
            # loop through each file in images with .jpg extension
            new = os.path.split(in_file)
            # access and split the current files filepath to manipulate
            new_filepath = os.path.join(new[0], "thumbnails", "thumbnail_" + new[1])
            # make new filepath
            with Image.open(in_file) as img:
                # open the current file image
                img.thumbnail(size)
                # resize current file image as a thumbnail
                img.save(new_filepath)
                # and save to the new path
    except:
        pass

thumbs_n_nails()

FileExistsError: [Errno 17] File exists: './images/thumbnails'