In [None]:
# The variables below describes if the notebook is no test mode or not,
# when "test", it means the data can be fewer than the original data.
# Result is not important, but the speed to test if it the notebook work is.


ENV_TYPE = "test" # "test" or "production"

#Image (JSON) Save/Load functions 

This code implements:

* Function load_all_json_files(folder_path) $\rightarrow$ look for every folder, subfolder, zipfile, folder and subfolder inside zip file looking for json files. Will check every of the json objects found to see if it is image and will return a list containing all the images found.

  When an image is loaded, a property "json_file_location" is added, containing where it has been loaded relative to the place of the code execution.

* Function load_json_files_in_zip(zip_path) $\rightarrow$ will do the same process, but only inside a zip. It is used by the first function.

* Function extract_images_from_json_tuple(dictionaryTuple, image_definition) $\rightarrow$ will loop through every JSON object checking if it is an image. An image is defined by an object that has the property image_definition. In the previous examples, an image is something that has a clip_model.

* Function save_to_json $→$ takes a tuple of dictionaries (to put in json) and has filename="output.json" default parameter.

###Images are loaded and saved in tuples of dictionaries.

In [3]:
import os
import fnmatch
import zipfile
import hashlib
import json
import sys

def load_json_files_in_zip(zip_path):
    zip_rel_loc = '/' + os.path.relpath(zip_path, os.getcwd())
    image_files_in_zip = []
    with zipfile.ZipFile(zip_path, 'r') as zip_file:
        for file in zip_file.namelist():
            if file.endswith('.json'):
                jsonpath = zip_path + file
                with zip_file.open(file) as file_opened:
                    data = json.load(file_opened)
                    data = extract_images_from_json_tuple(data, 'clip_model') #remove this code to import every object
                    #this code imports only objects that has 'clip model' (images)
                    for index, x in enumerate(data):

                        data[index]['json_file_location'] = zip_rel_loc + '/' + file
                image_files_in_zip.extend(data)


    return image_files_in_zip

def extract_images_from_json_tuple(dictionaryTuple, image_definition):
    #image_definition: a object that every image in json files contain. we're using clip_model.
    imagesFromJSON = ()
    for jsonObject in dictionaryTuple:
        if image_definition in jsonObject:
            imagesFromJSON = imagesFromJSON + (jsonObject,)
    return imagesFromJSON

def load_all_json_files(folder_path):

    image_files = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if fnmatch.fnmatch(file, '*.json'): #if the found file is a json, import it.
                jsonpath = os.path.join(root, file)
                with open(jsonpath, 'r') as f:
                    data = json.load(f)
                    file_path = os.path.relpath(jsonpath, folder_path)

                data = extract_images_from_json_tuple(data, 'clip_model') #remove this code to import every object
                #this code imports only objects that has 'clip model' (images)

                for index, x in enumerate(data):

                    data[index]['json_file_location'] = '/' + file_path

                image_files.extend(data)

            if fnmatch.fnmatch(file, '*.zip'): #if the found file is a zip, look for json inside.
                image_files.extend(load_json_files_in_zip(os.path.join(root, file)))
    return image_files

def save_to_json(tuple_of_dictionaries, filename="output.json"):
    with open(filename, "w") as f:
        json.dump(tuple_of_dictionaries, f)


#This function takes parameter folder path.
#will look for JSON files in the folder path.
#if found a folder, will open that folder and look for json folders inside that folder.
#if find zip, will look for json folders inside the zip and its subfolders as well. 
#will return a tuple containing all loaded information. 

#example usage
folder_path = os.getcwd() #set to function on current os folder.
data = load_all_json_files(folder_path)

if len(data): #to ensure you have some image json files
        print('file_archive: ', data[0]['file_archive'])
        print('file_name: ', data[0]['file_name'])
        print('file_path: ', data[0]['file_path'])
        print('file_hash: ', data[0]['file_hash'])
        print('clip_model: ', data[0]['clip_model'])
        #print('clip_vector', data[0]['clip_vector']) #commented out because vector is too large

        #this code also creates specific "json_file_location" in every dictionary containing the location of where
        #the json file was loaded from. 
        print('json_file_location: ', data[0]['json_file_location'])
        save_to_json(data, "example1.json") #save the loaded information into another file (example 1)




file_archive:  pixel-art-pinterest-000.zip
file_name:  images-0004/Vegi_Brokeller_pixel-art-inspiration/https___i.pinimg.com_originals_64_66_bc_6466bc83e810675b0317d705c90766d0.jpg
file_path:  /input/image-scraper-pixel-art-result/pixel-art-pinterest-000.zip/images-0004/Vegi_Brokeller_pixel-art-inspiration
file_hash:  f4353ab3c192f89ec9c30d3c606293162b44a2fdf4482ddd943b7538ab9756a6
clip_model:  ViT-L/14
json_file_location:  /example1.json
