# Image processing

Converting image to thumbnail, blur it, create a hash and then generate a base64 data uri.

In [13]:
import base64
import io
import os
from PIL import Image, ImageFilter
from datetime import datetime
from elasticsearch import Elasticsearch
import hashlib
import sys


In [2]:
IMAGE_DIR = r'./JPEGImages'
IMAGE_FORMAT = "webp"
ES_HOST = "http://localhost:9200"
ES_INDEX="images"
ES = Elasticsearch(ES_HOST)

In [3]:
def image_to_base64(img):
    buffer = io.BytesIO()
    img.save(buffer, format=IMAGE_FORMAT)
    return base64.b64encode(buffer.getvalue()).decode("utf-8")

In [4]:
def generate_image_hash(image_data):
    sha256 = hashlib.sha256()
    data = image_data # io.BytesIO(image_data).read(32768)  # reduce RAM usage
    if not data:
        raise ValueError("No image data provided, hence no hash could be generated")
    sha256.update(data)

    return sha256.hexdigest()

In [22]:
def process_images(do_yield=True):
    processed_images = 0
    total_size = 0
    for file in os.listdir(IMAGE_DIR):
        # Limit to 1000 images
        if (processed_images == 1000):
             break
        with Image.open(f'{IMAGE_DIR}/{file}') as img:
            try:
                img.thumbnail((1000, 240)) # Set max height
                img_boxblur = img.filter(ImageFilter.BoxBlur(14))
                img_gaussblur = img_boxblur.filter(ImageFilter.GaussianBlur(12))
                original_image_hash = generate_image_hash(img.tobytes())
                image_hash = generate_image_hash(img_gaussblur.tobytes())
                # Show the original image and the blurred version
                # if processed_images % 100 == 0 and not processed_images == 0:
                #    img.show()
                #    img_gaussblur.show()
                img_b64  = f'data:image/{IMAGE_FORMAT};base64,{image_to_base64(img_gaussblur)}'
                total_size += len(img_b64)
                processed_images += 1
                if do_yield:
                    yield { "id": processed_images, "original_image_hash": original_image_hash, "image_hash": image_hash, "path": f"{IMAGE_DIR}/{file}", "data_uri": img_b64, "timestamp": datetime.now().timestamp(), "bytes": len(img_b64)}
            except OSError:
                print(f"Error: Could not process file {file}")
            except KeyboardInterrupt:
                print("Halting processing")
                break
    print(f"Processed {processed_images} files with total size of {total_size}.")

In [None]:
# Check
for obj in process_images():
  if (obj["id"] % 1000 == 0):
    print(obj)
#process_images(do_yield=False)

## Test with indexing images in elastic

Index base64 encoded images in Elastic.


In [74]:
def es_insert(doc):
  if ES.exists(index=ES_INDEX, id=doc["id"]):
    ES.update(index=ES_INDEX, id=doc["id"], doc=doc)
  else:
    ES.create(index=ES_INDEX, id=doc["id"], document=doc)

In [None]:
for obj in process_images():
  es_insert(obj)

Check the size of the blurred base64 data uri in Elastic, would like to keep them as small as possible

In [81]:
ES.count(index=ES_INDEX, query={"range": { "bytes": {"gte": 3000}}})

ObjectApiResponse({'count': 95, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}})

In [None]:
ES.search(index=ES_INDEX, query={"match_all": {}}, size=50)