In [1]:
from hashlib import md5

In [3]:
from pathlib import Path

In [34]:
from collections import namedtuple

In [24]:
import json

In [5]:
from typing import List

In [2]:
def md5sum(content: bytes) -> str:
    m = md5()
    m.update(content)
    return m.hexdigest()

In [46]:
def is_image(path: Path):
    return path.suffix[1:].upper() in ["JPG", "JPEG", "CR2", "PNG", "AAE", "MOV", "MP4"]

class ImagePaths:
    def __init__(self, path: str):
        self._image_dir = path
    
    def __iter__(self):
        image_dir = Path(self._image_dir)
        for item_path in image_dir.iterdir():
            if item_path.is_file():
                if is_image(item_path):
                    yield item_path.relative_to(self._image_dir)
            elif item_path.is_dir():
                for sub_item_path in ImagePaths(str(item_path)):
                    yield item_path / sub_item_path 
            continue

In [52]:
def get_images_in_bytes(parent: str, paths: ImagePaths):
    return {
        path: (parent / path).read_bytes() for path in paths
    }

In [50]:
def get_md5sums(dir_path: str) -> List:
    paths = ImagePaths(dir_path)
    images = get_images_in_bytes(dir_path, paths)
    return {
        path: md5sum(img_bytes)
        for path, img_bytes in images.items()
    }

In [59]:
class CacheItem:
    def __init__(self, path: Path):
        self._path = path
        self._stat = Stat.from_path(path)
        self._md5sum = None
        
    @property
    def stat(self):
        return self._stat
    
    @property
    def md5sum(self):
        if not self._md5sum:
            self._md5sum = md5sum(self._path.read_bytes())
        return self._md5sum

In [60]:
class Stat:
    """
    stat info from filesystem
    """
    __slots__ = "size", "mtime"
    def __init__(self, size, mtime):
        self.size = size
        self.mtime = mtime
    
    def __eq__(self, instance):
        return self.size == instance.size and self.mtime == instance.mtime
    
    @classmethod
    def loads(cls, value_str):
        size, mtime = value_str.split(",")
        return cls(size, mtime)
    @classmethod
    def from_path(cls, path: Path):
        stat = path.stat()
        return cls(stat.st_size, stat.st_mtime)
    
    def dumps(self):
        return f"{self.size},{self.mtime}"

In [61]:
class Indexes:
    def __init__(self, image_dir: str):
        self._image_dir = image_dir
        self._dir_path = Path(image_dir)
        if not self._dir_path.exists():
            raise ValueError(f"image dir {image_dir} does not exist")
        self._cache_file = self._dir_path / ".indexes.json"
        if self._cache_file.exists():
            self.load(self._cache_file.open("r"))
        else:
            self._cache = dict()
            self._regenerate()
            
    def _regenerate(self):
        md5_dict = get_md5sums(self._image_dir)
        for path, md5sum in md5_dict.items():
            self._cache[str(path)] = CacheItem(
                stat=Stat.from_path(self._dir_path / path),
                md5sum=md5sum
            )
            
    def dump(self):
        with self._cache_file.open("w") as f:
            result = dict()
            for key, cacheItem in self._cache.items():
                result[key] = cacheItem.stat.dumps(), cacheItem.md5sum
            json.dump(fp, result)
    
    def load(self, fp):
        cached_dict = json.load(fp)
        for key, value in cached_dict.items():
            stat_pickle, md5sum = value
            stat = Stat.loads(stat_pickle)
            self._cache[key] = CacheItem(stat=stat, md5sum=md5sum)
    
    def __delete__(self):
        self.dump()
        super().__delete__()


In [56]:
indexes = Indexes("/home/huangyu/100APPLE/")

In [58]:
indexes._cache

{'ADCT5449.jpg': CacheItem(stat=<__main__.Stat object at 0x7fb87843cef0>, md5sum='d354fe5a552d6a2e7d81937751b97b20'),
 'BOZB1599.jpg': CacheItem(stat=<__main__.Stat object at 0x7fb873da7da0>, md5sum='3b1a7a4c2b6c8c7418dd06e2c69592ac'),
 'CHPG7112.jpg': CacheItem(stat=<__main__.Stat object at 0x7fb873da7d68>, md5sum='e6bd69f02a08b9518b7531eecaf11ce1'),
 'CHYS0653.jpg': CacheItem(stat=<__main__.Stat object at 0x7fb873da7c50>, md5sum='099e604a72938a71a45e96caf759cf0b'),
 'CIAY5248.jpg': CacheItem(stat=<__main__.Stat object at 0x7fb873da7dd8>, md5sum='50a9cda9aff74759260a7b1b392200c2'),
 'DNRQ2071.jpg': CacheItem(stat=<__main__.Stat object at 0x7fb873da7f98>, md5sum='f84a555b9f29116c64471f7b47c796cd'),
 'EHMO9873.jpg': CacheItem(stat=<__main__.Stat object at 0x7fb873da7fd0>, md5sum='699809c33349163b439fd3c3ad21c1f1'),
 'EQHC3146.jpg': CacheItem(stat=<__main__.Stat object at 0x7fb873daab00>, md5sum='a6ca85769ed004d06632c96dde89d759'),
 'ERHH0591.jpg': CacheItem(stat=<__main__.Stat object at