In [58]:
from pathlib import Path
import re
from tqdm import tqdm
from slugify import slugify
import os, json, datetime


In [122]:
POTENTIAL_PATHS = [
    # zero means always reload
    ("/Users/kallewesterling/Desktop/", 1),
 
    # 7 means once per week
    ("/Volumes/GoogleDrive/My Drive/Ongoing Projects/Dissertation - Archive/- My own clippings and photos/", 7),

    # 14 means once every two weeks
    ("/Volumes/GoogleDrive/My Drive/Ongoing Projects/Dissertation - Archive/- newspapers.com", 14),
    
    # 60 means once every two months
    ("/Volumes/External", 60),
]

POTENTIAL_FILE_ENDINGS = [".png"]



In [123]:
NEWSPAPER_ID = re.compile(r"(\d{8,9})")
newspaper_id_files = {}
found_in_dirs = []


In [127]:
class DirectoryCache():
    directory = None
    expiry_days = None
    filepath = None
    file_endings = None
    content = {}
    
    def __init__(self, directory, expiry_days, file_endings):
        self.directory = directory
        self.expiry_days = expiry_days
        self.file_endings = file_endings
        self.filepath = self.setup_filepath()
        if not Path(self.filepath).exists():
            print(f'no cache file exists yet...')
        else:
            if self.check_expiry() == True:
                print(f'cache expired... removing {self.filepath} and reloading data')
                self.drop()
            else:
                self.content = self.read()
            
    def setup_filepath(self):
        if not Path('.DirectoryCache').exists():
            Path('.DirectoryCache').mkdir(parents=True)
            
        return Path(f'.DirectoryCache/{slugify("-".join(self.file_endings) + "-" + self.directory)}.json')
        
    def check_expiry(self):
        min_created = datetime.datetime.now() - datetime.timedelta(days=self.expiry_days)
        date_created = datetime.datetime.fromtimestamp(Path(self.filepath).stat().st_mtime)
        if date_created > min_created:
            return False
        else:
            return True
        
    def drop(self):
        if self.filepath.exists():
            self.filepath.unlink()
        
    def save(self):
        print('---> saving cache data')
        Path(self.filepath).write_text(json.dumps(self.content))
        
    def read(self):
        return json.loads(Path(self.filepath).read_text())
    
    @property
    def newspaper_ids(self):
        return list(self.content.keys())

In [131]:
for PATH, EXPIRY_DAYS in POTENTIAL_PATHS:
    cache = DirectoryCache(PATH, EXPIRY_DAYS, POTENTIAL_FILE_ENDINGS)
    if not cache.content:
        found_files = {}
        for subdir, dirs, files in os.walk(PATH):
            for file in files:
                filepath = subdir + os.sep + file

                if "." + file.split(".")[-1] in POTENTIAL_FILE_ENDINGS:
                    newspaper_id = None
                    has_newspaper_id = NEWSPAPER_ID.search(str(file))

                    if has_newspaper_id:
                        newspaper_id = has_newspaper_id.groups()[0]

                    if newspaper_id and not newspaper_id in found_files:
                        found_files[newspaper_id] = []

                    if newspaper_id:
                        found_files[newspaper_id].append(filepath)
                        found_in_dirs.append(subdir)
                        found_in_dirs = list(set(found_in_dirs))
        cache.content = found_files
        cache.save()
    
    cache = DirectoryCache(PATH, EXPIRY_DAYS, POTENTIAL_FILE_ENDINGS)
    # cache.directory, cache.newspaper_ids

In [9]:
from pprint import pprint

# pprint(newspaper_id_files)
pprint(found_in_dirs)

['/Volumes/GoogleDrive/My Drive/Ongoing Projects/Dissertation - Archive/- My '
 'own clippings and photos/Stroud, Frankie [performer]',
 '/Volumes/GoogleDrive/My Drive/Ongoing Projects/Dissertation - Archive/- My '
 'own clippings and photos/Warren, Roni [performer]/Roni Warren '
 '(newspapers.com)',
 '/Volumes/GoogleDrive/My Drive/Ongoing Projects/Dissertation - Archive/- My '
 'own clippings and photos/Beryl, Billie [performer]',
 '/Volumes/GoogleDrive/My Drive/Ongoing Projects/Dissertation - Archive/- My '
 'own clippings and photos/Fredericks, Leon [performer]',
 '/Volumes/GoogleDrive/My Drive/Ongoing Projects/Dissertation - Archive/- My '
 'own clippings and photos/Jewel Box Revue [group]',
 '/Users/kallewesterling/Desktop/New Folder With Items 2',
 '/Volumes/GoogleDrive/My Drive/Ongoing Projects/Dissertation - Archive/- My '
 'own clippings and photos/Renault, Francis [performer]/newspapers.com',
 '/Volumes/GoogleDrive/My Drive/Ongoing Projects/Dissertation - Archive/- My '
 "own