In [1]:
import datetime
import glob
import hashlib
import os
import psycopg2
import time

DEBUG = False # Think _hard_ before enabling DEBUG
#conn = sqlite3.connect('/Users/sholden/Desktop/filestore.sqlite')
#conn = sqlite3.connect('/Users/sholden/Desktop/test.sqlite')
conn = psycopg2.connect(database="filescan", user="sholden")
curs = conn.cursor()
# curs.execute("DROP TABLE IF EXISTS location")
# curs.execute("""CREATE TABLE location (
#                     id SERIAL,
#                     filename VARCHAR,
#                     dirpath VARCHAR,
#                     modified TIMESTAMP,
#                     checksum CHAR(64),
#                     length INTEGER,
#                     seen BOOLEAN)
# """)

base_dirs = ['/Users/sholden/Projects/Python/filescan']

def debug(*args, **kwargs):
    if DEBUG:
        print(*args, **kwargs)

def clear_seen_bits():
    curs.execute('UPDATE LOCATION SET seen=FALSE')

def id_mod_seen(dir_path, file_path):
    curs.execute('''
    SELECT id, modified, seen FROM location
    WHERE dirpath=%s AND filename=%s''',
                       (dir_path, file_path))
    return curs.fetchone()

def update_modified_hash_seen(id, modified, hash, seen=True):
    """
    This file has been updated, and so if Python needs re-indexed
    unless we happen to already have indexed a file with the same
    hash already ...
    """
    curs.execute('''
    UPDATE location \
    SET modified=%s, checksum=%s, seen=%s \
            WHERE id=%s''',
            (modified, hash, seen, id))

def update_seen(id):
    curs.execute('''
    UPDATE location SET seen=TRUE
            WHERE id=%s''',
            (id, ))

def db_insert_location(file_path, dir_path, disk_modified, hash):
    curs.execute('''
    INSERT INTO location (filename, dirpath, modified, checksum, seen) \
    VALUES (%s, %s, %s, %s, true)''',
    (file_path, dir_path, disk_modified, hash))

def all_file_count():
    curs.execute("SELECT count(*) FROM location")
    return curs.fetchone()[0]

def count_not_seen():
    curs.execute('''SELECT count(*) FROM location WHERE NOT seen''')
    return curs.fetchone()[0]

def dir_files_not_seen():
    curs.execute('''SELECT dirpath, filename FROM location WHERE NOT seen''')
    return curs.fetchmany()

def delete_not_seen():
    """
    Before deleting the database entries, any token entries for its
    checksum should be deleted if the deleted member's checksum is
    not duplicated by any other files.
    """
    curs.execute('''DELETE from location WHERE NOT seen''')

def test_id_mod_seen():
    ...

In [2]:
clear_seen_bits()
file_count = known_files = updated_file s = unchanged_files = new_files = deleted_files = 0

for base_dir in base_dirs:
    for rec in os.walk(base_dir):
        dir_path, dirnames, filenames = rec
        # If dir_path matches any gitignore rule, continue
        for file_path in filenames:
            # if file_;ath matches any gitignore rule, continue
            file_count += 1
            thisfile = os.path.join(dir_path, file_path)
            stat = os.stat(thisfile, follow_symlinks=False)
            disk_modified = datetime.datetime.fromtimestamp(stat.st_mtime, None)
            rec = id_mod_seen(dir_path, file_path)
            if rec: # Known file
                id, modified, seen = rec
                known_files += 1
                if disk_modified != modified: # Changed since last scan
                    debug(f"Modified was {disk_modified}({type(disk_modified)}) is now {modified}({type(modified)})")
                    updated_files += 1
                    hash = hashlib.sha256(open(thisfile, "rb").read()).hexdigest()
                    update_modified_hash_seen(id, modified, hash)
                    debug("*CHANGED*", thisfile)
                else:
                    debug("*REMAINS*", thisfile)
                    unchanged_files += 1
                    update_seen(id)
            else:   # New file
                new_files += 1
                try:
                    hash = hashlib.sha256(open(thisfile, "rb").read()).hexdigest()
                except FileNotFoundError:
                    hash = "UNHASHABLE"
                db_insert_location(file_path, dir_path, disk_modified, hash)
                debug("*CREATED*", thisfile)
ct = all_file_count()
debug("Row count:", ct)
deleted_files = count_not_seen()
for dirname, filepath in dir_files_not_seen():
    debug("*DELETED*", os.path.join(dirname, filepath))
delete_not_seen()
conn.commit()
print(f"""
Known:     {known_files:8,d}
New:       {new_files:8,d}
Deleted:   {deleted_files:8,d}
Updated:   {updated_files:8,d}
Unchanged: {unchanged_files:8,d}

Total:     {file_count:8,d}""")


Known:       11,910
New:              0
Deleted:          0
Updated:          2
Unchanged:   11,908

Total:       11,910


In [3]:
file_path, dir_path

('master', '/Users/sholden/Projects/Python/filescan/.git/refs/tags')

In [4]:
for rec in os.walk(base_dir):
    print(rec)

('/Users/sholden/Projects/Python/filescan', ['py3.11', '__pycache__', '.ipynb_checkpoints', '.git'], ['api_server_stub.py', 'filescan.ipynb', 'Production.ipynb', '.DS_Store', 'api_client_stub.py', 'Untitled.ipynb', 'sqlite_store.py', 'test_storage.py', 'basic.py', 'postgresql_store.py', 'filescan.wpr', 'mongo_store.py', 'filescan.wpu', 'test.sqlite', 'load_tokens.py', 'filescan.ipy'])
('/Users/sholden/Projects/Python/filescan/py3.11', ['bin', 'include', 'etc', 'lib', 'share'], ['pyvenv.cfg'])
('/Users/sholden/Projects/Python/filescan/py3.11/bin', [], ['jupyter-run', 'pyjson5', 'jupyter-labextension', 'Activate.ps1', 'python3', 'pybabel', 'jlpm', 'python', 'pip3', 'ipython', 'jupyter-notebook', 'activate.fish', 'send2trash', 'python3.11', 'jupyter-labhub', 'jupyter-server', 'jupyter-dejavu', 'ipython3', 'pip', 'jupyter-nbconvert', 'jupyter-lab', 'jsonschema', 'wsdump', 'pip3.11', 'jupyter-troubleshoot', 'pygmentize', 'jupyter-migrate', 'activate', 'jupyter-console', 'jupyter-events', 'n