Skip to content

Commit

Permalink
.chkbitignore support for subdirectories #8
Browse files Browse the repository at this point in the history
  • Loading branch information
laktak committed Jan 3, 2024
1 parent 0f55a94 commit df44bc7
Show file tree
Hide file tree
Showing 11 changed files with 183 additions and 81 deletions.
19 changes: 15 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ chkbit will
Run `chkbit PATH` to verify only.

```
usage: chkbit [-h] [-u] [--algo ALGO] [-f] [-s] [--index-name NAME] [--ignore-name NAME] [-w N] [--plain] [-q] [-v] [PATH ...]
usage: chkbit [-h] [-u] [--show-ignored-only] [--algo ALGO] [-f] [-s] [--index-name NAME] [--ignore-name NAME] [-w N] [--plain] [-q] [-v] [PATH ...]
Checks the data integrity of your files. See https://github.com/laktak/chkbit-py
Expand All @@ -84,6 +84,7 @@ positional arguments:
options:
-h, --help show this help message and exit
-u, --update update indices (without this chkbit will verify files in readonly mode)
--show-ignored-only only show ignored files
--algo ALGO hash algorithm: md5, sha512, blake3 (default: blake3)
-f, --force force update of damaged items
-s, --skip-symlinks do not follow symlinks
Expand All @@ -94,14 +95,20 @@ options:
-q, --quiet quiet, don't show progress/information
-v, --verbose verbose output
.chkbitignore rules:
each line should contain exactly one name
you may use Unix shell-style wildcards (see README)
lines starting with `#` are skipped
lines starting with `/` are only applied to the current directory
Status codes:
DMG: error, data damage detected
EIX: error, index damaged
old: warning, file replaced by an older version
new: new file
upd: file updated
ok : check ok
skp: skipped (see .chkbitignore)
ign: ignored (see .chkbitignore)
EXC: internal exception
```

Expand All @@ -123,9 +130,13 @@ You should
Add a `.chkbitignore` file containing the names of the files/directories you wish to ignore

- each line should contain exactly one name
- you may use [Unix shell-style wildcards](https://docs.python.org/3/library/fnmatch.html)
- `*` matches everything
- `?` matches any single character
- `[seq]` matches any character in seq
- `[!seq]` matches any character not in seq
- lines starting with `#` are skipped
- you may use [Unix shell-style wildcards](https://docs.python.org/3.8/library/fnmatch.html)
- at the moment does not allow to match files in subdirectories (PR welcome)
- lines starting with `/` are only applied to the current directory

## FAQ

Expand Down
2 changes: 2 additions & 0 deletions chkbit/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from chkbit.status import Status
from chkbit.ignore import Ignore
from chkbit.input_item import InputItem
from chkbit.context import Context
from chkbit.hashfile import hashfile, hashtext
from chkbit.index import Index
Expand Down
19 changes: 17 additions & 2 deletions chkbit/context.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from __future__ import annotations
import queue
from chkbit import Status
import chkbit
from typing import Optional
from chkbit import InputItem


class Context:
Expand All @@ -9,6 +12,7 @@ def __init__(
num_workers=5,
force=False,
update=False,
show_ignored_only=False,
hash_algo="blake3",
skip_symlinks=False,
index_filename=".chkbit",
Expand All @@ -17,19 +21,30 @@ def __init__(
self.num_workers = num_workers
self.force = force
self.update = update
self.show_ignored_only = show_ignored_only
self.hash_algo = hash_algo
self.skip_symlinks = skip_symlinks
self.index_filename = index_filename
self.ignore_filename = ignore_filename

# the input queue is used to distribute the work
# to the index threads
self.input_queue = queue.Queue()

self.result_queue = queue.Queue()
self.hit_queue = queue.Queue()

if hash_algo not in ["md5", "sha512", "blake3"]:
raise Exception(f"{hash_algo} is unknown.")

def log(self, stat: Status, path: str):
def log(self, stat: chkbit.Status, path: str):
self.result_queue.put((0, stat, path))

def hit(self, *, cfiles: int = 0, cbytes: int = 0):
self.result_queue.put((1, cfiles, cbytes))

def add_input(self, path: str, *, ignore: Optional[chkbit.Ignore] = None):
self.input_queue.put(InputItem(path, ignore=ignore))

def end_input(self):
self.input_queue.put(None)
2 changes: 1 addition & 1 deletion chkbit/hashfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def hashfile(path: str, hash_algo: str, *, hit: Callable[[str], None]):
return h.hexdigest()


def hashtext(text):
def hashtext(text: str):
md5 = hashlib.md5()
md5.update(text.encode("utf-8"))
return md5.hexdigest()
56 changes: 56 additions & 0 deletions chkbit/ignore.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from __future__ import annotations
import fnmatch
import os
import sys
import chkbit
from enum import Enum
from typing import Optional


class Ignore:
def __init__(
self,
context: chkbit.Context,
path: str,
*,
parent_ignore: Optional[chkbit.Ignore],
):
self.parent_ignore = parent_ignore
self.context = context
self.path = path
self.name = os.path.basename(path) + "/"
self.ignore = []
self.load_ignore()

@property
def ignore_filepath(self):
return os.path.join(self.path, self.context.ignore_filename)

def load_ignore(self):
if not os.path.exists(self.ignore_filepath):
return
with open(self.ignore_filepath, "r", encoding="utf-8") as f:
text = f.read()

self.ignore = list(
filter(
lambda x: x and x[0] != "#" and len(x.strip()) > 0, text.splitlines()
)
)

def should_ignore(self, name: str, *, fullname: str = None):
for ignore in self.ignore:
if ignore.startswith("/"):
if fullname:
continue
else:
ignore = ignore[1:]
if fnmatch.fnmatch(name, ignore):
return True
if fullname and fnmatch.fnmatch(fullname, ignore):
return True
if self.parent_ignore:
return self.parent_ignore.should_ignore(
fullname or name, fullname=self.name + (fullname or name)
)
return False
53 changes: 22 additions & 31 deletions chkbit/index.py
Original file line number Diff line number Diff line change
@@ -1,51 +1,49 @@
from __future__ import annotations
import fnmatch
import os
import subprocess
import sys
import json
import chkbit
from chkbit import hashfile, hashtext, Status
from typing import Optional

VERSION = 2 # index version


class Index:
def __init__(self, context, path, files, *, readonly=False):
def __init__(
self,
context: chkbit.Context,
path: str,
files: list[str],
*,
readonly: bool = False,
):
self.context = context
self.path = path
self.files = files
self.old = {}
self.new = {}
self.ignore = []
self.load_ignore()
self.updates = []
self.modified = None
self.readonly = readonly

@property
def ignore_filepath(self):
return os.path.join(self.path, self.context.ignore_filename)

@property
def index_filepath(self):
return os.path.join(self.path, self.context.index_filename)

def should_ignore(self, name):
for ignore in self.ignore:
if fnmatch.fnmatch(name, ignore):
return True
return False

def _setmod(self, value=True):
self.modified = value

def _log(self, stat: Status, name: str):
self.context.log(stat, os.path.join(self.path, name))

# calc new hashes for this index
def update(self):
def calc_hashes(self, *, ignore: Optional[chkbit.Ignore] = None):
for name in self.files:
if self.should_ignore(name):
self._log(Status.SKIP, name)
if ignore and ignore.should_ignore(name):
self._log(Status.IGNORE, name)
continue

a = self.context.hash_algo
Expand All @@ -65,8 +63,13 @@ def update(self):
else:
self.new[name] = self._calc_file(name, a)

def show_ignored_only(self, ignore: chkbit.Ignore):
for name in self.files:
if ignore.should_ignore(name):
self._log(Status.IGNORE, name)

# check/update the index (old vs new)
def check_fix(self, force):
def check_fix(self, force: bool):
for name in self.new.keys():
if not name in self.old:
self._log(Status.NEW, name)
Expand Down Expand Up @@ -101,15 +104,15 @@ def check_fix(self, force):
self._log(Status.WARN_OLD, name)
self._setmod()

def _list_file(self, name, a):
def _list_file(self, name: str, a: str):
# produce a dummy entry for new files when the index is not updated
return {
"mod": None,
"a": a,
"h": None,
}

def _calc_file(self, name, a):
def _calc_file(self, name: str, a: str):
path = os.path.join(self.path, name)
info = os.stat(path)
mtime = int(info.st_mtime * 1000)
Expand Down Expand Up @@ -158,15 +161,3 @@ def load(self):
self._setmod()
self._log(Status.ERR_IDX, self.index_filepath)
return True

def load_ignore(self):
if not os.path.exists(self.ignore_filepath):
return
with open(self.ignore_filepath, "r", encoding="utf-8") as f:
text = f.read()

self.ignore = list(
filter(
lambda x: x and x[0] != "#" and len(x.strip()) > 0, text.splitlines()
)
)
52 changes: 30 additions & 22 deletions chkbit/index_thread.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,29 @@
from __future__ import annotations
import os
import sys
import time
import threading
from chkbit import Index, Status
import chkbit
from chkbit import Index, Status, Ignore


class IndexThread:
def __init__(self, thread_no, context, input_queue):
def __init__(self, thread_no: int, context: chkbit.Context):
self.thread_no = thread_no
self.update = context.update
self.context = context
self.input_queue = input_queue
self.input_queue = context.input_queue
self.t = threading.Thread(target=self._run)
self.t.daemon = True
self.t.start()

def _process_root(self, parent):
def _process_root(self, iitem: chkbit.InputItem):
files = []
dirs = []

# load files and subdirs
for name in os.listdir(path=parent):
path = os.path.join(parent, name)
for name in os.listdir(path=iitem.path):
path = os.path.join(iitem.path, name)
if name[0] == ".":
continue
if os.path.isdir(path):
Expand All @@ -33,36 +35,42 @@ def _process_root(self, parent):
files.append(name)

# load index
index = Index(self.context, parent, files, readonly=not self.update)
index = Index(self.context, iitem.path, files, readonly=not self.update)
index.load()

# calc the new hashes
index.update()
# load ignore
ignore = Ignore(self.context, iitem.path, parent_ignore=iitem.ignore)

# compare
index.check_fix(self.context.force)
if self.context.show_ignored_only:
index.show_ignored_only(ignore)
else:
# calc the new hashes
index.calc_hashes(ignore=ignore)

# save if update is set
if self.update:
if index.save():
self.context.log(Status.UPDATE_INDEX, "")
# compare
index.check_fix(self.context.force)

# save if update is set
if self.update:
if index.save():
self.context.log(Status.UPDATE_INDEX, "")

# process subdirs
for name in dirs:
if not index.should_ignore(name):
self.input_queue.put(os.path.join(parent, name))
if not ignore.should_ignore(name):
self.context.add_input(os.path.join(iitem.path, name), ignore=ignore)
else:
self.context.log(Status.SKIP, name + "/")
self.context.log(Status.IGNORE, name + "/")

def _run(self):
while True:
parent = self.input_queue.get()
if parent is None:
iitem = self.input_queue.get()
if iitem is None:
break
try:
self._process_root(parent)
self._process_root(iitem)
except Exception as e:
self.context.log(Status.INTERNALEXCEPTION, f"{parent}: {e}")
self.context.log(Status.INTERNALEXCEPTION, f"{iitem.path}: {e}")
self.input_queue.task_done()

def join(self):
Expand Down
9 changes: 9 additions & 0 deletions chkbit/input_item.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from __future__ import annotations
from typing import Optional
import chkbit


class InputItem:
def __init__(self, path: str, *, ignore: Optional[chkbit.Ignore] = None):
self.path = path
self.ignore = ignore

0 comments on commit df44bc7

Please sign in to comment.