# DEV - Filtering

Filtering filepaths and folder like `git`.

Flow diagram:

```text
START
╚═➤ filter_path: given an absolute path, extracts the included and excluded file lists
    ╚═➤ parse_filter_file: loads and parses the file with the exclusion/inclusion directives
    ╚═➤ is_ignored: applied to each path to determine if it is to be included/excluded
        ║            depending on the filtering criterias
        ╚═➤ match_pattern: compares the path against the filtering criteria
```      

```
╠═
╚═
║
```

In [1]:
import os
import fnmatch
# import re

In [25]:
dirname = 'a'

if not os.listdir(dirname):
    os.rmdir(dirname)
    print(f"{dirname} removed!")


a removed!


In [None]:
def parse_filter_file(filepath:str) -> list:
    """
    Parse the filter file into a list of (pattern, negation) tuples.
    negation=True  -> file/folder to be included
    negation=False -> file/folder to be excluded
    """
    patterns = []
    if not os.path.exists(filepath):
        return patterns

    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith('#'):
                continue
            negated = line.startswith('!')
            if negated:
                line = line[1:]
            patterns.append((line, negated))
    return patterns


def match_pattern(path, pattern, base_dir):
    """
    Match a path to a .gitignore-style pattern.
    """
    # Normalize
    path = path.replace(os.sep, '/')
    base_dir = base_dir.replace(os.sep, '/')
    pattern = pattern.replace(os.sep, '/')

    # If pattern starts with '/', it's relative to the base directory
    if pattern.startswith('/'):
        pattern = pattern[1:]
        path_rel = os.path.relpath(path, base_dir).replace(os.sep, '/')
    else:
        path_rel = path

    # Convert ** to */*/ and handle directory endings
    if pattern.endswith('/'):
        pattern = pattern.rstrip('/') + '/**'

    # Convert gitignore wildcards to fnmatch wildcards
    # Git ** matches across directories
    pattern = pattern.replace('**', '*')

    return fnmatch.fnmatch(path_rel, pattern)


def is_ignored(path, patterns, base_dir):
    """
    Determine if a file path should be ignored based on parsed patterns.
    Handles negation (!pattern).
    """
    ignored = False
    for pattern, negated in patterns:
        if match_pattern(path, pattern, base_dir):
            ignored = not negated
    return ignored


def iter_files(base_dir):
    """ Recursively yield all files under base_dir. """
    for root, dirs, files in os.walk(base_dir):
        for f in files:
            yield os.path.join(root, f)


def filter_paths(base_dir, filter_filepath):
    """
    Filter all files under base_dir using the rules rules.
    Returns (included_files, ignored_files).
    """
    patterns = parse_filter_file(filter_filepath)
    included, ignored = [], []

    for path in iter_files(base_dir):
        if is_ignored(path, patterns, base_dir):
            ignored.append(os.path.relpath(path, base_dir))
        else:
            included.append(os.path.relpath(path, base_dir))

    return sorted(included), sorted(ignored)

In [21]:
from scipy.interpolate import interp2d

In [19]:
FILTER_FILE = '.libignore'

if not os.path.exists(FILTER_FILE):
    with open(FILTER_FILE, 'w') as f:
        f.write('')

In [20]:
with open(FILTER_FILE, 'r') as f:
    filter_file_content = f.read()

print("File Content >>\n", filter_file_content)

print("File Parsed:", parse_filter_file(FILTER_FILE))

File Content >>
 # Ignore logs
*.log
*.txt

# Ignore Python cache folders
__pycache__/

# Except important.log
!important.log

File Parsed: [('*.log', False), ('*.txt', False), ('__pycache__/', False), ('important.log', True)]


In [None]:
base_dir = "."
filter_filepath = os.path.join(base_dir, FILTER_FILE)

included, ignored = filter_paths(base_dir, filter_filepath)

print("=== INCLUDED FILES ===")
for f in included:
    print(f)

print("\n=== IGNORED FILES ===")
for f in ignored:
    print(f)

=== INCLUDED FILES ===
dev_filtering.ipynb
dev_librarian.ipynb
filter_file

=== IGNORED FILES ===
test_ext_folder\readme.txt
test_int_folder\readme.txt
