In [32]:
from pathlib import Path

def _apply_filters(file_path: str, filters: dict) -> bool:
    file = Path(file_path).resolve()
    if filters.get('exclude_str') and \
            set(os.path.normpath(str(file)).split(os.path.sep)) & set(filters['exclude_str']):
        return False
    if filters.get('include_str') and \
            not set(os.path.normpath(str(file)).split(os.path.sep)) & set(filters['include_str']):
        return False
    if filters.get('extensions') and file.suffix not in filters['extensions']:
        return False
    if filters.get('exclude_extensions') and file.suffix in filters['exclude_extensions']:
        return False
    if filters.get('min_size') and file.stat().st_size < filters['min_size']:
        return False
    if filters.get('max_size') and file.stat().st_size > filters['max_size']:
        return False

    return True  # Passes all filter conditions

In [86]:
import os
import pandas as pd
from tqdm import tqdm
from file_processing import File

def process(file_path, report_path, batch_size=1000, breakpoint=0, filters=None, recovery_mode=False):

    # Crash recovery: auto-computing breakpoint to start processing
    if recovery_mode and os.path.isfile(report_path):
        df = pd.read_csv(report_path)
        breakpoint = len(df)

    with tqdm(desc='Processing batches', unit=' batches completed') as pbar:
        for index, batch in enumerate(loader(file_path, batch_size=batch_size, breakpoint=breakpoint, filters=filters)):
            data = pd.DataFrame([file.processor.__dict__ for file in batch])

            if index > 1:
                df = pd.read_csv(report_path)
                df = pd.concat([df, data], ignore_index=True)
                df.to_csv(report_path, index=False)
            elif not recovery_mode:
                data.to_csv(report_path, index=False)
            
            pbar.update(1)


def loader(file_path, batch_size=0, breakpoint=0, filters=None):
    batch = []

    with tqdm(desc='Processing files', unit=' files completed') as pbar:
        for dirpath, _, filenames in os.walk(file_path):
            for filename in filenames:
                pbar.update(1)
                file_path = os.path.join(dirpath, filename)

                if filters and not _apply_filters(file_path, filters):
                    continue

                if pbar.n > breakpoint:
                    file_obj = File(file_path, open_file=False)

                    if batch_size == 0:
                        yield file_obj
                    elif batch_size > 0:
                        batch.append(file_obj)
                        batch_progress = pbar.n/batch_size

                        if int(batch_progress) == batch_progress:
                            yield batch
                            batch = []

        if len(batch) > 0:
            yield batch

process('./tests', report_path='metadata.csv', batch_size=1000, recovery_mode=False, filters={'exclude_str': ['.venv', '.git']})

Processing files: 186 files completed [00:01, 104.31 files completed/s]ed]
Processing batches: 1 batches completed [00:01,  1.79s/ batches completed]
