In [None]:
import os
from pathlib import Path

# Root directory
root = Path("/app/data/dev/hyfs/test/fs")

# Create the root directory
root.mkdir(parents=True, exist_ok=True)

# Define the filesystem structure with lots of variety and edge cases
filesystem = {
    # Regular files at root
    "README.md": "# Test Project\n",
    "config.json": '{"version": "1.0"}\n',
    ".gitignore": "*.pyc\n__pycache__/\n",
    ".env": "SECRET_KEY=test123\n",
    "requirements.txt": "numpy==1.24.0\npandas>=2.0.0\n",
    
    # Source code directory
    "src/main.py": "def main():\n    pass\n",
    "src/utils.py": "# Utilities\n",
    "src/__init__.py": "",
    "src/models/user.py": "class User:\n    pass\n",
    "src/models/__init__.py": "",
    "src/models/product.py": "class Product:\n    pass\n",
    
    # Tests directory
    "tests/test_main.py": "def test_main():\n    assert True\n",
    "tests/__init__.py": "",
    "tests/fixtures/data.json": '{"test": "data"}\n',
    "tests/fixtures/sample.csv": "id,name,value\n1,test,100\n",
    
    # Data directory with various file types
    "data/raw/dataset_2023.csv": "col1,col2,col3\n1,2,3\n",
    "data/raw/dataset_2024.csv": "col1,col2,col3\n4,5,6\n",
    "data/processed/cleaned_data.parquet": b"fake parquet data",
    "data/processed/features.pkl": b"fake pickle data",
    "data/images/photo1.jpg": b"fake jpg data",
    "data/images/photo2.png": b"fake png data",
    "data/images/thumbnails/thumb1.jpg": b"fake thumbnail",
    
    # Documentation
    "docs/index.html": "<html><body>Docs</body></html>\n",
    "docs/api/endpoints.md": "# API Endpoints\n",
    "docs/api/authentication.md": "# Auth\n",
    "docs/guides/getting-started.pdf": b"fake pdf data",
    
    # Configuration files
    "config/development.yaml": "debug: true\n",
    "config/production.yaml": "debug: false\n",
    "config/database.ini": "[database]\nhost=localhost\n",
    
    # Build artifacts
    "build/output.js": "console.log('built');\n",
    "build/styles.css": "body { margin: 0; }\n",
    "dist/bundle.min.js": "!function(){console.log('minified')}();\n",
    
    # Edge cases
    "files with spaces/document 1.txt": "Content with spaces\n",
    "files with spaces/my file (copy).docx": b"fake docx",
    "special-chars/file@2024.txt": "File with @ symbol\n",
    "special-chars/data#1.csv": "test,data\n",
    "special-chars/report_v2.1.pdf": b"fake pdf",
    "multiple.dots.in.name.txt": "Multiple dots\n",
    "UPPERCASE.TXT": "UPPERCASE FILE\n",
    "MixedCase.TxT": "Mixed case extension\n",
    
    # Hidden files and directories
    ".hidden/secret.txt": "Hidden content\n",
    ".hidden/.config": "hidden config\n",
    ".cache/temp1.tmp": "cache data\n",
    
    # Empty directory (will create separately)
    "empty_dir/.keep": "",
    
    # Deep nesting
    "a/b/c/d/e/deep_file.txt": "Very nested\n",
    
    # Various extensions
    "scripts/deploy.sh": "#!/bin/bash\necho 'deploying'\n",
    "scripts/backup.bat": "@echo off\necho backing up\n",
    "notebooks/analysis.ipynb": '{"cells": []}\n',
    "media/video.mp4": b"fake video data",
    "media/audio.mp3": b"fake audio data",
    "archives/backup.zip": b"fake zip data",
    "archives/old_data.tar.gz": b"fake tar.gz data",
    
    # Files with no extension
    "LICENSE": "MIT License\n",
    "Makefile": "all:\n\techo 'building'\n",
    "Dockerfile": "FROM python:3.11\n",
    
    # Very long filename
    "long_filename_that_goes_on_and_on_and_on_to_test_length_limits.txt": "Long name\n",
    
    # Numeric filenames
    "logs/2024-01-01.log": "[INFO] Log entry\n",
    "logs/2024-01-02.log": "[ERROR] Error entry\n",
    "reports/001_report.txt": "Report 1\n",
    "reports/002_report.txt": "Report 2\n",
}

# Create all files and directories
for filepath, content in filesystem.items():
    full_path = root / filepath
    full_path.parent.mkdir(parents=True, exist_ok=True)
    
    if isinstance(content, bytes):
        full_path.write_bytes(content)
    else:
        full_path.write_text(content)

# Create a truly empty directory
(root / "truly_empty").mkdir(exist_ok=True)

# Create another empty nested directory
(root / "temp/cache/empty").mkdir(parents=True, exist_ok=True)

print(f"✓ Created dummy filesystem at {root}")
print(f"✓ Total files created: {len(filesystem)}")
print(f"✓ Includes edge cases: spaces, special chars, hidden files, deep nesting, various extensions")

✓ Created dummy filesystem at /app/data/dev/hyfs/test/fs
✓ Total files created: 60
✓ Includes edge cases: spaces, special chars, hidden files, deep nesting, various extensions


In [None]:
!find /app/data/dev/hyfs/test -print | sed -e "s;/app/data/dev/hyfs/test;;" -e "s;[^/]*/;|  ;g" -e "s;|  \([^|]\);├─ \1;"


├─ fs
|  ├─ config.json
|  ├─ .hidden
|  |  ├─ secret.txt
|  |  ├─ .config
|  ├─ src
|  |  ├─ models
|  |  |  ├─ product.py
|  |  |  ├─ __init__.py
|  |  |  ├─ user.py
|  |  ├─ utils.py
|  |  ├─ __init__.py
|  |  ├─ main.py
|  ├─ dist
|  |  ├─ bundle.min.js
|  ├─ tests
|  |  ├─ test_main.py
|  |  ├─ __init__.py
|  |  ├─ fixtures
|  |  |  ├─ data.json
|  |  |  ├─ sample.csv
|  ├─ build
|  |  ├─ styles.css
|  |  ├─ output.js
|  ├─ reports
|  |  ├─ 002_report.txt
|  |  ├─ 001_report.txt
|  ├─ config
|  |  ├─ development.yaml
|  |  ├─ production.yaml
|  |  ├─ database.ini
|  ├─ .gitignore
|  ├─ requirements.txt
|  ├─ media
|  |  ├─ video.mp4
|  |  ├─ audio.mp3
|  ├─ docs
|  |  ├─ api
|  |  |  ├─ endpoints.md
|  |  |  ├─ authentication.md
|  |  ├─ index.html
|  |  ├─ guides
|  |  |  ├─ getting-started.pdf
|  ├─ files with spaces
|  |  ├─ document 1.txt
|  |  ├─ my file (copy).docx
|  ├─ UPPERCASE.TXT
|  ├─ MixedCase.TxT
|  ├─ README.md
|  ├

# HyFS
> A Hyper FileSystem

Architecture phase: exploration (2)

Current code:

In [None]:
import uuid
import os
import errno
from hashlib import sha256
from pathlib import Path
from fastcore.basics import AttrDict, patch
from fastcore.foundation import L
from fastcore.xtras import dict2obj
from fnmatch import fnmatch

class FSNode(AttrDict):
    def __getattribute__(self, key):
        cls = object.__getattribute__(self, '__class__')
        if key in cls.__dict__ and isinstance(cls.__dict__[key], property):
            return cls.__dict__[key].fget(self)
        return super().__getattribute__(key)

@patch
def show(self:FSNode, indent=0):
    print('    ' * indent + self.path.name)
    if self.type == 'dir':
        for child in self.children: child.show(indent+1)

@patch
def filter(self:FSNode, pred):
    matches = L()
    if pred(self): matches.append(self)
    if self.type == 'dir':
        for child in self.children:
            matches += child.filter(pred)
    return matches

@patch
def find(self:FSNode, pattern):
    return self.filter(lambda n: fnmatch(n.path.name, pattern))

@property
def eid(self:FSNode):
    """Return stable UUID for this node. Uses xattr if available, else deterministic hash."""
    path_str = str(self.path)
    xattr_key = 'user.fsxp.uuid'
    
    try:
        uuid_bytes = os.getxattr(path_str, xattr_key)
        return uuid_bytes.decode()
    except OSError:
        pass
    
    new_uuid = str(uuid.uuid4())
    
    try:
        os.setxattr(path_str, xattr_key, new_uuid.encode())
        return new_uuid
    except OSError as e:
        if e.errno in (errno.ENOTSUP, errno.EPERM, errno.EACCES):
            s = self.path.stat()
            data = f"{s.st_dev}:{s.st_ino}:{s.st_mtime}".encode()
            hash_hex = sha256(data).hexdigest()
            return f"{hash_hex[:8]}-{hash_hex[8:12]}-{hash_hex[12:16]}-{hash_hex[16:20]}-{hash_hex[20:32]}"
        else:
            raise

FSNode.eid = eid

def build_tree(path):
    p = Path(path)
    if p.is_file():
        return {'path': p, 'type': 'file'}
    children = [build_tree(child) for child in p.iterdir()]
    return {'path': p, 'type': 'dir', 'children': children}

# Test it!
fstree = dict2obj(build_tree('/app/data/dev/hyfs/test/'), dict_func=FSNode)
print("Root eid:", fstree.eid)
print("First child eid:", fstree.children[0].eid)
print("\nAll .ipynb files:")
for node in fstree.find('*.ipynb'):
    print(f"  {node.path.name}: {node.eid}")

Root eid: 068d27c0-a6d0-457e-9449-bdd1da937502


IndexError: list index out of range

# HyFS Design Document
> formerly called fsxp

## Overview
HyFS (FileSystem eXPlorer) is a Python-based filesystem management tool built on fastcore principles. It provides a tree-based representation of filesystem structures with stable entity identification, designed for interactive exploration, filtering, and eventual manipulation.

## Core Philosophy

### Principle of Lean Information Form (LIF)
Information must be expressed in its meaningful form, preserving integrity without requiring decoders. We store semantic structure directly, then decide display independently. This means:
- Organize as objects, lists, or indented structures—always fully meaningful
- No ASCII art for tree branches (`├──`, `└──`)—these are display concerns, not data
- When we `pathlib.Path()` the filesystem, we store what we find properly, then display separately

### LIF Lemma 1: Separation of Tagging and Nesting
Item tagging (atomic, category-based) and tree nesting (path-based hierarchy) are separate concerns:
- Don't store tags inside tree structure—keep them separate (dict mapping paths to tag sets, or metadata layer)
- Tags are many-to-many; trees are one-to-many
- Decide early: filesystem metadata (xattrs) or application-level (separate file/db)
- xattrs are portable but platform-dependent; app-level is consistent but not universal

### The fastcore Way
Methods return transformed data when possible, enabling chaining. `filter()` returns a new structure (or `L` of nodes), not print output. This separates data transformation from presentation.

### Make Side Effects Explicit and Deferrable
Inspired by Git's staging area, ZFS transactions, and the Command Pattern:
- **Read operations**: Immediate (work directly on tree snapshot)
- **Write operations**: Return a Plan/Transaction object that can be inspected, then executed
- Example: `plan = node.rename('newname')` → `plan.preview()` → `plan.execute()`
- This provides safety, composability, and clear boundaries between observation and mutation

## Architecture Decisions

### Data Structure: AttrDict + Path Composition
**Choice**: FSNode as AttrDict subclass, with Path objects as values
```python
{'path': Path('/app/data'), 'type': 'dir', 'children': [...]}
```

**Why**:
- Clean separation: AttrDict handles tree structure, Path handles filesystem operations
- Dual access: `node.path` (attribute) and `node['path']` (dict) both work
- Composable: Leverage both APIs fully without fighting immutability
- REPL-friendly: Tab completion works on attributes

**Rejected alternatives**:
- Subclassing Path: Fights Path's immutability, adds complexity
- Plain dicts: Loses ergonomic attribute access
- Custom tree node classes: More ceremony, less flexibility

### Tree Building: Lazy and Recursive
```python
def build_tree(path):
    p = Path(path)
    if p.is_file():
        return {'path': p, 'type': 'file'}
    children = [build_tree(child) for child in p.iterdir()]
    return {'path': p, 'type': 'dir', 'children': children}
```

**Why**:
- Simple recursion mirrors filesystem structure naturally
- Returns plain dicts, converted to FSNode via `dict2obj(build_tree(path), dict_func=FSNode)`
- Fast: No metadata collection upfront, only structure
- Extensible: Easy to add fields later

### Metadata: Lazy Properties
**Choice**: Properties like `eid`, `size`, `mtime` accessed on-demand via `@property`

**Why**:
- Pay-as-you-go: Don't stat() 100K files if you only need 50
- Scales better: Building tree with metadata upfront would add 20+ seconds for 1M files
- Most use cases filter first, then access metadata on subset
- Can add `@cached_property` later if repeated access becomes bottleneck

**Threshold analysis**:
- 10K files: Upfront metadata = 200ms (negligible), but lazy still better for partial queries
- 100K files: Upfront = 2s (noticeable), lazy = instant build + selective stats
- 1M files: Upfront = 20s+ (painful), lazy = <1s build

### Entity Identification: UUID with xattr Storage

**Concept**: Every node (file or directory) gets a stable `eid` (Entity ID)

**Why `eid` not `fid`/`nid`**:
- Directories are entities too—structure has semantic meaning before files exist
- `fid` (file ID) excludes directories
- `nid` (node ID) too bound to filesystem concept (inode)
- `eid` sits at perfect abstraction: generic enough for any representation, specific enough to be meaningful
- Conceptual hierarchy: `cid` (content) → `eid` (entity/metadata) → `nid` (filesystem-specific)

**Storage strategy**:
1. Try to read UUID from xattr `user.hyfs.uuid`
2. If missing, generate UUID v4 (v7 not yet in Python stdlib)
3. Try to store in xattr
4. If xattr fails (unsupported fs, permissions), fall back to deterministic hash of `(st_dev, st_ino, st_mtime)`

**xattr tradeoffs**:
- **Pros**: Atomic with file, survives renames within filesystem, standard POSIX
- **Cons**: Lost on cloud sync, zip, basic copy; not supported on FAT32/exFAT
- **Acceptable**: For SolveIT use case (Linux containers, modern fs), works 95% of time

**Why not inode-only**:
- Inodes change across filesystems (USB, network, backups)
- Need identity to persist across hosts for multi-instance SolveIT usage
- UUID provides stable identity even when filesystem metadata changes

### The AttrDict Property Problem

**Challenge**: AttrDict's `__getattr__` intercepts attribute access, checking dict keys before class properties. This breaks `@property` decorators.

**Solution**: Override `__getattribute__` to check class properties first:
```python
class FSNode(AttrDict):
    def __getattribute__(self, key):
        cls = object.__getattribute__(self, '__class__')
        if key in cls.__dict__ and isinstance(cls.__dict__[key], property):
            return cls.__dict__[key].fget(self)
        return super().__getattribute__(key)
```

**Critical detail**: Use `@property` + manual attachment (`FSNode.eid = eid`), not `@patch(as_prop=True)`. The latter doesn't work with our `__getattribute__` override.

**Naming conflicts avoided**:
- `id` → conflicts with Python builtin
- `uuid` → conflicts with imported module
- `uid` → conflicts with User ID concept
- Final choice: `eid` (mnemonic: D→E←F, between directories and files)

## Method Design

### `show(indent=0)`: Tree Display
Recursive print with indentation. Simple, effective MVP. Future: add depth limiting, icons, colors.

### `filter(pred)`: Flat List of Matches
```python
@patch
def filter(self:FSNode, pred):
    matches = L()
    if pred(self): matches.append(self)
    if self.type == 'dir':
        for child in self.children:
            matches += child.filter(pred)
    return matches
```

**Returns**: Flat `L` (fastcore list) of nodes matching predicate

**Why flat, not tree**:
- Unix philosophy: `find` outputs paths, not trees
- Composable: Easy to operate on results
- Separate concern from `prune()` (future: tree with only matching branches)

### `find(pattern)`: Glob Pattern Matching
```python
@patch
def find(self:FSNode, pattern):
    return self.filter(lambda n: fnmatch(n.path.name, pattern))
```

One-liner convenience wrapper. Uses `fnmatch` for shell-style globs (`*.py`, `test_*`).

## Development Principles

### Vertical Space Efficiency
- Favor one-liners where clarity isn't sacrificed
- Imports at top (no lazy imports unless heavy deps)
- `@patch` for adding methods to classes

### Fastcore Alignment
- Use `L` for lists (chainable, better defaults)
- Use `AttrDict` for dict-with-attributes
- Use `@patch` to extend classes
- Leverage `dict2obj` for recursive AttrDict conversion

### Jeremy Howard's Design Process
- Start simple, iterate toward elegance
- REPL-driven development: optimize for tab completion, exploration
- Composability over monolithic features
- "Do one thing well" (Unix philosophy)

### What We Avoid
- Premature optimization (measure first)
- Mixing concerns (filter ≠ display)
- ASCII art in data structures
- Schema-heavy approaches (dataclasses for dynamic data)
- Ceremony (favor terse, clear code)

## Code Style

### Naming
- Short where unambiguous: `eid`, `L`, `pred`
- Explicit where needed: `build_tree`, `dict2obj`
- Unix-inspired: `find`, `filter`, `show`

### Structure
1. Imports
2. Class definitions
3. Methods (via `@patch`)
4. Functions
5. Usage/tests

### Comments
- Docstrings for public methods
- Inline comments for non-obvious logic
- No redundant comments explaining obvious code

## Future Directions

### Immediate Next Steps
1. Add `prune()` for structural filtering (tree with only matching branches + ancestors)
2. Add metadata properties: `size`, `mtime`, `permissions`
3. Add navigation: `parent`, `find_by_path`, depth limiting

### Medium Term
1. Write operations: `rename()`, `move()`, `copy()` returning Plan objects
2. Content operations: `read()`, `write()` with sed-like transforms
3. Tagging system (separate from tree structure)

### Long Term
1. Distributed filesystem index (multi-host UUID tracking)
2. Semantic relationships (parent/child beyond filesystem hierarchy)
3. Integration with nbdev, SolveIT dialog management
4. CLI tool with FastHTML web interface

## Lessons Learned

### AttrDict + Properties
AttrDict wasn't designed for class properties. Our `__getattribute__` override works but is a workaround. For future: consider if fastcore could add property support, or if we should use a different base class.

### UUID Version Drama
Python 3.12 doesn't have uuid7 yet (still in discussion). uuid4 is fine for our needs, easy to migrate later.

### xattr Portability
xattrs work great on modern Linux/macOS but fail on many consumer scenarios (cloud sync, FAT32). Deterministic fallback is essential. Future: consider sidecar metadata files for persistent tracking.

### Debugging Strategy
When stuck: strip to vanilla Python, verify concept works, then incrementally add complexity. Our AttrDict property issue was solved by testing with `SimpleNode` first.

## Meta: How We Work

- **Incremental understanding**: Build simple examples to grasp concepts before implementing
- **Question assumptions**: "Why doesn't this exist?" often reveals antipatterns or limitations
- **Book-quality prose**: Dense paragraphs over blog-style bullet points for deep insights
- **Design before code**: Understand tradeoffs, then implement decisively
- **Prototype as we design**: PoC validates decisions immediately

This is a living document. Update as HyFS evolves.