# Pathlib and OS Operations

**Chapter 8 - Learning Python, 5th Edition**

The `pathlib` module (Python 3.4+) provides an object-oriented interface for filesystem
paths, replacing the older `os.path` string-based approach. Combined with `shutil` for
high-level operations and `tempfile` for temporary storage, these modules form the
foundation of modern Python filesystem programming.

## Section 1: `pathlib.Path` Basics

`Path` objects represent filesystem paths as structured objects rather than raw strings.
They provide attributes for accessing path components and methods for common operations.

In [None]:
from pathlib import Path, PurePosixPath, PureWindowsPath

# Constructing paths
home: Path = Path.home()
cwd: Path = Path.cwd()
print(f"Home directory: {home}")
print(f"Current working directory: {cwd}")

# Path from string
config_path: Path = Path("/etc/app/config.yaml")
print(f"\nPath from string: {config_path}")
print(f"  type: {type(config_path).__name__}")

# Path components
example: Path = Path("/home/user/projects/app/main.py")
print(f"\nPath: {example}")
print(f"  .name:    {example.name}")        # 'main.py'
print(f"  .stem:    {example.stem}")        # 'main'
print(f"  .suffix:  {example.suffix}")      # '.py'
print(f"  .parent:  {example.parent}")      # /home/user/projects/app
print(f"  .anchor:  {example.anchor!r}")    # '/'
print(f"  .parts:   {example.parts}")       # ('/', 'home', 'user', ...)

# Multiple suffixes (e.g., .tar.gz)
archive: Path = Path("data/backup.tar.gz")
print(f"\nArchive: {archive}")
print(f"  .suffix:   {archive.suffix}")     # '.gz'
print(f"  .suffixes: {archive.suffixes}")   # ['.tar', '.gz']
print(f"  .stem:     {archive.stem}")       # 'backup.tar'

# Parent chain
print(f"\nParent chain of {example}:")
for parent in example.parents:
    print(f"  {parent}")

## Section 2: Path Operations - Joining, Testing, and Resolving

The `/` operator provides an intuitive way to join path components. Path objects
also offer methods for testing existence and resolving symbolic links.

In [None]:
import tempfile

# Create a temporary directory for our experiments
work_dir = Path(tempfile.mkdtemp(prefix="ch08_pathlib_"))
print(f"Working directory: {work_dir}")

# Path joining with / operator (preferred over os.path.join)
project: Path = work_dir / "myproject"
src: Path = project / "src" / "main.py"
print(f"\nJoined path: {src}")

# joinpath() method (equivalent to / operator)
test_path: Path = project.joinpath("tests", "test_main.py")
print(f"joinpath():  {test_path}")

# Create the directory structure
project_src = project / "src"
project_tests = project / "tests"
project_src.mkdir(parents=True, exist_ok=True)
project_tests.mkdir(parents=True, exist_ok=True)

# Create some files
(project_src / "main.py").write_text("print('hello')\n", encoding="utf-8")
(project_src / "utils.py").write_text("# utilities\n", encoding="utf-8")
(project_tests / "test_main.py").write_text("# tests\n", encoding="utf-8")
(project / "README.md").write_text("# My Project\n", encoding="utf-8")
(project / "setup.py").write_text("# setup\n", encoding="utf-8")

# Existence and type testing
print(f"\n{project_src} exists:     {project_src.exists()}")
print(f"{project_src} is_dir:     {project_src.is_dir()}")
print(f"{project_src} is_file:    {project_src.is_file()}")

main_py = project_src / "main.py"
print(f"\n{main_py.name} exists:    {main_py.exists()}")
print(f"{main_py.name} is_file:   {main_py.is_file()}")
print(f"{main_py.name} is_dir:    {main_py.is_dir()}")

ghost = project / "nonexistent.txt"
print(f"\n{ghost.name} exists: {ghost.exists()}")

# Resolving paths (absolute, no symlinks)
relative: Path = Path(".") / "some" / "relative" / "path"
print(f"\nRelative path:  {relative}")
print(f"Resolved:       {relative.resolve()}")
print(f"Is absolute:    {relative.is_absolute()}")
print(f"work_dir absolute: {work_dir.is_absolute()}")

# with_name() and with_suffix() create modified copies
original: Path = Path("data/report_v1.txt")
renamed: Path = original.with_name("report_v2.txt")
different_ext: Path = original.with_suffix(".csv")
print(f"\nOriginal:      {original}")
print(f"with_name():   {renamed}")
print(f"with_suffix(): {different_ext}")

## Section 3: Directory Operations

Path objects provide methods for creating, listing, and searching directories.
`iterdir()` lists immediate contents, while `glob()` and `rglob()` perform
pattern matching at one or all directory levels.

In [None]:
# mkdir() - create directories
nested: Path = work_dir / "a" / "b" / "c"

# parents=True creates intermediate directories (like mkdir -p)
# exist_ok=True suppresses FileExistsError
nested.mkdir(parents=True, exist_ok=True)
print(f"Created nested directory: {nested}")

# Without parents=True, intermediate dirs must exist
try:
    (work_dir / "x" / "y" / "z").mkdir()
except FileNotFoundError as e:
    print(f"Without parents=True: {e}")

# iterdir() - list directory contents
print(f"\nContents of {project.name}/src/:")
for item in sorted(project_src.iterdir()):
    kind = "dir" if item.is_dir() else "file"
    print(f"  [{kind:4s}] {item.name}")

# Separate files and directories
print(f"\nContents of {project.name}/:")
dirs: list[Path] = []
files: list[Path] = []
for item in sorted(project.iterdir()):
    (dirs if item.is_dir() else files).append(item)

for d in dirs:
    print(f"  [dir ] {d.name}/")
for f in files:
    print(f"  [file] {f.name}")

# glob() - pattern matching in one directory level
print(f"\nPython files in src/: {[p.name for p in project_src.glob('*.py')]}")
print(f"Markdown files:       {[p.name for p in project.glob('*.md')]}")

# rglob() - recursive pattern matching (all subdirectories)
print(f"\nAll .py files (recursive):")
for py_file in sorted(project.rglob("*.py")):
    # Show path relative to project root
    print(f"  {py_file.relative_to(project)}")

# glob with ** for explicit recursive matching
all_files: list[Path] = sorted(project.glob("**/*"))
print(f"\nAll items (glob **/*):")
for item in all_files:
    rel = item.relative_to(project)
    prefix = "dir " if item.is_dir() else "file"
    print(f"  [{prefix}] {rel}")

## Section 4: File Metadata

The `stat()` method returns file metadata including size, timestamps, and permissions.
Path objects also provide convenience properties for common queries.

In [None]:
from datetime import datetime
import stat as stat_module

# Write a file with some content for metadata inspection
meta_file: Path = work_dir / "metadata_demo.txt"
meta_file.write_text("Sample content for metadata demonstration.\n" * 10,
                     encoding="utf-8")

# stat() returns an os.stat_result object
file_stat = meta_file.stat()
print(f"File: {meta_file.name}")
print(f"  Size:          {file_stat.st_size} bytes")
print(f"  Mode (octal):  {oct(file_stat.st_mode)}")
print(f"  Is regular:    {stat_module.S_ISREG(file_stat.st_mode)}")

# Timestamps (as Unix epoch floats)
created = datetime.fromtimestamp(file_stat.st_ctime)
modified = datetime.fromtimestamp(file_stat.st_mtime)
accessed = datetime.fromtimestamp(file_stat.st_atime)
print(f"  Created:       {created:%Y-%m-%d %H:%M:%S}")
print(f"  Modified:      {modified:%Y-%m-%d %H:%M:%S}")
print(f"  Accessed:      {accessed:%Y-%m-%d %H:%M:%S}")

# Practical: summarize directory contents
def directory_summary(path: Path) -> dict[str, int | float]:
    """Summarize file counts and total size of a directory."""
    total_size: int = 0
    file_count: int = 0
    dir_count: int = 0

    for item in path.rglob("*"):
        if item.is_file():
            file_count += 1
            total_size += item.stat().st_size
        elif item.is_dir():
            dir_count += 1

    return {
        "files": file_count,
        "directories": dir_count,
        "total_size_bytes": total_size,
        "total_size_kb": round(total_size / 1024, 2),
    }


summary = directory_summary(project)
print(f"\nProject summary:")
for key, value in summary.items():
    print(f"  {key}: {value}")

# File size by extension
from collections import defaultdict

size_by_ext: dict[str, int] = defaultdict(int)
count_by_ext: dict[str, int] = defaultdict(int)

for f in project.rglob("*"):
    if f.is_file():
        ext = f.suffix or "(no ext)"
        size_by_ext[ext] += f.stat().st_size
        count_by_ext[ext] += 1

print(f"\nFiles by extension:")
for ext in sorted(count_by_ext):
    print(f"  {ext:10s}: {count_by_ext[ext]} files, {size_by_ext[ext]} bytes")

## Section 5: The `tempfile` Module

The `tempfile` module creates temporary files and directories that are automatically
cleaned up. This is essential for testing, intermediate processing, and any situation
where you need scratch space.

In [None]:
import tempfile

# NamedTemporaryFile - auto-deleted when closed (or when context exits)
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", prefix="data_",
                                  delete=False, encoding="utf-8") as tmp:
    tmp.write("Temporary data for processing.\n")
    tmp_path = Path(tmp.name)
    print(f"NamedTemporaryFile: {tmp_path}")
    print(f"  name:   {tmp_path.name}")
    print(f"  suffix: {tmp_path.suffix}")
    print(f"  exists: {tmp_path.exists()}")

# File persists because delete=False; clean it up manually
print(f"  exists after close: {tmp_path.exists()}")
tmp_path.unlink()
print(f"  exists after unlink: {tmp_path.exists()}")

# TemporaryDirectory - auto-deleted when context exits
with tempfile.TemporaryDirectory(prefix="scratch_") as tmp_dir_str:
    tmp_dir = Path(tmp_dir_str)
    print(f"\nTemporaryDirectory: {tmp_dir}")

    # Create files inside the temp directory
    for i in range(3):
        (tmp_dir / f"file_{i}.txt").write_text(f"content {i}\n",
                                                encoding="utf-8")

    contents = list(tmp_dir.iterdir())
    print(f"  Contents: {[p.name for p in sorted(contents)]}")
    print(f"  exists: {tmp_dir.exists()}")

# Directory and all contents are gone after the with block
print(f"  exists after context: {tmp_dir.exists()}")

# mkdtemp() for manual management (when you need control over lifetime)
manual_tmp: str = tempfile.mkdtemp(prefix="manual_")
manual_path = Path(manual_tmp)
print(f"\nmkdtemp: {manual_path}")
print(f"  exists: {manual_path.exists()}")

# Must clean up manually
import shutil
shutil.rmtree(manual_path)
print(f"  exists after rmtree: {manual_path.exists()}")

# SpooledTemporaryFile - stays in memory until max_size exceeded
with tempfile.SpooledTemporaryFile(max_size=1024, mode="w+",
                                    encoding="utf-8") as spool:
    spool.write("Small data stays in memory.\n")
    spool.seek(0)
    content = spool.read()
    print(f"\nSpooledTemporaryFile content: {content.rstrip()!r}")
    print(f"  Rolled to disk: {spool._rolled}")

## Section 6: `shutil` - High-Level File Operations

The `shutil` module provides high-level operations like copying, moving, and
removing directory trees. It handles the details that `Path` methods do not cover.

In [None]:
import shutil

# Create a fresh workspace
shutil_dir = Path(tempfile.mkdtemp(prefix="ch08_shutil_"))

# Create source files
src_dir = shutil_dir / "source"
src_dir.mkdir()
(src_dir / "data.txt").write_text("Important data.\n", encoding="utf-8")
(src_dir / "config.ini").write_text("[settings]\nkey=value\n", encoding="utf-8")
sub = src_dir / "subdir"
sub.mkdir()
(sub / "nested.txt").write_text("Nested file.\n", encoding="utf-8")

# --- shutil.copy() - copy file (preserves permissions) ---
dst_file = shutil_dir / "data_copy.txt"
shutil.copy(src_dir / "data.txt", dst_file)
print(f"copy(): {dst_file.name} exists={dst_file.exists()}")

# --- shutil.copy2() - copy file + metadata (timestamps) ---
dst_file2 = shutil_dir / "data_copy2.txt"
shutil.copy2(src_dir / "data.txt", dst_file2)
orig_stat = (src_dir / "data.txt").stat()
copy_stat = dst_file2.stat()
print(f"copy2(): preserved mtime={orig_stat.st_mtime == copy_stat.st_mtime}")

# --- shutil.copytree() - copy entire directory tree ---
tree_copy = shutil_dir / "source_backup"
shutil.copytree(src_dir, tree_copy)
print(f"\ncopytree() result:")
for item in sorted(tree_copy.rglob("*")):
    rel = item.relative_to(tree_copy)
    prefix = "dir " if item.is_dir() else "file"
    print(f"  [{prefix}] {rel}")

# --- shutil.move() - move/rename files or directories ---
moved_file = shutil_dir / "moved_data.txt"
shutil.move(str(dst_file), str(moved_file))
print(f"\nmove(): original exists={dst_file.exists()}, moved exists={moved_file.exists()}")

# --- shutil.rmtree() - remove entire directory tree ---
print(f"\nBefore rmtree: {tree_copy.name}/ exists={tree_copy.exists()}")
shutil.rmtree(tree_copy)
print(f"After rmtree:  {tree_copy.name}/ exists={tree_copy.exists()}")

# --- shutil.disk_usage() - check disk space ---
usage = shutil.disk_usage(shutil_dir)
print(f"\nDisk usage for {shutil_dir}:")
print(f"  Total: {usage.total / (1024**3):.1f} GB")
print(f"  Used:  {usage.used / (1024**3):.1f} GB")
print(f"  Free:  {usage.free / (1024**3):.1f} GB")

# Cleanup
shutil.rmtree(shutil_dir)
print(f"\nCleaned up {shutil_dir}")

## Section 7: Practical Pattern - Project File Scanner

Combining `pathlib` operations into a reusable utility that scans a project
directory, filters by patterns, and reports statistics.

In [None]:
from dataclasses import dataclass, field
from datetime import datetime


@dataclass
class FileInfo:
    """Metadata about a single file."""
    path: Path
    size: int
    modified: datetime
    extension: str


@dataclass
class ScanResult:
    """Results of scanning a directory."""
    root: Path
    files: list[FileInfo] = field(default_factory=list)
    skipped_dirs: list[Path] = field(default_factory=list)

    @property
    def total_size(self) -> int:
        return sum(f.size for f in self.files)

    @property
    def file_count(self) -> int:
        return len(self.files)

    def by_extension(self) -> dict[str, list[FileInfo]]:
        """Group files by extension."""
        result: dict[str, list[FileInfo]] = {}
        for f in self.files:
            result.setdefault(f.extension, []).append(f)
        return result

    def largest(self, n: int = 5) -> list[FileInfo]:
        """Return the n largest files."""
        return sorted(self.files, key=lambda f: f.size, reverse=True)[:n]


def scan_directory(
    root: Path,
    patterns: list[str] | None = None,
    exclude_dirs: set[str] | None = None,
) -> ScanResult:
    """Scan a directory tree, optionally filtering by glob patterns."""
    if exclude_dirs is None:
        exclude_dirs = {".git", "__pycache__", ".venv", "node_modules"}
    if patterns is None:
        patterns = ["*"]

    result = ScanResult(root=root)

    for pattern in patterns:
        for path in root.rglob(pattern):
            # Skip excluded directories
            if any(excl in path.parts for excl in exclude_dirs):
                continue
            if path.is_file():
                stat = path.stat()
                result.files.append(FileInfo(
                    path=path,
                    size=stat.st_size,
                    modified=datetime.fromtimestamp(stat.st_mtime),
                    extension=path.suffix or "(none)",
                ))

    return result


# Demo: scan a temporary project
demo_dir = Path(tempfile.mkdtemp(prefix="ch08_scanner_"))
for subdir in ["src", "tests", "docs", "__pycache__"]:
    (demo_dir / subdir).mkdir()

(demo_dir / "src" / "app.py").write_text("# App code\n" * 50, encoding="utf-8")
(demo_dir / "src" / "models.py").write_text("# Models\n" * 30, encoding="utf-8")
(demo_dir / "src" / "utils.py").write_text("# Utils\n", encoding="utf-8")
(demo_dir / "tests" / "test_app.py").write_text("# Tests\n" * 20, encoding="utf-8")
(demo_dir / "docs" / "guide.md").write_text("# Guide\n" * 15, encoding="utf-8")
(demo_dir / "README.md").write_text("# README\n", encoding="utf-8")
(demo_dir / "__pycache__" / "app.cpython-312.pyc").write_bytes(b"\x00" * 100)

scan = scan_directory(demo_dir, patterns=["*.py", "*.md"])

print(f"Scan results for {demo_dir.name}/")
print(f"  Total files: {scan.file_count}")
print(f"  Total size:  {scan.total_size} bytes")

print(f"\nBy extension:")
for ext, files in sorted(scan.by_extension().items()):
    total = sum(f.size for f in files)
    print(f"  {ext}: {len(files)} files, {total} bytes")

print(f"\nLargest files:")
for fi in scan.largest(3):
    rel = fi.path.relative_to(demo_dir)
    print(f"  {rel}: {fi.size} bytes (modified {fi.modified:%H:%M:%S})")

# Note: __pycache__ files excluded by default
print(f"\n__pycache__ excluded: no .pyc files in results")

# Cleanup
shutil.rmtree(demo_dir)
shutil.rmtree(work_dir)
print(f"Cleaned up temporary directories")

## Summary

### `pathlib.Path`
- Use `/` operator for joining: `Path('a') / 'b' / 'c'`
- Access components: `.name`, `.stem`, `.suffix`, `.parent`, `.parts`
- Test: `.exists()`, `.is_file()`, `.is_dir()`
- Modify: `.with_name()`, `.with_suffix()`
- Read/write shortcuts: `.read_text()`, `.write_text()`, `.read_bytes()`

### Directory Operations
- `.mkdir(parents=True, exist_ok=True)` for safe creation
- `.iterdir()` for listing, `.glob()` / `.rglob()` for pattern matching
- `.relative_to()` for computing relative paths

### `tempfile`
- `TemporaryDirectory()` for auto-cleaned scratch space
- `NamedTemporaryFile()` for temporary files with names
- Always prefer context managers for automatic cleanup

### `shutil`
- `copy()` / `copy2()` for file copying (with or without metadata)
- `copytree()` / `rmtree()` for directory trees
- `move()` for cross-filesystem moves

### Best Practices
1. Prefer `pathlib.Path` over `os.path` for new code
2. Use `tempfile` for scratch files and directories
3. Always handle `FileNotFoundError` and `PermissionError`
4. Use `rglob()` with exclusion sets to skip irrelevant directories
5. Use `shutil.rmtree()` with caution - it deletes recursively with no undo