# File Operations

**Chapter 8 - Learning Python, 5th Edition**

Python provides a comprehensive set of built-in functions and methods for file I/O.
Understanding file modes, encoding, context managers, and the distinction between
text and binary modes is essential for reading and writing data reliably across platforms.

## Section 1: The `open()` Function and File Modes

The built-in `open()` function is the gateway to all file I/O. Its `mode` parameter
controls how the file is opened:

| Mode | Description |
|------|-------------|
| `'r'` | Read (default) - file must exist |
| `'w'` | Write - creates or truncates |
| `'a'` | Append - creates or appends |
| `'x'` | Exclusive create - fails if file exists |
| `'b'` | Binary mode (combine: `'rb'`, `'wb'`) |
| `'t'` | Text mode (default, combine: `'rt'`, `'wt'`) |
| `'+'` | Read and write (combine: `'r+'`, `'w+'`) |

In [None]:
import tempfile
import os
from pathlib import Path

# Create a temporary directory for all our examples
work_dir = Path(tempfile.mkdtemp(prefix="ch08_files_"))
print(f"Working directory: {work_dir}")

# --- 'w' mode: write (creates or truncates) ---
sample_file = work_dir / "sample.txt"
f = open(sample_file, "w")
f.write("Hello, World!\n")
f.write("Python file I/O is straightforward.\n")
f.close()  # Must close manually without context manager
print(f"Created {sample_file.name}, size: {sample_file.stat().st_size} bytes")

# --- 'r' mode: read (default) ---
f = open(sample_file, "r")
content = f.read()
f.close()
print(f"\nRead content:\n{content}")

# --- 'a' mode: append ---
f = open(sample_file, "a")
f.write("This line was appended.\n")
f.close()

# --- 'x' mode: exclusive create (fails if file exists) ---
new_file = work_dir / "exclusive.txt"
f = open(new_file, "x")
f.write("Created exclusively.\n")
f.close()

try:
    f = open(new_file, "x")
except FileExistsError as e:
    print(f"'x' mode prevents overwrite: {e}")

## Section 2: Context Managers (`with` Statement)

Always use `with` for file operations. The context manager guarantees the file is
closed even if an exception occurs, preventing resource leaks and data corruption.

In [None]:
# The with statement handles open/close automatically
log_file = work_dir / "app.log"

with open(log_file, "w") as f:
    f.write("2024-01-15 10:00:00 INFO  Application started\n")
    f.write("2024-01-15 10:00:01 DEBUG Loading configuration\n")
    f.write("2024-01-15 10:00:02 WARN  Deprecated API used\n")
    f.write("2024-01-15 10:00:03 ERROR Connection timeout\n")
    f.write("2024-01-15 10:00:05 INFO  Retrying connection\n")
    # File is automatically closed when exiting the with block

print(f"File closed after with block: {f.closed}")

# Even exceptions don't prevent cleanup
error_file = work_dir / "error_test.txt"
try:
    with open(error_file, "w") as f:
        f.write("some data\n")
        raise ValueError("Simulated error during write")
except ValueError:
    pass

print(f"File closed despite exception: {f.closed}")

# Multiple files in one with statement
source = work_dir / "source.txt"
dest = work_dir / "dest.txt"

with open(source, "w") as f:
    f.write("Line 1\nLine 2\nLine 3\n")

with open(source, "r") as src, open(dest, "w") as dst:
    for line in src:
        dst.write(line.upper())

with open(dest, "r") as f:
    print(f"\nUppercased copy:\n{f.read()}")

## Section 3: Reading Methods

Python offers several ways to read file content, each suited to different use cases:

- `read()` - entire file as a single string
- `read(n)` - up to `n` characters (text) or bytes (binary)
- `readline()` - one line at a time
- `readlines()` - all lines as a list
- **Iteration** - memory-efficient line-by-line processing

In [None]:
# Create a multi-line file for demonstration
data_file = work_dir / "data.txt"
with open(data_file, "w") as f:
    for i in range(1, 6):
        f.write(f"Line {i}: data point {i * 10}\n")

# read() - entire file as string
with open(data_file, "r") as f:
    full_content: str = f.read()
print(f"read() returns {type(full_content).__name__}, length={len(full_content)}")
print(f"Content:\n{full_content}")

# read(n) - partial read
with open(data_file, "r") as f:
    first_20: str = f.read(20)
    next_20: str = f.read(20)
print(f"First 20 chars: {first_20!r}")
print(f"Next 20 chars:  {next_20!r}")

# readline() - one line at a time
print("\nreadline() calls:")
with open(data_file, "r") as f:
    line1: str = f.readline()
    line2: str = f.readline()
    print(f"  Line 1: {line1!r}")
    print(f"  Line 2: {line2!r}")

# readlines() - all lines as a list
with open(data_file, "r") as f:
    all_lines: list[str] = f.readlines()
print(f"\nreadlines() returns {len(all_lines)} lines:")
for line in all_lines:
    print(f"  {line!r}")

# Iteration - preferred for large files (memory efficient)
print("\nIteration (recommended):")
with open(data_file, "r") as f:
    for line_num, line in enumerate(f, start=1):
        print(f"  [{line_num}] {line.rstrip()}")

## Section 4: Writing Methods

- `write(s)` - write a string, returns number of characters written
- `writelines(lines)` - write an iterable of strings (no newlines added)
- `print(..., file=f)` - redirect print output to a file

In [None]:
# write() returns the number of characters written
output_file = work_dir / "output.txt"
with open(output_file, "w") as f:
    chars_written: int = f.write("Hello, World!\n")
    print(f"write() returned: {chars_written} characters")

    chars_written = f.write("Second line.\n")
    print(f"write() returned: {chars_written} characters")

# writelines() - writes an iterable of strings (no automatic newlines!)
lines_file = work_dir / "lines.txt"
lines: list[str] = [
    "alpha\n",
    "bravo\n",
    "charlie\n",
    "delta\n",
]

with open(lines_file, "w") as f:
    f.writelines(lines)

with open(lines_file, "r") as f:
    print(f"\nwritelines() result:\n{f.read()}")

# Using print() with file= parameter
report_file = work_dir / "report.txt"
with open(report_file, "w") as f:
    print("=" * 40, file=f)
    print("Monthly Report", file=f)
    print("=" * 40, file=f)
    for month, revenue in [("Jan", 1200), ("Feb", 1350), ("Mar", 980)]:
        print(f"  {month}: ${revenue:,}", file=f)
    print(f"\n  Total: ${1200 + 1350 + 980:,}", file=f)

with open(report_file, "r") as f:
    print(f"print(file=f) result:\n{f.read()}")

# Writing with a generator (memory efficient for large data)
gen_file = work_dir / "generated.txt"
with open(gen_file, "w") as f:
    f.writelines(f"item_{i:04d}\n" for i in range(5))

with open(gen_file, "r") as f:
    print(f"Generator writelines:\n{f.read()}")

## Section 5: Text vs Binary Mode

Text mode (`'t'`, the default) handles encoding/decoding and normalizes line endings.
Binary mode (`'b'`) reads and writes raw bytes without any transformation.

- **Text mode**: `str` objects, automatic newline translation (`\n` <-> OS-specific)
- **Binary mode**: `bytes` objects, no translation, required for non-text data (images, etc.)

In [None]:
# Text mode: works with str
text_file = work_dir / "text_mode.txt"
with open(text_file, "w") as f:  # 'w' is shorthand for 'wt'
    f.write("Text mode uses str objects.\n")
    f.write("Newlines are translated automatically.\n")

with open(text_file, "r") as f:
    text_data: str = f.read()
    print(f"Text mode type: {type(text_data).__name__}")
    print(f"Content: {text_data!r}")

# Binary mode: works with bytes
bin_file = work_dir / "binary_mode.bin"
with open(bin_file, "wb") as f:
    f.write(b"\x89PNG\r\n\x1a\n")  # PNG file header bytes
    f.write(b"\x00\x01\x02\x03\x04")
    f.write(bytes([0xFF, 0xFE, 0xFD]))

with open(bin_file, "rb") as f:
    bin_data: bytes = f.read()
    print(f"\nBinary mode type: {type(bin_data).__name__}")
    print(f"Raw bytes: {bin_data!r}")
    print(f"Hex: {bin_data.hex(' ')}")
    print(f"Length: {len(bin_data)} bytes")

# Mixing modes raises TypeError
try:
    with open(bin_file, "wb") as f:
        f.write("text string")  # Cannot write str in binary mode
except TypeError as e:
    print(f"\nTypeError: {e}")

# Reading a text file in binary mode shows raw bytes
with open(text_file, "rb") as f:
    raw: bytes = f.read()
    print(f"\nText file read as binary: {raw!r}")

## Section 6: File Encoding

Text files must be decoded from bytes to strings. The `encoding` parameter tells
Python which codec to use. Always specify encoding explicitly for portability.

Common encodings:
- `'utf-8'` - Universal, supports all Unicode (recommended default)
- `'ascii'` - 7-bit, English only
- `'latin-1'` (ISO 8859-1) - Western European, maps bytes 0-255 directly
- `'utf-16'` - Fixed 2-byte encoding with BOM

In [None]:
import sys

# Check the platform default encoding
print(f"Platform default encoding: {sys.getdefaultencoding()}")

# Write UTF-8 text with international characters
utf8_file = work_dir / "utf8.txt"
with open(utf8_file, "w", encoding="utf-8") as f:
    f.write("English: Hello, World!\n")
    f.write("French: Bonjour le monde!\n")
    f.write("Japanese: \u3053\u3093\u306b\u3061\u306f\u4e16\u754c\n")
    f.write("Emoji: \U0001F40D Python\n")

with open(utf8_file, "r", encoding="utf-8") as f:
    print(f"UTF-8 content:\n{f.read()}")

# Show byte-level differences between encodings
test_string = "caf\u00e9"  # cafe with accent
print(f"String: {test_string!r}")

for enc in ["utf-8", "latin-1", "ascii"]:
    try:
        encoded: bytes = test_string.encode(enc)
        print(f"  {enc:10s}: {encoded!r} ({len(encoded)} bytes)")
    except UnicodeEncodeError as e:
        print(f"  {enc:10s}: FAILED - {e}")

# The errors parameter controls how encoding/decoding failures are handled
problem_bytes = b"caf\xe9"  # Latin-1 encoded

for errors_mode in ["strict", "replace", "ignore", "backslashreplace"]:
    try:
        decoded = problem_bytes.decode("utf-8", errors=errors_mode)
        print(f"\nerrors={errors_mode!r:20s} -> {decoded!r}")
    except UnicodeDecodeError as e:
        print(f"\nerrors={errors_mode!r:20s} -> UnicodeDecodeError: {e}")

# Practical pattern: read with fallback encoding
def read_text_file(path: Path, encodings: list[str] | None = None) -> str:
    """Try multiple encodings until one works."""
    if encodings is None:
        encodings = ["utf-8", "latin-1", "cp1252"]
    for enc in encodings:
        try:
            with open(path, "r", encoding=enc) as f:
                return f.read()
        except (UnicodeDecodeError, LookupError):
            continue
    raise ValueError(f"Could not decode {path} with any of {encodings}")

# Test with a Latin-1 encoded file
latin_file = work_dir / "latin1.txt"
with open(latin_file, "wb") as f:
    f.write(b"caf\xe9\n")

result = read_text_file(latin_file)
print(f"\nFallback reader result: {result!r}")

## Section 7: File Position with `seek()` and `tell()`

Every open file has a position cursor. `tell()` returns the current position,
and `seek(offset, whence)` moves it.

In **text mode**, positions are opaque values returned by `tell()` (only `seek(0)` is
guaranteed portable). In **binary mode**, positions are byte offsets.

| `whence` | Meaning |
|----------|---------|
| `0` | Start of file (default) |
| `1` | Current position (binary only) |
| `2` | End of file (binary only) |

In [None]:
# seek() and tell() in binary mode (full control)
seek_file = work_dir / "seek_demo.bin"
with open(seek_file, "wb") as f:
    f.write(b"ABCDEFGHIJKLMNOPQRSTUVWXYZ")

with open(seek_file, "rb") as f:
    print(f"Initial position: {f.tell()}")

    # Read first 5 bytes
    data = f.read(5)
    print(f"Read 5 bytes: {data!r}, position now: {f.tell()}")

    # Seek to absolute position
    f.seek(10)
    print(f"After seek(10): position={f.tell()}, next={f.read(3)!r}")

    # Seek relative to current position (whence=1)
    f.seek(2, 1)
    print(f"After seek(2, 1): position={f.tell()}, next={f.read(3)!r}")

    # Seek from end (whence=2)
    f.seek(-5, 2)
    print(f"After seek(-5, 2): position={f.tell()}, rest={f.read()!r}")

    # Rewind to start
    f.seek(0)
    print(f"After seek(0): position={f.tell()}, full={f.read()!r}")

# seek() and tell() in text mode (limited)
text_seek = work_dir / "seek_text.txt"
with open(text_seek, "w", encoding="utf-8") as f:
    f.write("Line 1\nLine 2\nLine 3\n")

with open(text_seek, "r", encoding="utf-8") as f:
    pos_0 = f.tell()
    line1 = f.readline()
    pos_after_line1 = f.tell()
    line2 = f.readline()

    print(f"\nText mode positions:")
    print(f"  Start: {pos_0}")
    print(f"  After line 1 ({line1.rstrip()!r}): {pos_after_line1}")

    # In text mode, only seek(0) and seek(tell_value) are safe
    f.seek(pos_after_line1)
    re_read = f.readline()
    print(f"  Re-read from saved position: {re_read.rstrip()!r}")

## Section 8: Practical Pattern - Line-Oriented Processing

A common real-world pattern: process a file line by line, filtering, transforming,
or aggregating data. This approach is memory-efficient for large files.

In [None]:
from dataclasses import dataclass
from collections import Counter


@dataclass
class LogEntry:
    """Parsed log entry."""
    timestamp: str
    level: str
    message: str


def parse_log_line(line: str) -> LogEntry | None:
    """Parse a log line into a LogEntry, or None if malformed."""
    parts = line.strip().split(maxsplit=3)
    if len(parts) < 4:
        return None
    timestamp = f"{parts[0]} {parts[1]}"
    level = parts[2]
    message = parts[3]
    return LogEntry(timestamp=timestamp, level=level, message=message)


def analyze_log(path: Path) -> dict[str, int]:
    """Analyze a log file and return counts by level."""
    level_counts: Counter[str] = Counter()
    error_messages: list[str] = []

    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            entry = parse_log_line(line)
            if entry is None:
                continue
            level_counts[entry.level] += 1
            if entry.level == "ERROR":
                error_messages.append(entry.message)

    return dict(level_counts), error_messages


# Use the log file we created earlier
counts, errors = analyze_log(log_file)
print(f"Log level counts: {counts}")
print(f"Error messages: {errors}")

# Practical: filter log to a new file
error_log = work_dir / "errors_only.log"
lines_written: int = 0

with open(log_file, "r", encoding="utf-8") as src, \
     open(error_log, "w", encoding="utf-8") as dst:
    for line in src:
        entry = parse_log_line(line)
        if entry and entry.level in ("ERROR", "WARN"):
            dst.write(line)
            lines_written += 1

print(f"\nFiltered {lines_written} warning/error lines to {error_log.name}")
with open(error_log, "r") as f:
    print(f.read())

# Cleanup temporary directory
import shutil
shutil.rmtree(work_dir)
print(f"Cleaned up {work_dir}")

## Summary

### File Modes
- `'r'` read, `'w'` write (truncate), `'a'` append, `'x'` exclusive create
- `'b'` binary, `'t'` text (default), `'+'` read-write

### Reading
- `read()` for small files, iteration for large files
- `readline()` for one line, `readlines()` for all lines as list

### Writing
- `write()` returns character count, `writelines()` takes an iterable
- `print(..., file=f)` for formatted output

### Best Practices
1. **Always** use `with` statements for file handling
2. **Always** specify `encoding='utf-8'` explicitly
3. Use iteration (not `read()`) for large files
4. Use `'x'` mode when you need to prevent accidental overwrites
5. Use binary mode for non-text data (images, archives, protocols)
6. Use `seek()`/`tell()` in binary mode for random access; avoid in text mode