# Chapter 1: String Handling and Unicode

This notebook covers string operations, modern formatting techniques, Unicode handling, and string immutability in professional Python code.

## Section 1: Modern String Formatting

Python 3.6+ offers convenient f-strings that are preferred over older formatting methods.

In [None]:
# Basic f-strings (preferred)
name = "Alice"
age = 30
height = 5.8

print(f"Name: {name}")
print(f"Age: {age}")
print(f"Height: {height} feet")

# Expressions in f-strings
print(f"\nNext year: {age + 1}")
print(f"Age doubled: {age * 2}")

# Method calls in f-strings
text = "hello"
print(f"Uppercase: {text.upper()}")
print(f"Title case: {text.title()}")

In [None]:
# Number formatting
price = 19.99
percentage = 0.8567
large_number = 1234567

# Fixed decimal places
print(f"Price: ${price:.2f}")

# Percentage formatting
print(f"Success rate: {percentage:.1%}")

# Thousand separators
print(f"Population: {large_number:,}")

# Alignment and padding
print(f"Left align:   '{name:<10}'")
print(f"Right align:  '{name:>10}'")
print(f"Center:       '{name:^10}'")
print(f"Padded right: '{age:05d}'")

In [None]:
# Older formatting methods (for reference)
name = "Bob"
age = 25

# % formatting (old)
old_style = "Name: %s, Age: %d" % (name, age)
print(f"Old style: {old_style}")

# .format() method
format_style = "Name: {}, Age: {}".format(name, age)
print(f"Format: {format_style}")

# f-string (modern, preferred)
f_style = f"Name: {name}, Age: {age}"
print(f"F-string: {f_style}")

print("\n‚úÖ Use f-strings in new code!")

## Section 2: String Methods and Operations

In [None]:
text = "  Hello, Python World!  "

# Case operations
print(f"Original: '{text}'")
print(f"Upper:    '{text.upper()}'")
print(f"Lower:    '{text.lower()}'")
print(f"Title:    '{text.title()}'")
print(f"Swapcase: '{text.swapcase()}'")

# Whitespace operations
print(f"\nStripped: '{text.strip()}'")
print(f"Left stripped: '{text.lstrip()}'")
print(f"Right stripped: '{text.rstrip()}'")

In [None]:
# Search and information
text = "Hello World"

print(f"Text length: {len(text)}")
print(f"Index of 'World': {text.find('World')}")
print(f"Contains 'Hello': {'Hello' in text}")
print(f"Starts with 'Hello': {text.startswith('Hello')}")
print(f"Ends with 'World': {text.endswith('World')}")

# Counting
message = "one two three two one"
print(f"\nCount 'two' in message: {message.count('two')}")
print(f"Count 'one' in message: {message.count('one')}")

In [None]:
# Replace operations
text = "one two three two one"

print(f"Original: {text}")
print(f"Replace 'two' with '2': {text.replace('two', '2')}")
print(f"Replace only first 'two': {text.replace('two', '2', 1)}")

# Partition
text = "The quick brown fox"
before, sep, after = text.partition(" brown ")
print(f"\nPartition on ' brown ':")
print(f"  Before: '{before}'")
print(f"  Sep:    '{sep}'")
print(f"  After:  '{after}'")

In [None]:
# Type checking
test_strings = ["123", "hello", "hello123", "   ", "HELLO", "12.34"]

for s in test_strings:
    print(f"\n'{s}':")
    print(f"  isdigit: {s.isdigit()}")
    print(f"  isalpha: {s.isalpha()}")
    print(f"  isalnum: {s.isalnum()}")
    print(f"  isspace: {s.isspace()}")
    print(f"  isupper: {s.isupper()}")
    print(f"  islower: {s.islower()}")

## Section 3: String Splitting and Joining

In [None]:
# Basic split
csv_line = "apple,banana,cherry"
items = csv_line.split(",")
print(f"CSV: {csv_line}")
print(f"Items: {items}")

# Split with limit
limited = csv_line.split(",", 1)  # Split into max 2 parts
print(f"Limited split: {limited}")

# Split on whitespace
sentence = "The quick brown fox"
words = sentence.split()
print(f"\nWords: {words}")

# Split from right
path = "/home/user/documents/file.txt"
directory, filename = path.rsplit("/", 1)
print(f"\nPath: {path}")
print(f"Directory: {directory}")
print(f"Filename: {filename}")

In [None]:
# Joining (more efficient than concatenation)
items = ["apple", "banana", "cherry"]

# Inefficient: string concatenation in a loop
# result = ""
# for item in items:
#     result += item + ", "  # Creates new string each time

# Efficient: use join
result = ", ".join(items)
print(f"Joined: '{result}'")

# Different separators
print(f"Space separated: '{' '.join(items)}'")
print(f"Dash separated: '{'-'.join(items)}'")
print(f"Newline separated:\n{'\\n'.join(items)}")

# Performance comparison
import time

large_list = [str(i) for i in range(10000)]

# Using join (fast)
start = time.perf_counter()
result = "".join(large_list)
elapsed = time.perf_counter() - start
print(f"\nJoining 10,000 strings: {elapsed*1000:.3f}ms")

## Section 4: Unicode and String Encoding

In [None]:
# Python 3 strings are Unicode by default
emoji_text = "Hello üëã World üåç"
greek = "ŒïŒªŒªŒ∑ŒΩŒπŒ∫Œ¨"
russian = "–ü—Ä–∏–≤–µ—Ç –º–∏—Ä"
arabic = "ŸÖÿ±ÿ≠ÿ®ÿß ÿ®ÿßŸÑÿπÿßŸÑŸÖ"

print(f"Emoji: {emoji_text}")
print(f"Greek: {greek}")
print(f"Russian: {russian}")
print(f"Arabic: {arabic}")

# String length (character count)
print(f"\n'{emoji_text}' has {len(emoji_text)} characters")
print(f"Bytes when encoded: {len(emoji_text.encode('utf-8'))} bytes")

In [None]:
import unicodedata

# Unicode normalization
# Some characters can be represented in multiple ways
text_composed = "caf√©"   # √© is a single character
text_decomposed = "caf√©" # √© is e + accent (combining character)

print(f"Composed:   '{text_composed}'")
print(f"Decomposed: '{text_decomposed}'")
print(f"Equal: {text_composed == text_decomposed}")

# Normalize both to NFC (composed form)
nfc_composed = unicodedata.normalize("NFC", text_composed)
nfc_decomposed = unicodedata.normalize("NFC", text_decomposed)
print(f"\nAfter NFC normalization: {nfc_composed == nfc_decomposed}")

# Normalize to NFD (decomposed form)
nfd_composed = unicodedata.normalize("NFD", text_composed)
nfd_decomposed = unicodedata.normalize("NFD", text_decomposed)
print(f"After NFD normalization: {nfd_composed == nfd_decomposed}")

In [None]:
# Character information
test_chars = ["A", "a", "5", "!", "√©", "‰∏≠", "üòÄ"]

for char in test_chars:
    try:
        category = unicodedata.category(char)
        name = unicodedata.name(char)
        print(f"'{char}': category={category}, name={name}")
    except ValueError:
        print(f"'{char}': no Unicode name")

## Section 5: String Immutability

In [None]:
# Strings are immutable
original = "Hello"

# Cannot modify strings in place
try:
    original[0] = 'J'  # This will raise TypeError
except TypeError as e:
    print(f"Error: {e}")

# Instead, create new strings
modified = "J" + original[1:]
print(f"\nOriginal: '{original}' (id: {id(original)})")
print(f"Modified: '{modified}' (id: {id(modified)})")
print(f"Different objects: {id(original) != id(modified)}")

In [None]:
# String concatenation creates new strings
s1 = "Hello"
s2 = " "
s3 = "World"

# String concatenation (creates multiple intermediate strings)
result1 = s1 + s2 + s3
print(f"Concatenation: '{result1}'")

# f-string (efficient)
result2 = f"{s1}{s2}{s3}"
print(f"F-string: '{result2}'")

# Join with list (most efficient for many strings)
parts = [s1, s2, s3]
result3 = "".join(parts)
print(f"Join: '{result3}'")

print(f"\nAll equal: {result1 == result2 == result3}")

In [None]:
# Raw strings (useful for Windows paths and regex)
# Regular string with escapes
regular = "C:\\Users\\Alice\\Documents"
print(f"Regular string: {regular}")

# Raw string (backslashes not escaped)
raw = r"C:\Users\Alice\Documents"
print(f"Raw string: {raw}")
print(f"Same: {regular == raw}")

# Helpful for regex patterns
import re

# Without raw string, need to escape backslashes
pattern1 = "\\d+"  # Matches digits

# With raw string, cleaner
pattern2 = r"\d+"  # Same pattern, cleaner

print(f"\nPattern 1: {pattern1}")
print(f"Pattern 2: {pattern2}")
print(f"Same: {pattern1 == pattern2}")

## Section 6: Multiline Strings

In [None]:
# Triple-quoted strings for multiple lines
doc = """
    This is a multiline string.
    It spans multiple lines.
    Useful for docstrings and long text.
    """
    
print(doc)

# Preserves newlines
html_template = """
<html>
  <head>
    <title>Page</title>
  </head>
  <body>
    <h1>Hello</h1>
  </body>
</html>
"""
    
print(html_template)

## Summary

### String Formatting
- **f-strings** (preferred): `f"{value:.2f}"`
- `.format()`: `"{0:.2f}".format(value)`
- `%` operator (legacy): `"%d" % value`

### Common Methods
- **Case**: `.upper()`, `.lower()`, `.title()`, `.capitalize()`
- **Whitespace**: `.strip()`, `.lstrip()`, `.rstrip()`
- **Search**: `.find()`, `.startswith()`, `.endswith()`, `.count()`
- **Transform**: `.replace()`, `.split()`, `.join()`
- **Type check**: `.isdigit()`, `.isalpha()`, `.isspace()`

### Key Concepts
- Strings are **immutable**‚Äîoperations create new strings
- **Unicode handling** is transparent in Python 3
- Use `join()` instead of `+` for multiple concatenations
- **Raw strings** (`r"..."`) useful for paths and regex
- **f-strings** are preferred for formatting clarity