<a href="https://colab.research.google.com/github/liljar2004-sudo/Kenjar_DTSC3020/blob/main/assignment_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from pathlib import Path
import re
import csv

# File paths
input_path = Path("contacts_raw.txt")
output_path = Path("contacts_clean.csv")

# Regex for email validation
email_pattern = re.compile(r"^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$")

def normalize_phone(raw):
    """Normalize phone number: keep last 10 digits if available, else empty string."""
    digits = re.sub(r"\D", "", raw)
    if len(digits) >= 10:
        return digits[-10:]
    return ""

# Read and process the input file
contacts = []
seen_emails = set()

try:
    with input_path.open("r", encoding="utf-8") as f:
        for line in f:
            parts = [p.strip() for p in line.strip().split(",")]
            if len(parts) < 3:
                # Skip malformed lines
                continue

            name, email, phone = parts[0], parts[1], parts[2]

            # Validate email
            if not email_pattern.fullmatch(email):
                continue

            # Deduplicate by case-insensitive email
            email_key = email.casefold()
            if email_key in seen_emails:
                continue
            seen_emails.add(email_key)

            # Normalize phone
            phone = normalize_phone(phone)

            contacts.append((name, email, phone))

except FileNotFoundError:
    print(f"⚠️ File not found: {input_path}. Please make sure the file exists.")
else:
    # Write cleaned data to CSV
    with output_path.open("w", newline="", encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["name", "email", "phone"])
        writer.writerows(contacts)

    print(f"✅ Cleaned contacts written to {output_path}")


In [None]:
import re
import io
import pytest
from pathlib import Path

# === Import functions from your main script ===
# Assume the main script (from your previous code) is saved as crm_cleanup.py
from crm_cleanup import normalize_phone, email_pattern


# --- Helper function for parsing tests ---
def parse_contacts_from_string(data):
    """Simulate reading from contacts_raw.txt (without file I/O)."""
    contacts = []
    seen_emails = set()

    for line in io.StringIO(data):
        parts = [p.strip() for p in line.strip().split(",")]
        if len(parts) < 3:
            continue
        name, email, phone = parts[0], parts[1], parts[2]

        # Email validation
        if not email_pattern.fullmatch(email):
            continue

        email_key = email.casefold()
        if email_key in seen_emails:
            continue
        seen_emails.add(email_key)

        phone = normalize_phone(phone)
        contacts.append((name, email, phone))

    return contacts


# --- Email validation tests ---
@pytest.mark.parametrize(
    "email,expected",
    [
        ("user@example.com", True),
        ("USER@Example.COM", True),
        ("user.name+tag@example.co.uk", True),
        ("user@", False),
        ("@example.com", False),
        ("userexample.com", False),
        ("user@.com", False),
    ],
)
def test_email_validation(email, expected):
    """Check that the regex fully matches only valid email addresses."""
    assert bool(email_pattern.fullmatch(email)) == expected


# --- Phone normalization tests ---
@pytest.mark.parametrize(
    "raw,expected",
    [
        ("1234567890", "1234567890"),
        ("(123) 456-7890", "1234567890"),
        ("+1 (234) 567-8901", "2345678901"),  # keep last 10 digits
        ("001-234-567-8901", "2345678901"),
        ("98765", ""),  # too short
        ("abc-123-4
