# Week 1 — Part 02: Data profiling script (CSV → JSON/Markdown)

**Estimated time:** 90–120 minutes

## Learning Objectives

- Treat real-world CSV data as untrusted input
- Build a deterministic profiling artifact (`profile.json` + `profile.md`)
- Fail fast with clear errors for missing/empty inputs
- Add optional schema/required-column checks


## Overview

In AI/ML/LLM projects, most pain starts with data issues:

- wrong column names
- unexpected types
- empty files
- missing values

A data profiling script makes these issues visible early.

---

## Output contract

Given the same input CSV, the script should always produce:

- `output/profile.json` (machine-readable)
- `output/profile.md` (human-readable)

And it should fail with clear errors for:

- missing file
- empty file
- missing required columns (optional extension)

Key reproducibility detail: keep outputs deterministic so diffs are meaningful.

In [None]:
from __future__ import annotations

import json
from dataclasses import asdict, dataclass
from pathlib import Path


try:
    import pandas as pd
except Exception as e:  # pragma: no cover
    pd = None
    _pd_import_error = e


OUTPUT_DIR = Path("output")
OUTPUT_DIR.mkdir(exist_ok=True)


@dataclass
class Profile:
    rows: int
    cols: int
    columns: list[str]
    dtypes: dict[str, str]
    missing_by_column: dict[str, int]


def load_csv(path: Path):
    if not path.exists():
        raise FileNotFoundError(f"Input file not found: {path}")
    if path.stat().st_size == 0:
        raise ValueError(f"Input file is empty: {path}")
    if pd is None:
        raise RuntimeError(f"pandas is required: {_pd_import_error}")
    return pd.read_csv(path)


def make_profile(df) -> Profile:
    missing = df.isna().sum().to_dict()
    dtypes = {col: str(dtype) for col, dtype in df.dtypes.to_dict().items()}
    return Profile(
        rows=int(df.shape[0]),
        cols=int(df.shape[1]),
        columns=list(df.columns),
        dtypes=dtypes,
        missing_by_column={k: int(v) for k, v in missing.items()},
    )


print("ready")

In [None]:
# Create a small sample CSV for profiling (non-verbatim example)
if pd is not None:
    sample_path = OUTPUT_DIR / "sample_profile.csv"
    df = pd.DataFrame(
        {
            "user_id": [1, 2, 3, 4],
            "age": [22, None, 35, 29],
            "country": ["US", "SG", None, "US"],
        }
    )
    df.to_csv(sample_path, index=False)
    print("wrote sample:", sample_path)

In [None]:
def profile_to_markdown(p: Profile) -> str:
    lines = []
    lines.append("# Data Profile")
    lines.append("")
    lines.append(f"- Rows: {p.rows}")
    lines.append(f"- Columns: {p.cols}")
    lines.append("")
    lines.append("## Columns")
    lines.append("")
    lines.append("| column | dtype | missing |")
    lines.append("|---|---|---:|")
    for col in p.columns:
        lines.append(f"| {col} | {p.dtypes.get(col, '')} | {p.missing_by_column.get(col, 0)} |")
    lines.append("")
    return "\n".join(lines)


if pd is not None:
    df2 = load_csv(sample_path)
    p = make_profile(df2)
    (OUTPUT_DIR / "profile.json").write_text(json.dumps(asdict(p), indent=2, sort_keys=True), encoding="utf-8")
    (OUTPUT_DIR / "profile.md").write_text(profile_to_markdown(p), encoding="utf-8")
    print("wrote:", OUTPUT_DIR / "profile.json")
    print("wrote:", OUTPUT_DIR / "profile.md")

In [None]:
def require_columns(df, required: list[str]) -> None:
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}")


print("TODO: extend with required columns + numeric summaries")