# Foundations Course — Week 6 Practice (Starter Notebook)

Starter Capstone pipeline skeleton: CSV -> profile -> sample/compress -> LLM placeholder -> report artifacts.

## What success looks like (end of practice)

- You produce `output/report.json` and `output/report.md`.
- You can inspect `output/capstone_sample.csv` and explain how sampling/compression works.
- You implement the TODO exercise and verify it writes an artifact under `output/`.

### Checkpoint

- Re-running the notebook produces the same `report.json` fields.
- `output/capstone_sample.csv` exists.

## References (docs)
- Pandas I/O (CSV): https://pandas.pydata.org/docs/user_guide/io.html
- Pandas sampling: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sample.html
- JSON Schema: https://json-schema.org/
- Python `json`: https://docs.python.org/3/library/json.html
- Twelve-Factor App: https://12factor.net/

## Setup

Runnable without an API key because the LLM call is a placeholder.


In [None]:
import json
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List

import pandas as pd


In [None]:
from pathlib import Path
OUTPUT_DIR = Path('output')
OUTPUT_DIR.mkdir(exist_ok=True)
OUTPUT_DIR


## Step 0: Load CSV


In [None]:
df = pd.DataFrame({
    'user_id': [1, 2, 3, 4, 5, 6, 7, 8],
    'age': [23, None, 31, 45, 29, 35, None, 41],
    'country': ['US', 'US', 'SG', None, 'CN', 'US', 'SG', 'CN'],
    'purchase_amount': [12.5, 0.0, 7.99, 103.2, None, 5.0, 1000.0, 8.2],
})
csv_path = OUTPUT_DIR / 'capstone_sample.csv'
df.to_csv(csv_path, index=False)
df


## Step 1: Profile


In [None]:
def profile_df(df: pd.DataFrame) -> Dict[str, Any]:
    return {
        'n_rows': int(df.shape[0]),
        'n_cols': int(df.shape[1]),
        'dtypes': {k: str(v) for k, v in df.dtypes.items()},
        'missing_by_col': df.isna().sum().to_dict(),
        'duplicate_rows': int(df.duplicated().sum()),
        'numeric_stats': df.select_dtypes(include='number').describe().to_dict(),
    }

profile = profile_df(df)
profile


## Step 2: Sample/compress input for the LLM


In [None]:
def make_llm_input(df: pd.DataFrame, profile: Dict[str, Any], sample_n: int = 5) -> Dict[str, Any]:
    sample = df.sample(n=min(sample_n, len(df)), random_state=42).to_dict(orient='records')
    anomalies: List[Dict[str, Any]] = []
    if 'purchase_amount' in df.columns:
        s = df['purchase_amount'].dropna()
        if len(s) > 0:
            threshold = float(s.quantile(0.95))
            for _, row in df[df['purchase_amount'].fillna(-1) > threshold].iterrows():
                anomalies.append({'reason': 'purchase_amount above 95th percentile ({:.2f})'.format(threshold), 'row': row.to_dict()})
    return {
        'profile_summary': {
            'n_rows': profile['n_rows'],
            'n_cols': profile['n_cols'],
            'missing_by_col': profile['missing_by_col'],
            'dtypes': profile['dtypes'],
        },
        'numeric_stats': profile['numeric_stats'],
        'sample_rows': sample,
        'anomalies': anomalies,
    }

llm_input = make_llm_input(df, profile, sample_n=5)
llm_input


## Step 3: LLM explanation (placeholder)


In [None]:
def llm_explain_placeholder(llm_input: Dict[str, Any]) -> Dict[str, Any]:
    return {
        'insights': [
            'Missing values exist in age/country/purchase_amount; decide whether to impute or drop.',
            'High purchase_amount outliers may indicate VIP customers or data issues.',
        ],
        'recommendations': [
            'Add input validation for required columns before analysis.',
            'Track missing value rates over time as a data quality metric.',
            'Investigate outliers with row-level drill-down and source validation.',
        ],
        'risk_notes': [
            'Small samples may not represent the full dataset; confirm with stratified sampling if needed.',
        ],
    }

llm_explanation = llm_explain_placeholder(llm_input)
llm_explanation


## Step 4: Write report artifacts


In [None]:
@dataclass
class ReportPaths:
    report_json: Path
    report_md: Path

def build_report(profile: Dict[str, Any], llm_explanation: Dict[str, Any]) -> Dict[str, Any]:
    return {
        'data_overview': {
            'n_rows': profile['n_rows'],
            'n_cols': profile['n_cols'],
            'dtypes': profile['dtypes'],
            'missing_by_col': profile['missing_by_col'],
        },
        'anomalies': llm_input.get('anomalies', []),
        'insights': llm_explanation.get('insights', []),
        'recommendations': llm_explanation.get('recommendations', []),
        'risk_notes': llm_explanation.get('risk_notes', []),
    }

def write_report(report: Dict[str, Any], out_dir: Path) -> ReportPaths:
    report_json = out_dir / 'report.json'
    report_md = out_dir / 'report.md'
    report_json.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding='utf-8')
    md_lines = [
        '# Report',
        '',
        '## Data Overview',
        'Rows: {}'.format(report['data_overview']['n_rows']),
        'Cols: {}'.format(report['data_overview']['n_cols']),
        '',
        '## Insights',
    ]
    for it in report.get('insights', []):
        md_lines.append('- ' + str(it))
    md_lines.extend(['', '## Recommendations'])
    for it in report.get('recommendations', []):
        md_lines.append('- ' + str(it))
    md_lines.extend(['', '## Risk Notes'])
    for it in report.get('risk_notes', []):
        md_lines.append('- ' + str(it))
    report_md.write_text('\n'.join(md_lines), encoding='utf-8')
    return ReportPaths(report_json=report_json, report_md=report_md)

report = build_report(profile, llm_explanation)
paths = write_report(report, OUTPUT_DIR)
paths


## Exercise (TODO)

Add a required-columns guard before running the pipeline.

Goal:

- Implement `assert_required_columns_todo(df, required)`.
- Save the check result under `output/required_columns.json`.

Checkpoint:

- Calling the function with a missing column raises a clear `ValueError`.

In [None]:
def assert_required_columns_todo(df: pd.DataFrame, required: List[str]) -> None:
    # TODO: implement
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError("missing required columns: %s" % missing)


required_cols = ["user_id", "age", "country", "purchase_amount"]
assert_required_columns_todo(df, required_cols)

(OUTPUT_DIR / "required_columns.json").write_text(
    json.dumps({"required": required_cols, "present": list(df.columns)}, indent=2),
    encoding="utf-8",
)
print("wrote:", OUTPUT_DIR / "required_columns.json")

## Appendix: Solutions (peek only after trying)

Reference implementation for `assert_required_columns_todo`.

In [None]:
def assert_required_columns_todo(df: pd.DataFrame, required: List[str]) -> None:
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError("missing required columns: %s" % missing)


try:
    assert_required_columns_todo(df.drop(columns=["country"]), required_cols)
except ValueError as e:
    (OUTPUT_DIR / "required_columns_failure.json").write_text(
        json.dumps({"error": str(e)}, indent=2),
        encoding="utf-8",
    )
    print("wrote:", OUTPUT_DIR / "required_columns_failure.json")