In [None]:
"""
DATA QUALITY CHECK SCRIPT
--------------------------------
Purpose:
- Load financial data
- Perform basic data quality validations
- Flag issues before reporting
- Export control report for FP&A / BI teams

Author: Karol Holda
"""

import pandas as pd
from pathlib import Path


# ============================
# CONFIGURATION
# ============================

    INPUT_FILE = Path("data/finance_data.csv")
    OUTPUT_FOLDER = Path("output")
    OUTPUT_FOLDER.mkdir(exist_ok=True)
    
    REPORT_CSV = OUTPUT_FOLDER / "data_quality_report.csv"
    REPORT_XLSX = OUTPUT_FOLDER / "data_quality_report.xlsx"
    
    REQUIRED_COLUMNS = [
        "date",
        "account",
        "cost_center",
        "entity",
        "scenario",
        "amount",
    ]
    
    VARIANCE_THRESHOLD = 0.3  # 30% MoM anomaly threshold


# ============================
# LOAD DATA
# ============================

    def load_data(file_path: Path) -> pd.DataFrame:
        """
        Load financial data from CSV.
        """
        df = pd.read_csv(file_path, parse_dates=["date"])
        return df


# ============================
# VALIDATIONS
# ============================
    def check_schema(df: pd.DataFrame) -> pd.DataFrame:
            """
            Check whether input DataFrame matches expected schema.
            """
            issues = []
    
        expected_cols = set(REQUIRED_COLUMNS)
        actual_cols = set(df.columns)
    
        missing_cols = expected_columns - loaded_columns
        extra_cols = actual_cols - expected_cols
    
    if missing_cols:
        issues.append({
            "check_type": "SCHEMA_MISSING_COLUMNS",
            "severity": "ERROR",
            "description": f"Missing required columns: {', '.join(sorted(missing_cols))}",
            "count": len(missing_cols)
        })

    if extra_cols:
        issues.append({
            "check_type": "SCHEMA_EXTRA_COLUMNS",
            "severity": "WARNING",
            "description": f"Unexpected extra columns detected: {', '.join(sorted(extra_cols))}",
            "count": len(extra_cols)
        })

    return pd.DataFrame(issues)

    def check_missing_values(df: pd.DataFrame) -> pd.DataFrame:
        """
        Check for missing values in required columns.
        """
        issues = []
    
        for col in REQUIRED_COLUMNS:
            missing_count = df[col].isna().sum()
            if missing_count > 0:
                issues.append({
                    "check_type": "MISSING_VALUES",
                    "severity": "ERROR",
                    "description": f"Missing values in column '{col}'",
                    "count": missing_count
                })
    
        return pd.DataFrame(issues)


    def check_control_totals(df: pd.DataFrame) -> pd.DataFrame:
        """
        Check if sums per scenario exist and are non-zero.
        """
        issues = []
    
        totals = df.groupby("scenario")["amount"].sum()
    
        for scenario, total in totals.items():
            if total == 0:
                issues.append({
                    "check_type": "CONTROL_TOTAL",
                    "severity": "ERROR",
                    "description": f"Total amount for scenario '{scenario}' equals zero",
                    "count": 1
                })
    
        return pd.DataFrame(issues)


    def check_monthly_anomalies(df: pd.DataFrame) -> pd.DataFrame:
        """
        Detect strong Month-over-Month changes.
        """
        issues = []
    
        df_sorted = df.sort_values("date")
        monthly = df_sorted.groupby(
            [pd.Grouper(key="date", freq="M"), "scenario"]
        )["amount"].sum().reset_index()
    
        monthly["prev_amount"] = monthly.groupby("scenario")["amount"].shift(1)
        monthly["change_pct"] = ((monthly["amount"] - monthly["prev_amount"]) / monthly["prev_amount"].abs()
        )

        anomalies = monthly[
            monthly["change_pct"].abs() > VARIANCE_THRESHOLD
        ]
    
        for _, row in anomalies.iterrows():
            issues.append({
                "check_type": "MONTHLY_ANOMALY",
                "severity": "WARNING",
                "description": (
                    f"High MoM change in {row['scenario']} "
                    f"for {row['date'].strftime('%Y-%m')}"
                ),
                "count": round(row["change_pct"], 2)
            })
    
        return pd.DataFrame(issues)

# ============================
# MAIN PIPELINE
# ============================

    def run_data_quality_checks() -> pd.DataFrame:
        """
        Main pipeline:
        - load data
        - run validations
        - combine results
        """
        df = load_data(INPUT_FILE)
    
        issues_frames = [
            check_missing_values(df),
            check_control_totals(df),
            check_monthly_anomalies(df),
        ]
    
        report = pd.concat(issues_frames, ignore_index=True)
    
        if report.empty:
            report = pd.DataFrame([{
                "check_type": "ALL_CHECKS",
                "severity": "OK",
                "description": "No data quality issues detected",
                "count": 0
            }])
    
        return report


# ============================
# EXPORT REPORT
# ============================

    def export_report(report_df: pd.DataFrame) -> None:
        """
        Export data quality report to CSV and Excel.
        """
        report_df.to_csv(REPORT_CSV, index=False)
        report_df.to_excel(REPORT_XLSX, index=False)


# ============================
# ENTRY POINT
# ============================

    if __name__ == "__main__":
        quality_report = run_data_quality_checks()
        export_report(quality_report)
        print("Data quality check completed successfully.")
