From 44ad494515f24403e895a7148e76d7dedf497e6e Mon Sep 17 00:00:00 2001
From: jedsonn <137447164+jedsonn@users.noreply.github.com>
Date: Wed, 8 Oct 2025 07:02:07 -0500
Subject: [PATCH] Add accounting analytics dashboard application
---
 README.md                           |  22 +++
 accounting_app/__init__.py          |  12 ++
 accounting_app/accounting_engine.py | 130 ++++++++++++++
 accounting_app/analytics.py         | 134 ++++++++++++++
 accounting_app/dashboard.py         | 203 +++++++++++++++++++++
 accounting_app/data_cleaner.py      |  78 +++++++++
 accounting_app/data_loader.py       | 108 ++++++++++++
 accounting_app/utils.py             | 263 ++++++++++++++++++++++++++++
 accounting_app/visualizations.py    | 139 +++++++++++++++
 requirements.txt                    |   9 +
 sample_data/dummy_budget.csv        |   7 +
 sample_data/dummy_transactions.csv  |  16 ++
 12 files changed, 1121 insertions(+)
 create mode 100644 accounting_app/__init__.py
 create mode 100644 accounting_app/accounting_engine.py
 create mode 100644 accounting_app/analytics.py
 create mode 100644 accounting_app/dashboard.py
 create mode 100644 accounting_app/data_cleaner.py
 create mode 100644 accounting_app/data_loader.py
 create mode 100644 accounting_app/utils.py
 create mode 100644 accounting_app/visualizations.py
 create mode 100644 requirements.txt
 create mode 100644 sample_data/dummy_budget.csv
 create mode 100644 sample_data/dummy_transactions.csv
diff --git a/README.md b/README.md
index 7736cd1..2be861d 100644
--- a/README.md
+++ b/README.md
@@ -16,6 +16,28 @@ I think I've got things running smoothly and fixed some major bugs, but feel fre
 
 See more info at https://academicpages.github.io/
 
+## Accounting Analytics Dashboard
+
+This repository now includes a Python-based accounting analytics application located in `accounting_app/`. The tool ingests CSV or Excel exports from your accounting system, validates and cleans the data, generates financial statements, and exposes an interactive Streamlit dashboard complete with automated insights and export capabilities.
+
+### Features
+
+- Multi-file CSV/Excel ingestion with automatic sheet handling and column standardisation.
+- Data cleaning (duplicate removal, missing value handling) and validation checks (e.g., debit vs credit balancing, outlier detection).
+- Financial statement generation: Income Statement, Balance Sheet, Cash Flow Statement, and Trial Balance.
+- Analytical tooling for trend, variance, ratio, budget vs actual, and aging analysis.
+- Interactive dashboard with KPI cards, Plotly visualisations, drill-down filters, preset views by role, dark/light mode toggle, and PDF/CSV export.
+- Privacy-friendly operation: uploaded files are processed in temporary folders that are deleted after ingestion.
+
+### Getting Started
+
+1. Create and activate a virtual environment (optional but recommended).
+2. Install dependencies with `pip install -r requirements.txt`.
+3. Launch the dashboard using `streamlit run accounting_app/dashboard.py`.
+4. Upload one or more accounting files (see the templates in `sample_data/` for the expected format) and optionally a budget file to explore the reports.
+
+Sample datasets are provided in `sample_data/dummy_transactions.csv` and `sample_data/dummy_budget.csv` for quick experimentation.
+
 ## To run locally (not on GitHub Pages, to serve on your own computer)
 
 1. Clone the repository and made updates as detailed above
diff --git a/accounting_app/__init__.py b/accounting_app/__init__.py
new file mode 100644
index 0000000..1b8a233
--- /dev/null
+++ b/accounting_app/__init__.py
@@ -0,0 +1,12 @@
+"""Accounting analytics package exposing core utilities."""
+
+from . import analytics, data_cleaner, data_loader, visualizations
+from .accounting_engine import generate_statements
+
+__all__ = [
+    "analytics",
+    "data_cleaner",
+    "data_loader",
+    "visualizations",
+    "generate_statements",
+]
diff --git a/accounting_app/accounting_engine.py b/accounting_app/accounting_engine.py
new file mode 100644
index 0000000..a88c03d
--- /dev/null
+++ b/accounting_app/accounting_engine.py
@@ -0,0 +1,130 @@
+"""Core accounting calculations and statement generation."""
+from __future__ import annotations
+
+from dataclasses import dataclass
+from datetime import datetime
+from typing import Dict, Optional
+
+import pandas as pd
+from pandas import DataFrame
+
+from . import utils
+
+
+@dataclass
+class StatementResult:
+    name: str
+    data: DataFrame
+    metadata: Dict[str, str]
+
+
+def _filter_period(df: DataFrame, start_date: Optional[datetime], end_date: Optional[datetime]) -> DataFrame:
+    if start_date:
+        df = df[df["date"] >= pd.Timestamp(start_date)]
+    if end_date:
+        df = df[df["date"] <= pd.Timestamp(end_date)]
+    return df
+
+
+def _category_totals(df: DataFrame) -> DataFrame:
+    if "category" not in df.columns:
+        return pd.DataFrame(columns=["category", "amount"])
+    totals = df.groupby("category")["amount"].sum().reset_index()
+    totals = totals.sort_values(by="amount", ascending=False)
+    return totals
+
+
+def generate_income_statement(df: DataFrame, start_date: Optional[datetime] = None, end_date: Optional[datetime] = None) -> StatementResult:
+    frame = _filter_period(df, start_date, end_date)
+    if "category" not in frame.columns:
+        frame["category"] = frame["account"].map(utils.infer_category)
+    revenue = frame[frame["category"].str.contains("Revenue", case=False, na=False)]
+    expense = frame[frame["category"].str.contains("Expense", case=False, na=False)]
+    other = frame[~frame.index.isin(revenue.index.union(expense.index))]
+    summary = pd.DataFrame(
+        {
+            "Category": ["Revenue", "Expense", "Net Income"],
+            "Amount": [revenue["amount"].sum(), expense["amount"].sum(), revenue["amount"].sum() - expense["amount"].sum()],
+        }
+    )
+    detail = pd.concat(
+        {
+            "Revenue": _category_totals(revenue),
+            "Expense": _category_totals(expense),
+            "Other": _category_totals(other),
+        },
+        names=["Section"],
+    ).reset_index(level=0)
+    return StatementResult(
+        name="Income Statement",
+        data=summary,
+        metadata={"detail": detail.to_json(orient="records")},
+    )
+
+
+def generate_balance_sheet(df: DataFrame, as_of: Optional[datetime] = None) -> StatementResult:
+    frame = df.copy()
+    if as_of:
+        frame = frame[frame["date"] <= pd.Timestamp(as_of)]
+    if "balance" not in frame.columns:
+        frame["balance"] = frame["amount"].cumsum()
+    pivot = frame.groupby("category")["balance"].sum()
+    assets = pivot.filter(regex="Asset", axis=0).sum()
+    liabilities = pivot.filter(regex="Liability", axis=0).sum()
+    equity = pivot.filter(regex="Equity", axis=0).sum()
+    sheet = pd.DataFrame(
+        {
+            "Category": ["Assets", "Liabilities", "Equity"],
+            "Amount": [assets, liabilities, equity],
+        }
+    )
+    sheet["Amount"].fillna(0.0, inplace=True)
+    return StatementResult("Balance Sheet", sheet, metadata={})
+
+
+def generate_cash_flow(df: DataFrame, start_date: Optional[datetime] = None, end_date: Optional[datetime] = None) -> StatementResult:
+    frame = _filter_period(df, start_date, end_date)
+    if "category" not in frame.columns:
+        frame["category"] = frame["account"].map(utils.infer_category)
+    frame = frame.sort_values(by="date")
+    operating = frame[frame["category"].str.contains("Expense|Revenue", case=False, na=False)]["amount"].sum()
+    investing = frame[frame["category"].str.contains("Asset", case=False, na=False)]["amount"].sum()
+    financing = frame[frame["category"].str.contains("Liability|Equity", case=False, na=False)]["amount"].sum()
+    cash_change = operating + investing + financing
+    cf = pd.DataFrame(
+        {
+            "Category": ["Operating Activities", "Investing Activities", "Financing Activities", "Net Change in Cash"],
+            "Amount": [operating, investing, financing, cash_change],
+        }
+    )
+    return StatementResult("Cash Flow Statement", cf, metadata={})
+
+
+def generate_trial_balance(df: DataFrame, as_of: Optional[datetime] = None) -> StatementResult:
+    frame = df.copy()
+    if as_of:
+        frame = frame[frame["date"] <= pd.Timestamp(as_of)]
+    grouped = frame.groupby("account").agg({"debit": "sum", "credit": "sum"}).fillna(0.0)
+    grouped["net"] = grouped["debit"] - grouped["credit"]
+    grouped = grouped.reset_index()
+    return StatementResult("Trial Balance", grouped, metadata={})
+
+
+def generate_statements(df: DataFrame, start_date: Optional[datetime] = None, end_date: Optional[datetime] = None) -> Dict[str, StatementResult]:
+    statements = {
+        "income_statement": generate_income_statement(df, start_date, end_date),
+        "balance_sheet": generate_balance_sheet(df, end_date),
+        "cash_flow": generate_cash_flow(df, start_date, end_date),
+        "trial_balance": generate_trial_balance(df, end_date),
+    }
+    return statements
+
+
+__all__ = [
+    "StatementResult",
+    "generate_income_statement",
+    "generate_balance_sheet",
+    "generate_cash_flow",
+    "generate_trial_balance",
+    "generate_statements",
+]
diff --git a/accounting_app/analytics.py b/accounting_app/analytics.py
new file mode 100644
index 0000000..45a2171
--- /dev/null
+++ b/accounting_app/analytics.py
@@ -0,0 +1,134 @@
+"""Analytics layer providing ratios and trend insights."""
+from __future__ import annotations
+
+from typing import Dict, Optional
+
+import numpy as np
+import pandas as pd
+
+from . import utils
+
+
+def _ensure_period(df: pd.DataFrame) -> pd.DataFrame:
+    df = df.copy()
+    df["period"] = df["date"].dt.to_period("M").dt.to_timestamp()
+    return df
+
+
+def revenue_vs_expense_trend(df: pd.DataFrame) -> pd.DataFrame:
+    df = _ensure_period(df)
+    pivot = df.pivot_table(values="amount", index="period", columns="category", aggfunc="sum", fill_value=0)
+    revenue = pivot.filter(regex="Revenue", axis=1).sum(axis=1)
+    expense = pivot.filter(regex="Expense", axis=1).sum(axis=1)
+    trend = pd.DataFrame({"Revenue": revenue, "Expense": expense})
+    trend["Net"] = trend["Revenue"] - trend["Expense"]
+    return trend.reset_index()
+
+
+def month_over_month(df: pd.DataFrame, column: str = "amount") -> pd.DataFrame:
+    df = _ensure_period(df)
+    monthly = df.groupby("period")[column].sum()
+    mom = utils.rolling_growth(monthly).to_frame(name="mom_growth")
+    return mom.reset_index()
+
+
+def year_over_year(df: pd.DataFrame, column: str = "amount") -> pd.DataFrame:
+    df = _ensure_period(df)
+    yearly = df.groupby(df["date"].dt.to_period("Y")).agg({column: "sum"})
+    yoy = yearly.pct_change().rename(columns={column: "yoy_growth"})
+    yoy.index = yoy.index.to_timestamp()
+    return yoy.reset_index().rename(columns={"date": "period"})
+
+
+def ratio_analysis(df: pd.DataFrame) -> Dict[str, float]:
+    if "balance" not in df.columns:
+        df = df.sort_values("date").copy()
+        df["balance"] = df.groupby("account")["amount"].cumsum()
+    latest_period = df["date"].max()
+    balance_sheet = df[df["date"] == latest_period]
+    assets = balance_sheet[balance_sheet["category"].str.contains("Asset", na=False)]["balance"].sum()
+    liabilities = balance_sheet[balance_sheet["category"].str.contains("Liability", na=False)]["balance"].sum()
+    equity = balance_sheet[balance_sheet["category"].str.contains("Equity", na=False)]["balance"].sum()
+    revenue = df[df["category"].str.contains("Revenue", na=False)]["amount"].sum()
+    expense = df[df["category"].str.contains("Expense", na=False)]["amount"].sum()
+    net_income = revenue - expense
+    return {
+        "current_ratio": utils.safe_divide(assets, liabilities),
+        "quick_ratio": utils.safe_divide(assets - balance_sheet[balance_sheet["account"].str.contains("Inventory", na=False)]["balance"].sum(), liabilities),
+        "debt_to_equity": utils.safe_divide(liabilities, equity),
+        "net_margin": utils.safe_divide(net_income, revenue),
+        "gross_margin": utils.safe_divide(
+            revenue - df[df["account"].str.contains("COGS|Cost of Goods", case=False, na=False)]["amount"].sum(),
+            revenue,
+        ),
+    }
+
+
+def top_expenses(df: pd.DataFrame, limit: int = 10) -> pd.DataFrame:
+    expenses = df[df["category"].str.contains("Expense", case=False, na=False)]
+    grouped = expenses.groupby("account")["amount"].sum().abs().sort_values(ascending=False)
+    return grouped.head(limit).reset_index().rename(columns={"amount": "total"})
+
+
+def top_revenue(df: pd.DataFrame, limit: int = 10) -> pd.DataFrame:
+    revenue = df[df["category"].str.contains("Revenue", case=False, na=False)]
+    grouped = revenue.groupby("account")["amount"].sum().sort_values(ascending=False)
+    return grouped.head(limit).reset_index().rename(columns={"amount": "total"})
+
+
+def aging_analysis(df: pd.DataFrame, aging_column: str = "amount") -> pd.DataFrame:
+    df = df.copy()
+    df["days_outstanding"] = (pd.Timestamp.utcnow().normalize() - df["date"]).dt.days
+    bins = [0, 30, 60, 90, np.inf]
+    labels = ["0-30", "31-60", "61-90", "90+"]
+    df["aging_bucket"] = pd.cut(df["days_outstanding"], bins=bins, labels=labels, right=False)
+    aging = df.groupby("aging_bucket")[aging_column].sum().reset_index()
+    return aging
+
+
+def budget_vs_actual(df: pd.DataFrame, budget_df: Optional[pd.DataFrame]) -> Optional[pd.DataFrame]:
+    if budget_df is None:
+        return None
+    df = _ensure_period(df)
+    budget = budget_df.copy()
+    if "date" in budget.columns:
+        budget["date"] = utils.parse_dates(budget["date"])
+        budget["period"] = budget["date"].dt.to_period("M").dt.to_timestamp()
+    else:
+        raise ValueError("Budget data must contain a date column")
+    actual = df.groupby("period")["amount"].sum().reset_index()
+    plan = budget.groupby("period")["amount"].sum().reset_index()
+    merged = pd.merge(actual, plan, on="period", how="outer", suffixes=("_actual", "_budget")).fillna(0)
+    merged["variance"] = merged["amount_actual"] - merged["amount_budget"]
+    merged["variance_pct"] = utils.safe_divide(merged["variance"], merged["amount_budget"].replace({0: np.nan}))
+    return merged
+
+
+def automated_insights(df: pd.DataFrame) -> Dict[str, str]:
+    trend = revenue_vs_expense_trend(df)
+    if trend.empty:
+        return {"summary": "Insufficient data for insights."}
+    latest = trend.iloc[-1]
+    previous = trend.iloc[-2] if len(trend) > 1 else None
+    summary = f"Net income for {latest['period']:%B %Y} was {utils.currency_format(latest['Net'])}."
+    if previous is not None and previous["Net"]:
+        change = utils.safe_divide(latest["Net"] - previous["Net"], previous["Net"])
+        summary += f" This represents a {change:.1%} change from the prior month."
+    expense_trend = top_expenses(df, limit=3)
+    if not expense_trend.empty:
+        top_expense = expense_trend.iloc[0]
+        summary += f" Top expense category: {top_expense['account']} ({utils.currency_format(top_expense['total'])})."
+    return {"summary": summary}
+
+
+__all__ = [
+    "revenue_vs_expense_trend",
+    "month_over_month",
+    "year_over_year",
+    "ratio_analysis",
+    "top_expenses",
+    "top_revenue",
+    "aging_analysis",
+    "budget_vs_actual",
+    "automated_insights",
+]
diff --git a/accounting_app/dashboard.py b/accounting_app/dashboard.py
new file mode 100644
index 0000000..e427ca2
--- /dev/null
+++ b/accounting_app/dashboard.py
@@ -0,0 +1,203 @@
+"""Streamlit dashboard for accounting analytics."""
+from __future__ import annotations
+
+import base64
+import shutil
+import tempfile
+from io import BytesIO
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+import pandas as pd
+import plotly.graph_objects as go
+import streamlit as st
+
+from . import analytics, data_cleaner, data_loader, utils, visualizations
+from .accounting_engine import generate_statements
+
+st.set_page_config(page_title="Accounting Insights", layout="wide", initial_sidebar_state="expanded")
+
+
+@st.cache_data(show_spinner=False)
+def load_data(files: List[Any]) -> pd.DataFrame:
+    temp_dir = Path(tempfile.mkdtemp(prefix="acct-app-"))
+    temp_paths: List[str] = []
+    try:
+        for uploaded in files:
+            file_path = temp_dir / uploaded.name
+            with file_path.open("wb") as buffer:
+                buffer.write(uploaded.getvalue())
+            temp_paths.append(str(file_path))
+        return data_loader.load_sources(temp_paths)
+    finally:
+        shutil.rmtree(temp_dir, ignore_errors=True)
+
+
+@st.cache_data(show_spinner=False)
+def load_budget(file: Optional[Any]) -> Optional[pd.DataFrame]:
+    if file is None:
+        return None
+    temp_dir = Path(tempfile.mkdtemp(prefix="acct-budget-"))
+    temp_path = temp_dir / file.name
+    try:
+        with temp_path.open("wb") as buffer:
+            buffer.write(file.getvalue())
+        return data_loader.load_budget_file(str(temp_path))
+    finally:
+        shutil.rmtree(temp_dir, ignore_errors=True)
+
+
+def _export_csv(df: pd.DataFrame) -> bytes:
+    return df.to_csv(index=False).encode("utf-8")
+
+
+def _export_pdf(figures: Dict[str, "go.Figure"]) -> bytes:
+    from fpdf import FPDF
+
+    pdf = FPDF()
+    pdf.set_auto_page_break(auto=True, margin=15)
+    for title, figure in figures.items():
+        pdf.add_page()
+        pdf.set_font("Arial", "B", 16)
+        pdf.cell(0, 10, title, ln=True)
+        image_bytes = figure.to_image(format="png")
+        stream = BytesIO(image_bytes)
+        pdf.image(stream, w=180)
+    return pdf.output(dest="S").encode("latin-1")
+
+
+def sidebar_filters(df: pd.DataFrame) -> pd.DataFrame:
+    st.sidebar.header("Filters")
+    date_range = st.sidebar.date_input(
+        "Date Range",
+        value=(df["date"].min().date(), df["date"].max().date()),
+    )
+    department = st.sidebar.selectbox("Department", options=["All"] + sorted(df["department"].dropna().unique().tolist()))
+    account = st.sidebar.selectbox("Account", options=["All"] + sorted(df["account"].dropna().unique().tolist()))
+    preset = st.sidebar.selectbox("Preset Views", options=list(utils.ROLE_PRESETS.keys()))
+    theme = st.sidebar.toggle("Dark Theme", value=False)
+    st.session_state["dark_theme"] = theme
+    filtered = df.copy()
+    if date_range:
+        start, end = date_range
+        filtered = filtered[(filtered["date"] >= pd.Timestamp(start)) & (filtered["date"] <= pd.Timestamp(end))]
+    if department != "All":
+        filtered = filtered[filtered["department"] == department]
+    if account != "All":
+        filtered = filtered[filtered["account"] == account]
+    preset_config = utils.ROLE_PRESETS.get(preset)
+    if preset_config and preset_config.get("department"):
+        filtered = filtered[filtered["department"] == preset_config["department"]]
+    return filtered
+
+
+def apply_theme() -> None:
+    if st.session_state.get("dark_theme"):
+        st.markdown(
+            """
+            
+            """,
+            unsafe_allow_html=True,
+        )
+
+
+def main() -> None:
+    st.title("Accounting Analysis and Insights")
+    st.write("Upload accounting datasets to explore financial performance.")
+
+    uploaded_files = st.file_uploader("Upload CSV/Excel Files", type=["csv", "xlsx", "xls"], accept_multiple_files=True)
+    budget_file = st.file_uploader("Optional Budget File", type=["csv", "xlsx", "xls"], key="budget")
+
+    if not uploaded_files:
+        st.info("Please upload one or more accounting files to begin.")
+        return
+
+    with st.spinner("Loading data..."):
+        df = load_data(uploaded_files)
+        df = data_cleaner.clean_data(df)
+        data_cleaner.validate_required_columns(df, ["date", "account", "amount"])
+        imbalances = data_cleaner.validate_balances(df)
+        outliers = data_cleaner.flag_outliers(df)
+        budget_df = load_budget(budget_file)
+
+    filtered = sidebar_filters(df)
+    apply_theme()
+
+    statements = generate_statements(filtered)
+    ratios = analytics.ratio_analysis(filtered)
+    insights = analytics.automated_insights(filtered)
+
+    tabs = st.tabs(["Overview", "Income Statement", "Balance Sheet", "Cash Flow", "Detailed Analytics", "Data Explorer"])
+
+    with tabs[0]:
+        st.subheader("Key Metrics")
+        col1, col2, col3 = st.columns(3)
+        col1.metric("Total Revenue", utils.currency_format(filtered[filtered["category"].str.contains("Revenue", na=False)]["amount"].sum()))
+        col2.metric("Total Expenses", utils.currency_format(filtered[filtered["category"].str.contains("Expense", na=False)]["amount"].sum()))
+        col3.metric("Net Profit", utils.currency_format(filtered["amount"].sum()))
+
+        st.plotly_chart(visualizations.revenue_vs_expense_chart(filtered), use_container_width=True)
+        st.plotly_chart(visualizations.expense_bar_chart(filtered), use_container_width=True)
+        st.plotly_chart(visualizations.revenue_breakdown_pie(filtered), use_container_width=True)
+        st.plotly_chart(visualizations.pnl_waterfall(filtered), use_container_width=True)
+        st.plotly_chart(visualizations.assets_vs_liabilities(filtered), use_container_width=True)
+        st.plotly_chart(visualizations.expense_heatmap(filtered), use_container_width=True)
+        st.plotly_chart(visualizations.kpi_gauges(ratios), use_container_width=True)
+        st.plotly_chart(visualizations.aging_bar_chart(filtered), use_container_width=True)
+        st.plotly_chart(visualizations.budget_vs_actual_chart(filtered, budget_df), use_container_width=True)
+
+        st.markdown(f"**Automated Insight:** {insights['summary']}")
+
+        if imbalances:
+            st.warning(f"Debit/Credit imbalance detected: {imbalances}")
+        if not outliers.empty:
+            st.warning("Potential outlier transactions:")
+            st.dataframe(outliers)
+
+    with tabs[1]:
+        st.subheader("Income Statement")
+        st.dataframe(statements["income_statement"].data)
+        st.download_button(
+            "Download Income Statement",
+            data=_export_csv(statements["income_statement"].data),
+            file_name="income_statement.csv",
+            mime="text/csv",
+        )
+
+    with tabs[2]:
+        st.subheader("Balance Sheet")
+        st.dataframe(statements["balance_sheet"].data)
+
+    with tabs[3]:
+        st.subheader("Cash Flow Statement")
+        st.dataframe(statements["cash_flow"].data)
+
+    with tabs[4]:
+        st.subheader("Advanced Analytics")
+        st.dataframe(analytics.month_over_month(filtered))
+        st.dataframe(analytics.year_over_year(filtered))
+
+    with tabs[5]:
+        st.subheader("Data Explorer")
+        st.dataframe(filtered)
+        st.download_button("Export Filtered Data", data=_export_csv(filtered), file_name="filtered_data.csv", mime="text/csv")
+
+    if st.button("Download Dashboard Report"):
+        figures = {
+            "Revenue vs Expense": visualizations.revenue_vs_expense_chart(filtered),
+            "Top Expenses": visualizations.expense_bar_chart(filtered),
+            "Assets vs Liabilities": visualizations.assets_vs_liabilities(filtered),
+        }
+        pdf_bytes = _export_pdf(figures)
+        b64 = base64.b64encode(pdf_bytes).decode()
+        href = f'Download Report'
+        st.markdown(href, unsafe_allow_html=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/accounting_app/data_cleaner.py b/accounting_app/data_cleaner.py
new file mode 100644
index 0000000..a669c8e
--- /dev/null
+++ b/accounting_app/data_cleaner.py
@@ -0,0 +1,78 @@
+"""Data validation and cleaning utilities."""
+from __future__ import annotations
+
+from typing import Dict, List
+
+import pandas as pd
+from pandas import DataFrame
+
+from . import utils
+
+
+class DataValidationError(RuntimeError):
+    """Raised when the dataset fails validation."""
+
+
+def remove_duplicates(df: DataFrame) -> DataFrame:
+    subset = [col for col in ("date", "account", "description", "amount") if col in df.columns]
+    if subset:
+        df = df.drop_duplicates(subset=subset)
+    return df
+
+
+def fill_missing_values(df: DataFrame) -> DataFrame:
+    fill_map = {col: df[col].mode().iloc[0] for col in df.columns if df[col].dtype == "O" and not df[col].mode().empty}
+    df = df.fillna(value=fill_map)
+    numeric_columns = [col for col in ("debit", "credit", "balance", "amount") if col in df.columns]
+    for column in numeric_columns:
+        df[column] = df[column].fillna(0.0)
+    return df
+
+
+def validate_balances(df: DataFrame) -> Dict[str, float]:
+    """Check that debits equal credits per period."""
+
+    if "date" not in df.columns or "amount" not in df.columns:
+        return {}
+    df = df.copy()
+    df["period"] = df["date"].dt.to_period("M")
+    grouped = df.groupby("period")
+    imbalances = {}
+    for period, frame in grouped:
+        debit = frame.get("debit")
+        credit = frame.get("credit")
+        if debit is not None and credit is not None:
+            diff = float(debit.sum() - credit.sum())
+            if abs(diff) > 0.01:
+                imbalances[str(period)] = diff
+    return imbalances
+
+
+def flag_outliers(df: DataFrame) -> DataFrame:
+    if "amount" not in df.columns:
+        return pd.DataFrame(columns=df.columns)
+    outliers = utils.summarise_outliers(df["amount"].abs())
+    if outliers.empty:
+        return pd.DataFrame(columns=df.columns)
+    return df.loc[outliers.index]
+
+
+def validate_required_columns(df: DataFrame, required: List[str]) -> None:
+    missing = [column for column in required if column not in df.columns]
+    if missing:
+        raise DataValidationError(f"Missing required columns: {', '.join(missing)}")
+
+
+def clean_data(df: DataFrame) -> DataFrame:
+    df = remove_duplicates(df)
+    df = fill_missing_values(df)
+    return df
+
+
+__all__ = [
+    "DataValidationError",
+    "clean_data",
+    "validate_required_columns",
+    "validate_balances",
+    "flag_outliers",
+]
diff --git a/accounting_app/data_loader.py b/accounting_app/data_loader.py
new file mode 100644
index 0000000..bd04b46
--- /dev/null
+++ b/accounting_app/data_loader.py
@@ -0,0 +1,108 @@
+"""Data loading utilities for the accounting analytics application."""
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Dict, Iterable, List, Optional, Tuple
+
+import pandas as pd
+from pandas import DataFrame
+
+from . import utils
+
+SUPPORTED_EXTENSIONS = {".csv", ".xlsx", ".xls"}
+
+
+class DataLoaderError(RuntimeError):
+    """Raised when the loader encounters an unrecoverable problem."""
+
+
+def _read_csv(path: Path) -> DataFrame:
+    return pd.read_csv(path, dtype=str).replace({"": None})
+
+
+def _read_excel(path: Path) -> Dict[str, DataFrame]:
+    excel = pd.read_excel(path, sheet_name=None, dtype=str)
+    return {sheet: df.replace({"": None}) for sheet, df in excel.items()}
+
+
+def _coerce_types(df: DataFrame) -> DataFrame:
+    df = utils.normalize_columns(df)
+    if "date" in df.columns:
+        df["date"] = utils.parse_dates(df["date"])
+    for column in ("debit", "credit", "balance", "amount"):
+        if column in df.columns:
+            df[column] = utils.ensure_numeric(df[column])
+    return df
+
+
+def _standardise(df: DataFrame) -> DataFrame:
+    df = _coerce_types(df)
+    if "amount" not in df.columns:
+        debit = df.get("debit")
+        credit = df.get("credit")
+        if debit is not None or credit is not None:
+            df["amount"] = (debit.fillna(0) if debit is not None else 0) - (
+                credit.fillna(0) if credit is not None else 0
+            )
+    if "category" not in df.columns and "account" in df.columns:
+        df["category"] = df["account"].map(utils.infer_category)
+    if "date" in df.columns:
+        df = df.dropna(subset=["date"])  # discard rows without a valid date
+    return df
+
+
+def load_file(path: Path) -> List[Tuple[str, DataFrame]]:
+    """Load a single file returning a list of (source_id, dataframe)."""
+
+    ext = path.suffix.lower()
+    if ext not in SUPPORTED_EXTENSIONS:
+        raise DataLoaderError(f"Unsupported file format: {path.suffix}")
+    if ext == ".csv":
+        return [(path.stem, _standardise(_read_csv(path)))]
+    sheets = _read_excel(path)
+    frames: List[Tuple[str, DataFrame]] = []
+    for sheet, df in sheets.items():
+        frames.append((f"{path.stem}:{sheet}", _standardise(df)))
+    return frames
+
+
+def load_sources(file_paths: Iterable[str]) -> DataFrame:
+    """Load multiple sources into a unified dataframe."""
+
+    frames: List[DataFrame] = []
+    for file_path in file_paths:
+        path = Path(file_path)
+        if not path.exists():
+            raise DataLoaderError(f"File not found: {file_path}")
+        for source_id, frame in load_file(path):
+            frame["source"] = source_id
+            frames.append(frame)
+    if not frames:
+        raise DataLoaderError("No data was loaded from the provided sources")
+    combined = pd.concat(frames, ignore_index=True, sort=False)
+    combined = combined.sort_values(by="date") if "date" in combined else combined
+    return combined.reset_index(drop=True)
+
+
+def load_budget_file(path: Optional[str]) -> Optional[DataFrame]:
+    """Load optional budget data used for budget vs actual reporting."""
+
+    if not path:
+        return None
+    file_path = Path(path)
+    if not file_path.exists():
+        raise DataLoaderError(f"Budget file not found: {path}")
+    frames = load_file(file_path)
+    if not frames:
+        return None
+    budget_df = frames[0][1]
+    if "amount" not in budget_df.columns:
+        raise DataLoaderError("Budget file must include an 'amount' column")
+    return budget_df
+
+
+__all__ = [
+    "DataLoaderError",
+    "load_sources",
+    "load_budget_file",
+]
diff --git a/accounting_app/utils.py b/accounting_app/utils.py
new file mode 100644
index 0000000..17c7d71
--- /dev/null
+++ b/accounting_app/utils.py
@@ -0,0 +1,263 @@
+"""Utility functions and constants for the accounting analytics app."""
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass
+from datetime import datetime
+from functools import lru_cache
+from pathlib import Path
+from typing import Dict, Iterable, List, Mapping, Optional
+
+import numpy as np
+import pandas as pd
+
+# Default column names expected by the application. Each column has a list of
+# aliases that may appear in source data.
+COLUMN_ALIASES: Mapping[str, List[str]] = {
+    "date": [
+        "date",
+        "transaction date",
+        "posting date",
+        "journal date",
+        "entry date",
+    ],
+    "account": [
+        "account",
+        "account name",
+        "account title",
+        "gl account",
+        "account code",
+    ],
+    "account_code": ["account code", "gl code", "code"],
+    "description": ["description", "memo", "details", "narration"],
+    "debit": ["debit", "debits", "debit amount"],
+    "credit": ["credit", "credits", "credit amount"],
+    "balance": ["balance", "running balance", "amount"],
+    "category": [
+        "category",
+        "account type",
+        "type",
+        "financial category",
+        "class",
+    ],
+    "reference": [
+        "reference",
+        "reference number",
+        "ref",
+        "document number",
+        "invoice number",
+    ],
+    "entity": [
+        "vendor",
+        "customer",
+        "entity",
+        "name",
+        "counterparty",
+        "vendor/customer",
+    ],
+    "department": ["department", "cost center", "location", "division"],
+    "amount": ["amount", "value", "net amount"],
+}
+
+STANDARD_COLUMNS: List[str] = [
+    "date",
+    "account",
+    "account_code",
+    "description",
+    "debit",
+    "credit",
+    "balance",
+    "amount",
+    "category",
+    "reference",
+    "entity",
+    "department",
+]
+
+# Mapping of keywords to default categories when the source data does not
+# provide an explicit account type. The mapping is intentionally broad so that
+# we can infer a reasonable classification in most cases.
+KEYWORD_CATEGORY_MAP: Mapping[str, str] = {
+    "cash": "Asset",
+    "bank": "Asset",
+    "receivable": "Asset",
+    "inventory": "Asset",
+    "prepaid": "Asset",
+    "asset": "Asset",
+    "payable": "Liability",
+    "loan": "Liability",
+    "tax": "Liability",
+    "liability": "Liability",
+    "equity": "Equity",
+    "capital": "Equity",
+    "revenue": "Revenue",
+    "sales": "Revenue",
+    "income": "Revenue",
+    "expense": "Expense",
+    "cost": "Expense",
+    "cogs": "Expense",
+}
+
+ROLE_PRESETS: Mapping[str, Dict[str, Optional[str]]] = {
+    "CFO View": {"department": None, "focus_metric": "net_income"},
+    "Accountant View": {"department": None, "focus_metric": "trial_balance"},
+    "Department Manager": {"department": "Operations", "focus_metric": "expense"},
+}
+
+
+@dataclass
+class Period:
+    """Simple representation of a reporting period."""
+
+    start: datetime
+    end: datetime
+
+    @property
+    def label(self) -> str:
+        return f"{self.start:%Y-%m-%d} to {self.end:%Y-%m-%d}"
+
+
+def slugify(value: str) -> str:
+    """Create a filesystem friendly slug."""
+
+    cleaned = "".join(ch if ch.isalnum() else "-" for ch in value.lower())
+    while "--" in cleaned:
+        cleaned = cleaned.replace("--", "-")
+    return cleaned.strip("-")
+
+
+def ensure_directory(path: Path) -> None:
+    """Ensure a directory exists."""
+
+    path.mkdir(parents=True, exist_ok=True)
+
+
+def detect_column(columns: Iterable[str], target: str) -> Optional[str]:
+    """Return the first column name that matches a target alias."""
+
+    aliases = COLUMN_ALIASES.get(target, [])
+    lowered = {col.lower(): col for col in columns}
+    for alias in aliases:
+        if alias in lowered:
+            return lowered[alias]
+    return None
+
+
+def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
+    """Standardise column names across disparate data sources."""
+
+    rename_map = {}
+    for standard in STANDARD_COLUMNS:
+        alias = detect_column(df.columns, standard)
+        if alias:
+            rename_map[alias] = standard
+    normalized = df.rename(columns=rename_map)
+    return normalized
+
+
+def parse_dates(series: pd.Series) -> pd.Series:
+    """Parse a pandas Series to datetimes with flexible format support."""
+
+    if series.dtype == "datetime64[ns]":
+        return series
+    return pd.to_datetime(series, errors="coerce", infer_datetime_format=True, utc=False)
+
+
+def ensure_numeric(series: pd.Series) -> pd.Series:
+    """Convert a series to numeric, gracefully handling errors."""
+
+    return pd.to_numeric(series, errors="coerce")
+
+
+def infer_category(account: str) -> str:
+    """Infer an account category from its name using keyword heuristics."""
+
+    account_lower = (account or "").lower()
+    for keyword, category in KEYWORD_CATEGORY_MAP.items():
+        if keyword in account_lower:
+            return category
+    return "Uncategorized"
+
+
+@lru_cache(maxsize=32)
+def load_json_config(path: str) -> Dict[str, str]:
+    """Load a JSON configuration file with caching."""
+
+    file_path = Path(path)
+    if not file_path.exists():
+        return {}
+    with file_path.open("r", encoding="utf-8") as fh:
+        return json.load(fh)
+
+
+def as_month_period(date: datetime) -> datetime:
+    """Return the first day of the month for grouping."""
+
+    return datetime(date.year, date.month, 1)
+
+
+def safe_divide(numerator: float, denominator: float) -> float:
+    """Return a safe division result with zero denominator handling."""
+
+    if denominator in (0, None) or (isinstance(denominator, float) and np.isclose(denominator, 0.0)):
+        return np.nan
+    return numerator / denominator
+
+
+def currency_format(value: float) -> str:
+    """Format a number as currency with thousands separator."""
+
+    if value is None or (isinstance(value, float) and np.isnan(value)):
+        return "-"
+    return f"${value:,.2f}"
+
+
+def summarise_outliers(series: pd.Series, multiplier: float = 3.0) -> pd.Series:
+    """Identify outliers using the z-score method."""
+
+    if series.empty:
+        return pd.Series(dtype=float)
+    mean = series.mean()
+    std = series.std(ddof=0)
+    if std == 0:
+        return pd.Series(dtype=float)
+    z_scores = (series - mean) / std
+    return series[np.abs(z_scores) > multiplier]
+
+
+def to_excel(df_map: Mapping[str, pd.DataFrame], path: Path) -> None:
+    """Write multiple dataframes to a formatted Excel workbook."""
+
+    ensure_directory(path.parent)
+    with pd.ExcelWriter(path, engine="openpyxl") as writer:
+        for sheet_name, df in df_map.items():
+            df.to_excel(writer, sheet_name=sheet_name[:31], index=False)
+
+
+def rolling_growth(series: pd.Series) -> pd.Series:
+    """Calculate percentage change with safe handling of division by zero."""
+
+    previous = series.shift(1)
+    return (series - previous) / previous.replace({0: np.nan})
+
+
+__all__ = [
+    "COLUMN_ALIASES",
+    "STANDARD_COLUMNS",
+    "ROLE_PRESETS",
+    "Period",
+    "slugify",
+    "ensure_directory",
+    "detect_column",
+    "normalize_columns",
+    "parse_dates",
+    "ensure_numeric",
+    "infer_category",
+    "load_json_config",
+    "as_month_period",
+    "safe_divide",
+    "currency_format",
+    "summarise_outliers",
+    "to_excel",
+    "rolling_growth",
+]
diff --git a/accounting_app/visualizations.py b/accounting_app/visualizations.py
new file mode 100644
index 0000000..9354f0f
--- /dev/null
+++ b/accounting_app/visualizations.py
@@ -0,0 +1,139 @@
+"""Plotly figure generation for the accounting dashboard."""
+from __future__ import annotations
+
+from typing import Dict, Optional
+
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+
+from . import analytics
+
+
+def kpi_card(title: str, value: float, delta: Optional[float] = None) -> go.Figure:
+    gauge = go.Figure(
+        go.Indicator(
+            mode="number+delta" if delta is not None else "number",
+            value=value or 0,
+            title={"text": title},
+            delta={"reference": delta, "relative": True} if delta is not None else None,
+            number={"valueformat": ".2f"},
+        )
+    )
+    gauge.update_layout(height=200, margin=dict(t=40, b=20, l=20, r=20))
+    return gauge
+
+
+def revenue_vs_expense_chart(df: pd.DataFrame) -> go.Figure:
+    trend = analytics.revenue_vs_expense_trend(df)
+    fig = px.line(trend, x="period", y=["Revenue", "Expense"], markers=True, title="Revenue vs Expense")
+    fig.update_layout(hovermode="x unified")
+    return fig
+
+
+def expense_bar_chart(df: pd.DataFrame) -> go.Figure:
+    expenses = analytics.top_expenses(df, limit=10)
+    fig = px.bar(expenses, x="total", y="account", orientation="h", title="Top Expenses")
+    fig.update_layout(yaxis=dict(autorange="reversed"))
+    return fig
+
+
+def revenue_breakdown_pie(df: pd.DataFrame) -> go.Figure:
+    revenue = analytics.top_revenue(df, limit=10)
+    fig = px.pie(revenue, names="account", values="total", title="Revenue Breakdown")
+    return fig
+
+
+def pnl_waterfall(df: pd.DataFrame) -> go.Figure:
+    trend = analytics.revenue_vs_expense_trend(df)
+    if trend.empty:
+        return go.Figure()
+    monthly = trend.iloc[-1]
+    fig = go.Figure(
+        go.Waterfall(
+            name="P&L",
+            orientation="v",
+            measure=["relative", "relative", "total"],
+            x=["Revenue", "Expense", "Net"],
+            y=[monthly["Revenue"], -abs(monthly["Expense"]), monthly["Net"]],
+        )
+    )
+    fig.update_layout(title="Monthly Profit and Loss", showlegend=False)
+    return fig
+
+
+def assets_vs_liabilities(df: pd.DataFrame) -> go.Figure:
+    df = df.copy()
+    df["period"] = df["date"].dt.to_period("M").dt.to_timestamp()
+    pivot = df.pivot_table(index="period", values="balance", columns="category", aggfunc="sum", fill_value=0)
+    assets = pivot.filter(regex="Asset", axis=1).sum(axis=1)
+    liabilities = pivot.filter(regex="Liability", axis=1).sum(axis=1)
+    fig = go.Figure()
+    fig.add_trace(go.Bar(name="Assets", x=assets.index, y=assets.values))
+    fig.add_trace(go.Bar(name="Liabilities", x=liabilities.index, y=liabilities.values))
+    fig.update_layout(barmode="stack", title="Assets vs Liabilities")
+    return fig
+
+
+def expense_heatmap(df: pd.DataFrame) -> go.Figure:
+    df = df[df["category"].str.contains("Expense", case=False, na=False)].copy()
+    if df.empty:
+        return go.Figure()
+    df["month"] = df["date"].dt.strftime("%Y-%m")
+    pivot = df.pivot_table(index="account", columns="month", values="amount", aggfunc="sum", fill_value=0)
+    fig = px.imshow(pivot, aspect="auto", title="Expense Heatmap")
+    return fig
+
+
+def kpi_gauges(ratios: Dict[str, float]) -> go.Figure:
+    fig = go.Figure()
+    titles = {
+        "current_ratio": "Current Ratio",
+        "quick_ratio": "Quick Ratio",
+        "debt_to_equity": "Debt to Equity",
+        "net_margin": "Net Margin",
+        "gross_margin": "Gross Margin",
+    }
+    for idx, (key, value) in enumerate(ratios.items()):
+        display_value = 0.0 if value is None or pd.isna(value) else float(value)
+        range_max = max(display_value * 2, 1.0)
+        fig.add_trace(
+            go.Indicator(
+                mode="gauge+number",
+                value=display_value,
+                domain={"row": idx // 3, "column": idx % 3},
+                title={"text": titles.get(key, key.replace("_", " ").title())},
+                gauge={"axis": {"range": [0, range_max]}},
+            )
+        )
+    fig.update_layout(grid={'rows': 2, 'columns': 3, 'pattern': 'independent'}, height=600, title="Key Ratios")
+    return fig
+
+
+def aging_bar_chart(df: pd.DataFrame) -> go.Figure:
+    aging = analytics.aging_analysis(df)
+    fig = px.bar(aging, x="aging_bucket", y="amount", title="Aging Analysis")
+    return fig
+
+
+def budget_vs_actual_chart(df: pd.DataFrame, budget_df: Optional[pd.DataFrame]) -> go.Figure:
+    comparison = analytics.budget_vs_actual(df, budget_df)
+    if comparison is None:
+        return go.Figure()
+    fig = px.bar(comparison, x="period", y=["amount_actual", "amount_budget"], barmode="group", title="Budget vs Actual")
+    fig.update_layout(xaxis_title="Period", yaxis_title="Amount")
+    return fig
+
+
+__all__ = [
+    "kpi_card",
+    "revenue_vs_expense_chart",
+    "expense_bar_chart",
+    "revenue_breakdown_pie",
+    "pnl_waterfall",
+    "assets_vs_liabilities",
+    "expense_heatmap",
+    "kpi_gauges",
+    "aging_bar_chart",
+    "budget_vs_actual_chart",
+]
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..8a51b99
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,9 @@
+pandas>=2.0
+numpy>=1.24
+plotly>=5.18
+streamlit>=1.28
+openpyxl>=3.1
+xlrd>=2.0
+kaleido>=0.2.1
+fpdf2>=2.7
+
diff --git a/sample_data/dummy_budget.csv b/sample_data/dummy_budget.csv
new file mode 100644
index 0000000..84ac61f
--- /dev/null
+++ b/sample_data/dummy_budget.csv
@@ -0,0 +1,7 @@
+Date,Account,Amount
+2024-01-01,Revenue,25000
+2024-02-01,Revenue,26000
+2024-03-01,Revenue,28000
+2024-01-01,Expense,15000
+2024-02-01,Expense,15500
+2024-03-01,Expense,16000
diff --git a/sample_data/dummy_transactions.csv b/sample_data/dummy_transactions.csv
new file mode 100644
index 0000000..90eb869
--- /dev/null
+++ b/sample_data/dummy_transactions.csv
@@ -0,0 +1,16 @@
+Date,Account Name,Account Type,Description,Debit,Credit,Balance,Department,Vendor/Customer
+2024-01-05,Sales Revenue,Revenue,Online sales,0,12000,12000,Sales,Customer A
+2024-01-06,Cash,Asset,Customer payment,12000,0,24000,Sales,Customer A
+2024-01-07,Cost of Goods Sold,Expense,Inventory purchase,4500,0,19500,Operations,Supplier Z
+2024-01-10,Accounts Receivable,Asset,Invoice 1001,5000,0,24500,Sales,Customer B
+2024-01-15,Sales Revenue,Revenue,Invoice 1001 payment,0,5000,29500,Sales,Customer B
+2024-02-01,Rent Expense,Expense,Monthly rent,3000,0,26500,Operations,Landlord LLC
+2024-02-02,Utilities Expense,Expense,Electricity bill,800,0,25700,Operations,Utility Co
+2024-02-03,Accounts Payable,Liability,Supplier invoice,0,2500,23200,Operations,Supplier Z
+2024-02-05,Inventory,Asset,Inventory purchase,2500,0,25700,Operations,Supplier Z
+2024-02-10,Sales Revenue,Revenue,In-store sales,0,15000,40700,Sales,Customer C
+2024-03-01,Payroll Expense,Expense,Monthly payroll,7000,0,33700,Operations,
+2024-03-05,Sales Revenue,Revenue,Online sales,0,18000,51700,Sales,Customer D
+2024-03-07,Accounts Receivable,Asset,Invoice 1002,8000,0,59700,Sales,Customer E
+2024-03-15,Sales Revenue,Revenue,Invoice 1002 payment,0,8000,67700,Sales,Customer E
+2024-03-20,Marketing Expense,Expense,Digital ads,2000,0,65700,Marketing,Ad Agency