# Dependencies — Report

This notebook visualizes the CSV outputs generated by the **Dependencies** block.

## What this notebook shows
- **Circular dependencies** between packages (Top pairs).
- **External dependencies** overview (group → artifact).
- **Lines of code**: Top classes by LoC (Bar) and share (Donut) for the same Top set.
- **Modules & artifacts**: In/Out degree per artifact (Scatter) and Top outgoing.
- **Package dependencies**: Grouped bars of total dependencies & distinct dependent types (Top origins).
- **Package dependencies — classes**: Top class-to-class dependency pairs by weight (Bar).

> If a CSV is missing or empty, the cell prints an info message and skips the chart.


In [None]:
# Setup: imports, paths, helpers
# - CSVs are read from reports/csv-reports/<CATEGORY>/<file>.csv relative to this notebook folder.
# - Minimal console output; only show information if a CSV is missing/empty.
# - Bar charts use an explicit default color so it's easy to tweak later.
# - Titles are standardized without block prefixes.

import os
from pathlib import Path
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

pd.set_option('future.no_silent_downcasting', True)

CATEGORY = "Dependencies"
CSV_BASE = Path("../reports/csv-reports").resolve()
DEPS_DIR = CSV_BASE / CATEGORY

# Explicit default color for all bar charts in this notebook
DEFAULT_BAR_COLOR = ["#1f77b4"]

# CSV IO helpers
NA_LITS = ["", " ", "NA", "N/A", "n/a", "NaN", "NULL", "Null", "null", "None", "none", "-", "--"]

def read_csv_safe(p: Path) -> pd.DataFrame:
    """Read a CSV if present; otherwise return an empty DataFrame.
    Prints a minimal info message when missing or unreadable."""
    p = Path(p)
    if not p.exists():
        print(f"[info] Missing CSV: {p}")
        return pd.DataFrame()
    try:
        df = pd.read_csv(p, na_values=NA_LITS, keep_default_na=True)
        df.columns = [str(c).strip() for c in df.columns]
        df = df.dropna(how="all")
        return df
    except Exception as e:
        print(f"[warn] Failed to read {p}: {e}")
        return pd.DataFrame()

def labelize_na(s, label="N/A"):
    s = s.copy()
    s = s.mask(s.isna(), label).astype(str)
    s = s.replace({"nan": label, "NaN": label})
    return s

def find_col(df, *cands, default=None, contains=None):
    """Find a column by exact candidates or by substring (contains)."""
    low = {c.lower(): c for c in df.columns}
    for c in cands:
        if c and c.lower() in low:
            return low[c.lower()]
    if contains:
        for k, orig in low.items():
            if contains.lower() in k:
                return orig
    return default

MAX_BARS = 25  # cap for long bar charts


## 1) Circular dependencies (package ↔ package)

In [None]:
# Charts for Circular_Dependencies
# Where charts are generated:
#  - 1A) Top circular package pairs by total dependencies (Bar, explicit color)
#  - 1B) Circular dependencies heatmap (Top pairs)

path = DEPS_DIR / "Circular_Dependencies.csv"
df_circ = read_csv_safe(path)

if not df_circ.empty:
    c_p1 = find_col(df_circ, "package1", contains="package1", default=None)
    c_p2 = find_col(df_circ, "package2", contains="package2", default=None)
    c_fwd = find_col(df_circ, "totalDepsP1toP2", contains="p1top2", default=None)
    c_bwd = find_col(df_circ, "totalDepsP2toP1", contains="p2top1", default=None)

    if c_p1 and c_p2 and c_fwd and c_bwd:
        tmp = df_circ[[c_p1, c_p2, c_fwd, c_bwd]].copy()
        tmp.columns = ["package1", "package2", "fwd", "bwd"]
        tmp["fwd"] = pd.to_numeric(tmp["fwd"], errors="coerce").fillna(0)
        tmp["bwd"] = pd.to_numeric(tmp["bwd"], errors="coerce").fillna(0)
        tmp["total"] = tmp["fwd"] + tmp["bwd"]
        top_pairs = tmp.sort_values("total", ascending=False).head(MAX_BARS)

        fig = px.bar(top_pairs,
                     x=top_pairs["package1"] + " ⇄ " + top_pairs["package2"],
                     y="total", text="total",
                     title="Top circular package pairs by total dependencies",
                     color_discrete_sequence=DEFAULT_BAR_COLOR)
        fig.update_traces(textposition="outside", cliponaxis=False)
        fig.update_layout(xaxis_tickangle=-35, width=1200, height=550,
                          xaxis_title="package pair", yaxis_title="total dependencies")
        fig.show()

        fig2 = px.density_heatmap(top_pairs, x="package1", y="package2", z="total",
                                  title="Circular dependencies heatmap (top pairs)")
        fig2.update_layout(width=900, height=700, xaxis_title="package1", yaxis_title="package2")
        fig2.show()
    else:
        print("[info] Circular_Dependencies.csv lacks expected columns — skipping charts.")
else:
    print("[info] Circular_Dependencies.csv missing or empty.")


## 2) External dependencies (group → artifact)

In [None]:
# Charts for External_Dependencies
# Where charts are generated:
#  - 2A) External dependencies treemap (group → artifact)
#  - 2B) Top groups by number of artifacts used (Bar, explicit color)

path = DEPS_DIR / "External_Dependencies.csv"
df_ext = read_csv_safe(path)

if not df_ext.empty:
    c_group = find_col(df_ext, "group", "artifact.group", contains="group", default=None)
    c_name  = find_col(df_ext, "name", "artifact.name", contains="name", default=None)

    if c_group and c_name:
        df_ext["group"] = labelize_na(df_ext[c_group])
        df_ext["name"]  = labelize_na(df_ext[c_name])
        treemap = (df_ext.groupby(["group", "name"]).size()
                         .reset_index(name="count"))
        fig = px.treemap(treemap, path=["group", "name"], values="count",
                         title="External dependencies (group → artifact)")
        fig.update_layout(width=1000, height=650)
        fig.show()

        by_group = treemap.groupby("group")["count"].sum().reset_index(name="artifacts")
        top_groups = by_group.sort_values("artifacts", ascending=False).head(MAX_BARS)
        fig2 = px.bar(top_groups, x="group", y="artifacts", text="artifacts",
                      title="Top groups by number of artifacts used",
                      color_discrete_sequence=DEFAULT_BAR_COLOR)
        fig2.update_traces(textposition="outside", cliponaxis=False)
        fig2.update_layout(xaxis_tickangle=-30, width=1100, height=550,
                           xaxis_title="group", yaxis_title="artifacts used")
        fig2.show()
    else:
        print("[info] External_Dependencies.csv lacks expected columns — skipping charts.")
else:
    print("[info] External_Dependencies.csv missing or empty.")


## 3) Lines of code (per class)

In [None]:
# Charts for Lines_Of_Code
# Where charts are generated:
#  - 3A) Top classes by lines of code (Bar, explicit color)
#  - 3B) LoC share for the same Top set (Donut)

path = DEPS_DIR / "Lines_Of_Code.csv"
df_loc = read_csv_safe(path)

if not df_loc.empty:
    c_cls = find_col(df_loc, "CompleteClassPath", contains="class", default=None)
    c_loc = find_col(df_loc, "LoC", contains="loc", default=None)

    if c_cls and c_loc:
        df_loc["LoC"] = pd.to_numeric(df_loc[c_loc], errors="coerce").fillna(0).astype(int)

        # 3A) Top classes by LoC (bar)
        top_loc = df_loc.sort_values("LoC", ascending=False).head(MAX_BARS)
        fig = px.bar(top_loc, x=c_cls, y="LoC", text="LoC",
                     title="Top classes by lines of code",
                     color_discrete_sequence=DEFAULT_BAR_COLOR)
        fig.update_traces(textposition="outside", cliponaxis=False)
        fig.update_layout(xaxis_tickangle=-40, width=1200, height=550,
                          xaxis_title="class", yaxis_title="LoC")
        fig.show()

        # 3B) LoC share (donut) for the same Top set
        fig2 = px.pie(top_loc, names=c_cls, values="LoC", hole=0.35,
                      title=f"LoC share — Top {len(top_loc)} classes")
        fig2.update_traces(textposition="inside")
        fig2.update_layout(width=850, height=650)
        fig2.show()
    else:
        print("[info] Lines_Of_Code.csv lacks expected columns — skipping charts.")
else:
    print("[info] Lines_Of_Code.csv missing or empty.")


## 4) Modules & artifacts (in/out degree per artifact)

In [None]:
# Charts for Modules_And_Artifacts
# Where charts are generated:
#  - 4A) Artifact degree: outgoing vs incoming (Scatter, size = total)
#  - 4B) Top artifacts by number of outgoing dependencies (Bar, explicit color)

path = DEPS_DIR / "Modules_And_Artifacts.csv"
df_mod = read_csv_safe(path)

if not df_mod.empty:
    c_a1 = find_col(df_mod, "Artifact_1_Name", contains="_1_name", default=None)
    c_a2 = find_col(df_mod, "Artifact_2_Name", contains="_2_name", default=None)

    if c_a1 and c_a2:
        a1 = df_mod[c_a1].astype(str)
        a2 = df_mod[c_a2].astype(str)
        out_deg = a1.value_counts().rename("outgoing").to_frame()
        in_deg  = a2.value_counts().rename("incoming").to_frame()
        deg = out_deg.join(in_deg, how="outer").fillna(0).astype(int).reset_index().rename(columns={"index": "artifact"})
        deg["total"] = deg["outgoing"] + deg["incoming"]

        fig = px.scatter(deg, x="outgoing", y="incoming", size="total", hover_name="artifact",
                         title="Artifact degree: outgoing vs incoming (size = total)")
        fig.update_layout(width=900, height=650, xaxis_title="outgoing", yaxis_title="incoming")
        fig.show()

        top_out = deg.sort_values("outgoing", ascending=False).head(MAX_BARS)
        fig2 = px.bar(top_out, x="artifact", y="outgoing", text="outgoing",
                      title="Top artifacts by number of outgoing dependencies",
                      color_discrete_sequence=DEFAULT_BAR_COLOR)
        fig2.update_traces(textposition="outside", cliponaxis=False)
        fig2.update_layout(xaxis_tickangle=-35, width=1100, height=550,
                           xaxis_title="artifact", yaxis_title="outgoing dependencies")
        fig2.show()
    else:
        print("[info] Modules_And_Artifacts.csv lacks expected columns — skipping charts.")
else:
    print("[info] Modules_And_Artifacts.csv missing or empty.")


## 5) Package dependencies (origin → destination)

In [None]:
# Charts for Package_Dependencies
# Where charts are generated:
#  - 5A) Top origin packages: total deps vs distinct dependent types (Grouped bars, explicit color)
#  - 5B) Top origin → destination package pairs by total dependencies (Heatmap)

path = DEPS_DIR / "Package_Dependencies.csv"
df_pkg = read_csv_safe(path)

if not df_pkg.empty:
    c_org = find_col(df_pkg, "originPackage", contains="origin", default=None)
    c_dst = find_col(df_pkg, "destinationPackage", contains="destination", default=None)
    c_types = find_col(df_pkg, "typesThatDepend", contains="types", default=None)
    c_total = find_col(df_pkg, "totalDependencies", contains="total", default=None)

    if c_org and c_dst and c_types and c_total:
        tmp = df_pkg[[c_org, c_dst, c_types, c_total]].copy()
        tmp.columns = ["origin", "destination", "types", "total"]
        tmp["types"] = pd.to_numeric(tmp["types"], errors="coerce").fillna(0).astype(int)
        tmp["total"] = pd.to_numeric(tmp["total"], errors="coerce").fillna(0).astype(int)

        agg = tmp.groupby("origin").agg(
            totalDeps=("total", "sum"),
            distinctTypes=("types", "sum")
        ).reset_index()

        top_origins = agg.sort_values("totalDeps", ascending=False).head(MAX_BARS)

        fig = px.bar(top_origins, x="origin", y=["totalDeps", "distinctTypes"],
                     barmode="group",
                     title="Top origin packages: total deps vs distinct dependent types",
                     color_discrete_sequence=DEFAULT_BAR_COLOR)
        fig.update_layout(xaxis_tickangle=-35, width=1200, height=600,
                          xaxis_title="origin package", yaxis_title="count")
        fig.show()

        pairs = tmp.sort_values("total", ascending=False).head(30)
        fig2 = px.density_heatmap(pairs, x="origin", y="destination", z="total",
                                  title="Top origin → destination package pairs by total dependencies")
        fig2.update_layout(width=1000, height=700, xaxis_title="origin", yaxis_title="destination")
        fig2.show()
    else:
        print("[info] Package_Dependencies.csv lacks expected columns — skipping charts.")
else:
    print("[info] Package_Dependencies.csv missing or empty.")


## 6) Package dependencies — classes (top pairs by weight)

In [None]:
# Charts for Package_Dependencies_Classes
# Where chart is generated:
#  - 6A) Top class-to-class dependencies by weight (Bar, explicit color)

path = DEPS_DIR / "Package_Dependencies_Classes.csv"
df_cls = read_csv_safe(path)

if not df_cls.empty:
    c_c1 = find_col(df_cls, "Class_1_fqn", contains="_1_fqn", default=None)
    c_w  = find_col(df_cls, "dependencyWeight", contains="weight", default=None)
    c_c2 = find_col(df_cls, "Class_2_fqn", contains="_2_fqn", default=None)

    if c_c1 and c_w and c_c2:
        tmp = df_cls[[c_c1, c_w, c_c2]].copy()
        tmp.columns = ["class1", "weight", "class2"]
        tmp["weight"] = pd.to_numeric(tmp["weight"], errors="coerce").fillna(0)

        top_pairs = tmp.sort_values("weight", ascending=False).head(MAX_BARS)
        fig = px.bar(top_pairs,
                     x=top_pairs["class1"] + " → " + top_pairs["class2"],
                     y="weight", text="weight",
                     title="Top class-to-class dependencies by weight",
                     color_discrete_sequence=DEFAULT_BAR_COLOR)
        fig.update_traces(textposition="outside", cliponaxis=False)
        fig.update_layout(xaxis_tickangle=-40, width=1200, height=600,
                          xaxis_title="class pair", yaxis_title="weight")
        fig.show()
    else:
        print("[info] Package_Dependencies_Classes.csv lacks expected columns — skipping charts.")
else:
    print("[info] Package_Dependencies_Classes.csv missing or empty.")
