
# Dependencies — Report

Interactive visuals for the **Dependencies** block using CSV files located at:
`reports/custom-queries-csv/Dependencies/`

**What’s Included**
- **Circular Dependencies** between packages (top pairs).
- **External Dependencies** overview (group → artifact).
- **Lines of Code**: **Top classes by LoC** (bar) **+ share pie** (donut) for the same Top set.
- **Modules & Artifacts**: in/out degree per artifact (scatter) and top outgoing.
- **Package Dependencies**: grouped bars of total dependencies & distinct dependent types (top origins).

> The notebook is resilient: when a CSV is missing it prints an info message and continues without failing.


In [1]:

# Setup: imports, robust path resolver, helpers
import os, ast, glob
from pathlib import Path
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

pd.set_option('future.no_silent_downcasting', True)

# ---- Path resolution ----
def resolve_reports_dir():
    # 1) Honor environment variable if valid
    env = os.environ.get("REPORTS_DIRECTORY")
    if env:
        p = Path(env).expanduser().resolve()
        if p.exists():
            return str(p)
    # 2) Walk upwards to find a 'reports' folder
    cwd = Path.cwd()
    for i in range(0, 8):
        base = cwd if i == 0 else cwd.parents[i-1]
        cand = base / "reports"
        if cand.exists():
            return str(cand.resolve())
    # 3) Fallback relative to CWD
    return str((Path.cwd() / "reports").resolve())

REPORTS_DIR = resolve_reports_dir()
DEPS_DIR = os.path.join(REPORTS_DIR, "custom-queries-csv", "Dependencies")

print("REPORTS_DIR =", REPORTS_DIR)
print("DEPS_DIR    =", DEPS_DIR)
print("DEPS_DIR exists? ->", os.path.exists(DEPS_DIR))
print("Files under DEPS_DIR:")
for p in sorted(glob.glob(os.path.join(DEPS_DIR, "*"))):
    print(" -", p)

# ---- CSV IO helpers ----
NA_LITS = ["", " ", "NA", "N/A", "n/a", "NaN", "NULL", "Null", "null", "None", "none", "-", "--"]

def read_csv_safe(p):
    if not os.path.exists(p):
        print(f"[info] Missing CSV: {p}")
        return pd.DataFrame()
    try:
        df = pd.read_csv(p, na_values=NA_LITS, keep_default_na=True)
        df.columns = [str(c).strip() for c in df.columns]
        df = df.dropna(how="all")  # drop completely empty rows if any
        return df
    except Exception as e:
        print(f"[warn] Failed to read {p}: {e}")
        return pd.DataFrame()

def labelize_na(s, label="N/A"):
    s = s.copy()
    s = s.mask(s.isna(), label).astype(str)
    s = s.replace({"nan": label, "NaN": label})
    return s

def find_col(df, *cands, default=None, contains=None):
    """Find a column by exact candidates or by substring (contains)."""
    low = {c.lower(): c for c in df.columns}
    for c in cands:
        if c and c.lower() in low:
            return low[c.lower()]
    if contains:
        for k, orig in low.items():
            if contains.lower() in k:
                return orig
    return default

TOP_N = 40
MAX_BARS = 25  # cap for long bar charts


REPORTS_DIR = /Users/jonathan.nervaez/Documents/AppModPractice/E2E-decomposition/reports
DEPS_DIR    = /Users/jonathan.nervaez/Documents/AppModPractice/E2E-decomposition/reports/custom-queries-csv/Dependencies
DEPS_DIR exists? -> True
Files under DEPS_DIR:
 - /Users/jonathan.nervaez/Documents/AppModPractice/E2E-decomposition/reports/custom-queries-csv/Dependencies/Circular_Dependencies.csv
 - /Users/jonathan.nervaez/Documents/AppModPractice/E2E-decomposition/reports/custom-queries-csv/Dependencies/External_Dependencies.csv
 - /Users/jonathan.nervaez/Documents/AppModPractice/E2E-decomposition/reports/custom-queries-csv/Dependencies/Lines_Of_Code.csv
 - /Users/jonathan.nervaez/Documents/AppModPractice/E2E-decomposition/reports/custom-queries-csv/Dependencies/Modules_And_Artifacts.csv
 - /Users/jonathan.nervaez/Documents/AppModPractice/E2E-decomposition/reports/custom-queries-csv/Dependencies/Package_Dependencies.csv
 - /Users/jonathan.nervaez/Documents/AppModPractice/E2E-decomposition/re

## 1) Circular Dependencies (package ↔ package)

In [2]:

path = os.path.join(DEPS_DIR, "Circular_Dependencies.csv")
df_circ = read_csv_safe(path)
display(df_circ.head(10))

if not df_circ.empty:
    c_p1 = find_col(df_circ, "package1", contains="package1", default=None)
    c_p2 = find_col(df_circ, "package2", contains="package2", default=None)
    c_fwd = find_col(df_circ, "totalDepsP1toP2", contains="p1toP2", default=None)
    c_bwd = find_col(df_circ, "totalDepsP2toP1", contains="p2toP1", default=None)

    if c_p1 and c_p2 and c_fwd and c_bwd:
        tmp = df_circ[[c_p1, c_p2, c_fwd, c_bwd]].copy()
        tmp.columns = ["package1", "package2", "fwd", "bwd"]
        tmp["total"] = pd.to_numeric(tmp["fwd"], errors="coerce").fillna(0) + pd.to_numeric(tmp["bwd"], errors="coerce").fillna(0)
        top_pairs = tmp.sort_values("total", ascending=False).head(MAX_BARS)

        fig = px.bar(top_pairs, x=top_pairs["package1"] + " ⇄ " + top_pairs["package2"], y="total",
                     text="total", title="Top circular package pairs by total dependencies")
        fig.update_traces(textposition="outside", cliponaxis=False)
        fig.update_layout(xaxis_tickangle=-35, width=1200, height=550)
        fig.show()

        hp = top_pairs.copy()
        fig2 = px.density_heatmap(hp, x="package1", y="package2", z="total", nbinsx=len(hp["package1"].unique()),
                                  title="Circular dependencies heatmap (top pairs)")
        fig2.update_layout(width=900, height=700)
        fig2.show()
    else:
        print("[info] Circular_Dependencies.csv lacks expected columns — skipping charts.")
else:
    print("[info] Circular_Dependencies.csv missing or empty.")


Unnamed: 0,artifact1,package1,artifact2,package2,totalDepsP1toP2,totalDepsP2toP1,sampleDepsP1toP2,sampleDepsP2toP1,Source Cypher File: Custom_Queries/Dependencies/Circular_Dependencies.cypher
0,,com.salesmanager.shop.model.catalog.product.at...,,com.salesmanager.shop.model.catalog.product.at...,4,10,PersistableProductAttribute → ProductAttribute...,PersistableProductOptionValueEntity → ProductO...,
1,,com.salesmanager.shop.model.catalog.product,,com.salesmanager.shop.model.catalog.product.pr...,3,4,"ReadableMinimalProduct → ProductEntity,Readabl...",PersistableProductInventory → PersistableProdu...,
2,,com.salesmanager.shop.model.order,,com.salesmanager.shop.model.order.v0,3,4,"ShopOrder → PersistableOrder,OrderEntity → Ord...","PersistableOrder → OrderEntity,ReadableOrder →...",
3,,com.salesmanager.core.business.services.order,,com.salesmanager.core.business.services.payments,2,1,"OrderServiceImpl → PaymentService,OrderService...",PaymentServiceImpl → OrderService,
4,,com.salesmanager.core.model.catalog.product,,com.salesmanager.core.model.catalog.product.at...,2,1,"Product → ProductAttribute,ProductCriteria → A...",ProductAttribute → Product,
5,,com.salesmanager.core.model.catalog.product,,com.salesmanager.core.model.catalog.product.av...,1,2,Product → ProductAvailability,"ProductAvailability → Product,ProductAvailabil...",
6,,com.salesmanager.core.model.catalog.product,,com.salesmanager.core.model.catalog.product.va...,1,2,Product → ProductVariant,"ProductVariantImageDescription → Product,Produ...",
7,,com.salesmanager.core.model.order,,com.salesmanager.core.model.order.orderstatus,2,1,"Order → OrderStatus,Order → OrderStatusHistory",OrderStatusHistory → Order,
8,,com.salesmanager.shop.model.catalog,,com.salesmanager.shop.model.catalog.product,1,2,ProductList → ReadableProduct,"ProductPriceDescription → NamedEntity,ProductD...",
9,,com.salesmanager.shop.model.catalog.product,,com.salesmanager.shop.model.catalog.product.pr...,1,2,ReadableProduct → ReadableProductVariant,"ReadableProductVariant → ReadableImage,Product...",


## 2) External Dependencies (group → artifact)

In [3]:

path = os.path.join(DEPS_DIR, "External_Dependencies.csv")
df_ext = read_csv_safe(path)
display(df_ext.head(10))

if not df_ext.empty:
    c_group = find_col(df_ext, "group", "artifact.group", contains="group", default=None)
    c_name  = find_col(df_ext, "name", "artifact.name", contains="name", default=None)
    c_ver   = find_col(df_ext, "version", "artifact.version", contains="version", default=None)

    if c_group and c_name:
        df_ext["group"] = labelize_na(df_ext[c_group])
        df_ext["name"]  = labelize_na(df_ext[c_name])
        treemap = (df_ext.groupby(["group", "name"]).size()
                         .reset_index(name="count"))
        fig = px.treemap(treemap, path=["group", "name"], values="count",
                         title="External dependencies (group → artifact)")
        fig.update_layout(width=1000, height=650)
        fig.show()

        by_group = treemap.groupby("group")["count"].sum().reset_index(name="artifacts")
        top_groups = by_group.sort_values("artifacts", ascending=False).head(MAX_BARS)
        fig2 = px.bar(top_groups, x="group", y="artifacts", text="artifacts",
                      title="Top groups by # of artifacts used")
        fig2.update_traces(textposition="outside", cliponaxis=False)
        fig2.update_layout(xaxis_tickangle=-30, width=1100, height=550)
        fig2.show()
    else:
        print("[info] External_Dependencies.csv lacks expected columns — skipping charts.")
else:
    print("[info] External_Dependencies.csv missing or empty.")


Unnamed: 0,artifact.group,artifact.name,artifact.version,Source Cypher File: Custom_Queries/Dependencies/External_Dependencies.cypher
0,${hibernate.groupId},hibernate-core,${hibernate},
1,${hibernate.groupId},hibernate-jpamodelgen,${hibernate},
2,${project.groupId},txw2,,
3,${project.groupId},spring-data-commons,${springdata.commons},
4,${project.groupId},wagon-http-shared,${project.version},
5,${project.groupId},google-cloud-core,,
6,${project.groupId},google-cloud-core-http,,
7,${project.groupId},protobuf-java,,
8,${project.groupId},spring-plugin-core,${project.version},
9,${project.groupId},byte-buddy-dep,${project.version},


## 3) Lines Of Code (per class) — Top bar + share pie

In [4]:

path = os.path.join(DEPS_DIR, "Lines_Of_Code.csv")
df_loc = read_csv_safe(path)
display(df_loc.head(10))

if not df_loc.empty:
    c_cls = find_col(df_loc, "CompleteClassPath", contains="class", default=None)
    c_loc = find_col(df_loc, "LoC", contains="loc", default=None)

    if c_cls and c_loc:
        df_loc["LoC"] = pd.to_numeric(df_loc[c_loc], errors="coerce").fillna(0).astype(int)

        # --- Top bar ---
        top_loc = df_loc.sort_values("LoC", ascending=False).head(MAX_BARS)
        fig = px.bar(top_loc, x=c_cls, y="LoC", text="LoC",
                     title="Top classes by Lines of Code (LoC)")
        fig.update_traces(textposition="outside", cliponaxis=False)
        fig.update_layout(xaxis_tickangle=-40, width=1200, height=550)
        fig.show()

        # --- Share pie (donut) for the same Top set ---
        # This shows LoC share contribution for the Top classes only.
        fig2 = px.pie(top_loc, names=c_cls, values="LoC", hole=0.35,
                      title=f"LoC share — Top {len(top_loc)} classes")
        fig2.update_traces(textposition="inside")
        fig2.update_layout(width=850, height=650)
        fig2.show()

        # Optional: interactive table for detailed inspection (kept minimal)
        from IPython.display import display
        display(top_loc[[c_cls, "LoC"]].reset_index(drop=True).head(100))
    else:
        print("[info] Lines_Of_Code.csv lacks expected columns — skipping charts.")
else:
    print("[info] Lines_Of_Code.csv missing or empty.")


Unnamed: 0,CompleteClassPath,LoC,Source Cypher File: Custom_Queries/Dependencies/Lines_Of_Code.cypher
0,net.sourceforge.htmlunit.cyberneko.HTMLEntitie...,39860,
1,org.drools.compiler.shade.org.eclipse.jdt.inte...,8381,
2,org.eclipse.jdt.internal.compiler.problem.Prob...,8354,
3,org.drools.compiler.shade.org.eclipse.jdt.inte...,7843,
4,org.eclipse.jdt.internal.compiler.parser.Parser,7830,
5,org.drools.compiler.rule.builder.dialect.java....,6989,
6,org.drools.mvel.parser.GeneratedMvelParser,5608,
7,org.h2.command.Parser,5313,
8,freemarker.core.FMParserTokenManager,4913,
9,org.drools.compiler.shade.org.eclipse.jdt.inte...,4670,


Unnamed: 0,CompleteClassPath,LoC
0,net.sourceforge.htmlunit.cyberneko.HTMLEntitie...,39860
1,org.drools.compiler.shade.org.eclipse.jdt.inte...,8381
2,org.eclipse.jdt.internal.compiler.problem.Prob...,8354
3,org.drools.compiler.shade.org.eclipse.jdt.inte...,7843
4,org.eclipse.jdt.internal.compiler.parser.Parser,7830
5,org.drools.compiler.rule.builder.dialect.java....,6989
6,org.drools.mvel.parser.GeneratedMvelParser,5608
7,org.h2.command.Parser,5313
8,freemarker.core.FMParserTokenManager,4913
9,org.drools.compiler.shade.org.eclipse.jdt.inte...,4670


## 4) Modules & Artifacts (in/out degree per artifact)

In [5]:

path = os.path.join(DEPS_DIR, "Modules_And_Artifacts.csv")
df_mod = read_csv_safe(path)
display(df_mod.head(10))

if not df_mod.empty:
    c_a1 = find_col(df_mod, "Artifact_1_Name", contains="_1_name", default=None)
    c_a2 = find_col(df_mod, "Artifact_2_Name", contains="_2_name", default=None)

    if c_a1 and c_a2:
        a1 = df_mod[c_a1].astype(str)
        a2 = df_mod[c_a2].astype(str)
        out_deg = a1.value_counts().rename("outgoing").to_frame()
        in_deg  = a2.value_counts().rename("incoming").to_frame()
        deg = out_deg.join(in_deg, how="outer").fillna(0).astype(int).reset_index().rename(columns={"index": "artifact"})
        deg["total"] = deg["outgoing"] + deg["incoming"]

        fig = px.scatter(deg, x="outgoing", y="incoming", size="total", hover_name="artifact",
                         title="Artifact degree: outgoing vs incoming (size = total)")
        fig.update_layout(width=900, height=650)
        fig.show()

        top_out = deg.sort_values("outgoing", ascending=False).head(MAX_BARS)
        fig2 = px.bar(top_out, x="artifact", y="outgoing", text="outgoing",
                      title="Top artifacts by number of outgoing dependencies")
        fig2.update_traces(textposition="outside", cliponaxis=False)
        fig2.update_layout(xaxis_tickangle=-35, width=1100, height=550)
        fig2.show()
    else:
        print("[info] Modules_And_Artifacts.csv lacks expected columns — skipping charts.")
else:
    print("[info] Modules_And_Artifacts.csv missing or empty.")


Unnamed: 0,Artifact_1_Name,Artifact_1_Type,Artifact_1_Version,Artifact_1_Group,Artifact_2_Name,Artifact_2_Type,Artifact_2_Version,Artifact_2_Group,Source Cypher File: Custom_Queries/Dependencies/Modules_And_Artifacts.cypher


[info] Modules_And_Artifacts.csv missing or empty.


## 5) Package Dependencies (origin → destination)

In [6]:

path = os.path.join(DEPS_DIR, "Package_Dependencies.csv")
df_pkg = read_csv_safe(path)
display(df_pkg.head(10))

if not df_pkg.empty:
    c_org = find_col(df_pkg, "originPackage", contains="origin", default=None)
    c_dst = find_col(df_pkg, "destinationPackage", contains="destination", default=None)
    c_types = find_col(df_pkg, "typesThatDepend", contains="types", default=None)
    c_total = find_col(df_pkg, "totalDependencies", contains="total", default=None)

    if c_org and c_dst and c_types and c_total:
        tmp = df_pkg[[c_org, c_dst, c_types, c_total]].copy()
        tmp.columns = ["origin", "destination", "types", "total"]
        tmp["types"] = pd.to_numeric(tmp["types"], errors="coerce").fillna(0).astype(int)
        tmp["total"] = pd.to_numeric(tmp["total"], errors="coerce").fillna(0).astype(int)

        agg = tmp.groupby("origin").agg(
            totalDeps=("total", "sum"),
            distinctTypes=("types", "sum")
        ).reset_index()

        top_origins = agg.sort_values("totalDeps", ascending=False).head(MAX_BARS)

        fig = px.bar(top_origins, x="origin", y=["totalDeps", "distinctTypes"],
                     barmode="group", title="Top origin packages (total deps vs. distinct dependent types)")
        fig.update_layout(xaxis_tickangle=-35, width=1200, height=600)
        fig.show()

        pairs = tmp.sort_values("total", ascending=False).head(30)
        fig2 = px.density_heatmap(pairs, x="origin", y="destination", z="total",
                                  title="Top origin → destination package pairs by total dependencies")
        fig2.update_layout(width=1000, height=700)
        fig2.show()
    else:
        print("[info] Package_Dependencies.csv lacks expected columns — skipping charts.")
else:
    print("[info] Package_Dependencies.csv missing or empty.")


Unnamed: 0,originPackage,destinationPackage,typesThatDepend,totalDependencies,Source Cypher File: Custom_Queries/Dependencies/Package_Dependencies.cypher
0,com.mysql.cj.x.protobuf,com.google.protobuf,352,3272,
1,com.google.api,com.google.protobuf,308,2827,
2,org.drools.core.marshalling.impl,com.google.protobuf,239,2417,
3,org.openxmlformats.schemas.spreadsheetml.x2006...,org.apache.xmlbeans,437,1480,
4,freemarker.core,freemarker.template,400,1294,
5,org.openxmlformats.schemas.drawingml.x2006.main,org.apache.xmlbeans,381,1218,
6,org.openxmlformats.schemas.wordprocessingml.x2...,org.apache.xmlbeans,375,1134,
7,org.apache.xmlbeans.impl.xb.xsdschema,org.apache.xmlbeans,276,1073,
8,org.jbpm.marshalling.impl,com.google.protobuf,106,1049,
9,com.amazonaws.services.simpleemail,com.amazonaws.services.simpleemail.model,78,998,


## 6) Package Dependencies — Classes (top pairs by weight)

In [7]:

path = os.path.join(DEPS_DIR, "Package_Dependencies_Classes.csv")
df_cls = read_csv_safe(path)
display(df_cls.head(10))

if not df_cls.empty:
    c_c1 = find_col(df_cls, "Class_1_fqn", contains="_1_fqn", default=None)
    c_w  = find_col(df_cls, "dependencyWeight", contains="weight", default=None)
    c_c2 = find_col(df_cls, "Class_2_fqn", contains="_2_fqn", default=None)

    if c_c1 and c_w and c_c2:
        tmp = df_cls[[c_c1, c_w, c_c2]].copy()
        tmp.columns = ["class1", "weight", "class2"]
        tmp["weight"] = pd.to_numeric(tmp["weight"], errors="coerce").fillna(0)

        top_pairs = tmp.sort_values("weight", ascending=False).head(MAX_BARS)
        fig = px.bar(top_pairs, x=top_pairs["class1"] + " → " + top_pairs["class2"],
                     y="weight", text="weight", title="Top class-to-class dependencies by weight")
        fig.update_traces(textposition="outside", cliponaxis=False)
        fig.update_layout(xaxis_tickangle=-40, width=1200, height=600)
        fig.show()
    else:
        print("[info] Package_Dependencies_Classes.csv lacks expected columns — skipping charts.")
else:
    print("[info] Package_Dependencies_Classes.csv missing or empty.")


Unnamed: 0,Class_1_fqn,dependencyWeight,Class_2_fqn,Source Cypher File: Custom_Queries/Dependencies/Package_Dependencies_Classes.cypher
0,org.springframework.boot.loader.ExecutableArch...,5,org.springframework.boot.loader.ClassPathIndex...,
1,org.springframework.boot.loader.JarLauncher,2,org.springframework.boot.loader.ClassPathIndex...,
2,org.springframework.boot.loader.WarLauncher,3,org.springframework.boot.loader.ExecutableArch...,
3,org.springframework.boot.loader.JarLauncher,4,org.springframework.boot.loader.ExecutableArch...,
4,org.springframework.boot.loader.ExecutableArch...,3,org.springframework.boot.loader.Launcher,
5,org.springframework.boot.loader.PropertiesLaun...,3,org.springframework.boot.loader.Launcher,
6,org.springframework.boot.loader.PropertiesLaun...,1,org.springframework.boot.loader.JarLauncher,
7,org.springframework.boot.loader.JarLauncher,1,org.springframework.boot.loader.archive.Explod...,
8,org.springframework.boot.loader.archive.Explod...,1,org.springframework.boot.loader.archive.Explod...,
9,org.springframework.boot.loader.PropertiesLaun...,3,org.springframework.boot.loader.archive.Explod...,
