
# External Integration — Report

Interactive visuals for the **External_Integration** block using CSV files located at:
`reports/custom-queries-csv/External_Integration/`

**What’s Included**
- **External SDKs**: Group → Artifact treemap, Top SDK artifacts bar, and Group-level summary.
- **Hardcoded URLs**: Top hosts bar, Host → Class treemap, HTTP vs HTTPS share, and a compact table.


In [1]:

# Setup: imports, robust path resolver, helpers
import os, ast, glob
from pathlib import Path
from urllib.parse import urlparse
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

pd.set_option('future.no_silent_downcasting', True)

# ---- Path resolution ----
def resolve_reports_dir():
    # 1) Honor environment variable if valid
    env = os.environ.get("REPORTS_DIRECTORY")
    if env:
        p = Path(env).expanduser().resolve()
        if p.exists():
            return str(p)
    # 2) Walk upwards to find a 'reports' folder
    cwd = Path.cwd()
    for i in range(0, 8):
        base = cwd if i == 0 else cwd.parents[i-1]
        cand = base / "reports"
        if cand.exists():
            return str(cand.resolve())
    # 3) Fallback relative to CWD
    return str((Path.cwd() / "reports").resolve())

REPORTS_DIR = resolve_reports_dir()
EXT_DIR = os.path.join(REPORTS_DIR, "custom-queries-csv", "External_Integration")

print("REPORTS_DIR =", REPORTS_DIR)
print("EXT_DIR     =", EXT_DIR)
print("EXT_DIR exists? ->", os.path.exists(EXT_DIR))
print("Files under EXT_DIR:")
for p in sorted(glob.glob(os.path.join(EXT_DIR, "*"))):
    print(" -", p)

# ---- CSV IO helpers ----
NA_LITS = ["", " ", "NA", "N/A", "n/a", "NaN", "NULL", "Null", "null", "None", "none", "-", "--"]

def read_csv_safe(p):
    if not os.path.exists(p):
        print(f"[info] Missing CSV: {p}")
        return pd.DataFrame()
    try:
        df = pd.read_csv(p, na_values=NA_LITS, keep_default_na=True)
        df.columns = [str(c).strip() for c in df.columns]
        df = df.dropna(how="all")  # drop completely empty rows if any
        return df
    except Exception as e:
        print(f"[warn] Failed to read {p}: {e}")
        return pd.DataFrame()

def labelize_na(s, label="N/A"):
    s = s.copy()
    s = s.mask(s.isna(), label).astype(str)
    s = s.replace({"nan": label, "NaN": label})
    return s

def find_col(df, *cands, default=None, contains=None):
    """Find a column by exact candidates or by substring (contains)."""
    low = {c.lower(): c for c in df.columns}
    for c in cands:
        if c and c.lower() in low:
            return low[c.lower()]
    if contains:
        for k, orig in low.items():
            if contains.lower() in k:
                return orig
    return default

TOP_N = 40
MAX_BARS = 25  # cap for long bar charts


REPORTS_DIR = /Users/jonathan.nervaez/Documents/AppModPractice/E2E-decomposition/reports
EXT_DIR     = /Users/jonathan.nervaez/Documents/AppModPractice/E2E-decomposition/reports/custom-queries-csv/External_Integration
EXT_DIR exists? -> True
Files under EXT_DIR:
 - /Users/jonathan.nervaez/Documents/AppModPractice/E2E-decomposition/reports/custom-queries-csv/External_Integration/External_SDKs.csv
 - /Users/jonathan.nervaez/Documents/AppModPractice/E2E-decomposition/reports/custom-queries-csv/External_Integration/Hardcoded_URLs.csv


## 1) External SDKs — usage overview

In [2]:

path = os.path.join(EXT_DIR, "External_SDKs.csv")
df_sdks = read_csv_safe(path)
display(df_sdks.head(10))

if not df_sdks.empty:
    c_cls = find_col(df_sdks, "className", contains="class", default=None)
    c_grp = find_col(df_sdks, "artifactGroup", "group", contains="group", default=None)
    c_art = find_col(df_sdks, "artifactName", "name", contains="name", default=None)
    c_ver = find_col(df_sdks, "artifactVersion", "version", contains="version", default=None)

    if c_grp and c_art:
        # Count class references per (group, artifact)
        usage = (df_sdks.groupby([c_grp, c_art]).size()
                        .reset_index(name="count"))
        usage[c_grp] = labelize_na(usage[c_grp])
        usage[c_art] = labelize_na(usage[c_art])

        # Treemap: Group → Artifact sized by count
        fig = px.treemap(usage, path=[c_grp, c_art], values="count",
                         title="External SDK usage (Group → Artifact)")
        fig.update_layout(width=1000, height=650)
        fig.show()

        # Top artifacts by usage
        top_art = (usage.groupby(c_art)["count"].sum()
                         .reset_index()
                         .sort_values("count", ascending=False)
                         .head(MAX_BARS))
        fig2 = px.bar(top_art, x=c_art, y="count", text="count",
                      title="Top external SDK artifacts by usage (by class reference count)")
        fig2.update_traces(textposition="outside", cliponaxis=False)
        fig2.update_layout(xaxis_tickangle=-30, width=1100, height=550)
        fig2.show()

        # Group-level summary
        by_group = usage.groupby(c_grp)["count"].sum().reset_index(name="usage")
        top_groups = by_group.sort_values("usage", ascending=False).head(MAX_BARS)
        fig3 = px.bar(top_groups, x=c_grp, y="usage", text="usage",
                      title="External SDK usage by group")
        fig3.update_traces(textposition="outside", cliponaxis=False)
        fig3.update_layout(xaxis_tickangle=-30, width=1100, height=550)
        fig3.show()
    else:
        print("[info] External_SDKs.csv lacks expected columns — skipping charts.")
else:
    print("[info] External_SDKs.csv missing or empty.")


Unnamed: 0,className,artifactGroup,artifactName,artifactVersion,Source Cypher File: Custom_Queries/External_Integration/External_SDKs.cypher


[info] External_SDKs.csv missing or empty.


## 2) Hardcoded URLs — endpoints and hosts

In [3]:

path = os.path.join(EXT_DIR, "Hardcoded_URLs.csv")
df_urls = read_csv_safe(path)
display(df_urls.head(10))

if not df_urls.empty:
    c_ep  = find_col(df_urls, "endpoint", contains="endpoint", default=None)
    c_cls = find_col(df_urls, "declaringClass", contains="class", default=None)
    c_fld = find_col(df_urls, "fieldName", contains="field", default=None)

    if c_ep:
        # Parse URLs
        work = df_urls[[c_ep]].copy()
        work.columns = ["endpoint"]
        def parse_host(url):
            try:
                u = urlparse(str(url))
                return (u.scheme or "N/A", (u.netloc or "").split("@")[-1])  # remove potential userinfo
            except Exception:
                return ("N/A", "N/A")
        sch, host = zip(*[parse_host(v) for v in work["endpoint"]])
        work["scheme"] = sch
        work["host"] = [h.split(":")[0] for h in host]  # strip port if present

        # Top hosts
        top_hosts = (work.groupby("host").size()
                          .reset_index(name="count")
                          .sort_values("count", ascending=False)
                          .head(MAX_BARS))
        fig = px.bar(top_hosts, x="host", y="count", text="count",
                     title="Top hardcoded URL hosts")
        fig.update_traces(textposition="outside", cliponaxis=False)
        fig.update_layout(xaxis_tickangle=-30, width=1100, height=550)
        fig.show()

        # Host → Class treemap (limited to top hosts for readability)
        if c_cls:
            top_host_set = set(top_hosts["host"].head(12))
            sub = df_urls[df_urls[c_ep].isin(work[work["host"].isin(top_host_set)]["endpoint"])].copy()

            # Join to host
            sub = sub.merge(work[["endpoint","host"]], on="endpoint", how="left")
            sub[c_cls] = sub[c_cls].astype(str)
            sub["host"] = sub["host"].astype(str)
            sub["value"] = 1

            fig2 = px.treemap(sub, path=["host", c_cls], values="value",
                              title="Hardcoded URLs — Host → Declaring Class (Top hosts)")
            fig2.update_layout(width=1100, height=700)
            fig2.show()

        # HTTP vs HTTPS share
        scheme_share = (work.groupby("scheme").size().reset_index(name="count"))
        fig3 = px.pie(scheme_share, names="scheme", values="count", hole=0.35,
                      title="Scheme share (http vs https)")
        fig3.update_traces(textposition="inside")
        fig3.update_layout(width=700, height=550)
        fig3.show()

        # Compact table
        from IPython.display import display
        cols = [c for c in [c_ep, c_cls, c_fld] if c]
        display(df_urls[cols].head(100))
    else:
        print("[info] Hardcoded_URLs.csv lacks expected columns — skipping charts.")
else:
    print("[info] Hardcoded_URLs.csv missing or empty.")


Unnamed: 0,endpoint,declaringClass,fieldName,Source Cypher File: Custom_Queries/External_Integration/Hardcoded_URLs.cypher


[info] Hardcoded_URLs.csv missing or empty.
