# External Integration — Report

This notebook visualizes the CSV outputs generated by the **External_Integration** block.

## What this notebook shows
- **External SDKs**: Group → Artifact treemap, Top artifacts (Bar), and Group-level summary (Bar).
- **Hardcoded URLs**: Top hosts (Bar), Host → Class treemap, HTTP vs HTTPS share (Donut), and a compact table.

> If a CSV is missing or empty, the cell prints an info message and skips the chart.


In [None]:
# Setup: imports, paths, helpers
# - CSVs are read from reports/csv-reports/<CATEGORY>/<file>.csv relative to this notebook folder.
# - Minimal console output; only show information if a CSV is missing/empty.
# - Bar charts use an explicit default color so it's easy to tweak later.
# - Titles are standardized without block prefixes.

import os
from pathlib import Path
from urllib.parse import urlparse
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

pd.set_option('future.no_silent_downcasting', True)

CATEGORY = "External_Integration"
CSV_BASE = Path("../reports/csv-reports").resolve()
EXT_DIR = CSV_BASE / CATEGORY

# Explicit default color for all bar charts in this notebook
DEFAULT_BAR_COLOR = ["#1f77b4"]

# CSV IO helpers
NA_LITS = ["", " ", "NA", "N/A", "n/a", "NaN", "NULL", "Null", "null", "None", "none", "-", "--"]

def read_csv_safe(p: Path) -> pd.DataFrame:
    """Read a CSV if present; otherwise return an empty DataFrame.
    Prints a minimal info message when missing or unreadable."""
    p = Path(p)
    if not p.exists():
        print(f"[info] Missing CSV: {p}")
        return pd.DataFrame()
    try:
        df = pd.read_csv(p, na_values=NA_LITS, keep_default_na=True)
        df.columns = [str(c).strip() for c in df.columns]
        df = df.dropna(how="all")
        return df
    except Exception as e:
        print(f"[warn] Failed to read {p}: {e}")
        return pd.DataFrame()

def labelize_na(s, label="N/A"):
    s = s.copy()
    s = s.mask(s.isna(), label).astype(str)
    s = s.replace({"nan": label, "NaN": label})
    return s

def find_col(df, *cands, default=None, contains=None):
    """Find a column by exact candidates or by substring (contains)."""
    low = {c.lower(): c for c in df.columns}
    for c in cands:
        if c and c.lower() in low:
            return low[c.lower()]
    if contains:
        for k, orig in low.items():
            if contains.lower() in k:
                return orig
    return default


## 1) External SDKs — usage overview

In [None]:
# Charts for External_SDKs
# Where charts are generated:
#  - 1A) External SDK usage (Group → Artifact) (Treemap)
#  - 1B) Top external SDK artifacts by usage (Bar, explicit color)
#  - 1C) External SDK usage by group (Bar, explicit color)

path = EXT_DIR / "External_SDKs.csv"
df_sdks = read_csv_safe(path)

if not df_sdks.empty:
    c_grp = find_col(df_sdks, "artifactGroup", "group", contains="group", default=None)
    c_art = find_col(df_sdks, "artifactName", "name", contains="name", default=None)

    if c_grp and c_art:
        usage = (df_sdks.groupby([c_grp, c_art]).size()
                        .reset_index(name="count"))
        usage[c_grp] = labelize_na(usage[c_grp])
        usage[c_art] = labelize_na(usage[c_art])

        # 1A) Treemap — Group → Artifact sized by count
        fig = px.treemap(usage, path=[c_grp, c_art], values="count",
                         title="External SDK usage (Group → Artifact)")
        fig.update_layout(width=1000, height=650)
        fig.show()

        # 1B) Top artifacts by usage (bar)
        top_art = (usage.groupby(c_art)["count"].sum()
                         .reset_index()
                         .sort_values("count", ascending=False)
                         .head(25))
        fig2 = px.bar(top_art, x=c_art, y="count", text="count",
                      title="Top external SDK artifacts by usage",
                      color_discrete_sequence=DEFAULT_BAR_COLOR)
        fig2.update_traces(textposition="outside", cliponaxis=False)
        fig2.update_layout(xaxis_tickangle=-30, width=1100, height=550,
                           xaxis_title="artifact", yaxis_title="usage (class references)")
        fig2.show()

        # 1C) Group-level summary (bar)
        by_group = usage.groupby(c_grp)["count"].sum().reset_index(name="usage")
        top_groups = by_group.sort_values("usage", ascending=False).head(25)
        fig3 = px.bar(top_groups, x=c_grp, y="usage", text="usage",
                      title="External SDK usage by group",
                      color_discrete_sequence=DEFAULT_BAR_COLOR)
        fig3.update_traces(textposition="outside", cliponaxis=False)
        fig3.update_layout(xaxis_tickangle=-30, width=1100, height=550,
                           xaxis_title="group", yaxis_title="usage (class references)")
        fig3.show()
    else:
        print("[info] External_SDKs.csv lacks expected columns — skipping charts.")
else:
    print("[info] External_SDKs.csv missing or empty.")


## 2) Hardcoded URLs — endpoints and hosts

In [None]:
# Charts for Hardcoded_URLs
# Where charts are generated:
#  - 2A) Top hardcoded URL hosts (Bar, explicit color)
#  - 2B) Hardcoded URLs — Host → Declaring Class (Treemap; limited to top hosts)
#  - 2C) Scheme share (http vs https) (Donut)
#  - 2D) Compact table (first rows)

path = EXT_DIR / "Hardcoded_URLs.csv"
df_urls = read_csv_safe(path)

if not df_urls.empty:
    from urllib.parse import urlparse

    c_ep  = find_col(df_urls, "endpoint", contains="endpoint", default=None)
    c_cls = find_col(df_urls, "declaringClass", contains="class", default=None)
    c_fld = find_col(df_urls, "fieldName", contains="field", default=None)

    if c_ep:
        # Parse URLs → derive scheme and host
        work = df_urls[[c_ep]].copy()
        work.columns = ["endpoint"]
        def parse_host(url):
            try:
                u = urlparse(str(url))
                return (u.scheme or "N/A", (u.netloc or "").split("@")[-1])  # strip userinfo if present
            except Exception:
                return ("N/A", "N/A")
        sch, host = zip(*[parse_host(v) for v in work["endpoint"]])
        work["scheme"] = sch
        work["host"] = [h.split(":")[0] for h in host]  # strip port if present

        # 2A) Top hosts (bar)
        top_hosts = (work.groupby("host").size()
                          .reset_index(name="count")
                          .sort_values("count", ascending=False)
                          .head(25))
        fig = px.bar(top_hosts, x="host", y="count", text="count",
                     title="Top hardcoded URL hosts",
                     color_discrete_sequence=DEFAULT_BAR_COLOR)
        fig.update_traces(textposition="outside", cliponaxis=False)
        fig.update_layout(xaxis_tickangle=-30, width=1100, height=550,
                          xaxis_title="host", yaxis_title="count")
        fig.show()

        # 2B) Host → Class treemap (limited to top hosts)
        if c_cls:
            top_host_set = set(top_hosts["host"].head(12))
            sub = df_urls[df_urls[c_ep].isin(work[work["host"].isin(top_host_set)]["endpoint"])].copy()
            sub = sub.merge(work[["endpoint","host"]], on="endpoint", how="left")
            sub[c_cls] = sub[c_cls].astype(str)
            sub["host"] = sub["host"].astype(str)
            sub["value"] = 1

            fig2 = px.treemap(sub, path=["host", c_cls], values="value",
                              title="Hardcoded URLs — Host → Declaring Class (Top hosts)")
            fig2.update_layout(width=1100, height=700)
            fig2.show()

        # 2C) HTTP vs HTTPS share (donut)
        scheme_share = (work.groupby("scheme").size().reset_index(name="count"))
        fig3 = px.pie(scheme_share, names="scheme", values="count", hole=0.35,
                      title="Scheme share (http vs https)")
        fig3.update_traces(textposition="inside")
        fig3.update_layout(width=700, height=550)
        fig3.show()

        # 2D) Compact table (first rows)
        from IPython.display import display
        cols = [c for c in [c_ep, c_cls, c_fld] if c]
        display(df_urls[cols].head(100))
    else:
        print("[info] Hardcoded_URLs.csv lacks expected columns — skipping charts.")
else:
    print("[info] Hardcoded_URLs.csv missing or empty.")
