# Package Dependencies — Graph (Prototype)

Este notebook de prueba carga `Package_Dependencies.csv` desde:

`../reports/csv-reports/Dependencies/Package_Dependencies.csv`

y genera un grafo interactivo de dependencias entre paquetes:

- **Nodo** = paquete (`originPackage` / `destinationPackage`)
- **Arista dirigida** = dependencia (`origin → destination`)
- **Grosor** de la arista = `totalDependencies`
- Se limita a los **N paquetes más “fuertes”** para que se vea legible (por defecto 15).

> Requisitos:
> - `networkx`
> - `plotly`
> - CSVs generados previamente por el pipeline (DependenciesCsv.sh).


In [None]:
import os
from pathlib import Path

import pandas as pd
import networkx as nx
import plotly.graph_objects as go

pd.set_option('future.no_silent_downcasting', True)

# Rutas (mismas convenciones que en tu notebook grande)
CATEGORY = "Dependencies"
CSV_BASE = Path("../reports/csv-reports").resolve()
DEPS_DIR = CSV_BASE / CATEGORY

NA_LITS = ["", " ", "NA", "N/A", "n/a", "NaN", "NULL", "Null", "null", "None", "none", "-", "--"]

def read_csv_safe(p: Path) -> pd.DataFrame:
    """Lee un CSV si existe; si no, devuelve un DataFrame vacío."""
    p = Path(p)
    if not p.exists():
        print(f"[info] Missing CSV: {p}")
        return pd.DataFrame()
    try:
        df = pd.read_csv(p, na_values=NA_LITS, keep_default_na=True)
        df.columns = [str(c).strip() for c in df.columns]
        df = df.dropna(how="all")
        return df
    except Exception as e:
        print(f"[warn] Failed to read {p}: {e}")
        return pd.DataFrame()

def find_col(df, *cands, default=None, contains=None):
    """Busca una columna por nombre exacto o por substring."""
    low = {c.lower(): c for c in df.columns}
    for c in cands:
        if c and c.lower() in low:
            return low[c.lower()]
    if contains:
        for k, orig in low.items():
            if contains.lower() in k:
                return orig
    return default

# Cargamos Package_Dependencies.csv
path = DEPS_DIR / "Package_Dependencies.csv"
df_pkg = read_csv_safe(path)

print("Columnas disponibles en Package_Dependencies.csv:")
print(list(df_pkg.columns))

# Localizamos las columnas clave
if not df_pkg.empty:
    c_org = find_col(df_pkg, "originPackage", contains="origin", default=None)
    c_dst = find_col(df_pkg, "destinationPackage", contains="destination", default=None)
    c_total = find_col(df_pkg, "totalDependencies", contains="total", default=None)

    print("Columna origen:", c_org)
    print("Columna destino:", c_dst)
    print("Columna total deps:", c_total)
else:
    c_org = c_dst = c_total = None


In [None]:
# Parámetros de visualización (puedes ajustarlos)
N_NODES = 15  # número máximo de paquetes a mostrar

if df_pkg.empty or not (c_org and c_dst and c_total):
    print("[info] No se puede construir el grafo: falta CSV o columnas esperadas.")
else:
    # Nos quedamos con las columnas relevantes
    graph_df = df_pkg[[c_org, c_dst, c_total]].copy()

    # Agrupamos por par (origen, destino) para sumar pesos, por si hay duplicados
    graph_df = graph_df.groupby([c_org, c_dst], as_index=False)[c_total].sum()

    # Calculamos "fuerza" de cada nodo (suma in + out) para seleccionar los top
    out_strength = graph_df.groupby(c_org)[c_total].sum()
    in_strength = graph_df.groupby(c_dst)[c_total].sum()
    node_strength = out_strength.add(in_strength, fill_value=0)

    # Seleccionamos los N_NODES paquetes más relevantes
    top_nodes = set(node_strength.sort_values(ascending=False).head(N_NODES).index)

    # Filtramos aristas que estén completamente dentro del subconjunto de top nodes
    mask = graph_df[c_org].isin(top_nodes) & graph_df[c_dst].isin(top_nodes)
    graph_df = graph_df[mask]

    print(f"Total de nodos top seleccionados: {len(top_nodes)}")
    print(f"Total de aristas después del filtrado: {len(graph_df)}")

    # Construimos el grafo dirigido
    G = nx.DiGraph()
    for _, row in graph_df.iterrows():
        G.add_edge(row[c_org], row[c_dst], weight=row[c_total])

    if len(G) == 0:
        print("[info] No hay suficientes aristas entre los top nodes para graficar.")
    else:
        # Layout de fuerza para ver clusters de paquetes
        pos = nx.spring_layout(G, k=0.6, seed=42)

        # Preparar pesos de aristas para mapearlos a grosor visual
        weights = [d["weight"] for _, _, d in G.edges(data=True)]
        w_min, w_max = min(weights), max(weights)

        def map_width(w):
            # Mapea el peso (w) a un rango de grosor entre 1 y 5
            if w_min == w_max:
                return 3.0
            return 1.0 + 4.0 * (w - w_min) / (w_max - w_min)

        edge_x = []
        edge_y = []
        annotations = []

        for (src, dst, data) in G.edges(data=True):
            x0, y0 = pos[src]
            x1, y1 = pos[dst]
            edge_x += [x0, x1, None]
            edge_y += [y0, y1, None]

            width = map_width(data["weight"])

            # Flecha para indicar dirección src -> dst
            annotations.append(
                dict(
                    ax=x0,
                    ay=y0,
                    x=x1,
                    y=y1,
                    xref="x",
                    yref="y",
                    axref="x",
                    ayref="y",
                    showarrow=True,
                    arrowhead=2,
                    arrowsize=1,
                    arrowwidth=width / 2.0,
                    opacity=0.6,
                )
            )

        # Aristas (líneas base; el “grosor” real lo llevan las flechas en annotations)
        edge_trace = go.Scatter(
            x=edge_x,
            y=edge_y,
            mode="lines",
            line=dict(width=1, color="rgba(120,120,120,0.4)"),
            hoverinfo="none",
        )

        # Nodos
        node_x = []
        node_y = []
        node_text = []

        for node in G.nodes():
            x, y = pos[node]
            node_x.append(x)
            node_y.append(y)
            node_text.append(node)

        node_trace = go.Scatter(
            x=node_x,
            y=node_y,
            mode="markers+text",
            text=node_text,
            textposition="top center",
            hovertext=node_text,
            hoverinfo="text",
            marker=dict(
                size=18,
                line=dict(width=1, color="#444"),
            ),
        )

        fig_graph = go.Figure(
            data=[edge_trace, node_trace],
            layout=go.Layout(
                title=f"Package dependency graph (top {N_NODES} paquetes por dependencias)",
                showlegend=False,
                hovermode="closest",
                margin=dict(b=20, l=20, r=20, t=60),
                xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                annotations=annotations,
            ),
        )

        fig_graph.show()
