# Overview in General
Interactive overview using CSV files from `reports/overview-csv`. Uses **Plotly** for visualizations with robust column detection and fallbacks.

In [1]:
# --- Imports & global theme ---
import os
from typing import List, Optional
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.io as pio

pio.templates.default = 'plotly_white'

In [2]:
# --- Paths & CSV helpers ---
REPORTS_ROOT = os.environ.get('REPORTS_DIRECTORY', os.path.abspath(os.path.join(os.getcwd(), '..', 'reports')))
OVERVIEW_DIR = os.path.join(REPORTS_ROOT, 'overview-csv')
print('REPORTS_ROOT =', REPORTS_ROOT)
print('OVERVIEW_DIR =', OVERVIEW_DIR)

def csv_path(name: str) -> str:
    return os.path.join(OVERVIEW_DIR, name)

def read_csv_or_empty(path: str) -> pd.DataFrame:
    if os.path.exists(path):
        try:
            return pd.read_csv(path, low_memory=False)
        except Exception as e:
            print(f"[warn] Failed to read {path}: {e}")
            return pd.DataFrame()
    else:
        print(f"[warn] Missing CSV: {path}")
        return pd.DataFrame()

REPORTS_ROOT = /Users/jonathan.nervaez/Documents/AppModPractice/E2E-decomposition/reports
OVERVIEW_DIR = /Users/jonathan.nervaez/Documents/AppModPractice/E2E-decomposition/reports/overview-csv


In [3]:
# --- Column helpers & numeric coercion ---
def to_numeric_safe(s):
    if s is None:
        return pd.Series([], dtype='float64')
    try:
        return pd.to_numeric(s, errors='coerce')
    except Exception:
        return pd.to_numeric(pd.Series(s), errors='coerce')

def pick_first(df: pd.DataFrame, candidates: List[str], kind: Optional[str] = None) -> Optional[str]:
    """
    Pick the first suitable column by name or by kind ('numeric' or 'text').
    """
    if df.empty:
        return None
    if kind is None:
        for c in candidates:
            if c in df.columns:
                return c
        return None
    if kind == 'numeric':
        for c in df.columns:
            if pd.api.types.is_numeric_dtype(df[c]):
                return c
        for c in df.columns:
            if pd.to_numeric(df[c], errors='coerce').notna().any():
                return c
        return None
    if kind == 'text':
        for c in df.columns:
            if not pd.api.types.is_numeric_dtype(df[c]):
                return c
        return None
    return None

def find_name_col(df: pd.DataFrame, preferred: List[str]) -> Optional[str]:
    for c in preferred:
        if c in df.columns:
            return c
    for c in df.columns:
        if not pd.api.types.is_numeric_dtype(df[c]) and df[c].notna().sum() >= 2:
            return c
    return None

In [4]:
# --- Plot helpers ---
def bar_smart_clean(df: pd.DataFrame, name_col: str, value_col: str, title: str, ylabel: str, top_n: int = 20):
    if df.empty or name_col not in df.columns or value_col not in df.columns:
        print(f"[info] {title}: missing columns or empty data.")
        return
    sub = df[[name_col, value_col]].copy()
    sub[name_col] = sub[name_col].astype(str)
    sub[value_col] = to_numeric_safe(sub[value_col])
    sub = sub.dropna(subset=[value_col])
    sub = sub[sub[name_col].str.strip().ne('')]
    if sub.empty:
        print(f"[info] {title}: no rows to plot after cleaning.")
        return
    sub = sub.sort_values(value_col, ascending=False).head(top_n)
    fig = px.bar(sub, x=name_col, y=value_col, title=title, labels={name_col: '', value_col: ylabel})
    fig.update_layout(xaxis_tickangle=-60, height=420, margin=dict(l=20,r=20,t=60,b=40))
    fig.show()

def pie_top_donut(df: pd.DataFrame, name_col: str, value_col: str, title: str, top_n: int = 12, others_label: str = 'others'):
    if df.empty or name_col not in df.columns or value_col not in df.columns:
        print(f"[info] {title}: missing columns or empty data.")
        return
    work = df[[name_col, value_col]].copy()
    work[name_col] = work[name_col].astype(str)
    work[value_col] = to_numeric_safe(work[value_col]).fillna(0)
    work = work.sort_values(value_col, ascending=False)
    top = work.head(top_n)
    if len(work) > top_n:
        rest = work.iloc[top_n:][value_col].sum()
        top = pd.concat([top, pd.DataFrame({name_col:[others_label], value_col:[rest]})], ignore_index=True)
    fig = px.pie(top, names=name_col, values=value_col, title=title, hole=0.45)
    fig.update_traces(textposition='inside', textinfo='percent+label')
    fig.update_layout(height=520, margin=dict(l=20,r=20,t=60,b=20))
    fig.show()

In [5]:
# --- Load CSVs ---
df_node_label_count = read_csv_or_empty(csv_path('NodeLabelCount.csv'))
df_rel_type_count = read_csv_or_empty(csv_path('RelationshipTypeCount.csv'))
df_labels_and_relationships = read_csv_or_empty(csv_path('NodeLabelsAndRelationships.csv'))
df_overview_size = read_csv_or_empty(csv_path('OverviewSize.csv'))
df_pkgs = read_csv_or_empty(csv_path('NumberOfPackagesPerArtifact.csv'))
df_types = read_csv_or_empty(csv_path('NumberOfTypesPerArtifact.csv'))

print('Shapes:')
for name, df in [
    ('NodeLabelCount', df_node_label_count),
    ('RelationshipTypeCount', df_rel_type_count),
    ('NodeLabelsAndRelationships', df_labels_and_relationships),
    ('OverviewSize', df_overview_size),
    ('NumberOfPackagesPerArtifact', df_pkgs),
    ('NumberOfTypesPerArtifact', df_types),
]:
    print(f' - {name}:', df.shape)

Shapes:
 - NodeLabelCount: (74, 4)
 - RelationshipTypeCount: (74, 4)
 - NodeLabelsAndRelationships: (578, 8)
 - OverviewSize: (1, 8)
 - NumberOfPackagesPerArtifact: (301, 3)
 - NumberOfTypesPerArtifact: (301, 3)


In [6]:
# --- Totals & density (robust) ---
total_nodes = 0
total_relationships = 0

if 'nodesWithThatLabel' in df_node_label_count.columns:
    total_nodes = pd.to_numeric(df_node_label_count['nodesWithThatLabel'], errors='coerce').sum()
if 'nodesWithThatRelationshipType' in df_rel_type_count.columns:
    total_relationships = pd.to_numeric(df_rel_type_count['nodesWithThatRelationshipType'], errors='coerce').sum()

print('Total nodes:', int(total_nodes))
print('Total relationships:', int(total_relationships))
if total_nodes > 1:
    dens = total_relationships / (total_nodes * (total_nodes - 1))
    print(f'Graph density: {dens:.6f} ({dens*100:.4f}%)')
else:
    print('[info] Not enough data to compute density.')

Total nodes: 12931953
Total relationships: 13919949
Graph density: 0.000000 (0.0000%)


In [7]:
# --- Charts ---
bar_smart_clean(df_node_label_count, 'nodeLabel', 'nodesWithThatLabel', 'Top Node Labels', 'nodes')
bar_smart_clean(df_rel_type_count, 'relationshipType', 'nodesWithThatRelationshipType', 'Top Relationship Types', 'relationships')
pie_top_donut(df_pkgs, 'artifactName', 'numberOfPackages', 'Packages per Artifact (Top + others)')
pie_top_donut(df_types, 'artifactName', 'numberOfTypes', 'Types per Artifact (Top + others)')