# ARC DP: Top Chief Investigators by Field of Research

Select Field of Research codes to see the top 30 Chief Investigators by number of Discovery Projects.

- Input: `arc_discovery_projects_2010_2025_with_for.csv` (includes Field of Research columns).
- UI: Select specific FoR codes and/or broad 2-digit FoR categories to filter; table and bar chart update accordingly.




In [1]:
# Configuration
INPUT_CSV = "./arc_discovery_projects_2010_2025_with_for.csv"
TOP_K = 30


In [2]:
import pandas as pd
import numpy as np
from pathlib import Path

from ipywidgets import widgets
from IPython.display import display, HTML

# Load data
path = Path(INPUT_CSV)
if not path.exists():
    raise FileNotFoundError(f"Input CSV not found: {path}")

df = pd.read_csv(path)

# Utility to split semicolon-separated values

def split_list(col):
    return col.fillna("").astype(str).apply(lambda s: [x.strip() for x in s.split(";") if x.strip()])

# Prepare exploded DataFrame for CI and FoR codes
ci_names = split_list(df["chief_investigators"]).rename("ci_name")
for_codes = split_list(df["for_all_codes"]).rename("for_code")

exploded = (
    df.assign(ci_name=ci_names, for_code=for_codes)
      .explode("ci_name")
      .explode("for_code")
)
# Drop blanks
exploded = exploded[(exploded["ci_name"].notna()) & (exploded["ci_name"].str.len() > 0)]
exploded = exploded[(exploded["for_code"].notna()) & (exploded["for_code"].str.len() > 0)]

# Build FoR options with simple labeling from names if available
for_code_to_name = {}
for code, names in zip(df["for_all_codes"], df["for_all_names"]):
    if pd.isna(code) or pd.isna(names):
        continue
    codes = [c.strip() for c in str(code).split(";") if c.strip()]
    name_list = [n.strip() for n in str(names).split(";") if n.strip()]
    for c, n in zip(codes, name_list):
        for_code_to_name.setdefault(c, n)

# Extract 2-digit codes and their names
for_2digit_to_name = {}
for code, name in for_code_to_name.items():
    if len(code) >= 2:
        two_digit = code[:2]
        if two_digit not in for_2digit_to_name:
            for_2digit_to_name[two_digit] = name.split(" — ")[0] if " — " in name else name

choices = sorted(for_code_to_name.keys())
two_digit_choices = sorted(for_2digit_to_name.keys())

def label_for(c):
    n = for_code_to_name.get(c)
    return f"{c} — {n}" if n else c

def label_2digit_for(c):
    n = for_2digit_to_name.get(c)
    return f"{c} — {n}" if n else c

# Widgets
for_selector = widgets.SelectMultiple(
    options=[(label_for(c), c) for c in choices],
    description="Specific FoR code(s)",
    layout=widgets.Layout(width="500px", height="220px"),
)
ci_selector = widgets.Select(
    options=[],
    description="CI",
    layout=widgets.Layout(width="500px", height="300px"),
)

# 2-digit FoR selector
for_2digit_selector = widgets.SelectMultiple(
    options=[(label_2digit_for(c), c) for c in two_digit_choices],
    description="Broad FoR (2-digit)",
    layout=widgets.Layout(width="500px", height="150px"),
)

out_table = widgets.Output(layout=widgets.Layout(width="500px"))
out_detail = widgets.Output(layout=widgets.Layout(width="600px"))

# State
current_filtered = exploded.copy()


def render_ci_detail(ci_name: str):
    out_detail.clear_output(wait=True)
    if not ci_name:
        return
    # Filter DPs for selected CI within the current FoR filter
    subset = current_filtered[current_filtered["ci_name"] == ci_name]
    codes = subset["code"].dropna().astype(str).unique().tolist()
    if not codes:
        return
    # Pull rows from original df for extra columns
    rows = df[df["code"].astype(str).isin(codes)].copy()
    rows = rows[[
        "code",
        "funding_commencement_year",
        "administering_organisation",
        "for_primary_names",
        "for_all_names",
        "grant_status",
        "funding_current",
    ]]
    # Build HTML with links to ARC grant page
    lines = [f"<h3>{ci_name}</h3>"]
    lines.append("<ol>")
    for _, r in rows.sort_values(["funding_commencement_year", "code"], ascending=[False, True]).iterrows():
        code = r["code"]
        year = r.get("funding_commencement_year", "")
        org = r.get("administering_organisation", "")
        forp = r.get("for_primary_names", "")
        url = f"https://dataportal.arc.gov.au/NCGP/Web/Grant/Grant/{code}"
        lines.append(
            f"<li><a target='_blank' href='{url}'>{code}</a> — {year} — {org} "
            f"<div style='color:#666;font-size:90%'>FoR: {forp}</div></li>"
        )
    lines.append("</ol>")
    with out_detail:
        display(HTML("\n".join(lines)))


def update_view(selected_codes, selected_2digit_codes):
    global current_filtered
    out_table.clear_output(wait=True)
    # Filter by FoR
    filt = exploded
    
    # Apply specific code filter
    if selected_codes:
        filt = filt[filt["for_code"].isin(list(selected_codes))]
    
    # Apply 2-digit code filter
    if selected_2digit_codes:
        # Create mask for codes that start with any of the selected 2-digit codes
        mask = filt["for_code"].str.startswith(tuple(selected_2digit_codes))
        filt = filt[mask]
    
    current_filtered = filt
    # Rank CIs
    ranked = (
        filt.groupby("ci_name")["code"].nunique().reset_index(name="num_projects")
            .sort_values("num_projects", ascending=False)
            .head(TOP_K)
    )
    # Update CI selector options
    ci_selector.options = ranked["ci_name"].tolist()
    with out_table:
        display(ranked)
    # Auto-render first CI if present
    if ci_selector.options:
        ci_selector.value = ci_selector.options[0]
        render_ci_detail(ci_selector.value)
    else:
        out_detail.clear_output(wait=True)

# Wire interactions

def on_ci_change(change):
    if change.get('name') == 'value':
        render_ci_detail(change['new'])

ci_selector.observe(on_ci_change, names='value')

# Initial render and layout
update_view((), ())

left_col = widgets.VBox([
    widgets.Label("<b>Specific FoR Codes:</b>", layout=widgets.Layout(width="500px")),
    for_selector,
    widgets.Label("<b>Broad FoR Categories (2-digit):</b>", layout=widgets.Layout(width="500px")),
    for_2digit_selector,
    widgets.Label("<b>Select CI:</b>", layout=widgets.Layout(width="500px")),
    ci_selector,
], layout=widgets.Layout(width="520px"))

right_col = widgets.VBox([
    out_table,
    out_detail,
], layout=widgets.Layout(width="700px"))

layout = widgets.HBox([
    left_col,
    right_col,
])

display(layout)

# Connect both selectors to update_view
def on_for_change(change):
    update_view(for_selector.value, for_2digit_selector.value)

def on_2digit_change(change):
    update_view(for_selector.value, for_2digit_selector.value)

for_selector.observe(on_for_change, names='value')
for_2digit_selector.observe(on_2digit_change, names='value')


HBox(children=(VBox(children=(Label(value='<b>Specific FoR Codes:</b>', layout=Layout(width='500px')), SelectM…