## VI LAB 2 

next step would be the following!
I want you to write down all the steps we took so far until this point of the project. That is:
I want you to summarize all the processes so far:
Start by stating how we gathered the data, from where, and why it is enough to cover this lab. Then explain the cleaning process on open refine and then the preprocessing/check on the notebook we did.
Then chose a suitable extra parameter for q6 (since according to the pdf instructions, we are free to chose). Hence, chose one and explain why it would be a good choice/approach

In [80]:
import pandas as pd
import altair as alt

# Performance: required by the project (datasets > 5000 rows)
alt.data_transformers.enable("vegafusion")


DataTransformerRegistry.enable('vegafusion')

In [81]:
# load datasets

base_path = "."

df_grants = pd.read_csv(
    f"{base_path}/NSF_Grants_Last5Years_Clean.csv"
)

df_trump = pd.read_csv(
    f"{base_path}/trump17-21-csv.csv"
)


In [82]:
# Ensure correct dtypes
df_grants["year"] = df_grants["year"].astype(int)
df_grants["award_amount"] = pd.to_numeric(df_grants["award_amount"], errors="coerce")

df_trump["year"] = df_trump["year"].astype(int)
df_trump["award_amount"] = pd.to_numeric(df_trump["award_amount"], errors="coerce")


In [83]:
# Drop rows with critical missing values
df_grants = df_grants.dropna(subset=["state", "directorate", "year"])
df_trump = df_trump.dropna(subset=["directorate"])


In [84]:
year_selection = alt.selection_point(
    fields=["year"],
    bind=alt.binding_select(
        options=sorted(df_grants["year"].unique()),
        name="Year: "
    ),
    value=sorted(df_grants["year"].unique())[0]
)

state_selection = alt.selection_point(
    fields=["state"],
    bind=alt.binding_select(
        options=sorted(df_grants["state"].unique()),
        name="State: "
    )
)


# Q1

In [85]:
# Q1 aggregation: grants per state per year
q1_df = (
    df_grants
    .groupby(["state", "year"])
    .agg(
        grants_count=("award_id", "count"),
        total_amount=("award_amount", "sum")
    )
    .reset_index()
)

q1_df.head()


Unnamed: 0,state,year,grants_count,total_amount
0,AK,2020,4,1561322.0
1,AK,2021,1,49966.0
2,AK,2022,7,769845.0
3,AL,2020,11,1927697.0
4,AL,2021,4,1384493.0


In [86]:
state_click = alt.selection_point(fields=["state"], empty="all")

q1_bars = (
    alt.Chart(q1_df)
    .mark_bar()
    .encode(
        x=alt.X("state:N", sort="-y", title="State"),
        y=alt.Y("grants_count:Q", title="Number of grants"),
        color=alt.condition(
            state_click,
            alt.Color("grants_count:Q", scale=alt.Scale(scheme="blues"), title="Grants count"),
            alt.value("lightgray")
        ),
        tooltip=[
            alt.Tooltip("state:N", title="State"),
            alt.Tooltip("grants_count:Q", title="Grants"),
            alt.Tooltip("total_amount:Q", title="Total amount ($)", format=",.0f")
        ]
    )
    .add_params(year_selection, state_click)
    .transform_filter(year_selection)
    .properties(width=750, height=380, title="Q1 — Grants by State (select a year + click a state)")
)


In [87]:
q1_state_trend = (
    alt.Chart(q1_df)
    .mark_line(point=True)
    .encode(
        x=alt.X("year:O", title="Year"),
        y=alt.Y("grants_count:Q", title="Grants"),
        tooltip=[
            alt.Tooltip("state:N"),
            alt.Tooltip("year:O"),
            alt.Tooltip("grants_count:Q", title="Grants"),
            alt.Tooltip("total_amount:Q", title="Total amount ($)", format=",.0f"),
        ],
    )
    .transform_filter(state_click)
    .properties(width=750, height=180, title="Selected state — grants over time")
)


In [88]:
(q1_bars & q1_state_trend)


bar chart chosen for ranking/comparison across states,

year dropdown avoids clutter vs small multiples,

click-to-highlight supports drill-down,

linked time-series gives context and supports exploration.

# Q2

In [89]:
# Q2 aggregation: grants per directorate per year
q2_df = (
    df_grants
    .groupby(["directorate", "year"])
    .agg(
        grants_count=("award_id", "count"),
        total_amount=("award_amount", "sum")
    )
    .reset_index()
)

q2_df.head()


Unnamed: 0,directorate,year,grants_count,total_amount
0,AGS,2020,16,3300718.0
1,AGS,2022,16,1375227.0
2,AGS,2023,6,871122.0
3,AGS,2024,7,1192193.0
4,AST,2020,9,2094148.0


In [90]:
q2_df = (
    df_grants
    .groupby(["directorate", "year"])
    .agg(
        grants_count=("award_id", "count"),
        total_amount=("award_amount", "sum")
    )
    .reset_index()
)


In [91]:
dir_click = alt.selection_point(fields=["directorate"], empty="all")


In [92]:
q2_overview = (
    alt.Chart(q2_df)
    .mark_bar()
    .encode(
        y=alt.Y("directorate:N", sort="-x", title="Directorate"),
        x=alt.X("grants_count:Q", title="Number of grants"),
        color=alt.condition(
            dir_click,
            alt.Color("grants_count:Q", scale=alt.Scale(scheme="blues"), title="Grants count"),
            alt.value("lightgray")
        ),
        tooltip=[
            alt.Tooltip("directorate:N", title="Directorate"),
            alt.Tooltip("year:O", title="Year"),
            alt.Tooltip("grants_count:Q", title="Grants"),
            alt.Tooltip("total_amount:Q", title="Total amount ($)", format=",.0f"),
        ],
    )
    .add_params(year_selection, dir_click)
    .transform_filter(year_selection)
    .properties(
        title="Q2 — Grants by Directorate (select a year + click a directorate)",
        width=750,
        height=420,
    )
)


In [93]:
q2_trend = (
    alt.Chart(q2_df)
    .mark_line(point=True)
    .encode(
        x=alt.X("year:O", title="Year"),
        y=alt.Y("grants_count:Q", title="Number of grants"),
        tooltip=[
            alt.Tooltip("directorate:N", title="Directorate"),
            alt.Tooltip("year:O", title="Year"),
            alt.Tooltip("grants_count:Q", title="Grants"),
            alt.Tooltip("total_amount:Q", title="Total amount ($)", format=",.0f"),
        ],
    )
    .transform_filter(dir_click)
    .properties(
        title="Selected directorate — grants over time",
        width=750,
        height=180,
    )
)


In [104]:
(q2_overview & q2_trend)


comments


# Q3

In [95]:
q3_cancel_df = (
    df_trump
    .groupby(["directorate"])
    .agg(cancelled_count=("award_id", "count"),
         cancelled_amount=("award_amount", "sum"))
    .reset_index()
)


In [96]:
q3_base_df = (
    df_grants
    .groupby(["directorate"])
    .agg(base_count=("award_id", "count"),
         base_amount=("award_amount", "sum"))
    .reset_index()
)


In [97]:
q3_df = (
    q3_base_df
    .merge(q3_cancel_df, on="directorate", how="outer")
    .fillna(0)
)

# Compute cancellation rate vs baseline (count-based)
q3_df["cancel_rate"] = q3_df["cancelled_count"] / q3_df["base_count"].replace(0, pd.NA)
q3_df["cancel_rate"] = q3_df["cancel_rate"].fillna(0)
q3_df.head()


  q3_df["cancel_rate"] = q3_df["cancel_rate"].fillna(0)


Unnamed: 0,directorate,base_count,base_amount,cancelled_count,cancelled_amount,cancel_rate
0,AGS,45.0,6739260.0,42.0,7671510.0,0.933333
1,AST,20.0,3856468.0,17.0,2623050.0,0.85
2,BCS,152.0,12735737.0,134.0,7984978.0,0.881579
3,BFA,22.0,4317435.0,13.0,2297368.0,0.590909
4,CBET,195.0,29234639.0,158.0,16874815.0,0.810256


In [98]:
dir_sel = alt.selection_point(fields=["directorate"], empty="all")


In [99]:
q3_scatter = (
    alt.Chart(q3_df)
    .mark_circle(opacity=0.8)
    .encode(
        x=alt.X("base_count:Q", title="Baseline grants (last 5 years)"),
        y=alt.Y("cancelled_count:Q", title="Cancelled grants (Trump era)"),
        size=alt.Size("cancelled_amount:Q", title="Cancelled amount ($)", legend=None),
        color=alt.Color("cancel_rate:Q", title="Cancellation rate", scale=alt.Scale(scheme="oranges")),
        tooltip=[
            alt.Tooltip("directorate:N", title="Directorate"),
            alt.Tooltip("base_count:Q", title="Baseline grants"),
            alt.Tooltip("cancelled_count:Q", title="Cancelled grants"),
            alt.Tooltip("cancel_rate:Q", title="Cancel rate", format=".2%"),
            alt.Tooltip("cancelled_amount:Q", title="Cancelled amount ($)", format=",.0f"),
        ],
    )
    .add_params(dir_sel)
    .properties(width=750, height=380, title="Q3 — Cancelled grants vs baseline distribution (by directorate)")
)


In [100]:
q3_bars = (
    alt.Chart(q3_df)
    .mark_bar()
    .encode(
        y=alt.Y(
            "directorate:N",
            sort="-x",
            title="Directorate",
            axis=alt.Axis(labelLimit=200)
        ),
        x=alt.X(
            "cancelled_count:Q",
            title="Cancelled grants"
        ),
        color=alt.condition(
            dir_sel,
            alt.value("#d95f02"),
            alt.value("lightgray")
        ),
        tooltip=[
            alt.Tooltip("directorate:N"),
            alt.Tooltip("cancelled_count:Q", title="Cancelled grants"),
            alt.Tooltip("cancel_rate:Q", title="Cancel rate", format=".2%")
        ]
    )
    .transform_filter(alt.datum.cancelled_count > 0)
    .add_params(dir_sel)
    .properties(
        width=750,
        height=rank_height,
        title="Cancelled grants ranking (click to focus)"
    )
)


In [101]:
q3_cancel_by_year = (
    df_trump
    .groupby(["directorate", "year"])
    .agg(cancelled_count=("award_id", "count"),
         cancelled_amount=("award_amount", "sum"))
    .reset_index()
)

q3_trend = (
    alt.Chart(q3_cancel_by_year)
    .mark_line(point=True)
    .encode(
        x=alt.X("year:O", title="Year (Trump era)"),
        y=alt.Y("cancelled_count:Q", title="Cancelled grants"),
        tooltip=[
            alt.Tooltip("directorate:N"),
            alt.Tooltip("year:O"),
            alt.Tooltip("cancelled_count:Q", title="Cancelled grants"),
            alt.Tooltip("cancelled_amount:Q", title="Cancelled amount ($)", format=",.0f")
        ]
    )
    .transform_filter(dir_sel)
    .properties(width=750, height=180, title="Selected directorate — cancellations over time")
)


In [102]:
n_dirs = q3_df[q3_df["cancelled_count"] > 0]["directorate"].nunique()
rank_height = max(300, n_dirs * 18)


In [103]:
(q3_scatter & q3_bars & q3_trend)


comments

# Q4

In [105]:
# Q4 aggregation: total funding and number of grants per year
q4_df = (
    df_grants
    .groupby("year")
    .agg(
        total_amount=("award_amount", "sum"),
        grants_count=("award_id", "count")
    )
    .reset_index()
)

q4_df


Unnamed: 0,year,total_amount,grants_count
0,2020,270167049.0,1410
1,2021,43187690.0,223
2,2022,256978539.0,1536
3,2023,44698271.0,232
4,2024,40922659.0,240


In [106]:
year_click = alt.selection_point(fields=["year"], empty="all")


In [107]:
q4_funding_line = (
    alt.Chart(q4_df)
    .mark_line(point=True)
    .encode(
        x=alt.X("year:O", title="Year"),
        y=alt.Y(
            "total_amount:Q",
            title="Total funding amount ($)",
            axis=alt.Axis(format="~s")
        ),
        color=alt.condition(
            year_click,
            alt.value("#1f77b4"),
            alt.value("lightgray")
        ),
        tooltip=[
            alt.Tooltip("year:O", title="Year"),
            alt.Tooltip("total_amount:Q", title="Total amount ($)", format=",.0f"),
            alt.Tooltip("grants_count:Q", title="Number of grants")
        ]
    )
    .add_params(year_click)
    .properties(
        width=750,
        height=280,
        title="Q4 — Evolution of total NSF funding over the last 5 years"
    )
)


In [108]:
q4_grants_bar = (
    alt.Chart(q4_df)
    .mark_bar()
    .encode(
        x=alt.X("year:O", title="Year"),
        y=alt.Y("grants_count:Q", title="Number of grants"),
        color=alt.condition(
            year_click,
            alt.value("#ff7f0e"),
            alt.value("lightgray")
        ),
        tooltip=[
            alt.Tooltip("year:O"),
            alt.Tooltip("grants_count:Q", title="Number of grants"),
            alt.Tooltip("total_amount:Q", title="Total amount ($)", format=",.0f")
        ]
    )
    .add_params(year_click)
    .properties(
        width=750,
        height=220,
        title="Number of grants per year (click to highlight)"
    )
)


In [109]:
(q4_funding_line & q4_grants_bar)


needs work, very bad

# Q5

In [110]:
df_grants.columns = df_grants.columns.str.strip()
df_trump.columns = df_trump.columns.str.strip()


In [111]:
q5_grants = (
    df_grants
    .groupby(["state", "year"])
    .agg(
        grants_count=("award_id", "count"),
        total_amount=("award_amount", "sum")
    )
    .reset_index()
)


In [112]:
q5_trump = (
    df_trump
    .groupby(["state", "year"])
    .agg(
        cancelled_count=("award_id", "count"),
        cancelled_amount=("award_amount", "sum")
    )
    .reset_index()
)


In [113]:
state_selection = alt.selection_point(
    fields=["state"],
    bind=alt.binding_select(
        options=sorted(df_grants["state"].dropna().unique()),
        name="Select state: "
    ),
    value=sorted(df_grants["state"].dropna().unique())[0]
)


In [114]:
q5_amount_line = (
    alt.Chart(q5_grants)
    .transform_filter(state_selection)
    .mark_line(point=True)
    .encode(
        x=alt.X("year:O", title="Year (last 5 years)"),
        y=alt.Y("total_amount:Q", title="Total funding ($)", axis=alt.Axis(format="~s")),
        tooltip=[
            alt.Tooltip("state:N"),
            alt.Tooltip("year:O"),
            alt.Tooltip("total_amount:Q", title="Total funding ($)", format=",.0f"),
            alt.Tooltip("grants_count:Q", title="Number of grants")
        ]
    )
    .add_params(state_selection)
    .properties(width=750, height=260, title="Q5 — Selected state: total funding over time (2020–2024)")
)


In [115]:
q5_count_bar = (
    alt.Chart(q5_grants)
    .transform_filter(state_selection)
    .mark_bar()
    .encode(
        x=alt.X("year:O", title="Year (last 5 years)"),
        y=alt.Y("grants_count:Q", title="Number of grants"),
        tooltip=[
            alt.Tooltip("state:N"),
            alt.Tooltip("year:O"),
            alt.Tooltip("grants_count:Q", title="Number of grants"),
            alt.Tooltip("total_amount:Q", title="Total funding ($)", format=",.0f"),
        ]
    )
    .add_params(state_selection)
    .properties(width=750, height=200, title="Selected state: number of grants per year (2020–2024)")
)


In [116]:
q5_cancelled = (
    alt.Chart(q5_trump)
    .transform_filter(state_selection)
    .mark_bar()
    .encode(
        x=alt.X("year:O", title="Year (Trump era)"),
        y=alt.Y("cancelled_count:Q", title="Cancelled grants"),
        tooltip=[
            alt.Tooltip("state:N"),
            alt.Tooltip("year:O"),
            alt.Tooltip("cancelled_count:Q", title="Cancelled grants"),
            alt.Tooltip("cancelled_amount:Q", title="Cancelled amount ($)", format=",.0f")
        ]
    )
    .add_params(state_selection)
    .properties(width=750, height=200, title="Trump era (2017–2021): cancelled grants for selected state")
)


In [117]:
(q5_amount_line & q5_count_bar & q5_cancelled)


comments

# Q6