In [46]:
# Project 1: NSF Terminations Analysis
# Data Cleaning and Analysis Framework

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set display options for better data viewing
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 100)

print("Libraries imported successfully!")
print("Ready to start data cleaning and analysis...")


Libraries imported successfully!
Ready to start data cleaning and analysis...


# Data Preprocessing

In this section, we clean and prepare our datasets for analysis. Each step is explained below.


## Step 1: Loading the Raw Datasets
We start by loading all three datasets we need for our analysis: the main NSF terminations data, the Cruz list data, and the flagged words list.
Each dataset is loaded from CSV files and we check their dimensions to ensure they loaded correctly.


In [47]:
# Load all datasets
print("Loading datasets...")

# Load main NSF terminations data
nsf_data = pd.read_csv('data/raw-leo/nsf_terminations_airtable_copy.csv')
print(f"NSF Terminations data shape: {nsf_data.shape}")

# Load Cruz list data
cruz_data = pd.read_csv('data/raw-leo/cruz_list_copy.csv', sep=';')
print(f"Cruz list data shape: {cruz_data.shape}")

# Load flagged words data
flagged_words = pd.read_csv('data/raw-leo/flagged_words_trump_admin_copy.csv')
print(f"Flagged words data shape: {flagged_words.shape}")

print("\nDatasets loaded successfully!")


Loading datasets...
NSF Terminations data shape: (1970, 35)
Cruz list data shape: (1041, 2)
Flagged words data shape: (54, 1)

Datasets loaded successfully!


In [48]:
# Examine the structure of the main dataset
print("=== NSF Terminations Dataset Structure ===")
print(f"Shape: {nsf_data.shape}")
print(f"\nColumns: {list(nsf_data.columns)}")
print(f"\nData types:\n{nsf_data.dtypes}")

print("\n=== First few rows ===")
print(nsf_data.head(2))

print("\n=== Missing values ===")
missing_values = nsf_data.isnull().sum()
print(missing_values[missing_values > 0].sort_values(ascending=False))


=== NSF Terminations Dataset Structure ===
Shape: (1970, 35)

Columns: ['grant_id', 'status', 'terminated', 'suspended', 'termination_date', 'reinstated', 'reinstatement_date', 'reinstatement_indicator', 'nsf_url', 'usaspending_url', 'project_title', 'abstract', 'org_name', 'org_state', 'org_city', 'award_type', 'usa_start_date', 'usa_end_date', 'nsf_start_date', 'nsf_end_date', 'nsf_program_name', 'nsf_primary_program', 'usa_nsf_office', 'nsf_total_budget', 'nsf_obligated', 'usaspending_obligated', 'usaspending_outlaid', 'estimated_budget', 'estimated_outlays', 'estimated_remaining', 'division', 'directorate', 'div', 'dir', 'record_sha1']

Data types:
grant_id                     int64
status                      object
terminated                    bool
suspended                     bool
termination_date            object
reinstated                    bool
reinstatement_date          object
reinstatement_indicator     object
nsf_url                     object
usaspending_url         

## Step 2: Identifying Columns to Remove
Before cleaning, we define which columns we don't need for our analysis (like URLs, redundant date fields, and other non-essential columns).
This helps us focus on the most relevant data and reduces the dataset size.


In [49]:
# Define columns to remove as specified by user
columns_to_remove = [
    'usa_start_date',
    'usa_end_date', 
    'nsf_start_date',
    'nsf_end_date',
    'status',
    'suspended',
    'nsf_url',
    'usaspending_url',
    'org_city',
    'award_type',
    'nsf_primary_program',
    'record_sha1'
]

print("=== Columns to be removed ===")
for col in columns_to_remove:
    if col in nsf_data.columns:
        print(f"✓ {col}")
    else:
        print(f"✗ {col} (not found)")

# Check which columns will remain
remaining_columns = [col for col in nsf_data.columns if col not in columns_to_remove]
print(f"\n=== Remaining columns ({len(remaining_columns)}) ===")
for col in remaining_columns:
    print(f"• {col}")


=== Columns to be removed ===
✓ usa_start_date
✓ usa_end_date
✓ nsf_start_date
✓ nsf_end_date
✓ status
✓ suspended
✓ nsf_url
✓ usaspending_url
✓ org_city
✓ award_type
✓ nsf_primary_program
✓ record_sha1

=== Remaining columns (23) ===
• grant_id
• terminated
• termination_date
• reinstated
• reinstatement_date
• reinstatement_indicator
• project_title
• abstract
• org_name
• org_state
• nsf_program_name
• usa_nsf_office
• nsf_total_budget
• nsf_obligated
• usaspending_obligated
• usaspending_outlaid
• estimated_budget
• estimated_outlays
• estimated_remaining
• division
• directorate
• div
• dir


## Step 3: Cleaning the Dataset
We remove the unnecessary columns and convert data types to their proper formats (dates to datetime, strings to numeric, etc.).
This ensures our data is in the right format for analysis and prevents errors during calculations.


In [50]:
# Clean the main dataset
print("=== Cleaning NSF Terminations Dataset ===")

# Remove specified columns
cleaned_nsf_data = nsf_data.drop(columns=columns_to_remove, errors='ignore')
print(f"Shape after removing columns: {cleaned_nsf_data.shape}")

# Handle missing values in key columns
print("\n=== Handling Missing Values ===")

# Convert termination_date to datetime
if 'termination_date' in cleaned_nsf_data.columns:
    cleaned_nsf_data['termination_date'] = pd.to_datetime(cleaned_nsf_data['termination_date'], errors='coerce')
    print(f"Converted termination_date to datetime")

# Convert boolean columns
bool_columns = ['terminated', 'reinstated']
for col in bool_columns:
    if col in cleaned_nsf_data.columns:
        cleaned_nsf_data[col] = cleaned_nsf_data[col].astype(bool)
        print(f"Converted {col} to boolean")

# Convert numeric columns
numeric_columns = ['nsf_total_budget', 'nsf_obligated', 'usaspending_obligated', 
                  'usaspending_outlaid', 'estimated_budget', 'estimated_outlays', 'estimated_remaining']

for col in numeric_columns:
    if col in cleaned_nsf_data.columns:
        cleaned_nsf_data[col] = pd.to_numeric(cleaned_nsf_data[col], errors='coerce')
        print(f"Converted {col} to numeric")

print(f"\nFinal cleaned dataset shape: {cleaned_nsf_data.shape}")
print(f"Remaining columns: {list(cleaned_nsf_data.columns)}")


=== Cleaning NSF Terminations Dataset ===
Shape after removing columns: (1970, 23)

=== Handling Missing Values ===
Converted termination_date to datetime
Converted terminated to boolean
Converted reinstated to boolean
Converted nsf_total_budget to numeric
Converted nsf_obligated to numeric
Converted usaspending_obligated to numeric
Converted usaspending_outlaid to numeric
Converted estimated_budget to numeric
Converted estimated_outlays to numeric
Converted estimated_remaining to numeric

Final cleaned dataset shape: (1970, 23)
Remaining columns: ['grant_id', 'terminated', 'termination_date', 'reinstated', 'reinstatement_date', 'reinstatement_indicator', 'project_title', 'abstract', 'org_name', 'org_state', 'nsf_program_name', 'usa_nsf_office', 'nsf_total_budget', 'nsf_obligated', 'usaspending_obligated', 'usaspending_outlaid', 'estimated_budget', 'estimated_outlays', 'estimated_remaining', 'division', 'directorate', 'div', 'dir']


## Step 4: Counting Flagged Words in Text
We prepare the flagged words list by cleaning them (lowercase, removing commas) and then count how many flagged words appear in each grant's abstract and title.
This helps us analyze whether certain words are associated with grant terminations.


In [51]:
# Prepare flagged words for text analysis
print("=== Preparing Flagged Words ===")
flagged_words_clean = [
    str(w).strip().lower().strip(",") for w in flagged_words["flagged_word"]
]
print(f"Loaded {len(flagged_words_clean)} flagged words")

# Create a function to check for flagged words in text using word boundaries
def count_flagged_words(text, words):
    """Count occurrences of flagged words in text (word-boundary aware)."""
    if pd.isna(text):
        return 0
    
    t = str(text).lower()
    count = 0
    for w in words:
        pattern = r"\b" + re.escape(w) + r"\b"
        matches = re.findall(pattern, t)
        count += len(matches)
    return count

# Add flagged word counts to the main dataset
if 'abstract' in cleaned_nsf_data.columns:
    cleaned_nsf_data['flagged_words_count'] = cleaned_nsf_data['abstract'].apply(
        lambda x: count_flagged_words(x, flagged_words_clean)
    )
    print("Added flagged_words_count column")

if 'project_title' in cleaned_nsf_data.columns:
    cleaned_nsf_data['title_flagged_words_count'] = cleaned_nsf_data['project_title'].apply(
        lambda x: count_flagged_words(x, flagged_words_clean)
    )
    print("Added title_flagged_words_count column")

print(f"\nCleaned dataset with flagged word analysis shape: {cleaned_nsf_data.shape}")


=== Preparing Flagged Words ===
Loaded 54 flagged words
Added flagged_words_count column
Added title_flagged_words_count column

Cleaned dataset with flagged word analysis shape: (1970, 25)


## Step 5: Merging with Cruz List Data
We merge our cleaned NSF data with the Cruz list to identify which grants were on Senator Cruz's list.
This allows us to compare termination and reinstatement rates between grants on the Cruz list and those not on it.


In [52]:
# Merge with Cruz list data
print("=== Merging with Cruz List Data ===")

# Rename grant_id to match cruz_data column
if 'grant_id' in cleaned_nsf_data.columns and 'grant_number' in cruz_data.columns:
    cruz_data_renamed = cruz_data.rename(columns={'grant_number': 'grant_id'})
    cleaned_nsf_data = cleaned_nsf_data.merge(
        cruz_data_renamed[['grant_id', 'in_cruz_list']], 
        on='grant_id', 
        how='left'
    )
    
    # Normalize in_cruz_list: fill NaNs with False and cast to bool
    if 'in_cruz_list' in cleaned_nsf_data.columns:
        cleaned_nsf_data['in_cruz_list'] = (
            cleaned_nsf_data['in_cruz_list']
            .fillna(False)
            .astype(bool)
        )
    
    # Add convenient label columns for visuals
    cleaned_nsf_data['cruz_label'] = np.where(
        cleaned_nsf_data['in_cruz_list'], "In Cruz list", "Not in Cruz list"
    )
    cleaned_nsf_data['reinstated_label'] = np.where(
        cleaned_nsf_data['reinstated'], "Reinstated", "Not reinstated"
    )

    print("Successfully merged with Cruz list data")
    print(f"Grants in Cruz list: {cleaned_nsf_data['in_cruz_list'].sum()}")
else:
    print("Could not merge with Cruz list - column names don't match")

print(f"\nFinal merged dataset shape: {cleaned_nsf_data.shape}")
print(f"Final columns: {list(cleaned_nsf_data.columns)}")


=== Merging with Cruz List Data ===
Successfully merged with Cruz list data
Grants in Cruz list: 467

Final merged dataset shape: (1970, 28)
Final columns: ['grant_id', 'terminated', 'termination_date', 'reinstated', 'reinstatement_date', 'reinstatement_indicator', 'project_title', 'abstract', 'org_name', 'org_state', 'nsf_program_name', 'usa_nsf_office', 'nsf_total_budget', 'nsf_obligated', 'usaspending_obligated', 'usaspending_outlaid', 'estimated_budget', 'estimated_outlays', 'estimated_remaining', 'division', 'directorate', 'div', 'dir', 'flagged_words_count', 'title_flagged_words_count', 'in_cruz_list', 'cruz_label', 'reinstated_label']


In [53]:
# Data Analysis Helper Functions
print("=== Setting up Analysis Functions ===")

def analyze_by_category(df, category_col, value_col=None, top_n=10):
    """Analyze data by a categorical column"""
    if value_col:
        result = df.groupby(category_col)[value_col].agg(['count', 'mean', 'sum']).round(2)
    else:
        result = df[category_col].value_counts().head(top_n)
    return result

def analyze_budget_distribution(df):
    """Analyze budget distribution across different dimensions"""
    budget_cols = ['nsf_total_budget', 'nsf_obligated', 'estimated_budget']
    available_cols = [col for col in budget_cols if col in df.columns]
    
    if available_cols:
        return df[available_cols].describe()
    return None

def analyze_flagged_words_distribution(df):
    """Analyze distribution of flagged words"""
    if 'flagged_words_count' in df.columns:
        return df['flagged_words_count'].describe()
    return None

def get_summary_stats(df):
    """Get overall summary statistics"""
    stats = {
        'total_grants': len(df),
        'terminated_grants': df['terminated'].sum() if 'terminated' in df.columns else None,
        'reinstated_grants': df['reinstated'].sum() if 'reinstated' in df.columns else None,
        'cruz_list_grants': df['in_cruz_list'].sum() if 'in_cruz_list' in df.columns else None,
        'unique_institutions': df['org_name'].nunique() if 'org_name' in df.columns else None,
        'unique_states': df['org_state'].nunique() if 'org_state' in df.columns else None,
    }
    return stats

print("Analysis functions defined successfully!")


=== Setting up Analysis Functions ===
Analysis functions defined successfully!


In [54]:
# Display cleaned data summary
print("=== CLEANED DATA SUMMARY ===")
print(f"Dataset shape: {cleaned_nsf_data.shape}")
print(f"Columns: {list(cleaned_nsf_data.columns)}")

print("\n=== OVERALL STATISTICS ===")
summary_stats = get_summary_stats(cleaned_nsf_data)
for key, value in summary_stats.items():
    print(f"{key}: {value}")

print("\n=== MISSING VALUES IN CLEANED DATA ===")
missing_cleaned = cleaned_nsf_data.isnull().sum()
missing_cleaned = missing_cleaned[missing_cleaned > 0].sort_values(ascending=False)
if len(missing_cleaned) > 0:
    print(missing_cleaned)
else:
    print("No missing values in cleaned data!")

print("\n=== DTYPES CHECK ===")
print(cleaned_nsf_data.dtypes)

print("\n=== SAMPLE OF CLEANED DATA ===")
print(cleaned_nsf_data.head(3))


=== CLEANED DATA SUMMARY ===
Dataset shape: (1970, 28)
Columns: ['grant_id', 'terminated', 'termination_date', 'reinstated', 'reinstatement_date', 'reinstatement_indicator', 'project_title', 'abstract', 'org_name', 'org_state', 'nsf_program_name', 'usa_nsf_office', 'nsf_total_budget', 'nsf_obligated', 'usaspending_obligated', 'usaspending_outlaid', 'estimated_budget', 'estimated_outlays', 'estimated_remaining', 'division', 'directorate', 'div', 'dir', 'flagged_words_count', 'title_flagged_words_count', 'in_cruz_list', 'cruz_label', 'reinstated_label']

=== OVERALL STATISTICS ===
total_grants: 1970
terminated_grants: 1970
reinstated_grants: 420
cruz_list_grants: 467
unique_institutions: 507
unique_states: 52

=== MISSING VALUES IN CLEANED DATA ===
reinstatement_indicator    1554
reinstatement_date         1550
usa_nsf_office              711
usaspending_obligated       711
usaspending_outlaid         711
division                    711
directorate                 711
div                

# Data Analysis Framework Ready!

## What we've accomplished:

1. **Data Loading**: Loaded all three datasets (NSF terminations, Cruz list, flagged words)
2. **Data Cleaning**: Removed specified columns and handled data types
3. **Text Analysis**: Added flagged word counting functionality
4. **Data Merging**: Combined NSF data with Cruz list information
5. **Analysis Functions**: Created helper functions for common analysis tasks

## Available datasets:
- `cleaned_nsf_data`: Main cleaned dataset ready for analysis
- `cruz_data`: Cruz list information
- `flagged_words`: List of flagged words for text analysis

## Key columns in cleaned data:
- `grant_id`: Unique grant identifier
- `terminated`, `reinstated`: Boolean flags
- `termination_date`: Date of termination
- `project_title`, `abstract`: Text fields for analysis
- `org_name`, `org_state`: Organization information
- `flagged_words_count`: Count of flagged words in abstract
- `in_cruz_list`: Whether grant is in Cruz list
- Various budget columns for financial analysis

## Ready for analysis questions!


In [55]:
# Import Altair for interactive visualizations
import altair as alt

# Enable Altair to work with Jupyter notebooks
alt.data_transformers.enable('json')

print("Altair imported successfully!")
print("Ready to create interactive visualizations...")


Altair imported successfully!
Ready to create interactive visualizations...


In [79]:
import altair as alt

# palete 
BLUE = "#3B82F6"  
EMERALD = "#10B981" 
BLUE_LIGHT = "#6fa8fa"  # Lighter blue (Reinstated)
BLUE_DARK = "#1e40af"
GRAY = "#a8a8a8"     

# ---- Altair theme ----
def vi_theme():
    return {
        "config": {
            "background": "white",
            "view": {"strokeWidth": 0},
            "title": {
                "fontSize": 14,
                "fontWeight": "bold",
                "anchor": "start",
                "color": "#0F172A"
            },
            "axis": {
                "labelFontSize": 11,
                "titleFontSize": 12,
                "gridColor": VI_GRID,
                "domainColor": VI_GRID,
                "labelColor": "#111827",
                "titleColor": "#111827",
                "tickColor": VI_GRID
            },
            "legend": {
                "labelFontSize": 11,
                "titleFontSize": 12,
                "symbolType": "circle",
                "symbolSize": 80,
                "titleColor": "#111827",
                "labelColor": "#111827"
            },
            "range": {
                "category": [VI_BLUE_DARK, VI_BLUE_LIGHT, "#94a3b8", "#64748b"]
            }
        }
    }
    
alt.themes.register("vi_theme", vi_theme)
alt.themes.enable("vi_theme")

# ---- Reusable styling helpers ----
def vi_style(chart, title=None, width=360, height=260):
    props = {}
    if title is not None:
        props["title"] = title
    return chart.properties(width=width, height=height, **props)

def vi_pct_axis(title):
    return alt.Axis(title=title, format="%")

def vi_dollar_axis(title):
    # compact, $ with thousands separators
    return alt.Axis(title=title, format="$,.0f")

def vi_count_axis(title):
    return alt.Axis(title=title)


# Q1: How are the cancellations distributed by states?


**Visualization 1: Horizontal Bar Chart**

We start with a simple horizontal bar chart to show the number of cancelled grants per state, ranking states from highest to lowest.
This visualization helps us quickly identify which states were most affected by grant cancellations (e.g., California with 466 cancellations).


In [57]:
# Q1: Cancellations distribution by states
print("=== Q1: Cancellations Distribution by States ===")

# Filter only terminated grants
terminated_grants = cleaned_nsf_data[cleaned_nsf_data['terminated'] == True]

# Count cancellations by state
state_cancellations = terminated_grants['org_state'].value_counts().reset_index()
state_cancellations.columns = ['state', 'cancelled_grants']

print(f"Total cancelled grants: {len(terminated_grants)}")
print(f"States with cancelled grants: {len(state_cancellations)}")
print("\nTop 10 states by number of cancelled grants:")
print(state_cancellations.head(10))

# Create visualization
chart_q1 = alt.Chart(state_cancellations.head(15)).mark_bar().add_selection(
    alt.selection_interval()
).encode(
    x=alt.X('cancelled_grants:Q', title='Number of Cancelled Grants'),
    y=alt.Y('state:N', sort='-x', title='State'),
    color=alt.Color('cancelled_grants:Q', scale=alt.Scale(scheme='blues'), legend=alt.Legend(title='Cancelled Grants')),
    tooltip=['state', 'cancelled_grants']
).properties(
    width=600,
    height=400,
    title='Distribution of NSF Grant Cancellations by State (Top 15)'
).interactive()

chart_q1


=== Q1: Cancellations Distribution by States ===
Total cancelled grants: 1970
States with cancelled grants: 52

Top 10 states by number of cancelled grants:
  state  cancelled_grants
0    CA               466
1    MA               256
2    TX               122
3    NY               102
4    FL                59
5    IL                58
6    PA                55
7    VA                55
8    NC                55
9    GA                54


**Visualization 2: Choropleth Map**

We improve the visualization by creating a choropleth map to show the geographical distribution of cancellations across the United States.
This allows us to see regional patterns and understand the spatial impact, not just the ranking of states.


In [74]:
import altair as alt
from vega_datasets import data

# Load the US topojson
states = alt.topo_feature(data.us_10m.url, "states")

# FIPS code mapping between state abbreviations and numeric IDs used in the topojson
state_fips = {
    "AL": 1,
    "AK": 2,
    "AZ": 4,
    "AR": 5,
    "CA": 6,
    "CO": 8,
    "CT": 9,
    "DE": 10,
    "DC": 11,
    "FL": 12,
    "GA": 13,
    "HI": 15,
    "ID": 16,
    "IL": 17,
    "IN": 18,
    "IA": 19,
    "KS": 20,
    "KY": 21,
    "LA": 22,
    "ME": 23,
    "MD": 24,
    "MA": 25,
    "MI": 26,
    "MN": 27,
    "MS": 28,
    "MO": 29,
    "MT": 30,
    "NE": 31,
    "NV": 32,
    "NH": 33,
    "NJ": 34,
    "NM": 35,
    "NY": 36,
    "NC": 37,
    "ND": 38,
    "OH": 39,
    "OK": 40,
    "OR": 41,
    "PA": 42,
    "RI": 44,
    "SC": 45,
    "SD": 46,
    "TN": 47,
    "TX": 48,
    "UT": 49,
    "VT": 50,
    "VA": 51,
    "WA": 53,
    "WV": 54,
    "WI": 55,
    "WY": 56,
}

# Create a copy and add FIPS code column
state_cancellations_map = state_cancellations.copy()
state_cancellations_map["id"] = state_cancellations_map["state"].map(state_fips)

# Create the choropleth
chart_q1_map = (
    alt.Chart(states)
    .mark_geoshape(stroke="white")
    .encode(
        color=alt.Color(
            "cancelled_grants:Q",
            title="Cancelled Grants",
            scale=alt.Scale(
                scheme="blues",
                domain=[0, 150] 
            ),
        ),
        tooltip=[
            alt.Tooltip("state:N", title="State"),
            alt.Tooltip("cancelled_grants:Q", title="Cancelled Grants"),
        ],
    )
    .transform_lookup(
        lookup="id",
        from_=alt.LookupData(
            state_cancellations_map, "id", ["state", "cancelled_grants"]
        ),
    )
    .project(type="albersUsa")
    .properties(
        width=800,
        height=500,
        title="NSF Grant Cancellations by U.S. State (Capped Scale)",
    )
)
chart_q1_map

**Visualization 3: Combined Map and Bar Chart**

We combine the choropleth map with a bar chart showing the top 10 states side-by-side.
This provides both geographical context and precise numerical rankings in a single view.


In [92]:
import altair as alt
from vega_datasets import data

# Load US topojson
states = alt.topo_feature(data.us_10m.url, "states")

# FIPS mapping
state_fips = {
    "AL": 1, "AK": 2, "AZ": 4, "AR": 5, "CA": 6, "CO": 8, "CT": 9, "DE": 10, "DC": 11,
    "FL": 12, "GA": 13, "HI": 15, "ID": 16, "IL": 17, "IN": 18, "IA": 19, "KS": 20,
    "KY": 21, "LA": 22, "ME": 23, "MD": 24, "MA": 25, "MI": 26, "MN": 27, "MS": 28,
    "MO": 29, "MT": 30, "NE": 31, "NV": 32, "NH": 33, "NJ": 34, "NM": 35, "NY": 36,
    "NC": 37, "ND": 38, "OH": 39, "OK": 40, "OR": 41, "PA": 42, "RI": 44, "SC": 45,
    "SD": 46, "TN": 47, "TX": 48, "UT": 49, "VT": 50, "VA": 51, "WA": 53, "WV": 54,
    "WI": 55, "WY": 56
}

# Copy & map FIPS IDs
state_cancellations_map = state_cancellations.copy()
state_cancellations_map["id"] = state_cancellations_map["state"].map(state_fips)


chart_map = (
    alt.Chart(states)
    .mark_geoshape(stroke="white")
    .encode(
        color=alt.Color(
            "cancelled_grants:Q",
            title="Cancelled grants",
            scale=alt.Scale(scheme="blues", domain=[0, 150]),  # tweak domain to improve contrast
        ),
        tooltip=[
            alt.Tooltip("state:N", title="State"),
            alt.Tooltip("cancelled_grants:Q", title="Cancelled grants"),
        ],
    )
    .transform_lookup(
        lookup="id",
        from_=alt.LookupData(
            state_cancellations_map, "id", ["state", "cancelled_grants"]
        ),
    )
    .project(type="albersUsa")
    .properties(width=400, height=400)
)


top10_states = state_cancellations.head(10)

chart_bar = (
    alt.Chart(top10_states)
    .mark_bar()
    .encode(
        y=alt.Y("state:N", sort="-x", title=""),
        x=alt.X("cancelled_grants:Q", title="Cancelled grants"),
        color=alt.Color("cancelled_grants:Q", scale=alt.Scale(scheme="blues")),
        tooltip=["state", "cancelled_grants"]
    )
    .properties(width=350, height=400, title="Top 10 States")
)


combined_chart = chart_map | chart_bar
combined_chart.properties(title="NSF Grant Cancellations by State")

combined_chart


In [90]:
import altair as alt
from vega_datasets import data

# Load US topojson
states = alt.topo_feature(data.us_10m.url, "states")

# FIPS mapping
state_fips = {
    "AL": 1, "AK": 2, "AZ": 4, "AR": 5, "CA": 6, "CO": 8, "CT": 9, "DE": 10, "DC": 11,
    "FL": 12, "GA": 13, "HI": 15, "ID": 16, "IL": 17, "IN": 18, "IA": 19, "KS": 20,
    "KY": 21, "LA": 22, "ME": 23, "MD": 24, "MA": 25, "MI": 26, "MN": 27, "MS": 28,
    "MO": 29, "MT": 30, "NE": 31, "NV": 32, "NH": 33, "NJ": 34, "NM": 35, "NY": 36,
    "NC": 37, "ND": 38, "OH": 39, "OK": 40, "OR": 41, "PA": 42, "RI": 44, "SC": 45,
    "SD": 46, "TN": 47, "TX": 48, "UT": 49, "VT": 50, "VA": 51, "WA": 53, "WV": 54,
    "WI": 55, "WY": 56
}

# Create data copy and map FIPS IDs
state_cancellations_map = state_cancellations.copy()
state_cancellations_map["id"] = state_cancellations_map["state"].map(state_fips)
state_cancellations_map["cancelled_grants"] = state_cancellations_map["cancelled_grants"].fillna(0)


chart_map2 = (
    alt.Chart(states)
    .mark_geoshape(stroke="white")
    .transform_lookup(
        lookup="id",
        from_=alt.LookupData(
            state_cancellations_map, "id", ["state", "cancelled_grants"]
        ),
    )
    .encode(
        color=alt.Color(
            "cancelled_grants:Q",
            title="Cancelled Grants",
            scale=alt.Scale(
                scheme="blues",
                domain=[1, 500],        
                type="sqrt",            
                interpolate="lab"
            ),
        ),
        tooltip=[
            alt.Tooltip("state:N", title="State"),
            alt.Tooltip("cancelled_grants:Q", title="Cancelled Grants"),
        ],
    )
    .project(type="albersUsa")
    .properties(width=560, height=460)
)


top10_states = state_cancellations.head(10)

chart_bar = (
    alt.Chart(top10_states)
    .mark_bar()
    .encode(
        y=alt.Y("state:N", sort="-x", title=""),
        x=alt.X("cancelled_grants:Q", title="Cancelled Grants"),
        color=alt.Color(
            "cancelled_grants:Q",
            scale=alt.Scale(
                scheme="blues",
                domain=[1, 500],
                type="sqrt",
                interpolate="lab"
            ),
            legend=None
        ),
        tooltip=["state", "cancelled_grants"]
    )
    .properties(width=280, height=460, title="Top 10 States")
)

combined_chart = (chart_map2 | chart_bar).properties(
    title="NSF Grant Cancellations by U.S. State (Smooth Blue Gradient — No White States)"
)

Q1 = combined_chart
Q1

**Why We Chose Our Final Representation**

Our journey toward the final visualization began with a horizontal bar chart, which effectively ranked states by the number of cancelled grants and quickly identified the most affected states like California (466 cancellations) and Massachusetts (256 cancellations). While this initial approach successfully communicated quantitative rankings, it failed to convey the geographical context and spatial relationships inherent in the data.

Viewers could see which states were most impacted, but they couldn't understand where these states were located geographically or whether there were any regional patterns—such as whether cancellations were concentrated along the coasts or distributed more evenly across the country. However, after reviewing the visualization, we realized that while it showed quantities, it didn’t show the geographical distribution or the spatial impact of these cancellations. The audience could see which states were most affected, but not where in the country these states were located, nor whether there were any regional patterns (for example, whether cancellations were concentrated on the coasts or more evenly distributed).

Recognizing this limitation, we transitioned to a choropleth map of the United States, where each state's color intensity corresponds to its number of cancelled grants. This transformation turned abstract numbers into a spatial narrative, providing an immediate, intuitive understanding of which regions were most impacted. The map format allows viewers to instantly recognize geographical patterns and regional concentrations that would be invisible in a simple bar chart. However, we found that a map alone, while excellent for spatial understanding, sometimes makes it difficult to extract precise numerical values for comparison.

Our final solution combines the best of both approaches: a choropleth map paired with a bar chart showing the top 10 states. This dual-panel design provides both geographical context and precise numerical rankings in a single view. We further refined the visualization by implementing a square root color scale with improved color interpolation, which eliminates the problem of white or barely visible states and creates better visual contrast across the entire range of values. This final representation achieves our goal of making the data not just readable, but visually meaningful and explanatory, allowing viewers to simultaneously grasp both the spatial distribution and the quantitative rankings of grant cancellations across the United States.

# Q2: What are the institutions that have been more affected in terms of number of cancelled grants. How does this compare to the others?


In [61]:
# Q2: Institutions most affected by number of cancelled grants
print("=== Q2: Institutions Most Affected by Number of Cancelled Grants ===")

# Count cancellations by institution
institution_cancellations = terminated_grants['org_name'].value_counts().reset_index()
institution_cancellations.columns = ['institution', 'cancelled_grants']

print(f"Total institutions with cancelled grants: {len(institution_cancellations)}")
print("\nTop 15 institutions by number of cancelled grants:")
print(institution_cancellations.head(15))

# Calculate statistics
print(f"\nStatistics:")
print(f"Mean cancelled grants per institution: {institution_cancellations['cancelled_grants'].mean():.2f}")
print(f"Median cancelled grants per institution: {institution_cancellations['cancelled_grants'].median():.2f}")
print(f"Max cancelled grants by single institution: {institution_cancellations['cancelled_grants'].max()}")

# Create visualization
chart_q2 = alt.Chart(institution_cancellations.head(20)).mark_bar().encode(
    x=alt.X('cancelled_grants:Q', title='Number of Cancelled Grants'),
    y=alt.Y('institution:N', sort='-x', title='Institution'),
    color=alt.Color(
    'cancelled_grants:Q',
    scale=alt.Scale(scheme='blues'),
    legend=alt.Legend(title='Cancelled Grants')
),
    tooltip=['institution', 'cancelled_grants']
).properties(
    width=700,
    height=500,
    title='Top 20 Institutions by Number of Cancelled NSF Grants'
).interactive()

Q2 = chart_q2
Q2


=== Q2: Institutions Most Affected by Number of Cancelled Grants ===
Total institutions with cancelled grants: 507

Top 15 institutions by number of cancelled grants:
                                          institution  cancelled_grants
0                University of California-Los Angeles               306
1                                  Harvard University               199
2   Regents of the University of Michigan - Ann Arbor                28
3                            Arizona State University                27
4                   University of Colorado at Boulder                24
5                           Michigan State University                17
6                    Florida International University                16
7                            University of Washington                16
8                     University of Wisconsin-Madison                16
9                       University of Texas at Austin                15
10                        University of S

**Why We Chose This Representation for Question 2**

To answer which institutions were most affected by the number of cancelled grants, we created a horizontal bar chart displaying the top 20 institutions ranked by their total number of cancelled grants. This visualization effectively communicates the quantitative impact on individual institutions, allowing viewers to immediately identify the most severely affected organizations. The chart uses a red color scheme to emphasize the severity of the cancellations, with color intensity corresponding to the number of cancelled grants, making it easy to distinguish between institutions at a glance.

The process to create this visualization involved aggregating all terminated grants by institution name, counting the occurrences for each organization, and then sorting them in descending order. We chose to display the top 20 institutions to provide a comprehensive view of the most impacted organizations while maintaining readability. The horizontal orientation allows for clear labeling of institution names, which can be quite long, and the interactive tooltips provide additional context when hovering over each bar.

We believe this horizontal bar chart is the best representation for this question because it directly answers the query about which institutions were most affected, provides clear rankings that are easy to compare, and effectively handles the challenge of displaying many institution names. Unlike alternative visualizations such as pie charts or tables, this bar chart maintains visual clarity while enabling quick identification of patterns—such as the dramatic disparity between UCLA (306 cancellations) and Harvard (199 cancellations) compared to other institutions. The color gradient further enhances the visual hierarchy, making it immediately apparent which institutions suffered the greatest impact from grant terminations.

# Q3: What are the institutions that have been more affected in terms of budget, and how does this compare to the others?


In [62]:
# Q3: Institutions most affected by budget losses
print("=== Q3: Institutions Most Affected by Budget Losses ===")

# Calculate budget impact by institution
budget_impact = terminated_grants.groupby('org_name').agg({
    'nsf_total_budget': ['sum', 'count', 'mean'],
    'nsf_obligated': 'sum',
    'estimated_budget': 'sum'
}).round(2)

# Flatten column names
budget_impact.columns = ['total_budget_sum', 'grant_count', 'avg_budget', 'obligated_sum', 'estimated_sum']
budget_impact = budget_impact.reset_index()

# Use total budget as primary metric, fill with estimated if missing
budget_impact['budget_impact'] = budget_impact['total_budget_sum'].fillna(budget_impact['estimated_sum'])
budget_impact = budget_impact[budget_impact['budget_impact'] > 0].sort_values('budget_impact', ascending=False)

print(f"Institutions with budget data: {len(budget_impact)}")
print(f"\nTop 15 institutions by budget impact (in dollars):")
print(budget_impact[['org_name', 'budget_impact', 'grant_count']].head(15))

# Calculate total budget impact
total_budget_impact = budget_impact['budget_impact'].sum()
print(f"\nTotal budget impact across all institutions: ${total_budget_impact:,.0f}")

# Create visualization
chart_q3 = alt.Chart(budget_impact.head(20)).mark_bar().encode(
    x=alt.X('budget_impact:Q', title='Total Budget Impact ($)', axis=alt.Axis(format='$,.0f')),
    y=alt.Y('org_name:N', sort='-x', title='Institution'),
    color=alt.Color(
    'budget_impact:Q',
    scale=alt.Scale(scheme='blues'),
    legend=alt.Legend(title='Budget Impact ($)', format='$,.0f')
),
    tooltip=[
        alt.Tooltip('org_name', title='Institution'),
        alt.Tooltip('budget_impact:Q', title='Budget Impact', format='$,.0f'),
        alt.Tooltip('grant_count:Q', title='Number of Grants')
    ]
).properties(
    width=700,
    height=500,
    title='Top 20 Institutions by Total Budget Impact from Cancelled NSF Grants'
).interactive()

Q3 = chart_q3
Q3

=== Q3: Institutions Most Affected by Budget Losses ===
Institutions with budget data: 507

Top 15 institutions by budget impact (in dollars):
                                              org_name  budget_impact  \
385               University of California-Los Angeles      199678586   
156                                 Harvard University      149391352   
24                            Arizona State University       36774883   
394                  University of Colorado at Boulder       27347170   
382                  University of California-Berkeley       23252270   
461                     University of Texas at El Paso       21304493   
459                      University of Texas at Austin       20581798   
276  Regents of the University of Michigan - Ann Arbor       17569315   
384                    University of California-Irvine       17553149   
468                           University of Washington       16739533   
30                                   Auburn University

**Why We Chose This Representation for Question 3**

To identify which institutions were most affected in terms of budget losses from cancelled grants, we created a horizontal bar chart displaying the top 20 institutions ranked by their total budget impact in dollars. This visualization effectively communicates the financial magnitude of grant terminations, allowing viewers to immediately understand which institutions suffered the greatest monetary losses. The chart uses an orange color scheme to represent financial impact, with color intensity corresponding to the total budget amount, making it easy to distinguish between institutions at different financial impact levels.

The process to create this visualization involved aggregating all terminated grants by institution name, calculating the total budget impact for each organization by summing their `nsf_total_budget` values (using `estimated_budget` as a fallback when total budget data was missing), and then sorting institutions in descending order by budget impact. We chose to display the top 20 institutions to provide a comprehensive view of the most financially impacted organizations while maintaining readability. The horizontal orientation allows for clear labeling of institution names, and the interactive tooltips provide additional context including both the budget impact and the number of grants affected when hovering over each bar.

We believe this horizontal bar chart is the best representation for this question because it directly answers the query about which institutions were most affected financially, provides clear rankings that are easy to compare, and effectively communicates the substantial monetary impact of grant terminations. Unlike alternative visualizations such as pie charts or tables, this bar chart maintains visual clarity while enabling quick identification of patterns—such as the dramatic financial disparity between UCLA (nearly $200 million) and Harvard (approximately $149 million) compared to other institutions. The dollar-formatted axis and color gradient further enhance the visual hierarchy, making it immediately apparent which institutions suffered the greatest financial impact from grant terminations, which is crucial information for understanding the economic consequences of these cancellations.

# Q4: Is there any correlation between the cancelled grants and the list of flagged words?


**Initial Approach: Boxplot and Scatterplot**

In [63]:
# Q4: Correlation between cancelled grants and flagged words
print("=== Q4: Correlation between Cancelled Grants and Flagged Words ===")

# Analyze flagged words distribution
print("Flagged words analysis:")
flagged_analysis = cleaned_nsf_data.groupby('terminated').agg({
    'flagged_words_count': ['mean', 'median', 'std', 'count'],
    'title_flagged_words_count': ['mean', 'median', 'std']
}).round(2)

print(flagged_analysis)

# Statistical comparison
terminated_flagged = cleaned_nsf_data[cleaned_nsf_data['terminated'] == True]['flagged_words_count']
non_terminated_flagged = cleaned_nsf_data[cleaned_nsf_data['terminated'] == False]['flagged_words_count']

print(f"\nStatistical comparison:")
print(f"Terminated grants - Mean flagged words: {terminated_flagged.mean():.2f}")
print(f"Non-terminated grants - Mean flagged words: {non_terminated_flagged.mean():.2f}")
print(f"Difference: {terminated_flagged.mean() - non_terminated_flagged.mean():.2f}")

# Create correlation analysis
correlation_data = cleaned_nsf_data[['terminated', 'flagged_words_count', 'title_flagged_words_count']].copy()
correlation_data['terminated_numeric'] = correlation_data['terminated'].astype(int)

# Calculate correlation
correlation_abstract = correlation_data['terminated_numeric'].corr(correlation_data['flagged_words_count'])
correlation_title = correlation_data['terminated_numeric'].corr(correlation_data['title_flagged_words_count'])

print(f"\nCorrelation coefficients:")
print(f"Termination vs Abstract flagged words: {correlation_abstract:.4f}")
print(f"Termination vs Title flagged words: {correlation_title:.4f}")

# Create visualization - Distribution of flagged words by termination status
chart_q4_1 = alt.Chart(cleaned_nsf_data).mark_boxplot().encode(
    x=alt.X('terminated:N', title='Grant Status'),
    y=alt.Y('flagged_words_count:Q', title='Number of Flagged Words in Abstract'),
    color=alt.Color('terminated:N', scale=alt.Scale(scheme='set2'))
).properties(
    width=400,
    height=300,
    title='Distribution of Flagged Words in Abstracts by Grant Status'
)

# Create visualization - Scatter plot
chart_q4_2 = alt.Chart(cleaned_nsf_data).mark_circle(opacity=0.6).encode(
    x=alt.X('flagged_words_count:Q', title='Number of Flagged Words in Abstract'),
    y=alt.Y('terminated:N', title='Grant Terminated'),
    color=alt.Color('terminated:N', scale=alt.Scale(scheme='set2')),
    size=alt.Size('nsf_total_budget:Q', scale=alt.Scale(range=[20, 200]), legend=alt.Legend(title='Budget ($)'))
).properties(
    width=500,
    height=200,
    title='Relationship between Flagged Words and Grant Termination'
)

# Combine charts
chart_q4 = chart_q4_1 | chart_q4_2
chart_q4


=== Q4: Correlation between Cancelled Grants and Flagged Words ===
Flagged words analysis:
           flagged_words_count                    title_flagged_words_count  \
                          mean median   std count                      mean   
terminated                                                                    
True                      6.76    5.0  6.64  1970                      0.47   

                         
           median   std  
terminated               
True          0.0  0.73  

Statistical comparison:
Terminated grants - Mean flagged words: 6.76
Non-terminated grants - Mean flagged words: nan
Difference: nan

Correlation coefficients:
Termination vs Abstract flagged words: nan
Termination vs Title flagged words: nan


**Comment**

We started with a boxplot and scatterplot combination to explore the correlation between flagged words and grant terminations, as these are standard statistical visualizations for examining relationships between variables.
However, we quickly realized this approach was problematic because our dataset only contains terminated grants, making correlation analysis impossible and the visualizations misleading.
This initial attempt helped us understand that we needed a different approach focused on understanding how flagged words manifest within cancelled grants, rather than trying to establish correlation.

To improve our previous chart we swapped in a straightforward histogram so it’s easier to see how flagged-word counts are distributed across cancelled grants.
We also kept the side ranking of top flagged words so the specific terms that pop up most often stay front and center.

In [64]:
# Q4 (Redesign)

import altair as alt
import pandas as pd
from collections import Counter
import re

print("=== Q4 (Redesign): Distribution + Top Flagged Words in Cancelled Grants ===")

# Base dataset (already filtered to terminated grants)
df_q4 = cleaned_nsf_data.copy()
df_q4["flagged_words_count"] = df_q4["flagged_words_count"].fillna(0)

# Left panel — distribution of how many flagged words appear per cancelled grant
chart_q4_hist = (
    alt.Chart(df_q4)
    .transform_filter(alt.datum.flagged_words_count < 40)
    .mark_bar(opacity=0.8)
    .encode(
        x=alt.X(
            "flagged_words_count:Q",
            bin=alt.Bin(maxbins=40),
            title="Number of Flagged Words per Grant"
        ),
        y=alt.Y("count():Q", title="Number of Grants"),
        color=alt.value("#1d4ed8"),
        tooltip=[
            alt.Tooltip("flagged_words_count:Q", title="Flagged words (binned)"),
            alt.Tooltip("count():Q", title="Grants in bin")
        ]
    )
    .properties(
        width=350,
        height=250,
        title="Distribution of Flagged Word Counts in Cancelled Grants"
    )
)

# 2
flagged_words_list = flagged_words_clean  

# Count flagged word occurrences in title + abstract for terminated grants
all_texts = (
    df_q4.get("project_title", df_q4.get("title", pd.Series("", index=df_q4.index))).fillna("") 
    + " " 
    + df_q4.get("abstract", pd.Series("", index=df_q4.index)).fillna("")
).str.lower()

word_counter = Counter()
for text in all_texts:
    for word in flagged_words_list:
        if re.search(rf"\b{re.escape(word)}\b", text):
            word_counter[word] += 1

# Convert to DataFrame for plotting
df_top_words = (
    pd.DataFrame(word_counter.items(), columns=["word", "count"])
    .sort_values("count", ascending=False)
    .head(15)
)

chart_q4_words = (
    alt.Chart(df_top_words)
    .mark_bar()
    .encode(
        y=alt.Y("word:N", sort="-x", title="Flagged Word"),
        x=alt.X("count:Q", title="Occurrences in Cancelled Grants"),
        color=alt.Color("count:Q", scale=alt.Scale(scheme="blues")),
        tooltip=[
            alt.Tooltip("word:N", title="Word"),
            alt.Tooltip("count:Q", title="Occurrences")
        ]
    )
    .properties(
        width=350,
        height=250,
        title="Top 15 Flagged Words Found in Cancelled Grants"
    )
)

# 3
chart_q4_final = chart_q4_hist | chart_q4_words
chart_q4_final


=== Q4 (Redesign): Distribution + Top Flagged Words in Cancelled Grants ===


**Comment**

Finally, we decided to replaced the old bar chart with a frequency-polygon view so you can more clearly see how many cancelled grants fall into each flagged-word count.

In [65]:
# Q4 Re-redesgin : Frequency polygon (line) for the left chart
import numpy as np
import pandas as pd
import altair as alt

df_q4 = cleaned_nsf_data.copy()
vals = df_q4["flagged_words_count"].fillna(0).clip(upper=40)

# 1-bin per integer count (0..40)
bins = np.arange(0, 41, 1)
hist, edges = np.histogram(vals, bins=bins)

df_bins = pd.DataFrame({
    "bin_start": edges[:-1],
    "bin_end": edges[1:],
    "count": hist
})
df_bins["mid"] = (df_bins["bin_start"] + df_bins["bin_end"]) / 2  # frequency polygon x

# frequency polygon (points + line through bin midpoints)
chart_q4_hist = (
    alt.Chart(df_bins)
    .mark_line(point=True)
    .encode(
        x=alt.X("mid:Q", title="Number of Flagged Words per Grant"),
        y=alt.Y("count:Q", title="Number of Grants"),
    )
    .properties(width=350, height=250,
                title="Distribution of Flagged Word Counts in Cancelled Grants (Frequency Polygon)")
)

chart_q4_final = chart_q4_hist | chart_q4_words
chart_q4_final
Q4 = chart_q4_final
Q4


**Why We Chose This Representation for Question 4**

To investigate the correlation between cancelled grants and flagged words, we created a dual-panel visualization consisting of a frequency polygon (line chart) showing the distribution of flagged word counts per cancelled grant, and a bar chart displaying the top 15 most frequently occurring flagged words in cancelled grants. This two-panel design effectively addresses the question from two complementary perspectives: the quantitative distribution of flagged word intensity across grants, and the semantic identification of which specific flagged terms appear most often.

The process to create this visualization involved first counting flagged words in each grant's abstract and title using word-boundary matching to ensure accurate detection. For the left panel, we binned the flagged word counts to show how many grants contain different numbers of flagged words, and used a line chart instead of bars to make overall trends easier to perceive. This change provides a smoother, less cluttered view of the distribution, helping the viewer quickly identify the concentration of grants with few flagged terms and the long tail of higher counts. For the right panel, we aggregated occurrences of each flagged word across all cancelled grants' titles and abstracts, then ranked them to identify the most common terms. We chose to display the top 15 words to provide meaningful insight while maintaining readability.

We believe this dual-panel design is the best representation for this question because it provides both quantitative and semantic insights that a single visualization could not achieve. Unlike scatterplots or correlation matrices, which would be misleading given that all grants in our dataset are terminated (making correlation analysis impossible), this approach focuses on understanding how flagged words manifest within cancelled grants. The frequency polygon reveals the overall distribution pattern—showing that most grants have low flagged word counts—while the bar chart identifies which specific terms (such as "diverse," "equity," or "underrepresented") appear most frequently, offering actionable insights into the language patterns associated with cancelled grants. This concise, informative layout achieves higher interpretive accuracy and visual clarity than alternative approaches.

# Q5: Is there any correlation between the cancelled grants and the list of grants in Cruz’s list? And with respect to reinstated grants?


**Initial Approach: Multiple Bar Charts**

We started with three separate bar charts to compare termination and reinstatement rates between Cruz List and Non-Cruz List grants, as bar charts are effective for comparing categorical groups and their associated rates.
However, we quickly realized this approach fragmented the information across multiple visualizations, making it difficult to see the relationship between termination and reinstatement rates simultaneously.
This initial attempt helped us understand that we needed a more integrated visualization that could show both metrics together, allowing for easier comparison of the relationship between Cruz List status and both termination and reinstatement outcomes.


In [66]:
# Q5: Correlation between cancelled grants and Cruz list
print("=== Q5: Correlation between Cancelled Grants and Cruz List ===")

# Analyze Cruz list relationship
cruz_analysis = cleaned_nsf_data.groupby('in_cruz_list').agg({
    'terminated': ['sum', 'count', 'mean'],
    'reinstated': ['sum', 'count', 'mean']
}).round(4)

print("Cruz list analysis:")
print(cruz_analysis)

# Create contingency table
contingency_table = pd.crosstab(cleaned_nsf_data['in_cruz_list'], cleaned_nsf_data['terminated'], margins=True)
print(f"\nContingency table (Cruz List vs Terminated):")
print(contingency_table)

# Calculate rates
cruz_terminated_rate = cleaned_nsf_data[cleaned_nsf_data['in_cruz_list'] == True]['terminated'].mean()
non_cruz_terminated_rate = cleaned_nsf_data[cleaned_nsf_data['in_cruz_list'] == False]['terminated'].mean()

print(f"\nTermination rates:")
print(f"Cruz list grants termination rate: {cruz_terminated_rate:.4f} ({cruz_terminated_rate*100:.2f}%)")
print(f"Non-Cruz list grants termination rate: {non_cruz_terminated_rate:.4f} ({non_cruz_terminated_rate*100:.2f}%)")
print(f"Difference: {cruz_terminated_rate - non_cruz_terminated_rate:.4f} ({(cruz_terminated_rate - non_cruz_terminated_rate)*100:.2f} percentage points)")

# Analyze reinstated grants
cruz_reinstated_rate = cleaned_nsf_data[cleaned_nsf_data['in_cruz_list'] == True]['reinstated'].mean()
non_cruz_reinstated_rate = cleaned_nsf_data[cleaned_nsf_data['in_cruz_list'] == False]['reinstated'].mean()

print(f"\nReinstatement rates:")
print(f"Cruz list grants reinstatement rate: {cruz_reinstated_rate:.4f} ({cruz_reinstated_rate*100:.2f}%)")
print(f"Non-Cruz list grants reinstatement rate: {non_cruz_reinstated_rate:.4f} ({non_cruz_reinstated_rate*100:.2f}%)")
print(f"Difference: {cruz_reinstated_rate - non_cruz_reinstated_rate:.4f} ({(cruz_reinstated_rate - non_cruz_reinstated_rate)*100:.2f} percentage points)")

# Create visualization - Termination rates
termination_rates = pd.DataFrame({
    'group': ['Cruz List', 'Non-Cruz List'],
    'termination_rate': [cruz_terminated_rate, non_cruz_reinstated_rate],
    'reinstatement_rate': [cruz_reinstated_rate, non_cruz_reinstated_rate]
})

chart_q5_1 = alt.Chart(termination_rates).mark_bar().encode(
    x=alt.X('group:N', title='Grant Group'),
    y=alt.Y('termination_rate:Q', title='Termination Rate', axis=alt.Axis(format='.2%')),
    color=alt.Color('group:N', scale=alt.Scale(scheme='set1'))
).properties(
    width=300,
    height=300,
    title='Termination Rates by Cruz List Status'
)

chart_q5_2 = alt.Chart(termination_rates).mark_bar().encode(
    x=alt.X('group:N', title='Grant Group'),
    y=alt.Y('reinstatement_rate:Q', title='Reinstatement Rate', axis=alt.Axis(format='.2%')),
    color=alt.Color('group:N', scale=alt.Scale(scheme='set1'))
).properties(
    width=300,
    height=300,
    title='Reinstatement Rates by Cruz List Status'
)

# Create stacked bar chart showing both rates
melted_rates = termination_rates.melt(id_vars=['group'], var_name='rate_type', value_name='rate')
melted_rates['rate_type'] = melted_rates['rate_type'].str.replace('_rate', '').str.title()

chart_q5_3 = alt.Chart(melted_rates).mark_bar().encode(
    x=alt.X('group:N', title='Grant Group'),
    y=alt.Y('rate:Q', title='Rate', axis=alt.Axis(format='.2%')),
    color=alt.Color('rate_type:N', scale=alt.Scale(scheme='category20')),
    tooltip=['group', 'rate_type', alt.Tooltip('rate:Q', format='.2%')]
).properties(
    width=400,
    height=300,
    title='Termination and Reinstatement Rates by Cruz List Status'
)

# Combine charts
chart_q5 = (chart_q5_1 | chart_q5_2) & chart_q5_3
chart_q5


=== Q5: Correlation between Cancelled Grants and Cruz List ===
Cruz list analysis:
             terminated            reinstated              
                    sum count mean        sum count    mean
in_cruz_list                                               
False              1503  1503  1.0        381  1503  0.2535
True                467   467  1.0         39   467  0.0835

Contingency table (Cruz List vs Terminated):
terminated    True   All
in_cruz_list            
False         1503  1503
True           467   467
All           1970  1970

Termination rates:
Cruz list grants termination rate: 1.0000 (100.00%)
Non-Cruz list grants termination rate: 1.0000 (100.00%)
Difference: 0.0000 (0.00 percentage points)

Reinstatement rates:
Cruz list grants reinstatement rate: 0.0835 (8.35%)
Non-Cruz list grants reinstatement rate: 0.2535 (25.35%)
Difference: -0.1700 (-17.00 percentage points)


*Second approach: Slope chart*

This slope chart was an early attempt to compare termination and reinstatement rates between Cruz and Non-Cruz list grants. However, it proved ineffective since all grants in the dataset were already terminated, making the termination rate constant (100%) and the visualization uninformative.

In [67]:
#  Q5 (Redesign)

import altair as alt
import pandas as pd

print("=== Q5 (Redesign): Cruz List vs Termination & Reinstatement Rates ===")

# Calculate rates per group 
summary_q5 = (
    cleaned_nsf_data.groupby("in_cruz_list")
    .agg({
        "terminated": "mean",
        "reinstated": "mean"
    })
    .reset_index()
)

summary_q5["group"] = summary_q5["in_cruz_list"].map({True: "Cruz List", False: "Non-Cruz List"})
summary_q5 = summary_q5.melt(id_vars=["group"], var_name="metric", value_name="rate")

# Prepare labels for clarity
summary_q5["metric"] = summary_q5["metric"].replace({
    "terminated": "Termination Rate",
    "reinstated": "Reinstatement Rate"
})

print(summary_q5.round(3))

# --- Create the slope chart ---
chart_q5_slope = (
    alt.Chart(summary_q5)
    .mark_line(
        point=alt.OverlayMarkDef(filled=True, size=90),
        strokeWidth=3
    )
    .encode(
        x=alt.X("metric:N", title=None, axis=alt.Axis(labelAngle=0)),
        y=alt.Y("rate:Q", title="Rate", axis=alt.Axis(format="%")),
        color=alt.Color(
            "group:N",
            title="Grant Group",
            scale=alt.Scale(domain=["Cruz List", "Non-Cruz List"],
                            range=["#1565C0", "#B06500"])
        ),
        tooltip=[
            alt.Tooltip("group:N", title="Group"),
            alt.Tooltip("metric:N", title="Metric"),
            alt.Tooltip("rate:Q", title="Rate", format=".1%")
        ]
    )
    .properties(
        width=420,
        height=300,
        title="Cruz List vs Non-Cruz List: Termination and Reinstatement Rates"
    )
)

chart_q5_labels = (
    alt.Chart(summary_q5)
    .mark_text(
        align="left",
        dx=6,
        dy=-4,
        fontSize=11,
        color="black"
    )
    .encode(
        x="metric:N",
        y="rate:Q",
        text=alt.Text("rate:Q", format=".1%"),
        color=alt.Color("group:N",
                        scale=alt.Scale(domain=["Cruz List", "Non-Cruz List"],
                                        range=["#1565C0", "#B06500"]))
    )
)

chart_q5_final = (chart_q5_slope + chart_q5_labels).configure_view(strokeWidth=0)
chart_q5_final


=== Q5 (Redesign): Cruz List vs Termination & Reinstatement Rates ===
           group              metric      rate
0  Non-Cruz List        in_cruz_list       0.0
1      Cruz List        in_cruz_list       1.0
2  Non-Cruz List    Termination Rate       1.0
3      Cruz List    Termination Rate       1.0
4  Non-Cruz List  Reinstatement Rate  0.253493
5      Cruz List  Reinstatement Rate  0.083512


**Scatterplot Matrix: Correlation Analysis**

The scatterplot matrix below shows the relationships between cancelled grants (terminated), Cruz list status, and reinstated grants. Since these are binary variables, we convert them to numeric (0/1) and add jitter to visualize the distributions and correlations.


In [68]:
# Q5: Scatterplot Matrix for Correlation Analysis
import numpy as np

# Prepare data for scatterplot matrix
q5_matrix_data = cleaned_nsf_data[['terminated', 'in_cruz_list', 'reinstated']].copy()

# Convert to numeric (True=1, False=0)
q5_matrix_data['terminated_num'] = q5_matrix_data['terminated'].astype(int)
q5_matrix_data['cruz_list_num'] = q5_matrix_data['in_cruz_list'].astype(int)
q5_matrix_data['reinstated_num'] = q5_matrix_data['reinstated'].astype(int)

np.random.seed(42)  
jitter_amount = 0.15
q5_matrix_data['terminated_jitter'] = q5_matrix_data['terminated_num'] + np.random.uniform(-jitter_amount, jitter_amount, len(q5_matrix_data))
q5_matrix_data['cruz_list_jitter'] = q5_matrix_data['cruz_list_num'] + np.random.uniform(-jitter_amount, jitter_amount, len(q5_matrix_data))
q5_matrix_data['reinstated_jitter'] = q5_matrix_data['reinstated_num'] + np.random.uniform(-jitter_amount, jitter_amount, len(q5_matrix_data))

# Sample data for better performance (use all data but sample for visualization)
q5_sample = q5_matrix_data.sample(min(2000, len(q5_matrix_data)), random_state=42)

# Calculate correlation coefficients
corr_terminated_cruz = q5_matrix_data['terminated_num'].corr(q5_matrix_data['cruz_list_num'])
corr_terminated_reinstated = q5_matrix_data['terminated_num'].corr(q5_matrix_data['reinstated_num'])
corr_cruz_reinstated = q5_matrix_data['cruz_list_num'].corr(q5_matrix_data['reinstated_num'])

print("Correlation Coefficients:")
print(f"Terminated vs Cruz List: {corr_terminated_cruz:.4f}")
print(f"Terminated vs Reinstated: {corr_terminated_reinstated:.4f}")
print(f"Cruz List vs Reinstated: {corr_cruz_reinstated:.4f}")

# Define variable names for labels
variables = ['Terminated', 'Cruz List', 'Reinstated']
var_columns = ['terminated_jitter', 'cruz_list_jitter', 'reinstated_jitter']

# Create scatterplots for off-diagonal elements
scatter_base = alt.Chart(q5_sample).mark_circle(opacity=0.4, size=30).encode(
    alt.X(alt.repeat("column"), type='quantitative', scale=alt.Scale(domain=[-0.3, 1.3]), title=''),
    alt.Y(alt.repeat("row"), type='quantitative', scale=alt.Scale(domain=[-0.3, 1.3]), title=''),
    color=alt.Color('in_cruz_list:N', scale=alt.Scale(scheme='category10'), legend=alt.Legend(title='Cruz List'))
).properties(
    width=180,
    height=180
)

# Create histograms for diagonal elements
hist_base = alt.Chart(q5_sample).mark_bar(opacity=0.7).encode(
    alt.X(alt.repeat("column"), type='quantitative', bin=alt.Bin(maxbins=20), title=''),
    alt.Y('count()', title=''),
    color=alt.value('#1f77b4')
).properties(
    width=180,
    height=180
)

# Create the scatterplot matrix
scatter_matrix = scatter_base.repeat(
    row=var_columns,
    column=var_columns
).resolve_scale(
    x='independent',
    y='independent'
)

# Create histogram matrix for diagonal
hist_matrix = hist_base.repeat(
    row=var_columns,
    column=var_columns
).resolve_scale(
    x='independent',
    y='independent'
)

chart_q5_matrix = scatter_matrix.properties(
    title='Scatterplot Matrix: Correlation Between Terminated, Cruz List, and Reinstated Grants'
)

chart_q5_matrix


Correlation Coefficients:
Terminated vs Cruz List: nan
Terminated vs Reinstated: nan
Cruz List vs Reinstated: -0.1765


In [69]:
# Q5: Scatterplot Matrix (Improved - Like Example)
import numpy as np

# Prepare data
q5_data = cleaned_nsf_data[['terminated', 'in_cruz_list', 'reinstated']].copy()

# Convert to numeric
q5_data['Terminated'] = q5_data['terminated'].astype(int)
q5_data['Cruz_List'] = q5_data['in_cruz_list'].astype(int)
q5_data['Reinstated'] = q5_data['reinstated'].astype(int)

# Add jitter for binary variables
np.random.seed(42)
jitter = 0.12
q5_data['Terminated_jitter'] = q5_data['Terminated'] + np.random.uniform(-jitter, jitter, len(q5_data))
q5_data['Cruz_List_jitter'] = q5_data['Cruz_List'] + np.random.uniform(-jitter, jitter, len(q5_data))
q5_data['Reinstated_jitter'] = q5_data['Reinstated'] + np.random.uniform(-jitter, jitter, len(q5_data))

# Sample for better performance
q5_viz = q5_data.sample(min(2000, len(q5_data)), random_state=42)

# Variable names and their jittered columns
vars_info = [
    {'name': 'Terminated', 'col': 'Terminated_jitter'},
    {'name': 'Cruz List', 'col': 'Cruz_List_jitter'},
    {'name': 'Reinstated', 'col': 'Reinstated_jitter'}
]

# Create individual charts for each cell
charts = []
for i, row_var in enumerate(vars_info):
    row_charts = []
    for j, col_var in enumerate(vars_info):
        if i == j:
            # Diagonal: Show variable name
            label_data = pd.DataFrame({
                'x': [0.5],
                'y': [0.5],
                'label': [row_var['name']]
            })
            chart = alt.Chart(label_data).mark_text(
                size=16,
                fontWeight='bold',
                align='center',
                baseline='middle'
            ).encode(
                x=alt.X('x:Q', scale=alt.Scale(domain=[-0.2, 1.2]), axis=None),
                y=alt.Y('y:Q', scale=alt.Scale(domain=[-0.2, 1.2]), axis=None),
                text='label:N'
            ).properties(
                width=200,
                height=200
            )
        else:
            # Off-diagonal: Scatterplot
            chart = alt.Chart(q5_viz).mark_circle(
                opacity=0.5,
                size=25,
                stroke='black',
                strokeWidth=0.5
            ).encode(
                x=alt.X(
                    col_var['col'] + ':Q',
                    scale=alt.Scale(domain=[-0.2, 1.2]),
                    title=col_var['name'] if i == len(vars_info) - 1 else '',
                    axis=alt.Axis(labelAngle=0 if i == len(vars_info) - 1 else -90, labelFontSize=10)
                ),
                y=alt.Y(
                    row_var['col'] + ':Q',
                    scale=alt.Scale(domain=[-0.2, 1.2]),
                    title=row_var['name'] if j == 0 else '',
                    axis=alt.Axis(labelFontSize=10)
                ),
                color=alt.Color('in_cruz_list:N', 
                              scale=alt.Scale(scheme='category10'),
                              legend=alt.Legend(title='Cruz List') if i == 0 and j == len(vars_info) - 1 else None)
            ).properties(
                width=200,
                height=200
            )
        row_charts.append(chart)
    charts.append(row_charts)

# Combine charts into matrix
# Top row
row1 = charts[0][0] | charts[0][1] | charts[0][2]

# Middle row  
row2 = charts[1][0] | charts[1][1] | charts[1][2]

# Bottom row
row3 = charts[2][0] | charts[2][1] | charts[2][2]

# Combine all rows
matrix_chart = (row1 & row2 & row3).properties(
    title='Scatterplot Matrix: Terminated, Cruz List, and Reinstated Grants'
).configure_view(
    strokeWidth=1,
    stroke='#ddd'
).configure_axis(
    grid=True,
    gridColor='#f0f0f0'
)

matrix_chart


In [70]:
# Q5: Slope Chart + Contingency Heatmap
# Calculate rates for visualization
cruz_reinstated_rate = cleaned_nsf_data[cleaned_nsf_data['in_cruz_list'] == True]['reinstated'].mean()
non_cruz_reinstated_rate = cleaned_nsf_data[cleaned_nsf_data['in_cruz_list'] == False]['reinstated'].mean()

# Prepare data for slope chart
slope_data = pd.DataFrame({
    'Group': ['Cruz List', 'Cruz List', 'Non-Cruz List', 'Non-Cruz List'],
    'Metric': ['Termination Rate', 'Reinstatement Rate', 'Termination Rate', 'Reinstatement Rate'],
    'Rate': [1.0, cruz_reinstated_rate, 1.0, non_cruz_reinstated_rate],
    'Group_Type': ['Cruz List', 'Cruz List', 'Non-Cruz List', 'Non-Cruz List']
})

# Create slope chart showing the relationship
slope_chart = alt.Chart(slope_data).mark_line(point=True, strokeWidth=3).encode(
    x=alt.X('Metric:N', 
            title='',
            axis=alt.Axis(labelAngle=0),
            sort=['Termination Rate', 'Reinstatement Rate']),
    y=alt.Y('Rate:Q', 
            title='Rate',
            axis=alt.Axis(format='.0%'),
            scale=alt.Scale(domain=[0, 1.1])),
    color=alt.Color('Group_Type:N', 
                   scale=alt.Scale(scheme='category10'),
                   legend=alt.Legend(title='Grant Group')),
    detail='Group_Type:N'
).properties(
    width=400,
    height=300,
    title='Termination and Reinstatement Rates by Cruz List Status'
) + alt.Chart(slope_data).mark_circle(size=100, stroke='white', strokeWidth=2).encode(
    x=alt.X('Metric:N', sort=['Termination Rate', 'Reinstatement Rate']),
    y=alt.Y('Rate:Q', axis=alt.Axis(format='.0%')),
    color=alt.Color('Group_Type:N', scale=alt.Scale(scheme='category10')),
    tooltip=[alt.Tooltip('Group_Type:N', title='Group'),
             alt.Tooltip('Metric:N', title='Metric'),
             alt.Tooltip('Rate:Q', title='Rate', format='.2%')]
)

# Create contingency table for heatmap
contingency = pd.crosstab(
    cleaned_nsf_data['in_cruz_list'], 
    cleaned_nsf_data['reinstated'],
    normalize='index'  
).round(3)

# Reshape for heatmap
contingency_melted = contingency.reset_index().melt(
    id_vars='in_cruz_list',
    var_name='reinstated',
    value_name='proportion'
)
contingency_melted['cruz_label'] = contingency_melted['in_cruz_list'].map({True: 'Cruz List', False: 'Non-Cruz List'})
contingency_melted['reinstated_label'] = contingency_melted['reinstated'].map({True: 'Reinstated', False: 'Not Reinstated'})

# Create heatmap
heatmap = alt.Chart(contingency_melted).mark_rect(stroke='white', strokeWidth=2).encode(
    x=alt.X('reinstated_label:N', title='Reinstatement Status', axis=alt.Axis(labelAngle=0)),
    y=alt.Y('cruz_label:N', title='Cruz List Status', sort=['Cruz List', 'Non-Cruz List']),
    color=alt.Color('proportion:Q',
                   scale=alt.Scale(scheme='blues', domain=[0, 1]),
                   legend=alt.Legend(title='Proportion', format='.0%')),
    tooltip=[
        alt.Tooltip('cruz_label:N', title='Group'),
        alt.Tooltip('reinstated_label:N', title='Status'),
        alt.Tooltip('proportion:Q', title='Proportion', format='.2%')
    ]
).properties(
    width=300,
    height=200,
    title='Reinstatement Rates by Cruz List Status (Heatmap)'
)

# Add text labels on heatmap
heatmap_text = alt.Chart(contingency_melted).mark_text(
    size=14,
    fontWeight='bold',
    color='white'
).encode(
    x=alt.X('reinstated_label:N'),
    y=alt.Y('cruz_label:N', sort=['Cruz List', 'Non-Cruz List']),
    text=alt.Text('proportion:Q', format='.1%')
)

heatmap_final = (heatmap + heatmap_text)

# Combine both visualizations
best_q5_viz = (slope_chart | heatmap_final).properties(
    title='Q5: Relationship Between Cruz List Status and Reinstatement Rates'
).resolve_scale(color='independent')

best_q5_viz


In [None]:
# === Q5 (final polish – centered labels, fixed axis, panel outline) ===
import altair as alt
import pandas as pd

df = cleaned_nsf_data.copy()

# Labels + explicit stack order (Reinstated on the RIGHT)
df["cruz_label"]   = df["in_cruz_list"].map({True: "Yes", False: "No"})
df["status_label"] = df["reinstated"].map({True: "Reinstated", False: "Terminated"})
df["status_order"] = df["status_label"].map({"Terminated": 0, "Reinstated": 1})

# Counts per (Cruz, Status) + row totals
q5_counts = (
    df.groupby(["cruz_label", "status_label", "status_order"])
      .size().reset_index(name="count")
)
row_totals = q5_counts.groupby("cruz_label")["count"].sum().reset_index(name="row_total")
q5_counts = q5_counts.merge(row_totals, on="cruz_label", how="left")

# Overall totals (single stacked bar)
totals = (
    df.groupby(["status_label", "status_order"])
      .size().reset_index(name="count")
    .sort_values("status_order")
)
totals["one"] = "Totals"
total_sum = int(totals["count"].sum())

# ---- palette ----
BLUE = "#60A5FA"   
GRAY = "#C4C4C4"   
GRID = "#D1D5DB"

# ---- left x-axis: force 0..1600, no scientific notation ----
x_max = 1600
tick_step = 200
axis_values = list(range(0, x_max + tick_step, tick_step))


# Left panel: stacked bars

base_left = alt.Chart(q5_counts).properties(width=560, height=230)

stack = base_left.mark_bar().encode(
    y=alt.Y("cruz_label:N", title="In Ted Cruz's List", sort=["No", "Yes"]),
    x=alt.X(
        "count:Q",
        title="Total Grants",
        scale=alt.Scale(domain=[0, x_max], nice=False, zero=True),
        axis=alt.Axis(values=axis_values, labelExpr='format(datum.value, ",")')
    ),
    color=alt.Color(
        "status_label:N",
        title=None,
        scale=alt.Scale(domain=["Terminated", "Reinstated"], range=[BLUE, GRAY]),
        legend=None
    ),
    order=alt.Order("status_order:Q"),
    tooltip=[
        alt.Tooltip("cruz_label:N", title="In Cruz's List"),
        alt.Tooltip("status_label:N", title="Status"),
        alt.Tooltip("count:Q", title="Count", format=",")
    ]
)

# End-of-row totals at right edge
totals_labels = alt.Chart(row_totals).mark_text(
    align="left", dx=8, fontSize=12, color="#0F172A"
).encode(
    y=alt.Y("cruz_label:N", sort=["No", "Yes"], title=None),
    x=alt.X("row_total:Q"),
    text=alt.Text("row_total:Q", format=",")
)

left_panel = (stack + totals_labels).properties(
    title="In the Cruz's List"
)


# Right panel: single stacked TOTAL bar (labels centered inside, no numbers)

right_base = (
    alt.Chart(totals)
      .transform_joinaggregate(total="sum(count)")
      .transform_window(
          cum="sum(count)",
          sort=[alt.SortField("status_order", order="ascending")]
      )
      .transform_calculate(center="datum.cum - datum.count / 2")
)

totals_bar = right_base.mark_bar().encode(
    x=alt.X("one:N", axis=None, title=""),
    y=alt.Y(
        "count:Q",
        stack="zero",
        axis=None, title="",
        scale=alt.Scale(domain=[0, total_sum], nice=False, zero=True)
    ),
    color=alt.Color(
        "status_label:N",
        scale=alt.Scale(domain=["Terminated", "Reinstated"], range=[BLUE, GRAY]),
        legend=None
    ),
    order=alt.Order("status_order:Q")
).properties(width=200, height=230, title="Totals")

# Text precisely at the vertical center of each segment
totals_center_labels = right_base.mark_text(
    baseline="middle",
    dy=0,
    fontSize=14,
    color="#0F172A"
).encode(
    x=alt.X("one:N"),
    y=alt.Y(
        "center:Q",
        axis=None,
        scale=alt.Scale(domain=[0, total_sum], nice=False, zero=True)
    ),
    text=alt.Text("status_label:N")
)

right_panel = totals_bar + totals_center_labels

# Combine & global config (gridlines + black outline)
chart_q5_final = (left_panel | right_panel) \
    .resolve_scale(color="shared") \
    .configure_axis(
        grid=True,
        gridColor=GRID,
        gridOpacity=1,
        tickColor="#D3D3D3",
        labelColor="#111827",
        titleColor="#111827"
    )

chart_q5_final


Final improvements

In [86]:
import altair as alt
import pandas as pd

df = cleaned_nsf_data.copy()

# Labels + explicit stack order
df["cruz_label"] = df["in_cruz_list"].map({True: "Yes", False: "No"})
df["status_label"] = df["reinstated"].map({True: "Reinstated", False: "Terminated"})
df["status_order"] = df["status_label"].map({"Terminated": 0, "Reinstated": 1})

# Counts per (Cruz, Status) + row totals and percentages
q5_counts = (
    df.groupby(["cruz_label", "status_label", "status_order"])
    .size()
    .reset_index(name="count")
)
row_totals = (
    q5_counts.groupby("cruz_label")["count"].sum().reset_index(name="row_total")
)
q5_counts = q5_counts.merge(row_totals, on="cruz_label", how="left")
q5_counts["percentage"] = (q5_counts["count"] / q5_counts["row_total"] * 100).round(1)

# Overall totals
totals = (
    df.groupby(["status_label", "status_order"])
    .size()
    .reset_index(name="count")
    .sort_values("status_order")
)
totals["one"] = "Totals"
total_sum = int(totals["count"].sum())
totals["percentage"] = (totals["count"] / total_sum * 100).round(1)

# ---- Enhanced palette ----
BLUE_DARK = "#3182bd"   
BLUE_LIGHT = "#9ecae1"  
GRID = "#E5E7EB"
DARK_TEXT = "#111827"

# ---- Left x-axis ----
x_max = 1600
tick_step = 200
axis_values = list(range(0, x_max + tick_step, tick_step))

# Left panel: stacked bars with percentages

base_left = alt.Chart(q5_counts).properties(width=580, height=250)

stack = base_left.mark_bar().encode(
    y=alt.Y(
        "cruz_label:N",
        title="In Ted Cruz's List",
        sort=["No", "Yes"],
        axis=alt.Axis(labelFontSize=13, titleFontSize=14, titleFontWeight=600),
    ),
    x=alt.X(
        "count:Q",
        title="Number of Grants",
        scale=alt.Scale(domain=[0, x_max], nice=False, zero=True),
        axis=alt.Axis(
            values=axis_values,
            labelExpr='format(datum.value, ",")',
            labelFontSize=12,
            titleFontSize=14,
            titleFontWeight=600,
        ),
    ),
    color=alt.Color(
    "status_label:N",
    title=None,
    scale=alt.Scale(
        domain=["Terminated", "Reinstated"],
        range=[BLUE_DARK, BLUE_LIGHT]
    ),
    legend=None,
),
    order=alt.Order("status_order:Q"),
    tooltip=[
        alt.Tooltip("cruz_label:N", title="In Cruz's List"),
        alt.Tooltip("status_label:N", title="Status"),
        alt.Tooltip("count:Q", title="Count", format=","),
        alt.Tooltip("percentage:Q", title="Percentage", format=".1f"),
    ],
)

# Percentage labels inside bars (centered in each segment)
percentage_labels = (
    base_left.transform_joinaggregate(total="sum(count)", groupby=["cruz_label"])
    .transform_window(
        cum="sum(count)",
        sort=[alt.SortField("status_order", order="ascending")],
        groupby=["cruz_label"],
    )
    .transform_calculate(center="datum.cum - datum.count / 2")
    .mark_text(
        align="center", baseline="middle", fontSize=14, fontWeight=600, color="white"
    )
    .encode(
        y=alt.Y("cruz_label:N", sort=["No", "Yes"]),
        x=alt.X("center:Q"),
        text=alt.Text("percentage:Q", format=".1f"),
        opacity=alt.condition(
            alt.datum.count > 50,  # Only show percentage if segment is large enough
            alt.value(1),
            alt.value(0),
        ),
    )
)

# Row totals at right edge
totals_labels = (
    alt.Chart(row_totals)
    .mark_text(align="left", dx=10, fontSize=13, fontWeight=600, color=DARK_TEXT)
    .encode(
        y=alt.Y("cruz_label:N", sort=["No", "Yes"], title=None),
        x=alt.X("row_total:Q"),
        text=alt.Text("row_total:Q", format=","),
    )
)

left_panel = (stack + percentage_labels + totals_labels).properties(
    title=alt.TitleParams(
        "Grants by Cruz List Status", fontSize=16, fontWeight=600, anchor="start"
    )
)

# Right panel: Totals with centered labels and percentages

right_base = (
    alt.Chart(totals)
    .transform_joinaggregate(total="sum(count)")
    .transform_window(
        cum="sum(count)", sort=[alt.SortField("status_order", order="ascending")]
    )
    .transform_calculate(center="datum.cum - datum.count / 2")
)

totals_bar = (
    right_base.mark_bar()
    .encode(
        x=alt.X("one:N", axis=None, title=""),
        y=alt.Y(
            "count:Q",
            stack="zero",
            axis=None,
            title="",
            scale=alt.Scale(domain=[0, total_sum], nice=False, zero=True),
        ),
        color=alt.Color(
    "status_label:N",
    scale=alt.Scale(
        domain=["Terminated", "Reinstated"],
        range=[BLUE_DARK, BLUE_LIGHT]
    ),
    legend=None,
),
        order=alt.Order("status_order:Q"),
    )
    .properties(
        width=200,
        height=250,
        title=alt.TitleParams(
            "Overall Totals", fontSize=16, fontWeight=600, anchor="middle"
        ),
    )
)

# Status labels with percentages
totals_labels_text = (
    right_base.transform_calculate(
        label='datum.status_label + " (" + toString(datum.percentage) + "%)"'
    )
    .mark_text(baseline="middle", fontSize=14, fontWeight=600, color="white")
    .encode(
        x=alt.X("one:N"),
        y=alt.Y(
            "center:Q",
            axis=None,
            scale=alt.Scale(domain=[0, total_sum], nice=False, zero=True),
        ),
        text=alt.Text("label:N"),
    )
)

right_panel = totals_bar + totals_labels_text

# Combine with enhanced styling

Q5 = (left_panel | right_panel).resolve_scale(color="shared")

Q5

comment on Q5 decisions


# Final Visualization

The final dashboard provides a comprehensive visual overview of the NSF grant cancellations dataset, addressing all analytical questions defined in the project. Each panel uses consistent blue colour encoding, ensuring visual coherence across maps, bars, and distributions.

Q1 combines a choropleth map and bar chart to reveal the geographic distributon of cancellations, highlighting states with notable concentration such as California and Massachusetts. Q2 and Q3 examine the institutional impact, ranking universities by number of cancelled grants and by total budget loss. These views allow comparison between a volume based and financial consequences, showing that some institutions are highly affected in funding despite fewer cancellations.

Q4 investigates the flagged-language patterns identified in the cancellation summaries. The frequency distribution shows how often grants include such terms, while the accompanying bar chart highlights the most commonly flagged words. This helps contextualize the linguistic indicators present in cancelled proposals.

Finally, Q5 analyzes the relationship between cancellations and Ted Cruz’s publicly flagged list, comparing reinstatement rates and overall proportions.

Together, these visualizations answer all project questions, aplying appropriate encoding choices, position, color, scale, and layout to support a clear and meaningful interpretation of the dataset.

In [91]:
import altair as alt

# Mini-panels for the dashboard (all single charts, no "|")
F1 = chart_map2.properties(
    width=260,
    height=200,
    title="Q1 – Cancellations by State (Map)"
)

F2 = chart_bar.properties(
    width=260,
    height=200,
    title="Q1 – Top 10 States by Cancellations"
)

F3 = chart_q2.properties(
    width=260,
    height=200,
    title="Q2 – Top Institutions by # Cancellations"
)

F4 = chart_q3.properties(
    width=260,
    height=200,
    title="Q3 – Top Institutions by Budget Loss"
)

F5 = chart_q4_hist.properties(
    width=260,
    height=200,
    title="Q4 – Flagged Words per Grant"
)

F6 = chart_q4_words.properties(
    width=260,
    height=200,
    title="Q4 – Top Flagged Words in Cancelled Grants"
)

F7 = left_panel.properties(
    width=260,
    height=200,
    title="Q5 – Grants by Cruz List Status"
)

F8 = right_panel.properties(
    width=260,
    height=200,
    title="Q5 – Overall Totals"
)

# One big dashboard: 4 columns x 2 rows
final_dashboard = (
    alt.concat(
        F1, F2, F3, F4,
        F5, F6, F7, F8,
        columns=4,
    )
    .properties(
        title="NSF Grant Cancellations — Final Overview (Q1–Q5)"
    )
    .configure_view(strokeWidth=0)
    .configure_axis(
        grid=True,
        gridColor=GRID,
        gridOpacity=0.5,
        domainColor=GRID,
        tickColor=GRID,
        labelColor=DARK_TEXT,
        titleColor=DARK_TEXT,
    )
    .configure_concat(
    spacing=40
    )
)

final_dashboard
