In [5]:
from pathlib import Path
import pandas as pd
import numpy as np
import json

'''
Counts are based on IPEDS completions data aggregated at the CIP2 level and may differ slightly 
from institution-published figures due to reporting windows and classification standards.
'''

#Settings to make sure outputs in jupyter window aren't abbreviated (but displaying large outputs CAN freeze the notebook, so watch it)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.width", 200)
pd.set_option("display.expand_frame_repr", False)


# ----------------------------------
# Config
# ----------------------------------

EXCEL_PATH = Path("..") / "data" / "raw" / "ipeds" / "Big10_CIP2_2024_20251227.xlsx"
OUT_DIR = Path("..") / "data" / "processed" / "ipeds"
OUT_DIR.mkdir(parents=True, exist_ok=True)

METHODOLOGY_NOTE = (
    "Counts are based on IPEDS completions data aggregated at the CIP2 level and may differ slightly "
    "from institution-published figures due to reporting windows and classification standards."
)



In [6]:
# ----------------------------
# 1) Load
# ----------------------------

df = pd.read_excel(
    EXCEL_PATH,
    sheet_name="Data",
    dtype={'unitid': str, 'cipcode': str, 'year': str}
)

In [7]:
# -----------------------------------------------
# 2) Standardize column names and basic cleaning
# -----------------------------------------------

# Strip the leading and trailing single quotes from the 'cipcode' column
df["C2024_A.CIP Code -  2020 Classification"] = df["C2024_A.CIP Code -  2020 Classification"].str.strip("'")

df["unitid"] = df["unitid"].astype("string").str.strip()
df["institution name"] = df["institution name"].astype("string").str.strip()
df["year"] = df["year"].astype("string").str.strip()

# completions should be numeric
df["C2024_A.Grand total"] = pd.to_numeric(df["C2024_A.Grand total"], errors="coerce").fillna(0).astype(int)

# Rename columns to be more reader-friendly
df = df.rename(
    columns={
        "unitid": "unitId",
        "institution name": "institution",
        "year": "year",
        "C2024_A.First or Second Major": "majorNumber",
        "C2024_A.CIP Code -  2020 Classification": "cipCode",
        "CipTitle": "cipTitle",
        "C2024_A.Award Level code": "awardLevel",
        "C2024_A.Grand total": "totalCompletions",
        "IDX_C": "indexCode"
    }
)




In [8]:
# ----------------------------
# 3) Degree Group mapping
# ----------------------------

# Add Broader Degree Groupings on Award Level Names
# Make a new grouping to distinguish grad from undergrad (that's the category we'll probably be using)
df["degreeGroup"] = pd.Series(pd.NA, index=df.index, dtype="string")
df.loc[df["awardLevel"] == "Bachelor's degree", "degreeGroup"] = "Bachelors"
df.loc[df["awardLevel"].isin(["Master's degree", "Doctor's degree - research/scholarship"]), "degreeGroup"] = "Graduate"
df.loc[df["awardLevel"] == "Associate's degree", "degreeGroup"] = "Associates"
df.loc[df["awardLevel"].isin(["Doctor's degree - professional practice", "Doctor's degree - other"]), "degreeGroup"] = "Graduate-Doctoral Professional/Other"
# If anything remains unmapped, keep it explicit rather than null
df["degreeGroup"] = df["degreeGroup"].fillna("Other").astype("string")

In [9]:
# -------------------------------------------------
# 4) Exclude CIP2 = '99' (totals/unclassified rows) and select only Bachelor's and Graduate degree groups
# -------------------------------------------------
df = (
    df.loc[
        (df["cipCode"] != "99") &
        (df["degreeGroup"].isin(["Bachelors", "Graduate"]))
    ]
    .copy()
)


#Rows and columns
#print(f"Number of rows: {len(df)}")
#print(f"Number of columns: {len(df.columns)}")


In [None]:
#--------------------------------------------------
# Exclude
#--------------------------------------------------

In [19]:
# Sanity check
#df_no99_major1.info()

#df_no99_major1["CIP Code"].unique()

In [11]:
# Sanity check for duplicates - Should be none

#key_cols = ["Institution", "Award Level", "Degree Group", "CIP Code", "Major Number"]

#dupes = (
#    df_no99
#    .groupby(key_cols)
#    .size()
#    .reset_index(name="Row_Count")
#    .query("Row_Count > 1")
#)

#dupes



In [None]:
# Check unique institutions totals for major 1

#inst_totals_major_1 = (
#    df
#    .loc[df["Major Number"] == "First major"]
#    .groupby(["Institution", "Degree Group", "Award Level","Major Number"])["Total Completions"]
#    .sum()
#    .reset_index()
#    .sort_values(["Institution","Degree Group", "Award Level", "Major Number"])
#)

#inst_totals_major_1


In [10]:
# ----------------------------
# 5) Treemap-ready aggregation
#    (Front-end should not have to group large raw extracts)
#    Grain: institution × year × major × award level × cip2
# ----------------------------
group_cols = ["unitId", "institution", "year", "majorNumber", "degreeGroup", "awardLevel", "cipCode", "cipTitle"]

treemap_df = (
    df.groupby(group_cols, dropna=False)["totalCompletions"]
      .sum()
      .reset_index()
      .sort_values(["institution", "year", "majorNumber", "degreeGroup", "awardLevel", "cipCode"])
)

In [11]:
# ----------------------------
# 6) Quality checks (fail fast)
# ---------------------------
# Expect Big Ten count (there are 18 schools in the list)
inst_count = treemap_df["unitId"].nunique()
if inst_count < 18:
    print(f"Warning: institution count looks low ({inst_count}). Confirm the extract includes all Big Ten schools.")

# Confirm CIP2 code looks 2 digits (most should be)
# (If the extract uses 2-digit CIP2 already, this should be True)
cip2_bad = treemap_df.loc[~treemap_df["cipCode"].str.fullmatch(r"\d{2}", na=False), "cipCode"].unique()
if len(cip2_bad) > 0:
    print("Warning: Some cip2 values are not 2 digits. Sample:", cip2_bad[:10])

In [12]:
# ----------------------------
# 7) Export
# ----------------------------
csv_path = OUT_DIR / "big10_cip2_2024_treemap.csv"
json_path = OUT_DIR / "big10_cip2_2024_treemap.json"

treemap_df.to_csv(csv_path, index=False)

payload = {
    "meta": {
        "source": "IPEDS Completions (custom extract)",
        "methodology_note": METHODOLOGY_NOTE,
        "generated_from": str(EXCEL_PATH),
        "rows": int(len(treemap_df)),
        "institutions": int(treemap_df["unitId"].nunique()),
        "years": sorted([y for y in treemap_df["year"].dropna().unique().tolist()]),
    },
    "data": treemap_df.to_dict(orient="records"),
}

with open(json_path, "w", encoding="utf-8") as f:
    json.dump(payload, f, ensure_ascii=False)

print("Wrote:")
print(" -", csv_path)
print(" -", json_path)

Wrote:
 - ..\data\processed\ipeds\big10_cip2_2024_treemap.csv
 - ..\data\processed\ipeds\big10_cip2_2024_treemap.json


In [13]:
df.columns

Index(['unitId', 'institution', 'year', 'majorNumber', 'cipCode', 'cipTitle', 'awardLevel', 'totalCompletions', 'indexCode', 'degreeGroup'], dtype='object')