In [None]:
# This code is the major steps taken for the following:

# 1) Getting the raw IPEDS data as CSV
# 2) Getting the institution header data
# 3) Getting the CIP titles corresponding to the codes
# 4) Cleaning the data to a basic level
# 5) Joining in the title data and the CIP code descriptions
# 6) Filtering just to the Big 10 schools. Outputting the research dataset for Big 10

In [40]:
import pandas as pd
import numpy as np
from pathlib import Path

pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 20)


In [34]:
# functions used elsewhere

def format_cipcode(val):
    if pd.isna(val):
        return pd.NA

    # If numeric, format to 4 decimal places (CIP6 standard formatting)
    try:
        f = float(val)
        s = f"{f:.4f}"  # e.g., 3.0104 -> "3.0104"
    except (ValueError, TypeError):
        s = str(val).strip()
        if s == "" or s.lower() == "nan":
            return pd.NA

    left, *rest = s.split(".", 1)
    left = left.zfill(2)
    if rest:
        right = (rest[0] + "0000")[:4]  # pad/right-trim to 4 digits
        return f"{left}.{right}"
    return left

In [41]:
# Read in CSV data

DATA_PATH = Path("..") / "data" / "raw" / "ipeds" / "C2024_A.csv"

df = pd.read_csv(DATA_PATH, low_memory=False)

#Rows and columns
df.shape

(307707, 64)

In [6]:
HD_PATH = Path("..") / "data" / "raw" / "ipeds" / "HD2024.csv"

hd = pd.read_csv(HD_PATH, low_memory=False)

hd.shape

(6072, 72)

In [45]:
CIP_PATH = Path("..") / "data" / "raw" / "ipeds" / "CIPCode2020.csv"
cip_raw = pd.read_csv(CIP_PATH, low_memory=False)
cip_raw.columns.tolist()

['CIPFamily',
 'CIPCode',
 'Action',
 'TextChange',
 'CIPTitle',
 'CIPDefinition',
 'CrossReferences',
 'Examples']

In [81]:
#cip_raw.head(10)

# Normalize the codes and titles so joins will work later
cip_lookup = (
    cip_raw[["CIPCode", "CIPTitle"]]
    .rename(columns={
        "CIPCode": "cipcode",
        "CIPTitle": "cip_title"
    })
    .copy()
)

cip_lookup["cipcode"] = (
    cip_lookup["cipcode"]
    .astype("string")
    .str.strip()
)

# String the leading =" and trailing ". This is from the coding of the CSV
cip_lookup["cipcode"] = (
    cip_lookup["cipcode"]
    .astype("string")
    .str.strip()
    .str.replace(r'^="?|"$', '', regex=True)
)

# Clean string in title
cip_lookup["cip_title"] = cip_lookup["cip_title"].astype("string").str.strip()

# CIP2 lookup (2-digit family)
cip_lookup["cip2"] = cip_lookup["cipcode"].str.split(".", n=1).str[0].str.zfill(2)

# CIP6 tells you what exact program; CIP2 tells you what broad field.

# Identify rows that are exactly 2 digits (e.g., "01", "03", "52")
cip2_only = cip_lookup[cip_lookup["cipcode"].str.fullmatch(r"\d{2}", na=False)].copy()

cip2_lookup = (
    cip2_only[["cipcode", "cip_title"]]
    .rename(columns={"cipcode": "cip2", "cip_title": "cip2_title"})
    .sort_values("cip2")
)


# Optional: CIP6 lookup (exact code)  - Program/Major level - likely too detailed for this analysis, but just here optionally
cip6_lookup = (
    cip_lookup[["cipcode", "cip_title"]]
    .dropna()
    .drop_duplicates()
    .rename(columns={"cip_title": "cip6_title"})
)

cip6_lookup = cip6_lookup[cip6_lookup["cipcode"].str.fullmatch(r"\d{2}\.\d{4}", na=False)].copy()


#cip_lookup.head(20)


In [None]:
# Get just the header columns we need
hd_small = hd[["UNITID", "INSTNM"]].copy()

#hd_small.head()


Unnamed: 0,UNITID,INSTNM
0,100654,Alabama A & M University
1,100663,University of Alabama at Birmingham
2,100690,Amridge University
3,100706,University of Alabama in Huntsville
4,100724,Alabama State University


In [None]:
# Left join the institution names to the main dataset

df_with_names = df.merge(
    hd_small,
    on="UNITID",
    how="left"
)

# df_with_names.shape


(307707, 65)

In [None]:
bigten_unitids = [
    145637,  # University of Illinois Urbana-Champaign
    151351,  # Indiana University-Bloomington
    153658,  # University of Iowa
    163286,  # University of Maryland-College Park
    170976,  # University of Michigan-Ann Arbor
    171100,  # Michigan State University
    174066,  # University of Minnesota-Twin Cities
    181464,  # University of Nebraska-Lincoln
    147767,  # Northwestern University
    204796,  # Ohio State University-Main Campus
    214777,  # Pennsylvania State University-Main Campus
    243780,  # Purdue University-Main Campus
    186380,  # Rutgers University-New Brunswick
    240444,  # University of Wisconsin-Madison
    110662,  # University of California-Los Angeles (UCLA)
    123961,  # University of Southern California (USC)
    209551,  # University of Oregon
    236948,  # University of Washington-Seattle Campus
]

# Make df with just Big 10 unit IDs
df_bigten = pd.DataFrame(
    {"UNITID": bigten_unitids}
)




In [None]:
# Inner join with the df_bigten to get the dataset for just the Big 10 schools

df_bigten_named = (
    df_with_names[["UNITID", "INSTNM"]]
    .drop_duplicates()
    .merge(df_bigten, on="UNITID", how="inner")
    .sort_values("INSTNM")
)

#df_bigten_named.head(20)



#df_bigten_named.to_csv(out_path, index=False)

Unnamed: 0,UNITID,INSTNM
4,151351,Indiana University-Bloomington
8,171100,Michigan State University
3,147767,Northwestern University
12,204796,Ohio State University-Main Campus
14,214777,Pennsylvania State University-Main Campus
17,243780,Purdue University-Main Campus
11,186380,Rutgers University-New Brunswick
0,110662,University of California-Los Angeles
2,145637,University of Illinois Urbana-Champaign
5,153658,University of Iowa


In [None]:
# Save the named Big 10 dataset as csv for later access in repo (processed)

#out_path = Path("..") / "data" / "processed" / "ipeds" / "bigten_unitids_named.csv"
#out_path.parent.mkdir(parents=True, exist_ok=True)

#df_bigten_named.to_csv(out_path, index=False)

#assert df_bigten_named.shape == (18, 2)


In [28]:
# Build the big 10 filter set

bigten_set = set(df_bigten_named["UNITID"].astype(int))
len(bigten_set)


18

In [None]:
# Create the Big 10 Research dataset and rename columns friendlier names

research = (
    df_with_names
    .loc[df_with_names["UNITID"].isin(bigten_set), ["UNITID", "INSTNM", "CIPCODE", "MAJORNUM", "AWLEVEL", "CTOTALT"]]
    .rename(columns={
        "UNITID": "unitid",
        "INSTNM": "institution",
        "CIPCODE": "cipcode",
        "MAJORNUM": "major_number",
        "AWLEVEL": "award_level_code",
        "CTOTALT": "award_count_total",
    })
    .copy()
)

#research.head()


Unnamed: 0,unitid,institution,cipcode,major_number,award_level_code,award_count_total
17963,110662,University of California-Los Angeles,3.0104,1,5,126
17964,110662,University of California-Los Angeles,3.0104,1,17,4
17965,110662,University of California-Los Angeles,4.0201,1,5,17
17966,110662,University of California-Los Angeles,4.0201,1,7,1
17967,110662,University of California-Los Angeles,4.0201,1,17,0


In [70]:
# Convert institution to pandas string dtype (cleaner than object)
research["institution"] = research["institution"].astype("string")

# Ensure numeric codes are numeric (already are, but this makes intent explicit)
for col in ["unitid", "major_number", "award_level_code", "award_count_total"]:
    research[col] = pd.to_numeric(research[col], errors="coerce").astype("Int64")

In [71]:
#Convert the cipcode to a string preserving leading 0's
#research["cipcode"] = research["cipcode"].apply(format_cipcode).astype("string")

research[["cipcode"]].head(10)

#research.info()

'''
CIP Code format, represents "family" (broad discipline) followed by . (separator) and the program code

CIP Code	Meaning
03.0104	Natural Resources → Environmental Science
04.0201	Architecture & Related Services → Architecture
04.0301	Architecture & Related Services → City / Urban Planning
05.0101	Area, Ethnic, Cultural Studies

'''

'\nCIP Code format, represents "family" (broad discipline) followed by . (separator) and the program code\n\nCIP Code\tMeaning\n03.0104\tNatural Resources → Environmental Science\n04.0201\tArchitecture & Related Services → Architecture\n04.0301\tArchitecture & Related Services → City / Urban Planning\n05.0101\tArea, Ethnic, Cultural Studies\n\n'

In [72]:
# Add the corresponding labels for the award level codes

award_level_labels = {
    5: "Bachelors",
    7: "Masters",
    17: "Doctoral (Research/Scholarship)",
}

# Map the labels
research["award_level_name"] = research["award_level_code"].map(award_level_labels).astype("string")

# Make a new grouping to distinguish grad from undergrad (that's the category we'll probably be using)
research["degree_group"] = pd.Series(pd.NA, index=research.index, dtype="string")
research.loc[research["award_level_code"] == 5, "degree_group"] = "Bachelors"
research.loc[research["award_level_code"].isin([7, 17]), "degree_group"] = "Graduate"

#research.head(20)

In [73]:
research[["award_level_code", "award_level_name", "degree_group"]].drop_duplicates().sort_values("award_level_code")


Unnamed: 0,award_level_code,award_level_name,degree_group
75673,2,,
82586,3,,
106076,4,,
17963,5,Bachelors,Bachelors
37584,6,,
17966,7,Masters,Graduate
18016,8,,
17964,17,Doctoral (Research/Scholarship),Graduate
18123,18,,
37094,19,,


In [74]:
# ----------------------------
# 2) Normalize research keys and join CIP titles
# ----------------------------
research = research.copy()
research["cipcode"] = research["cipcode"].astype("string").str.strip()
research["cip2"] = research["cipcode"].str.split(".", n=1).str[0].str.zfill(2)

research_final = (
    research
    .merge(cip2_lookup, on="cip2", how="left")
    .merge(cip6_lookup, on="cipcode", how="left")
)

In [75]:
# ----------------------------
# 4) Final column order + validation
# ----------------------------
final_cols = [
    "unitid",
    "institution",
    "cip2",
    "cip2_title",
    "cipcode",
    "cip6_title",
    "major_number",
    "award_level_code",
    "award_level_name",
    "degree_group",
    "award_count_total",
]
final_cols = [c for c in final_cols if c in research_final.columns]
research_final = research_final[final_cols].copy()

In [None]:
#Handle missing cip 2 codes

missing_cip2 = (
    research_final.loc[research_final["cip2_title"].isna(), "cip2"]
    .value_counts()
    .sort_index()
)
missing_cip2


cip2
99    150
Name: count, dtype: int64

In [None]:
# Join the lookup tables for CIP codes types 2 and 6
research_final = (
    research
    .merge(cip2_lookup, on="cip2", how="left")
    .merge(cip6_lookup, on="cipcode", how="left")
)


# Missing values in title for category 99 - no corresponding classification
research_final["cip2_title"] = research_final["cip2_title"].fillna("99 - Unclassified / Not in CIP taxonomy")


# Sanity checks
print("Missing cip2_title:", research_final["cip2_title"].isna().sum())
print("Missing cip6_title:", research_final["cip6_title"].isna().sum())


Missing cip2_title: 0
Missing cip6_title: 1589


In [None]:
# there are missing CIP6 titles because not all of them are 6 digit fine-grained. These ones are not part of CIP6

missing_cip6 = (
    research_final.loc[research_final["cip6_title"].isna(), "cip2"]
    .value_counts()
    .sort_index()
)
missing_cip6


cip2
01    288
03    156
04    115
05    371
09    215
13     25
15      2
16    121
22     13
26     37
30      9
40      1
44      4
50     20
51     61
52      1
99    150
Name: count, dtype: int64

In [None]:

# Flag for missing CIP6 title. These should not be considered if you are looking at that level, which we probably won't
research_final["is_cip6"] = research_final["cipcode"].str.fullmatch(r"\d{2}\.\d{4}", na=False)


In [95]:
# Fill the irrelevant CIP6 titles
research_final["cip6_title"] = research_final["cip6_title"].fillna(
    "Not a CIP6 program (aggregate / unclassified)"
)



In [96]:
# Guardrails (fail fast)
assert research_final["unitid"].nunique() == 18
assert research_final.loc[research_final["cip2"] != "99", "cip2_title"].isna().sum() == 0
assert research_final.loc[research_final["is_cip6"], "cip6_title"].isna().sum() == 0


In [97]:


out_path = (
    Path("..")
    / "data"
    / "processed"
    / "ipeds"
    / "research_bigten_completions_2024.csv"
)

out_path.parent.mkdir(parents=True, exist_ok=True)
research_final.to_csv(out_path, index=False)

print("Exported:", out_path)
print("Rows:", research_final.shape[0])
print("Columns:", research_final.shape[1])




Exported: ..\data\processed\ipeds\research_bigten_completions_2024.csv
Rows: 8880
Columns: 12
