In [21]:
import pandas as pd
import numpy as np
from pathlib import Path

#Settings to make sure outputs in jupyter window aren't abbreviated (but displaying large outputs CAN freeze the notebook, so watch it)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.width", 200)
pd.set_option("display.expand_frame_repr", False)


# Read in CSV data - this is the research dataset that has already been prepared from raw IPEDS data

DATA_PATH = Path("..") / "data" / "processed" / "ipeds" / "research_bigten_completions_2024.csv"

df = pd.read_csv(DATA_PATH, low_memory=False, dtype={'unitid': str, 'cipcode': str, 'cip2': str})

#Rows and columns
print(f"Number of rows: {len(df)}")
print(f"Number of columns: {len(df.columns)}")

Number of rows: 8880
Number of columns: 12


In [None]:
#df.loc[df["cip2_title"].str.contains(",", na=False), "cip2_title"].head(20)

#df.head(10)

# UofI rows only
#uiuc_df = df.loc[df["institution"] == "University of Illinois Urbana-Champaign"]
#uiuc_df.head(10)

#print(df["cip2"].dtype)
#print(df["cipcode"].dtype)
#print(df["unitid"].dtype)

#check that everything has a cip2 value
#print(df["cip2"].isna().sum())

# check that everything every cip2 value has a cip2 title, just one each
#df.loc[df["cip2"].str.len() != 2, "cip2"].value_counts(dropna=False)

#df.groupby(["cip2", "cip2_title"]).size().sort_index()

#df.groupby("cip2")["cip2_title"].nunique().sort_values(ascending=False)

# check each institution has just one unit id and these are all the right Big Ten insitutions
#df.groupby("institution")["unitid"].nunique().sort_values(ascending=False)

# check award level codes and award level titles align (codes are only 5, 7, 17)
#df.groupby(["award_level_code", "award_level_name", "degree_group"]).size().sort_index()

#df.groupby("award_level_code")["award_level_name"].nunique().sort_values(ascending=False)

#  Check row integrity - Is each row uniquely defined by institution, CIP/CIP2, award level, major number
key_cols = ["unitid", "institution", "cipcode", "cip2", "cip2_title", "award_level_code" "major_number"]

group_counts = (
    df.groupby(key_cols)
    .size()
    .reset_index(name="row_count")
)
#group_counts.head(50)

dupe_keys = group_counts.loc[group_counts["row_count"] > 1].sort_values("row_count", ascending=False)
dupe_keys.shape[0]


# Check if each row is uniquely defined by the key columns
#df[key_cols].duplicated().sum()

np.int64(0)

In [48]:
# Checking totals by institution

#“What is the total number of degrees awarded by each institution in this dataset?”
#institution_totals = df.groupby(["institution","award_level_name"])["award_count_total"].sum()
#institution_totals

institution_totals_major1 = (
    df[df["major_number"] == 1]
    .groupby(["institution", "award_level_name"])["award_count_total"]
    .sum()
)
institution_totals_major1



institution                                award_level_name               
Indiana University-Bloomington             Bachelors                          14680
                                           Doctoral (Research/Scholarship)      876
                                           Masters                             6500
Michigan State University                  Bachelors                          17514
                                           Doctoral (Research/Scholarship)     1110
                                           Masters                             4158
Northwestern University                    Bachelors                           4320
                                           Doctoral (Research/Scholarship)     1072
                                           Masters                             9774
Ohio State University-Main Campus          Bachelors                          23240
                                           Doctoral (Research/Scholarship)     1764
 

In [55]:
df_inst = df[
    (df["major_number"] == 1) &
    (df["award_level_name"] == "Bachelors") &
    (df["is_cip6"] == True)
]

institution_totals_major1 = (
    df_inst
    .groupby(["institution", "award_level_name"])["award_count_total"]
    .sum()
)
institution_totals_major1


institution                                award_level_name
Indiana University-Bloomington             Bachelors            6703
Michigan State University                  Bachelors            7342
Northwestern University                    Bachelors            1813
Ohio State University-Main Campus          Bachelors           10365
Pennsylvania State University-Main Campus  Bachelors            9492
Purdue University-Main Campus              Bachelors            8332
Rutgers University-New Brunswick           Bachelors            7736
University of California-Los Angeles       Bachelors            9024
University of Illinois Urbana-Champaign    Bachelors            7269
University of Iowa                         Bachelors            4687
University of Maryland-College Park        Bachelors            7337
University of Michigan-Ann Arbor           Bachelors            7894
University of Minnesota-Twin Cities        Bachelors            6485
University of Nebraska-Lincoln             

In [56]:
not_CIP6_df = df[df["is_cip6"]==False]
not_CIP6_df.head(20)

Unnamed: 0,unitid,institution,cipcode,major_number,award_level_code,award_count_total,award_level_name,degree_group,cip2,cip2_title,cip6_title,is_cip6
0,110662,University of California-Los Angeles,3.0104,1,5,126,Bachelors,Bachelors,3,NATURAL RESOURCES AND CONSERVATION.,Not a CIP6 program (aggregate / unclassified),False
1,110662,University of California-Los Angeles,3.0104,1,17,4,Doctoral (Research/Scholarship),Graduate,3,NATURAL RESOURCES AND CONSERVATION.,Not a CIP6 program (aggregate / unclassified),False
2,110662,University of California-Los Angeles,4.0201,1,5,17,Bachelors,Bachelors,4,ARCHITECTURE AND RELATED SERVICES.,Not a CIP6 program (aggregate / unclassified),False
3,110662,University of California-Los Angeles,4.0201,1,7,1,Masters,Graduate,4,ARCHITECTURE AND RELATED SERVICES.,Not a CIP6 program (aggregate / unclassified),False
4,110662,University of California-Los Angeles,4.0201,1,17,0,Doctoral (Research/Scholarship),Graduate,4,ARCHITECTURE AND RELATED SERVICES.,Not a CIP6 program (aggregate / unclassified),False
5,110662,University of California-Los Angeles,4.0201,2,5,3,Bachelors,Bachelors,4,ARCHITECTURE AND RELATED SERVICES.,Not a CIP6 program (aggregate / unclassified),False
6,110662,University of California-Los Angeles,4.0301,1,7,81,Masters,Graduate,4,ARCHITECTURE AND RELATED SERVICES.,Not a CIP6 program (aggregate / unclassified),False
7,110662,University of California-Los Angeles,4.0301,1,17,3,Doctoral (Research/Scholarship),Graduate,4,ARCHITECTURE AND RELATED SERVICES.,Not a CIP6 program (aggregate / unclassified),False
8,110662,University of California-Los Angeles,4.0902,1,7,68,Masters,Graduate,4,ARCHITECTURE AND RELATED SERVICES.,Not a CIP6 program (aggregate / unclassified),False
9,110662,University of California-Los Angeles,5.0101,1,7,6,Masters,Graduate,5,"AREA, ETHNIC, CULTURAL, GENDER, AND GROUP STUDIES.",Not a CIP6 program (aggregate / unclassified),False
