In [15]:
import pandas as pd
import os

# Correct directory path
data_dir = "data"

files = {
    "2019": "c2019_a_rv.csv",
    "2020": "c2020_a_rv.csv",
    "2021": "c2021_a_rv.csv",
    "2022": "c2022_a_rv.csv",
    "2023": "C2023_a_RV.csv",   # exact casing from folder
    "degree_field": "degree_field.csv"
}

dfs = {}

for year, filename in files.items():
    path = os.path.join(data_dir, filename)
    print("Looking for:", path)

    df = pd.read_csv(path, low_memory=False)
    dfs[year] = df

    print(f"Loaded {filename}  Shape = {df.shape}")


Looking for: data/c2019_a_rv.csv
Loaded c2019_a_rv.csv  Shape = (290972, 64)
Looking for: data/c2020_a_rv.csv
Loaded c2020_a_rv.csv  Shape = (289833, 64)
Looking for: data/c2021_a_rv.csv
Loaded c2021_a_rv.csv  Shape = (296365, 64)
Looking for: data/c2022_a_rv.csv
Loaded c2022_a_rv.csv  Shape = (301055, 64)
Looking for: data/C2023_a_RV.csv
Loaded C2023_a_RV.csv  Shape = (303460, 64)
Looking for: data/degree_field.csv
Loaded degree_field.csv  Shape = (15, 16)


In [16]:
# Combine ACS years into a single dataframe
acs_years = ["2019", "2020", "2021", "2022", "2023"]

acs_list = []
for year in acs_years:
    df = dfs[year].copy()
    df["YEAR"] = int(year)  # add year column
    acs_list.append(df)

acs = pd.concat(acs_list, ignore_index=True)

print("Combined ACS shape:", acs.shape)
acs.head()


Combined ACS shape: (1481685, 66)


Unnamed: 0,UNITID,CIPCODE,MAJORNUM,AWLEVEL,XCTOTALT,CTOTALT,XCTOTALM,CTOTALM,XCTOTALW,CTOTALW,XCAIANT,CAIANT,XCAIANM,CAIANM,XCAIANW,CAIANW,XCASIAT,CASIAT,XCASIAM,CASIAM,XCASIAW,CASIAW,XCBKAAT,CBKAAT,XCBKAAM,CBKAAM,XCBKAAW,CBKAAW,XCHISPT,CHISPT,XCHISPM,CHISPM,XCHISPW,CHISPW,XCNHPIT,CNHPIT,XCNHPIM,CNHPIM,XCNHPIW,CNHPIW,XCWHITT,CWHITT,XCWHITM,CWHITM,XCWHITW,CWHITW,XC2MORT,C2MORT,XC2MORM,C2MORM,XC2MORW,C2MORW,XCUNKNT,CUNKNT,XCUNKNM,CUNKNM,XCUNKNW,CUNKNW,XCNRALT,CNRALT,XCNRALM,CNRALM,XCNRALW,CNRALW,YEAR,CNRALW.1
0,100654,1.0999,1,5,R,6,Z,0,R,6,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,R,5,Z,0,R,5,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,R,1,Z,0,R,1,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0.0,2019,
1,100654,1.1001,1,5,R,7,R,2,R,5,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,R,7,R,2,R,5,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0.0,2019,
2,100654,1.1001,1,7,R,8,R,2,R,6,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,R,7,R,2,R,5,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,R,1,Z,0,R,1.0,2019,
3,100654,1.1001,1,17,R,2,Z,0,R,2,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,R,2,Z,0,R,2,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0.0,2019,
4,100654,1.9999,1,5,R,2,R,1,R,1,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,R,1,Z,0,R,1,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,R,1,R,1,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0,Z,0.0,2019,


In [18]:
degree_field = dfs["degree_field"]

print("Degree field file shape:", degree_field.shape)
degree_field.head()

Degree field file shape: (15, 16)


Unnamed: 0,YEAR,SAMPLE,SERIAL,CBSERIAL,HHWT,CLUSTER,STRATA,GQ,PERNUM,PERWT,EDUC,EDUCD,DEGFIELD,DEGFIELDD,DEGFIELD2,DEGFIELD2D
0,2013,201301,287071,485119,85.0,2013002870711,110512,1,1,85.0,10,101,62,6201,62,6201
1,2013,201301,1312870,1031177,155.0,2013013128701,1160353,1,1,154.0,10,101,62,6201,62,6201
2,2014,201401,81285,155141,57.0,2014000812851,810606,1,1,57.0,10,101,62,6201,62,6201
3,2015,201501,908047,511723,62.0,2015009080471,120637,1,1,63.0,10,101,62,6201,62,6201
4,2016,201601,595336,67618,84.0,2016005953361,330225,1,1,84.0,11,114,62,6201,62,6201


In [25]:
acs.columns.tolist()
[col for col in acs.columns if "deg" in col.lower()]
[col for col in acs.columns if "field" in col.lower()]
[col for col in acs.columns if "fod" in col.lower()]


[]