This notebook merges all the tables of the Decennial Census for the years 1980 - 2020 and the education data for the years 1980, 1990, and 2000.

Raw data path: `GLOB~S/Data/U.S. County Data/county_controls/data/NHGIS_RACE_BY_COUNTY_1980_2020/`

In [35]:
import pandas as pd
import os
SF1_PATH = "/Users/koacow/BOSTON UNIVERSITY Dropbox/Ngoc Duy Khoa Cao/GLOB~S/Data/U.S. County Data/county_controls/data/NHGIS_RACE_BY_COUNTY_1980_2020"

In [36]:
# 1. Load and process the STF1 data for 1980
sf1_1980 = pd.read_csv(
    os.path.join(SF1_PATH, "nhgis0002_ds104_1980_county.csv"),
    usecols=["YEAR", "C7L001","C9D001", "C9D002", "C9D006", "C9D007", "C9D008", "C9D009", "C9D010", "C9D011", "C9D012", "C9D013", "C9D014", "COUNTYA", "STATEA"],
)

# 1.1 Create a FIPS5 column by concatenating the STATEA and COUNTYA columns
sf1_1980["FIPS5"] = sf1_1980["STATEA"].astype(str).str.zfill(2) + sf1_1980["COUNTYA"].astype(str).str.zfill(3)

# 1.2 Calculate the asian population as the sum of asian subcategories
sf1_1980["asian_pop"] = sf1_1980["C9D006"] + sf1_1980["C9D007"] + sf1_1980["C9D008"] + sf1_1980["C9D009"] + sf1_1980["C9D010"] + sf1_1980["C9D011"] + sf1_1980["C9D012"] + sf1_1980["C9D013"] + sf1_1980["C9D014"]

# 1.3 Rename the columns to be consistent with other datasets
sf1_1980 = sf1_1980.rename(columns={
    "YEAR": "year",
    "C7L001": "total_pop",
    "C9D001": "white_pop",
    "C9D002": "black_pop",
})
# 1.4 Calculate the other population as the total population minus the sum of white, black, and asian populations
sf1_1980["other_pop"] = sf1_1980["total_pop"] - (sf1_1980["white_pop"] + sf1_1980["black_pop"] + sf1_1980["asian_pop"])

# 1.5 Select only the relevant columns
sf1_1980 = sf1_1980[["FIPS5", "year", "total_pop", "white_pop", "black_pop", "asian_pop", "other_pop"]]
sf1_1980

Unnamed: 0,FIPS5,year,total_pop,white_pop,black_pop,asian_pop,other_pop
0,01001,1980,32259,24814,7225,82,138
1,01003,1980,78556,65840,12048,96,572
2,01005,1980,24756,13693,11003,25,35
3,01007,1980,15723,12029,3675,11,8
4,01009,1980,36459,35761,598,18,82
...,...,...,...,...,...,...,...
3132,56037,1980,41723,39723,323,259,1418
3133,56039,1980,9355,9237,3,24,91
3134,56041,1980,13021,12781,17,17,206
3135,56043,1980,9496,8825,7,34,630


In [37]:
# 2. Load and process the STF1 data for 1990
sf1_1990 = pd.read_csv(
    os.path.join(SF1_PATH, "nhgis0002_ds120_1990_county.csv"),
    usecols=["YEAR", "ET1001", "EUY001", "EUY002", "EUY004", "COUNTYA", "STATEA"])

# 2.1 Create a FIPS5 column by concatenating the STATEA and COUNTYA columns
sf1_1990["FIPS5"] = sf1_1990["STATEA"].astype(str).str.zfill(2) + sf1_1990["COUNTYA"].astype(str).str.zfill(3)

# 2.2 Rename the columns to be consistent with other datasets
sf1_1990 = sf1_1990.rename(columns={
    "ET1001": "total_pop",
    "EUY001": "white_pop",
    "EUY002": "black_pop",
    "EUY004": "asian_pop",
    "YEAR": "year"
})

# 2.3 Calculate the other population as the total population minus the sum of white, black, and asian populations
sf1_1990["other_pop"] = sf1_1990["total_pop"] - (sf1_1990["white_pop"] + sf1_1990["black_pop"] + sf1_1990["asian_pop"])

# 2.4 Select only the relevant columns
sf1_1990 = sf1_1990[["FIPS5", "year", "total_pop", "white_pop", "black_pop", "asian_pop", "other_pop"]]
sf1_1990


Unnamed: 0,FIPS5,year,total_pop,white_pop,black_pop,asian_pop,other_pop
0,01001,1990,34222,27144,6845,120,113
1,01003,1990,98280,84565,12640,221,854
2,01005,1990,25417,14118,11194,44,61
3,01007,1990,16576,13052,3478,11,35
4,01009,1990,39248,38397,521,33,297
...,...,...,...,...,...,...,...
3136,56037,1990,38823,36564,289,254,1716
3137,56039,1990,11172,10989,17,49,117
3138,56041,1990,18705,18278,25,66,336
3139,56043,1990,8388,7864,14,41,469


In [38]:
# 3. Load and process the SF1 data for 2000
sf1_2000 = pd.read_csv(
    os.path.join(SF1_PATH, "nhgis0002_ds146_2000_county.csv"),
    usecols=["YEAR", "FL5001", "FMR001", "FMR002", "FMR004", "COUNTYA", "STATEA"])

# 3.1 Create a FIPS5 column by concatenating the STATEA and COUNTYA columns
sf1_2000["FIPS5"] = sf1_2000["STATEA"].astype(str).str.zfill(2) + sf1_2000["COUNTYA"].astype(str).str.zfill(3)

# 3.2 Rename the columns to be consistent with other datasets
sf1_2000 = sf1_2000.rename(columns={
    "FL5001": "total_pop",
    "FMR001": "white_pop",
    "FMR002": "black_pop",
    "FMR004": "asian_pop",
    "YEAR": "year"
})

# 3.3 Calculate the other population as the total population minus the sum of white, black, and asian populations
sf1_2000["other_pop"] = sf1_2000["total_pop"] - (sf1_2000["white_pop"] + sf1_2000["black_pop"] + sf1_2000["asian_pop"])

# 3.4 Select only the relevant columns
sf1_2000 = sf1_2000[["FIPS5", "year", "total_pop", "white_pop", "black_pop", "asian_pop", "other_pop"]]
sf1_2000


Unnamed: 0,FIPS5,year,total_pop,white_pop,black_pop,asian_pop,other_pop
0,01001,2000,43671,35221,7473,200,777
1,01003,2000,140415,122366,14444,537,3068
2,01005,2000,29038,14887,13451,84,616
3,01007,2000,20826,15966,4624,17,219
4,01009,2000,51024,48512,606,71,1835
...,...,...,...,...,...,...,...
3136,56037,2000,37613,34461,275,240,2637
3137,56039,2000,18251,17081,27,99,1044
3138,56041,2000,19742,18621,22,54,1045
3139,56043,2000,8289,7478,9,61,741


In [39]:
# 4. Load and process the SF1 data for 2010
sf1_2010 = pd.read_csv(
    os.path.join(SF1_PATH, "nhgis0002_ds172_2010_county.csv"),
    usecols=["YEAR", "H7V001", "H7X002", "H7X003", "H7X005", "COUNTYA", "STATEA"])

# 4.1 Create a FIPS5 column by concatenating the STATEA and COUNTYA columns
sf1_2010["FIPS5"] = sf1_2010["STATEA"].astype(str).str.zfill(2) + sf1_2010["COUNTYA"].astype(str).str.zfill(3)

# 4.2 Rename the columns to be consistent with other datasets
sf1_2010 = sf1_2010.rename(columns={
    "H7V001": "total_pop",
    "H7X002": "white_pop",
    "H7X003": "black_pop",
    "H7X005": "asian_pop",
    "YEAR": "year"
})

# 4.3 Calculate the other population as the total population minus the sum of white, black, and asian populations
sf1_2010["other_pop"] = sf1_2010["total_pop"] - (sf1_2010["white_pop"] + sf1_2010["black_pop"] + sf1_2010["asian_pop"])

# 4.4 Select only the relevant columns
sf1_2010 = sf1_2010[["FIPS5", "year", "total_pop", "white_pop", "black_pop", "asian_pop", "other_pop"]]
sf1_2010




Unnamed: 0,FIPS5,year,total_pop,white_pop,black_pop,asian_pop,other_pop
0,01001,2010,54571,42855,9643,474,1599
1,01003,2010,182265,156153,17105,1348,7659
2,01005,2010,27457,13180,12875,107,1295
3,01007,2010,22915,17381,5047,22,465
4,01009,2010,57322,53068,761,117,3376
...,...,...,...,...,...,...,...
3216,72145,2010,59662,46113,6843,74,6632
3217,72147,2010,9301,5456,2617,6,1222
3218,72149,2010,26073,21405,2212,11,2445
3219,72151,2010,37941,24888,5334,57,7662


In [40]:
sf1_2020 = pd.read_csv(
    os.path.join(SF1_PATH, "nhgis0002_ds248_2020_county.csv"),
    usecols=["YEAR", "GEOID", "U7B001", "U7B003", "U7B004", "U7B006"])

# 5.1 Rename the columns to be consistent with other datasets
sf1_2020 = sf1_2020.rename(columns={
    "YEAR": "year",
    "GEOID": "FIPS5",
    "U7B001": "total_pop",
    "U7B003": "white_pop",
    "U7B004": "black_pop",
    "U7B006": "asian_pop"
})

# 5.2 Reformat the FIPS5 columns, keeping only the last 5 characters and padding with zeros
sf1_2020["FIPS5"] = sf1_2020["FIPS5"].astype(str).str[-5:].str.zfill(5)

# 5.3 Calculate the other population as the total population minus the sum of white, black, and asian populations
sf1_2020["other_pop"] = sf1_2020["total_pop"] - (sf1_2020["white_pop"] + sf1_2020["black_pop"] + sf1_2020["asian_pop"])

# 5.4 Select only the relevant columns
sf1_2020 = sf1_2020[["FIPS5", "year", "total_pop", "white_pop", "black_pop", "asian_pop", "other_pop"]]
sf1_2020

Unnamed: 0,FIPS5,year,total_pop,white_pop,black_pop,asian_pop,other_pop
0,01001,2020,58805,42160,11445,881,4319
1,01003,2020,231767,189399,18217,2067,22084
2,01005,2020,25223,11317,11933,117,1856
3,01007,2020,22293,16555,4413,32,1293
4,01009,2020,59134,50663,845,178,7448
...,...,...,...,...,...,...,...
3216,72145,2020,54414,10042,3491,28,40853
3217,72147,2020,8249,1551,1256,7,5435
3218,72149,2020,22093,3429,1380,10,17274
3219,72151,2020,30426,5028,2941,8,22449


In [41]:
# 6. Concatenate all the dataframes into one
sf1_1980_2020 = pd.concat([sf1_1980, sf1_1990, sf1_2000, sf1_2010, sf1_2020], ignore_index=True)
sf1_1980_2020

Unnamed: 0,FIPS5,year,total_pop,white_pop,black_pop,asian_pop,other_pop
0,01001,1980,32259,24814,7225,82,138
1,01003,1980,78556,65840,12048,96,572
2,01005,1980,24756,13693,11003,25,35
3,01007,1980,15723,12029,3675,11,8
4,01009,1980,36459,35761,598,18,82
...,...,...,...,...,...,...,...
15856,72145,2020,54414,10042,3491,28,40853
15857,72147,2020,8249,1551,1256,7,5435
15858,72149,2020,22093,3429,1380,10,17274
15859,72151,2020,30426,5028,2941,8,22449


In [42]:
# 7. Add education data from the 1980, 1990, and 2000 censuses
sf1_education = pd.read_excel(
    os.path.join(SF1_PATH, "sf1_education.xls"),
    skiprows=4,
    dtype={"FIPS Code": str},
)
sf1_education = sf1_education.rename(columns={
    "FIPS Code": "FIPS5",
    "Four years of college or higher, 1980": "1980",
    "Bachelor's degree or higher, 1990": "1990",
    "Bachelor's degree or higher, 2000": "2000",
})
sf1_education = sf1_education[["FIPS5", "1980", "1990", "2000"]]
sf1_education = sf1_education.melt(
    id_vars=["FIPS5"],
    var_name="year",
    value_name="bachelors_pop",
)
sf1_education["year"] = sf1_education["year"].str.extract(r'(\d{4})').astype(int)
sf1_education["bachelors_pop"] = sf1_education["bachelors_pop"].astype(float)
sf1_education

Unnamed: 0,FIPS5,year,bachelors_pop
0,00000,1980,21558480.0
1,01000,1980,270063.0
2,01001,1980,2117.0
3,01003,1980,5498.0
4,01005,1980,1300.0
...,...,...,...
9844,72145,2000,5263.0
9845,72147,2000,564.0
9846,72149,2000,1885.0
9847,72151,2000,2763.0


In [43]:
# 8. Merge the education data with the main dataframe
sf1_1980_2020 = sf1_1980_2020.merge(sf1_education, on=["FIPS5", "year"], how="left")
sf1_1980_2020

Unnamed: 0,FIPS5,year,total_pop,white_pop,black_pop,asian_pop,other_pop,bachelors_pop
0,01001,1980,32259,24814,7225,82,138,2117.0
1,01003,1980,78556,65840,12048,96,572,5498.0
2,01005,1980,24756,13693,11003,25,35,1300.0
3,01007,1980,15723,12029,3675,11,8,433.0
4,01009,1980,36459,35761,598,18,82,1144.0
...,...,...,...,...,...,...,...,...
15856,72145,2020,54414,10042,3491,28,40853,
15857,72147,2020,8249,1551,1256,7,5435,
15858,72149,2020,22093,3429,1380,10,17274,
15859,72151,2020,30426,5028,2941,8,22449,


In [44]:
sf1_1980_2020["FIPS5"].nunique()

3235

In [45]:
# 7. Spot check for a specific county (e.g., FIPS code 06037 - Los Angeles County, CA)
sf1_1980_2020[sf1_1980_2020["FIPS5"] == "06037"].sort_values(by="year")

Unnamed: 0,FIPS5,year,total_pop,white_pop,black_pop,asian_pop,other_pop,bachelors_pop
197,6037,1980,7477503,5073617,943968,434850,1025068,818658.0
3337,6037,1990,8863164,5035103,992974,954485,1880602,1223442.0
6480,6037,2000,9519338,4637062,930957,1137500,2813819,1462389.0
9623,6037,2010,9818605,4936599,856874,1346865,2678267,
12845,6037,2020,10014009,3259427,794364,1499984,4460234,


In [46]:
# 8. Save the final dataframe to a CSV file
sf1_1980_2020.to_csv(os.path.join(SF1_PATH, "sf1_1980_2020.csv"), index=False)