## Race Data

In [230]:
import pandas as pd
import numpy as np


### Import COC Mapping

In [231]:
coc_mapping = pd.read_csv(
    "/Users/lorna/Documents/MIDS 2022/Spring 2023/Unifying DS/final project uds/src/unifying-data-science-2023-project-team3/00_source_data/COC mapping.csv"
)
coc_mapping.head()


Unnamed: 0,State,STNAME,CoC Code,Coc,CTYNAME,FIPS code
0,AZ,Arizona,AZ-502,"Phoenix,Mesa/Maricopa",Maricopa County,4013
1,CA,California,CA-600,Los Angeles City & County,Los Angeles County,6037
2,CA,California,CA-601,San Diego City and County,San Diego County,6073
3,CA,California,CA-500,San Jose/Santa Clara City & County,Santa Clara County,6085
4,CA,California,CA-609,San Bernardino City & County,San Bernardino County,6071


In [232]:
States = coc_mapping["STNAME"].unique()
cols = [
    "STATE",
    "COUNTY",
    "STNAME",
    "CTYNAME",
    "YEAR",
    "AGEGRP",
    "TOT_POP",
    "TOT_MALE",
    "TOT_FEMALE",
    "WA_MALE",
    "WA_FEMALE",
    "BA_MALE",
    "BA_FEMALE",
    "IA_MALE",
    "IA_FEMALE",
    "AA_MALE",
    "AA_FEMALE",
    "NA_MALE",
    "NA_FEMALE",
    "TOM_MALE",
    "TOM_FEMALE",
    "NH_MALE",
    "NH_FEMALE",
    "H_MALE",
    "H_FEMALE",
]


### Import 2007 to 2010

In [233]:
pop_07_10_subset = []
missing_files = [3, 7, 14, 43, 52]
chunk_size = 5000
for i in range(1, 57):
    if i in missing_files:
        continue
    if i < 10:
        i = f"0{i}"
        url = f"https://www2.census.gov/programs-surveys/popest/datasets/2000-2010/intercensal/county/co-est00int-alldata-{i}.csv"
    else:
        url = f"https://www2.census.gov/programs-surveys/popest/datasets/2000-2010/intercensal/county/co-est00int-alldata-{i}.csv"
    pop_07_10_raw = pd.read_csv(url, encoding="ISO-8859-1", usecols=cols)
    pop_07_10_filtered = pop_07_10_raw[
        (pop_07_10_raw["YEAR"].isin([9, 10, 11, 13]))
        & (pop_07_10_raw["STNAME"].isin(States))
        & (pop_07_10_raw["AGEGRP"] == 0)
    ]
    pop_07_10_subset.append(pop_07_10_filtered)

pop_07_10 = pd.concat(pop_07_10_subset)


### Import 2011 to 2020

In [234]:
chunk_size = 5000
url = "https://www2.census.gov/programs-surveys/popest/datasets/2010-2020/counties/asrh/CC-EST2020-ALLDATA.csv"
pop_11_20_raw = pd.read_csv(
    url, encoding="ISO-8859-1", chunksize=chunk_size, usecols=cols
)
pop_11_20_filtered = []
years = [i for i in range(4, 14)]
# Iterate over the chunks
for data in pop_11_20_raw:
    tmp = data[
        (data["YEAR"].isin(years))
        & (data["STNAME"].isin(States))
        & (data["AGEGRP"] == 0)
    ]
    pop_11_20_filtered.append(tmp)

pop_11_20 = pd.concat(pop_11_20_filtered)
# pop_11_20.shape


### Import 2021

In [235]:
url = "https://www2.census.gov/programs-surveys/popest/datasets/2020-2021/counties/asrh/cc-est2021-all.csv"
pop_21_raw = pd.read_csv(url, encoding="ISO-8859-1", usecols=cols)
pop_21 = pop_21_raw[
    (pop_21_raw["YEAR"] == 3)
    & pop_21_raw["STNAME"].isin(States)
    & (pop_21_raw["AGEGRP"] == 0)
].copy()
# pop_21.shape


### Merge Datasets

In [236]:
# merge all data sets with coc mapping
pop_07_10_merged = pd.merge(
    coc_mapping,
    pop_07_10,
    left_on=["STNAME", "CTYNAME"],
    right_on=["STNAME", "CTYNAME"],
    how="left",
    indicator=True,
)
pop_07_10_merged.shape


(124, 30)

In [237]:
pop_11_20_merged = pd.merge(
    coc_mapping,
    pop_11_20,
    left_on=["STNAME", "CTYNAME"],
    right_on=["STNAME", "CTYNAME"],
    how="left",
    indicator=True,
)
pop_11_20_merged.shape


(310, 30)

In [238]:
pop_21_merged = pd.merge(
    coc_mapping,
    pop_21,
    left_on=["STNAME", "CTYNAME"],
    right_on=["STNAME", "CTYNAME"],
    how="left",
    indicator=True,
)
pop_21_merged.shape


(31, 30)

______

### Clean Data sets

In [239]:
# Check for duplicates
def check_dups(data):
    """checks duplicates and NANs"""
    # check duplicates
    dups = data[data.duplicated()]
    if len(dups) >= 1:
        print(f"duplicates found in {pop}")
        data_no_dups = data.drop_duplicates().reset_index(drop=True)
        print(f"- {len(dups)} duplicate(s) deleted")
    else:
        print("No duplicates found")
    return None


def check_Na(data):
    "checks for NAs"
    data_nas = data.isnull().sum().sum()
    print(f"{data_nas} Nans found")
    return None


for pop in [pop_07_10_merged, pop_11_20_merged, pop_21_merged]:
    check_dups(pop)
    check_Na(pop)
# Check for NAs


No duplicates found
0 Nans found
No duplicates found
0 Nans found
No duplicates found
0 Nans found


### create totals

In [240]:
cols_to_add = [
    "WA_MALE",
    "WA_FEMALE",
    "BA_MALE",
    "BA_FEMALE",
    "IA_MALE",
    "IA_FEMALE",
    "AA_MALE",
    "AA_FEMALE",
    "NA_MALE",
    "NA_FEMALE",
    "TOM_MALE",
    "TOM_FEMALE",
    "NH_MALE",
    "NH_FEMALE",
    "H_MALE",
    "H_FEMALE",
]

# slide over the cols to add and them up
def create_totals(data):
    i = 0
    while i < (len(cols_to_add) - 1):
        col1 = cols_to_add[i]
        col2 = cols_to_add[i + 1]
        col_name = col1[:2]
        data[f"{col_name}_TOTAL"] = data[col1] + data[col2]
        i += 2
    return data


pop_07_10_totaled = create_totals(pop_07_10_merged)
pop_11_20_totaled = create_totals(pop_11_20_merged)
pop_21_totaled = create_totals(pop_21_merged)


### Rename years

In [241]:
conditions = [
    (pop_07_10_totaled["YEAR"] == 9),
    (pop_07_10_totaled["YEAR"] == 10),
    (pop_07_10_totaled["YEAR"] == 11),
    (pop_07_10_totaled["YEAR"] == 13),
]

values = [2007, 2008, 2009, 2010]

pop_07_10_totaled["New_YEAR"] = np.select(conditions, values)
pop_07_10_totaled["New_YEAR"].value_counts()


2007    31
2008    31
2009    31
2010    31
Name: New_YEAR, dtype: int64

In [242]:
conditions = [
    (pop_11_20_totaled["YEAR"] == 4),
    (pop_11_20_totaled["YEAR"] == 5),
    (pop_11_20_totaled["YEAR"] == 6),
    (pop_11_20_totaled["YEAR"] == 7),
    (pop_11_20_totaled["YEAR"] == 8),
    (pop_11_20_totaled["YEAR"] == 9),
    (pop_11_20_totaled["YEAR"] == 10),
    (pop_11_20_totaled["YEAR"] == 11),
    (pop_11_20_totaled["YEAR"] == 12),
    (pop_11_20_totaled["YEAR"] == 13),
]

values = [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]

pop_11_20_totaled["New_YEAR"] = np.select(conditions, values)
pop_11_20_totaled["New_YEAR"].value_counts()


2011    31
2012    31
2013    31
2014    31
2015    31
2016    31
2017    31
2018    31
2019    31
2020    31
Name: New_YEAR, dtype: int64

In [243]:
conditions = [(pop_21_totaled["YEAR"] == 3)]
values = [2021]

pop_21_totaled["New_YEAR"] = np.select(conditions, values)
pop_21_totaled["New_YEAR"].value_counts()


2021    31
Name: New_YEAR, dtype: int64

### subset

In [244]:
# subset for totals only.
cols_to_subset = [
    "STNAME",
    "CoC Code",
    "Coc",
    "FIPS code",
    "CTYNAME",
    "New_YEAR",
    "TOT_POP",
    "TOT_MALE",
    "TOT_FEMALE",
    "WA_TOTAL",
    "BA_TOTAL",
    "IA_TOTAL",
    "AA_TOTAL",
    "NA_TOTAL",
    "TO_TOTAL",
    "NH_TOTAL",
    "H__TOTAL",
]


pop_07_10_final = pop_07_10_totaled[cols_to_subset].copy()
pop_11_20_final = pop_11_20_totaled[cols_to_subset].copy()
pop_21_final = pop_21_totaled[cols_to_subset].copy()


___

### Final data set

In [261]:
col_name = {"New_YEAR": "Year",
            "TOT_POP": "Population",
            "TOT_MALE": "Male",
            "TOT_FEMALE": "Female",
            "WA_TOTAL": "White",
            "BA_TOTAL": "Black or African American",
            "IA_TOTAL": "American Indian and Alaska Native",
            "AA_TOTAL": "Asian",
            "NA_TOTAL": "Native Hawaiian and Other Pacific Islander",
            "TO_TOTAL": "Two or More Races",
            "NH_TOTAL": "Non Hispanic",
            "H__TOTAL": "Hispanic"}
population_by_race = pd.concat([pop_07_10_final, pop_11_20_final, pop_21_final],ignore_index=True)
population_by_race = population_by_race.rename(columns=col_name).sort_values(by = ["CTYNAME", "Year"])


In [264]:
# 🛑 check write path
population_by_race.to_csv("20_intermediate_files/population_race.csv")