## **Population dataset**

In [319]:
import pandas as pd


## Import COC data set

In [320]:
CoC_data = pd.read_csv("/00_source_data/COCMapping.csv")
CoC_data.head()


Unnamed: 0,State,STNAME,Coc,CTYNAME
0,AZ,Arizona,"Phoenix,Mesa/Maricopa",Maricopa County
1,CA,California,Los Angeles City & County,Los Angeles County
2,CA,California,San Diego City and County,San Diego County
3,CA,California,San Jose/Santa Clara City & County,Santa Clara County
4,CA,California,San Bernardino City & County,San Bernardino County


## Read Population data and Filter by states

In [321]:
states = CoC_data["STNAME"].unique()


def read_data(url):
    """read the population data from raw csv"""
    cols_to_read = ["STNAME", "CTYNAME"]
    popest_cols = [
        col
        for col in pd.read_csv(url, nrows=1, encoding="iso-8859-1")
        if "POPESTIMATE" in col
    ]
    cols_to_read += popest_cols
    tmp_data = pd.read_csv(url, usecols=cols_to_read, encoding="iso-8859-1")
    final_data = tmp_data[tmp_data["STNAME"].isin(states)]
    return final_data


In [322]:
pop_2007_2009 = read_data("/00_source_data/co-est2009-alldata.csv")[
    ["STNAME", "CTYNAME", "POPESTIMATE2007", "POPESTIMATE2008", "POPESTIMATE2009"]
]
pop_2010_2020 = read_data("/00_source_data/co-est2020-alldata.csv")
pop_2021_2022 = read_data("/00_source_data/co-est2022-alldata.csv")[
    ["STNAME", "CTYNAME", "POPESTIMATE2021", "POPESTIMATE2022"]
]


## Merge data sets with CoC data

In [323]:
def merge_data(data):
    """merge datasets"""
    data_merged = pd.merge(CoC_data, data, on = ["STNAME", "CTYNAME"], how="left", indicator=True)
    #check successful merge
    try:
        len(data_merged["_merge"].unique()[0]) == 1
        print("- successfully merged")
    except len(data_merged["_merge"].unique()[0]) > 1:
        print("unmerged elements detected")
    #check duplicates
    dups = data_merged[data_merged.duplicated()]
    if len(dups) >= 1:
        data_merged = data_merged.drop_duplicates().reset_index(drop=True)
        print(f"- {len(dups)} duplicate(s) deleted")
    else:
        print("No duplicates found")
    data_merged = data_merged.drop("_merge", axis=1)
    return data_merged
    

In [324]:
print("2007 to 2009")
pop_2007_2009_subset = merge_data(pop_2007_2009)
print("2010 to 2020")
pop_2010_2020_subset = merge_data(pop_2010_2020)
print("2021 to 2022")
pop_2021_2022_subset = merge_data(pop_2021_2022)


2007 to 2009
- successfully merged
- 1 duplicate(s) deleted
2010 to 2020
- successfully merged
- 1 duplicate(s) deleted
2021 to 2022
- successfully merged
- 1 duplicate(s) deleted


## Merge all 3 datasets

In [325]:
first_merge_final_data = pd.merge(
    pop_2007_2009_subset,
    pop_2010_2020_subset,
    on=["STNAME", "CTYNAME"],
    how="left",
    indicator=True,
)
assert first_merge_final_data["_merge"].unique() == "both"
first_merge_final_data = first_merge_final_data.drop("_merge", axis=1)


In [326]:
second_merge_final_data = pd.merge(
    first_merge_final_data,
    pop_2021_2022_subset,
    on=["STNAME", "CTYNAME"],
    how="left",
    indicator=True,
)
assert second_merge_final_data["_merge"].unique() == "both"


## Clean Merged dataset

In [327]:
all_pop_data = second_merge_final_data[
    [
        "STNAME",
        "Coc",
        "CTYNAME",
        "POPESTIMATE2007",
        "POPESTIMATE2008",
        "POPESTIMATE2009",
        "POPESTIMATE2010",
        "POPESTIMATE2011",
        "POPESTIMATE2012",
        "POPESTIMATE2013",
        "POPESTIMATE2014",
        "POPESTIMATE2015",
        "POPESTIMATE2016",
        "POPESTIMATE2017",
        "POPESTIMATE2018",
        "POPESTIMATE2019",
        "POPESTIMATE2020",
        "POPESTIMATE2021",
        "POPESTIMATE2022",
    ]
]


In [328]:
populations_cols = [
    "POPESTIMATE2007",
    "POPESTIMATE2008",
    "POPESTIMATE2009",
    "POPESTIMATE2010",
    "POPESTIMATE2011",
    "POPESTIMATE2012",
    "POPESTIMATE2013",
    "POPESTIMATE2014",
    "POPESTIMATE2015",
    "POPESTIMATE2016",
    "POPESTIMATE2017",
    "POPESTIMATE2018",
    "POPESTIMATE2019",
    "POPESTIMATE2020",
    "POPESTIMATE2021",
    "POPESTIMATE2022",
]
all_pop_data_melt = pd.melt(
    all_pop_data, id_vars=["STNAME", "Coc", "CTYNAME"], value_vars=populations_cols
)
assert (len(populations_cols) * len(all_pop_data["CTYNAME"].unique())) == len(
    all_pop_data_melt
)


## Melt data and rename variables

In [334]:
all_pop_data_melt = all_pop_data_melt.rename(
    columns={"variable": "Year", "value": "Population"}
)
all_pop_data_melt["Year"] = all_pop_data_melt["Year"].str.replace("POPESTIMATE", "")
# all_pop_data_melt.to_csv("/20_intermediate_files/population.csv")
all_pop_data_melt.head()


Unnamed: 0,STNAME,Coc,CTYNAME,Year,Population
0,Arizona,"Phoenix,Mesa/Maricopa",Maricopa County,2007,3872962
1,California,Los Angeles City & County,Los Angeles County,2007,9734701
2,California,San Diego City and County,San Diego County,2007,2975656
3,California,San Jose/Santa Clara City & County,Santa Clara County,2007,1723927
4,California,San Bernardino City & County,San Bernardino County,2007,1992989
