In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

CSV data downloaded from https://www2.census.gov/programs-surveys/popest/datasets/2010-2020/state/asrh/SC-EST2020-AGESEX-CIV.csv

details about the table format can be found here: https://www2.census.gov/programs-surveys/popest/technical-documentation/file-layouts/2010-2020/sc-est2020-18+pop-res.pdf

In [2]:
pop_agesex_path = "state_pop_by_agesex.csv"

In [3]:
!curl -o {pop_agesex_path} https://www2.census.gov/programs-surveys/popest/datasets/2010-2020/state/asrh/SC-EST2020-AGESEX-CIV.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 15950    0 15950    0     0   126k      0 --:--:-- --:--:-- --:--:--  135k
100 1297k    0 1297k    0     0  4723k      0 --:--:-- --:--:-- --:--:-- 4858k


In [4]:
data = pd.read_csv(pop_agesex_path)
pop2014 = data[data['SUMLEV'] == 40][['NAME', 'SEX', 'AGE', 'POPEST2014_CIV']]
pop2014.head()

Unnamed: 0,NAME,SEX,AGE,POPEST2014_CIV
261,Alabama,0,0,58335
262,Alabama,0,1,57721
263,Alabama,0,2,58621
264,Alabama,0,3,59479
265,Alabama,0,4,59524


In [5]:
abbrevs = pd.read_csv("../county_to_state_aggregation/state_abbreviations.csv")
abbrevs.head()

Unnamed: 0,State,Postal
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ
3,Arkansas,AR
4,California,CA


# Gender stratification

In [6]:
fn = lambda state_df: state_df.groupby("SEX").agg({'POPEST2014_CIV': np.sum})
gender = pop2014[pop2014["SEX"] > 0].groupby(["NAME"]).apply(fn)

gender = gender.reset_index()
gender = gender.pivot(index="NAME", columns="SEX", values="POPEST2014_CIV")

gender.index = gender.index.rename("State")
gender = gender.rename(columns={1:'male', 2:'female'})

gender.to_csv("state_pop_by_gender_2014.csv")
gender.head()

SEX,male,female
State,Unnamed: 1_level_1,Unnamed: 2_level_1
Alabama,4670522,4987132
Alaska,733848,696076
Arizona,6661362,6765100
Arkansas,2904282,3019690
California,38066644,38800530


# Age stratification

In [7]:
age_groups = [18, 45, 65, 75, 100]
fn = lambda state_df: state_df.groupby(pd.cut(state_df['AGE'], bins=age_groups, right=False)).agg({"POPEST2014_CIV": np.sum})
age = pop2014.groupby(["NAME"]).apply(fn)

age = age.reset_index()
age = age.pivot(index="NAME", columns='AGE', values="POPEST2014_CIV")

age.index = age.index.rename("State")
age.columns = ["18-44", "45-64", "65-74", "75+"]

age.to_csv("state_pop_by_age_group_2014.csv")
age.head()

Unnamed: 0_level_0,18-44,45-64,65-74,75+
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Alabama,3393670,2567578,868434,615550
Alaska,531160,384772,94312,45696
Arizona,4802234,3253942,1236350,884624
Arkansas,2069850,1507858,539356,392032
California,29348878,19348970,5615772,4281790


# High school-aged population size (2013)

In [8]:
pop2013 = pd.read_csv(pop_agesex_path)
pop2013 = pop2013[pop2013['SUMLEV'] == 40][['NAME', 'SEX', 'AGE', 'POPEST2013_CIV']]

hs = pop2013[pop2013['AGE'].between(14, 18)].groupby("NAME").agg({'POPEST2013_CIV': np.sum})
hs = hs.merge(abbrevs, left_index=True, right_on='State')
hs = hs.rename(columns={"POPEST2013_CIV": "num HS age"})
hs.head()

Unnamed: 0,num HS age,State,Postal
0,638222,Alabama,AL
1,99028,Alaska,AK
2,901416,Arizona,AZ
3,394124,Arkansas,AR
4,5255478,California,CA


In [9]:
hs.to_csv("hs_aged_pop_2013.csv")

# Adults (18 or older) in 2014

In [10]:
adult = pop2014[pop2014['AGE'] >= 18].groupby("NAME").agg({'POPEST2014_CIV': np.sum})
adult = adult.merge(abbrevs, left_index=True, right_on='State')
adult = adult.rename(columns={"POPEST2014_CIV": "adult population"})
adult.head()

Unnamed: 0,adult population,State,Postal
0,17102886,Alabama,AL
1,2485864,Alaska,AK
2,23603612,Arizona,AZ
3,10433068,Arkansas,AR
4,135462584,California,CA


In [11]:
adult.to_csv("adult_pop_2014.csv")