# 02_clean_transform
### Census data (B25070)
- load raw data and merge
- clean, reshape, create metrics
- save ready-for-EDA tables to data_final

### SETUP / IMPORTS

In [1]:
import pandas as pd
import os
from dotenv import load_dotenv
from pathlib import Path
from haf.cleaning import rename_columns, cast_msa_codes, compute_total, filter_msas

In [2]:
load_dotenv()
PROJECT_ROOT = Path(os.getenv("PROJECT_ROOT"))
DATA_RAW = PROJECT_ROOT / "data_raw"

### processes for single-year tables

In [3]:
year = 2023

In [8]:
file_path = DATA_RAW / f"B25070_{year}.csv"

df = pd.read_csv(file_path)
df.head()

Unnamed: 0,NAME,B25070_001E,B25070_002E,B25070_003E,B25070_004E,B25070_005E,B25070_006E,B25070_007E,B25070_008E,B25070_009E,B25070_010E,B25070_011E,metropolitan statistical area/micropolitan statistical area
0,"Aberdeen, WA Micro Area",7681.0,205.0,1523.0,725.0,615.0,1116.0,431.0,610.0,543.0,1512.0,401.0,10140
1,"Abilene, TX Metro Area",25067.0,522.0,1670.0,4174.0,2302.0,1957.0,1673.0,2712.0,3262.0,4861.0,1934.0,10180
2,"Adrian, MI Micro Area",7902.0,345.0,754.0,1255.0,432.0,451.0,1492.0,108.0,646.0,1443.0,976.0,10300
3,"Aguadilla, PR Metro Area",30177.0,657.0,1648.0,1811.0,2330.0,815.0,1343.0,1193.0,588.0,2924.0,16868.0,10380
4,"Akron, OH Metro Area",94990.0,3336.0,9661.0,11159.0,10435.0,9377.0,8051.0,6217.0,7753.0,23846.0,5155.0,10420


In [9]:
renamed_df = rename_columns(df)
renamed_df.head()

Unnamed: 0,msa_name,total_households_raw,less_than_100,100_to_149,150_to_199,200_to_249,250_to_299,300_to_349,350_to_399,400_to_499,500_or_more,households_not_computed,msa_code
0,"Aberdeen, WA Micro Area",7681.0,205.0,1523.0,725.0,615.0,1116.0,431.0,610.0,543.0,1512.0,401.0,10140
1,"Abilene, TX Metro Area",25067.0,522.0,1670.0,4174.0,2302.0,1957.0,1673.0,2712.0,3262.0,4861.0,1934.0,10180
2,"Adrian, MI Micro Area",7902.0,345.0,754.0,1255.0,432.0,451.0,1492.0,108.0,646.0,1443.0,976.0,10300
3,"Aguadilla, PR Metro Area",30177.0,657.0,1648.0,1811.0,2330.0,815.0,1343.0,1193.0,588.0,2924.0,16868.0,10380
4,"Akron, OH Metro Area",94990.0,3336.0,9661.0,11159.0,10435.0,9377.0,8051.0,6217.0,7753.0,23846.0,5155.0,10420


In [11]:
cast_df = cast_msa_codes(df_renamed)
cast_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 530 entries, 0 to 529
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   msa_name                 530 non-null    object 
 1   total_households_raw     529 non-null    float64
 2   less_than_100            529 non-null    float64
 3   100_to_149               529 non-null    float64
 4   150_to_199               529 non-null    float64
 5   200_to_249               529 non-null    float64
 6   250_to_299               529 non-null    float64
 7   300_to_349               529 non-null    float64
 8   350_to_399               529 non-null    float64
 9   400_to_499               529 non-null    float64
 10  500_or_more              529 non-null    float64
 11  households_not_computed  529 non-null    float64
 12  msa_code                 530 non-null    object 
dtypes: float64(11), object(2)
memory usage: 54.0+ KB


In [15]:
computed_df = compute_total(cast_df)
computed_df.head()

Unnamed: 0,msa_name,less_than_100,100_to_149,150_to_199,200_to_249,250_to_299,300_to_349,350_to_399,400_to_499,500_or_more,msa_code,total_households
0,"Aberdeen, WA Micro Area",205.0,1523.0,725.0,615.0,1116.0,431.0,610.0,543.0,1512.0,10140,7280.0
1,"Abilene, TX Metro Area",522.0,1670.0,4174.0,2302.0,1957.0,1673.0,2712.0,3262.0,4861.0,10180,23133.0
2,"Adrian, MI Micro Area",345.0,754.0,1255.0,432.0,451.0,1492.0,108.0,646.0,1443.0,10300,6926.0
3,"Aguadilla, PR Metro Area",657.0,1648.0,1811.0,2330.0,815.0,1343.0,1193.0,588.0,2924.0,10380,13309.0
4,"Akron, OH Metro Area",3336.0,9661.0,11159.0,10435.0,9377.0,8051.0,6217.0,7753.0,23846.0,10420,89835.0


In [16]:
MSA_CODES = ["42660", "41860", "35620", "12060", "38060", "12420", "33460", "19820", "41180"]

In [20]:
filtered_df = filter_msas(computed_df, MSA_CODES)
filtered_df.head()

Unnamed: 0,msa_name,less_than_100,100_to_149,150_to_199,200_to_249,250_to_299,300_to_349,350_to_399,400_to_499,500_or_more,msa_code,total_households
28,"Atlanta-Sandy Springs-Roswell, GA Metro Area",21003.0,49743.0,75242.0,89727.0,81012.0,71338.0,50769.0,71975.0,222452.0,12060,733261.0
34,"Austin-Round Rock-San Marcos, TX Metro Area",13440.0,29605.0,49366.0,56499.0,48003.0,42110.0,31612.0,44189.0,101072.0,12420,415896.0
125,"Detroit-Warren-Dearborn, MI Metro Area",18544.0,37501.0,51746.0,56461.0,52906.0,44389.0,29309.0,39457.0,124771.0,19820,455084.0
307,"Minneapolis-St. Paul-Bloomington, MN-WI Metro ...",11449.0,40567.0,55334.0,65728.0,51044.0,40382.0,26336.0,38603.0,99337.0,33460,428780.0
336,"New York-Newark-Jersey City, NY-NJ Metro Area",172750.0,308006.0,412797.0,404722.0,381086.0,301108.0,218866.0,299777.0,983573.0,35620,3482685.0


### NEXT STEPS:
- Try combined function: clean_census_data. Do i need to import all the smaller/modular functions?
- Merge single_year tables (see below)
- Continue cleaning with reshaping, buckets, etc.

In [None]:
# merged_df = pd.concat([merged_df, single_year_df], ignore_index=True)