# Imports

In [2]:
import pandas as pd

# poverty_county_2023.csv

In [3]:
df_poverty = pd.read_csv("data/poverty_county_2023.csv")

Filter povery rate (`PCTPOVALL_2023`) and ignore states (ending with `000`), as we only want county's in our data.

In [4]:
df_poverty = df_poverty[df_poverty["Attribute"] == "PCTPOVALL_2023"]
df_poverty = df_poverty[df_poverty["FIPS_Code"] % 1000 != 0]
df_poverty = df_poverty.drop(columns=["Attribute"])
df_poverty = df_poverty.rename(columns={"Value": "poverty_rate"})
df_poverty.head()

Unnamed: 0,FIPS_Code,Stabr,Area_Name,poverty_rate
61,1001,AL,Autauga County,11.7
86,1003,AL,Baldwin County,10.0
111,1005,AL,Barbour County,25.5
136,1007,AL,Bibb County,19.4
161,1009,AL,Blount County,12.8


# unemployment_county_2023.csv

In [5]:
df_unemployment = pd.read_csv("data/unemployment_county_2023.csv")

In [6]:
df_unemployment = df_unemployment[df_unemployment["Attribute"] == "Unemployment_rate_2023"]
df_unemployment = df_unemployment[df_unemployment["FIPS_Code"] % 1000 != 0]
df_unemployment = df_unemployment.drop(columns=["State", "Area_Name", "Attribute"])
df_unemployment = df_unemployment.rename(columns={"Value": "unemployment_rate"})
df_unemployment.head()

Unnamed: 0,FIPS_Code,unemployment_rate
293,1001,2.2
394,1003,2.3
495,1005,4.4
596,1007,2.5
697,1009,2.1


# ruralurban_county_2023.csv

In [7]:
df_rural = pd.read_csv("data/ruralurban_county_2023.csv", encoding="latin1")

In [8]:
df_rural = df_rural[df_rural["Attribute"] == "RUCC_2023"]
df_rural = df_rural.drop(columns=["State", "County_Name", "Attribute"])
df_rural = df_rural.rename(columns={"Value": "urban_rural_rate"})
df_rural.head()

Unnamed: 0,FIPS,urban_rural_rate
1,1001,2
4,1003,3
7,1005,6
10,1007,1
13,1009,1


# zillowhousevalue_county_2023.csv

In [9]:
df_house = pd.read_csv("data/zillowhousevalue_county_2023.csv")

Calculating the average housing price over all the months and saving it in `house_value`. Also in this dataset, the state- municipal codes are sepparated, so we want to combine them (see `FIPS`).

In [10]:
months = ["2023-01-31","2023-02-28","2023-03-31","2023-04-30","2023-05-31","2023-06-30","2023-07-31","2023-08-31","2023-09-30","2023-10-31","2023-11-30","2023-12-31","2024-01-31"]
df_house = df_house[["StateCodeFIPS", "MunicipalCodeFIPS", *months]]
df_house["house_value"] = df_house[months].mean(axis=1)
df_house["FIPS"] = (df_house["StateCodeFIPS"].astype(str) + df_house["MunicipalCodeFIPS"].astype(str).str.zfill(3)).astype(int)
df_house = df_house.drop(columns=months+["StateCodeFIPS", "MunicipalCodeFIPS"])
df_house.head()

Unnamed: 0,house_value,FIPS
0,815764.992054,6037
1,285912.561042,17031
2,280104.828264,48201
3,456931.128907,4013
4,858380.850497,6073


# geo_county_2025.csv

In [11]:
df_geo = pd.read_csv("data/geo_county_2025.csv")
df_geo = df_geo[["county_fips","lat","lng","population"]]
df_geo = df_geo.rename(columns={"county_fips": "FIPS"})
df_geo.head()

Unnamed: 0,FIPS,lat,lng,population
0,6037,34.3219,-118.2247,9848406
1,17031,41.8401,-87.8168,5185812
2,48201,29.8578,-95.3938,4758579
3,4013,33.349,-112.4915,4491987
4,6073,33.0343,-116.735,3282782


# Merge

Finally we merge all the different datasets by their Federal Information Processing Standard (`FIPS`). The `how="outer"` parameter

In [12]:
df = pd.merge(df_poverty, df_unemployment, on="FIPS_Code", how="outer")
df = pd.merge(df, df_rural, left_on="FIPS_Code", right_on="FIPS", how="outer")
df = df.drop(columns=["FIPS_Code"])
df = pd.merge(df, df_house, on="FIPS", how="outer")
df = pd.merge(df, df_geo, on="FIPS", how="outer")
df = df.set_index("FIPS")
df.head()

Unnamed: 0_level_0,Stabr,Area_Name,poverty_rate,unemployment_rate,urban_rural_rate,house_value,lat,lng,population
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1001.0,AL,Autauga County,11.7,2.2,2,232431.875906,32.5349,-86.6427,59285.0
1003.0,AL,Baldwin County,10.0,2.3,3,375064.36754,30.7277,-87.7226,239945.0
1005.0,AL,Barbour County,25.5,4.4,6,149074.674301,31.8696,-85.3932,24757.0
1007.0,AL,Bibb County,19.4,2.5,1,201633.074635,32.9986,-87.1265,22152.0
1009.0,AL,Blount County,12.8,2.1,1,222516.178747,33.9809,-86.5674,59292.0


In [13]:
df.to_csv("datatest/merged.csv")

In [14]:
print("Total number of rows:", df.shape[0])  # Display the number of rows

Total number of rows: 3250


In [18]:
filtered_df = df[df['Stabr'] != 'PR']

In [19]:
filtered_df.shape[0]

3250