In [None]:
import pandas as pd
from src.utils import * 
import geopandas as gpd
from src.main import *

pd.set_option('display.max_columns', None)
BASE_FILE_PATH = "../../data/county_raw" # Add the path to the raw data folder here

In [None]:
datapaths = {
    "Wind": "wind/ez_gis.plant_power_eia_v8_wind.shp",
    "Solar": "solar/solar_raw.csv",
    "GDP": "social factors/gdp_raw.csv",
    "education": "social factors/education_raw.csv",
    "private_schools": "social factors/private_school_raw.csv",
    "DEC_race": "social factors/race_dec_raw.csv",
    "ACS_race": "social factors/race_acs_raw.csv",
    "election": "social factors/election_raw.csv",
    "income": "social factors/income_raw.csv",
    "unemployment": "social factors/unemployment_raw.csv",
    'Population Data': 'social factors/population_raw.csv',
    "NREL_Electric": "electric price/NREL_raw.csv",
    "Rural_Urban": "electric price/rural_urban_raw.csv",
}
bounding_box = pd.read_csv("../../data/county_clean/county_bounding_boxes.csv", dtype={"FIPS State": str, "FIPS County": str})

In [None]:
FIPS = pd.read_csv("../../data/extras/US_FIPS_Codes.csv", dtype={"FIPS State": str, "FIPS County": str})
solar = get_solar(datapaths["Solar"], bounding_box, size='all')
wind = get_wind(datapaths["Wind"], bounding_box)
edu_18_24 = get_education_18_24(datapaths["education"])
edu_25 = get_education_25_over(datapaths["education"])
priv_sch = get_no_priv_schools(datapaths["private_schools"])
race_dec = get_race_dec(datapaths["DEC_race"])
race_acs = get_race_acs(datapaths["ACS_race"])
elections = get_election(datapaths["election"])
income = get_income(datapaths["income"])
unemployment = get_unemployment(datapaths["unemployment"])
electric = NREL_Electric(datapaths["NREL_Electric"])
gdp = get_GDP(datapaths["GDP"], bounding_box, datapaths["Population Data"])
rural_urban = get_rural_urban_coverage(datapaths["Rural_Urban"])
save_datapath = "missing_counties/"

In [None]:
# put all df into a dictionary
dfs = {
    "solar": solar,
    "wind": wind,
    "edu_18_24": edu_18_24,
    "edu_25": edu_25,
    "priv_sch": priv_sch,
    "race_acs": race_acs,
    "race_dec": race_dec,
    "elections_demo": elections['democrat'],
    "elections_repub": elections['republican'],
    "elections_other": elections['other'],
    "elections_green": elections['green'],
    "elections_libert": elections['libertarian'],
    "income": income,
    "unemployment": unemployment,
    "electric": electric,
    "gdp": gdp,
    "rural_urban": rural_urban
}

In [None]:
def get_missing_counties(data, base):
    base['State_County'] = base['State'] + '_' + base['County Name']
    data['State_County'] = data['State'] + '_' + data['County Name']
    
    not_in = base[~base['State_County'].isin(data['State_County'])]
    not_in = not_in.drop(columns=['State_County'])
    return not_in

## Count the number of counties that don't have data at all for each factor

In [None]:
dfs_missing = {
    name: get_missing_counties(data, FIPS) for name, data in dfs.items()
}

for name, data in dfs_missing.items():
    data.to_csv(save_datapath + name + "_missing.csv", index=False)
    print(f"For {name}, {len(data)} counties are missing")

## Count number of NaNs in each dfs

In [None]:
counts_str = ""

for name, df in dfs.items():
    temp = df.copy()
    temp = temp.drop(columns=['State', 'County Name'])
    
    # Count rows that have 1 or more 0s
    count_zeros = temp[temp.eq(0).any(axis=1)]
    
    # Count rows that have a NaN
    count_nan = temp[temp.isna().any(axis=1)]
    
    print(f"{name}:")
    print(f"Rows with 0s: {count_zeros.shape[0]}")
    print(f"Rows with NaNs: {count_nan.shape[0]}")
    print("\n")
    
    counts_str += f"{name}:\n Rows with 0s: {count_zeros.shape[0]}\n Rows with NaNs: {count_nan.shape[0]}\n\n"