In [17]:
using CSV
using DataFrames

In [18]:
dfs_county = [
    DataFrame(CSV.File("derived_data/county/census.csv")),
    DataFrame(CSV.File("derived_data/county/fbi_cde.csv")),
    DataFrame(CSV.File("derived_data/county/fema_eal.csv")),
    DataFrame(CSV.File("derived_data/county/usda_food_insecure.csv")),
];

dfs_zcta = [
    DataFrame(CSV.File("derived_data/zcta/census.csv")),
    DataFrame(CSV.File("derived_data/zcta/fbi_cde.csv")),
    DataFrame(CSV.File("derived_data/zcta/fema_eal.csv")),
    DataFrame(CSV.File("derived_data/zcta/usda_food_insecure.csv")),
];

dfs_tract = [
    DataFrame(CSV.File("derived_data/tract/census.csv")),
    DataFrame(CSV.File("derived_data/tract/fbi_cde.csv")),
    DataFrame(CSV.File("derived_data/tract/fema_eal.csv")),
    DataFrame(CSV.File("derived_data/tract/usda_food_insecure.csv")),
];

select!(dfs_county[2], Not(["STATE", "COUNTYNAME"]))
select!(dfs_county[3], Not(["STATE", "COUNTYNAME"]))

function merge(dfs, merge_col)
    df = dfs[1]
    for next_df in dfs[2:end]
        leftjoin!(df, next_df, on=merge_col)
    end
    
    for e in names(df)
        replace!(df[!, e], missing=>NaN)
    end

    return df
end

df_county = merge(dfs_county, "COUNTY");
df_zcta = merge(dfs_zcta, "ZIP");
df_tract = merge(dfs_tract, "TRACT");

df_county[!,"median_income"] = convert.(Float64,df_county[!,"median_income"]);
df_zcta[!,"median_income"] = convert.(Float64,df_zcta[!,"median_income"]);
df_tract[!,"median_income"] = convert.(Float64,df_tract[!,"median_income"]);

In [19]:
vars = [
    "single_parent_household", "youth_not_in_school", "sex_pay_gap", "race_pay_gap", 
    "house_pay_gap", "snap_household_inv", "snap_vulnerabe_household_inv", 
    "internet_household", "broadband_internet_household", "smartphone_household", 
    "car_household", "plumbing_household", "kitchen_household", "less_than_60min_commute", 
    "wfh", "public_transit_commute", "walking_commute", "biking_commute", "urban_household", 
    "white_collar_workers", "employment_rate", "owner_occupied_rate", "median_income", 
    "Total Offenses Rate", "Crimes Against Persons Rate", "Crimes Against Property Rate", 
    "Crimes Against Society Rate", "ALR_VALB", "ALR_VALP", "ALR_VALA", "lahunv1share", 
    "lahunv10share", "lasnap1share", "lasnap10share",
]

34-element Vector{String}:
 "single_parent_household"
 "youth_not_in_school"
 "sex_pay_gap"
 "race_pay_gap"
 "house_pay_gap"
 "snap_household_inv"
 "snap_vulnerabe_household_inv"
 "internet_household"
 "broadband_internet_household"
 "smartphone_household"
 "car_household"
 "plumbing_household"
 "kitchen_household"
 ⋮
 "median_income"
 "Total Offenses Rate"
 "Crimes Against Persons Rate"
 "Crimes Against Property Rate"
 "Crimes Against Society Rate"
 "ALR_VALB"
 "ALR_VALP"
 "ALR_VALA"
 "lahunv1share"
 "lahunv10share"
 "lasnap1share"
 "lasnap10share"

In [20]:
cols2inv = [
    "wfh", "public_transit_commute", "walking_commute", "biking_commute",
    "median_income"
]
function orient(df)
    for col in cols2inv
        df[!, "$(col)_inv"] = -1 .* df[:, col]
    end
    return df
end
    
cols = [
    #"single_parent_household", "youth_not_in_school", "sex_pay_inequity", "race_pay_inequity", "house_pay_inequity",
    #"snap_household", "snap_vulnerabe_household", "no_internet_household", 
    #"no_broadband_internet_household", "no_smartphone_household", "no_car_household", "no_plumbing_household", 
    #"no_kitchen_household", "over_60min_commute", "wfh_inv", "public_transit_commute_inv", 
    #"walking_commute_inv", "biking_commute_inv", "rural_household",
    #"working_class_workers", "unemployment_rate", "renter_occupied", "median_income_inv",
    "single_parent_household", "youth_not_in_school", "sex_pay_inequity", "race_pay_inequity",
    "snap_household", "snap_vulnerabe_household", "no_internet_household", 
    "no_smartphone_household", "no_car_household", "no_plumbing_household", 
    "over_60min_commute", "wfh_inv", "biking_commute_inv",
    "working_class_workers", "unemployment_rate", "renter_occupied", "median_income_inv",
    
    #"Total Offenses Rate", "Crimes Against Persons Rate", "Crimes Against Property Rate", "Crimes Against Society Rate",
    "Crimes Against Persons Rate", "Crimes Against Property Rate", "Crimes Against Society Rate",
    
    #"ALR_VALB", "ALR_VALP", "ALR_VALA", 
    "ALR_VALB", "ALR_VALP", 
    
    #"lahunv1share", "lahunv10share", "lasnap1share", "lasnap10share"
    "lahunv1share", "lasnap1share"
];

function ntile(s)
    x = sort(s)
    
    i = 0
    v_i = x[1]
    
    d = Dict([(v_i, 0.0)])
    
    for v in x[2:end]
        if v != v_i
            d[v] = i
        end
        i += 1
        v_i = v
    end

    n = maximum(values(d))
    
    return [d[v] / n for v in s]
end;

function get_ntiles(df)
    
    for col in cols
        m = .!isnan.(df[:, col])
        df[m, col] = ntile(df[m, col])
    end
    
    return df
end;

In [21]:
df_county = orient(df_county);
df_zcta = orient(df_zcta);
df_tract = orient(df_tract);

In [22]:
df_county = get_ntiles(df_county);
df_zcta = get_ntiles(df_zcta);
df_tract = get_ntiles(df_tract);

In [23]:
df_county = df_county[:, ["COUNTY"; cols]];
df_zcta = df_zcta[:, ["ZIP"; cols]];
df_tract = df_tract[:, ["TRACT"; cols]];

In [24]:
CSV.write("datasets/county_dataset.csv", df_county);
CSV.write("datasets/zcta_dataset.csv", df_zcta);
CSV.write("datasets/tract_dataset.csv", df_tract);

In [99]:
for e in names(df_county)
    print(""""$e", """)
end

"single_parent_household", "youth_not_in_school", "sex_pay_inequity", "race_pay_inequity", "house_pay_inequity", "snap_household", "snap_vulnerabe_household", "no_internet_household", "no_broadband_internet_household", "no_smartphone_household", "no_car_household", "no_plumbing_household", "no_kitchen_household", "over_60min_commute", "wfh", "public_transit_commute", "walking_commute", "biking_commute", "rural_household", "working_class_workers", "unemployment_rate", "renter_occupied", "median_income", "COUNTY", "Total Offenses Rate", "Crimes Against Persons Rate", "Crimes Against Property Rate", "Crimes Against Society Rate", "ALR_VALB", "ALR_VALP", "ALR_VALA", "lahunv1share", "lahunv10share", "lasnap1share", "lasnap10share", "wfh_inv", "public_transit_commute_inv", "walking_commute_inv", "biking_commute_inv", "median_income_inv", 

In [75]:
df_county[:, "ALR_VALP_inv"]

3222-element Vector{Float64}:
 0.6754905014014326
 0.46652133291809406
 0.5066957334163812
 0.4447212706322018
 0.39084397383992525
 0.4484584241669262
 0.427281220803488
 0.27032077234506385
 0.7324820928059794
 0.26253503581438803
 0.436935534101526
 0.2516350046714419
 0.523512924322641
 ⋮
 0.25661787605107444
 0.4331983805668016
 0.3774525070071629
 0.37122391778262226
 0.41762690750545
 0.320149486141389
 0.31205232014948614
 0.3157894736842105
 0.03550295857988166
 0.3920896916848334
 0.4070383058237309
 0.23544067268763624