# Canadian Household Income Analysis

# Setup

In [1]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import polars as pl
import pandas as pd
import numpy as np

Define some constants to make key column names easier to reference.

In [2]:
TOTAL_HOUSEHOLDS_COL = "HSBASHHD"
INSURANCE_COL = "HSEP001S"
PORTION_RET_INSUR_COL = "portion_retirement_insurance" # Our target column that we will calculate
INCOME_COL = "HSHNIAGG"

Read the data.

In [3]:
hs = pl.read_csv("./data/HouseholdSpend.csv")
ds = pl.read_csv("./data/DemoStats.csv")

# Check for initial null or NaN values
# Assert no NaNs in a DataFrame with float columns
assert not hs.drop_nans().shape[0] < hs.shape[0], "Spending DataFrame contains NaN values"
assert not ds.drop_nans().shape[0] < ds.shape[0], "Demographics DataFrame contains NaN values"
assert not hs.drop_nulls().shape[0] < hs.shape[0], "Spending DataFrame contains null values"
assert not ds.drop_nulls().shape[0] < ds.shape[0], "Demographics DataFrame contains null values"

display(hs.describe())
display(ds.describe())

statistic,CODE,GEO,HSBASHHD,HSHNIAGG,HSAGDISPIN,HSAGDISCIN,HSTT001,HSTE001,HSTX001,HSTC001,HSSH001S,HSFD001S,HSHO001S,HSHC001S,HSHF001S,HSTR001S,HSRE001S,HSPC001S,HSCL001S,HSED002S,HSRO001S,HSTA001S,HSGC001S,HSME001S,HSEP001S,HSMG001S,HSTE001ZBS,HSWH002S,HSWH028S,HSWH040S,HSWH041S,HSWH042S,HSSH001,HSSH002,HSSH003,HSSH004,…,HSTR002,HSTR003,HSTR004,HSTR005,HSTR006,HSTR007,HSTR008,HSTR009,HSTR058,HSTR010,HSTR011,HSTR012,HSTR014M,HSTR015,HSTR020,HSTR030,HSTR031,HSTR032,HSTR033,HSTR034,HSTR035,HSTR036,HSTR037,HSTR038,HSTR039,HSTR040,HSTR041,HSTR050,HSTR051,HSTR052,HSTR053,HSTR054,HSTR055,HSTR056,HSTR056A,HSTR056B,HSTR057
str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""","""868970""","""868970""",868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,…,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0
"""null_count""","""0""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",,,18.432629,2253400.0,1717000.0,1234200.0,2400800.0,2191700.0,421335.834661,1606500.0,388812.651163,278016.994627,111401.557243,109343.827301,76103.411113,271135.371662,87929.804561,45315.650319,70798.452452,34398.504565,3570.867859,70468.350013,25849.158365,33310.372386,115213.19111,48732.91062,209040.481816,67544.21004,4806.924846,112565.256385,15089.24985,9034.840695,388812.651163,303764.492909,89251.190609,88053.450315,…,236398.761036,118477.006209,114562.677761,14888.288571,8607.895903,91066.493288,188.957063,83.539013,105.41805,3725.371385,3469.552047,2092.59778,1376.954267,255.819338,1504.664209,116417.090618,3287.664753,10103.061471,12676.188385,18214.142317,100.530695,63415.559732,2639.560863,4612.181943,1368.200459,379.01064,989.189819,34736.610626,5995.306708,2258.469958,915.416319,23038.220756,71.1409,1322.434304,831.592595,490.84171,1135.621681
"""std""",,,69.379951,8278700.0,6327600.0,4634000.0,9132900.0,8056200.0,1596900.0,5925500.0,1366100.0,1047100.0,420881.63654,429642.774397,290959.592849,1075100.0,327376.60947,163377.943862,249311.269506,114999.916794,14386.745375,264786.580294,94955.831503,127397.296924,431645.982804,206454.726959,1271300.0,341440.420828,40531.631801,898861.389623,51492.602769,35858.838458,1366100.0,1027000.0,292325.224526,287942.333453,…,990960.632583,507452.95803,494184.487741,58848.760548,45895.371562,398856.15598,794.022208,358.55285,497.256289,14112.764792,13213.369738,8173.498909,5705.767025,1422.161917,5673.303299,486744.921092,14400.182837,39378.141535,60281.31644,71889.568827,615.785308,284702.777168,13567.714202,14917.235034,5725.037991,2091.296014,3882.026395,101629.434794,19414.670337,7613.821326,5037.673958,71509.135272,257.018871,6638.799848,5076.885882,2097.649493,4203.801021
"""min""","""A0A0A0""","""FSALDU""",0.0,0.0,0.0,0.0,-24249.492537,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-14599000.0,0.0,0.0,-16806000.0,-26676.98553,-204380.698719,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""25%""",,,3.0,267191.073885,214098.269455,150277.201174,288995.986572,265546.050861,33157.853004,207473.234744,45966.938994,37284.344091,14714.866683,14486.524941,8994.89881,35944.486804,10559.894359,5941.987115,8917.254867,3070.145183,443.75734,8651.284113,2763.922466,3866.706749,14171.577863,3756.238973,0.0,6565.759691,284.748359,-6243.536067,1559.30506,968.813853,45966.938994,36177.144193,4032.422898,4007.86451,…,31974.915025,14884.392499,14404.559008,1721.930707,889.08304,10994.748987,18.79587,8.307298,6.952265,341.867661,307.276114,167.899519,93.667955,6.870296,105.847834,16361.845444,313.292801,1178.97798,1513.451358,2469.125994,7.898357,8700.00923,189.507627,384.138866,151.543329,37.742767,93.143563,2811.475222,162.774495,167.312143,35.976385,1768.315493,6.114269,85.412388,36.712766,23.624462,88.826063
"""50%""",,,8.0,912463.575234,715484.197728,509690.363886,969018.390794,888496.551166,140149.72964,673798.921422,156532.519617,119802.611538,46963.310598,47206.797141,30088.717747,113572.988935,35444.228008,18884.714067,29071.994886,11240.721641,1429.571614,29355.141248,10235.100399,13337.104212,47773.521503,15219.81177,50392.064279,24481.506093,1049.907105,18375.896387,5767.899271,3728.618986,156532.519617,122522.885155,25032.303773,24717.905952,…,99636.881325,47255.263545,45702.223467,5675.289883,3012.454702,35561.278678,64.954671,29.399074,27.016134,1221.710075,1114.524975,634.507481,383.183443,48.33781,455.550063,50448.828495,1010.89752,4106.746145,4918.767677,7819.010503,30.350255,26710.649778,726.944444,1601.499714,495.907153,121.108084,342.930165,10954.693342,1156.778626,701.484161,161.412101,7057.834978,23.787995,343.285857,161.941917,117.546716,354.380653
"""75%""",,,18.0,2301700.0,1761200.0,1266300.0,2467700.0,2227300.0,405799.58862,1637500.0,397364.857056,283335.041457,112572.36426,111185.41082,76441.320833,271458.988702,89461.62396,45934.093846,72432.352457,33143.645302,3561.588287,72326.651697,26341.436173,33958.583529,118261.893216,45160.792428,224546.728503,66462.341916,2965.62507,138367.839985,15217.963496,9463.184174,397364.857056,309061.190349,74342.008286,73421.345491,…,234841.660337,116289.221612,112200.441102,15018.869906,8150.993847,87545.046991,180.270628,78.649214,86.006103,3560.668285,3296.700024,1929.993628,1207.25064,209.470357,1429.641014,116348.736875,2834.456579,10240.10095,12318.235716,18680.935475,86.966083,61627.089698,2153.664949,4785.370267,1321.914033,341.421503,954.217924,34053.319991,4960.248059,2008.694424,540.584951,22586.713725,63.703739,1023.247916,544.115663,420.149131,1009.684299
"""max""","""Y1A7A4""","""FSALDU""",7751.0,1165100000.0,886990000.0,643320000.0,1376100000.0,1103600000.0,221460000.0,801000000.0,200960000.0,143900000.0,55996000.0,48493000.0,38124000.0,138490000.0,40329000.0,31215000.0,33139000.0,17300000.0,2412200.0,35725000.0,16681000.0,17424000.0,63203000.0,45674000.0,272500000.0,68501000.0,6024300.0,190590000.0,6760500.0,5210200.0,200960000.0,154210000.0,38349000.0,37640000.0,…,127960000.0,70898000.0,68704000.0,8707900.0,10164000.0,52694000.0,98537.879231,43886.395799,62217.543041,2147700.0,2144300.0,1345900.0,895523.577009,272687.971232,1100600.0,58696000.0,1923000.0,5452200.0,9119300.0,8533900.0,128411.834084,37197000.0,2916500.0,1814100.0,1022500.0,406708.260287,615838.990473,12478000.0,2320100.0,1070700.0,653408.354038,9950600.0,29234.466882,1491200.0,1207800.0,397905.142758,733300.179849


statistic,CODE,GEO,ECYASQKM,ECYALSQKM,ECYBASPOP,ECYBASHHD,ECYBASHPOP,ECYBAS12P,ECYBAS15P,ECYBAS18P,ECYBAS19P,ECYBAS12HP,ECYBAS15HP,ECYBAS18HP,ECYBAS19HP,ECYBASTNGH,ECYBASADUH,ECYBASCF,ECYBASCFH,ECYBASKID,ECYBASLF,ECYPTAPOP,ECYPTA_0_4,ECYPTA_5_9,ECYPTA1014,ECYPTA1519,ECYPTA2024,ECYPTA2529,ECYPTA3034,ECYPTA3539,ECYPTA4044,ECYPTA4549,ECYPTA5054,ECYPTA5559,ECYPTA6064,ECYPTA6569,…,ECYRIMINDI,ECYRIMNEPA,ECYRIMPAKI,ECYRIMSRI,ECYRIMSASO,ECYRIMOCE,ECYRIMAUSS,ECYRIMOCEO,ECYPIMHPOP,ECYPIMNI,ECYPIMIM,ECYPIMP01,ECYPIM0110,ECYPIM1115,ECYPIM1621,ECYPIM22CY,ECYPIMNPER,ECYAIMHPOP,ECYAIMNI,ECYAIMIM,ECYAIM_0_5,ECYAIM_514,ECYAIM1524,ECYAIM2544,ECYAIM45P,ECYAIMNPER,ECYGENHPOP,ECYGEN1GEN,ECYGEN2GEN,ECYGEN3GEN,ECYTCAHPOP,ECYTCACIT,ECYTCA_U18,ECYTCA_18P,ECYNCANCIT,ECYNCA_U18,ECYNCA_18P
str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""","""868970""","""868970""",868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,…,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0
"""null_count""","""0""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",,,0.0,0.0,46.684199,18.432629,45.881762,41.254934,39.613594,37.819792,37.323999,40.336732,38.832472,37.263988,36.746572,3.072744,24.49923,12.76093,12.051364,12.740845,24.633679,46.684199,2.181318,2.390635,2.498651,2.672259,3.213171,3.308621,3.406466,3.258362,3.117668,2.858167,2.776588,2.859335,3.10898,2.80384,…,0.477122,0.005692,0.038764,0.008754,0.00028,0.007272,0.005106,0.002166,45.881762,31.717238,11.757474,4.721479,2.368195,1.391643,1.640656,1.635501,2.407051,45.881762,31.717238,11.757474,1.066154,2.036562,2.230265,5.299374,1.125119,2.407051,45.881762,14.168175,7.448384,24.265204,45.881762,40.528679,7.761493,32.767185,5.353084,0.85628,4.496803
"""std""",,,0.0,0.0,174.837877,69.379951,171.817976,152.539099,145.876662,139.320601,137.611402,149.038058,143.038376,137.181937,135.319262,12.468123,87.647782,50.126862,47.828225,49.02113,89.352097,174.837877,9.518254,10.400337,10.705186,10.30089,10.888878,10.894537,11.699239,11.371185,11.099312,10.485757,10.544898,11.657788,13.69638,12.729886,…,2.516032,0.123928,0.364373,0.142995,0.02439,0.118581,0.094428,0.057987,171.817976,149.721097,32.868441,14.28478,7.28958,4.605282,5.813031,5.761745,10.479555,171.817976,149.721097,32.868441,3.400699,6.112712,6.550715,14.905824,3.936661,10.479555,171.817976,40.185434,24.588753,131.818196,171.817976,163.866165,34.782061,130.449848,17.893635,3.211073,15.020402
"""min""","""A0A0A0""","""FSALDU""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""25%""",,,0.0,0.0,8.0,3.0,7.0,7.0,7.0,6.0,6.0,7.0,7.0,6.0,6.0,0.0,4.0,2.0,2.0,2.0,4.0,8.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,4.0,7.0,7.0,1.0,6.0,0.0,0.0,0.0
"""50%""",,,0.0,0.0,21.0,8.0,21.0,19.0,19.0,18.0,17.0,19.0,18.0,17.0,17.0,1.0,11.0,6.0,5.0,5.0,11.0,21.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,15.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,21.0,15.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,21.0,3.0,3.0,11.0,21.0,19.0,3.0,16.0,1.0,0.0,1.0
"""75%""",,,0.0,0.0,47.0,18.0,46.0,41.0,40.0,38.0,38.0,41.0,39.0,38.0,37.0,3.0,25.0,13.0,12.0,12.0,25.0,47.0,2.0,2.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46.0,32.0,11.0,5.0,2.0,1.0,1.0,1.0,1.0,46.0,32.0,11.0,1.0,2.0,2.0,5.0,1.0,1.0,46.0,13.0,8.0,23.0,46.0,41.0,8.0,33.0,4.0,1.0,4.0
"""max""","""Y1A7A4""","""FSALDU""",0.0,0.0,23507.0,7751.0,23378.0,20141.0,19020.0,17849.0,17487.0,19950.0,18891.0,17781.0,17436.0,2169.0,11623.0,6837.0,6487.0,8218.0,13551.0,23507.0,1241.0,1530.0,1791.0,1871.0,1954.0,1200.0,1478.0,1437.0,1811.0,1723.0,1617.0,1545.0,1393.0,1293.0,…,326.0,29.0,66.0,28.0,14.0,22.0,18.0,13.0,23378.0,19212.0,4372.0,2055.0,1061.0,666.0,407.0,813.0,894.0,23378.0,19212.0,4372.0,508.0,971.0,964.0,1663.0,402.0,894.0,23378.0,4874.0,4694.0,14518.0,23378.0,22529.0,5461.0,17068.0,1333.0,318.0,1164.0


Drop the zip code and geography level columns - these are just IDs that won't help our model.

In [4]:
hs.drop_in_place("CODE")
hs.drop_in_place("GEO")
ds.drop_in_place("GEO")
ds.drop_in_place("CODE")


CODE
str
"""A0A0A0"""
"""A0A1A0"""
"""A0A1B0"""
"""A0A1C0"""
"""A0A1E0"""
…
"""Y1A6W1"""
"""Y1A7A1"""
"""Y1A7A2"""
"""Y1A7A3"""


## Feature Engineering

Currently all the household spending variables represent total spending for entire neighbourhood; we will convert them to average per household spending which is probably a better predictor of per household insurance contribution. We don't want to confuse the model with some neighbourhoods having dramatically higher spending only because of high population.

Then we will construct our target variable, the portion of income a household spends on insurance and retirement savings.

In [5]:
total_households = hs.select(TOTAL_HOUSEHOLDS_COL)
hs.drop_in_place(TOTAL_HOUSEHOLDS_COL)
hs = hs.with_columns(
    pl.all() / total_households[TOTAL_HOUSEHOLDS_COL]
)

# Construct our target variable: portion of income spent on insurance and retirement
hs = hs.with_columns(
    (hs[INSURANCE_COL] / hs[INCOME_COL]).alias(PORTION_RET_INSUR_COL)
)

hs = hs.fill_nan(0) # replace NaNs resulting from zip codes with no households; we will remove these later
hs.describe()

statistic,HSHNIAGG,HSAGDISPIN,HSAGDISCIN,HSTT001,HSTE001,HSTX001,HSTC001,HSSH001S,HSFD001S,HSHO001S,HSHC001S,HSHF001S,HSTR001S,HSRE001S,HSPC001S,HSCL001S,HSED002S,HSRO001S,HSTA001S,HSGC001S,HSME001S,HSEP001S,HSMG001S,HSTE001ZBS,HSWH002S,HSWH028S,HSWH040S,HSWH041S,HSWH042S,HSSH001,HSSH002,HSSH003,HSSH004,HSSH053,HSSH054,HSSH005,…,HSTR003,HSTR004,HSTR005,HSTR006,HSTR007,HSTR008,HSTR009,HSTR058,HSTR010,HSTR011,HSTR012,HSTR014M,HSTR015,HSTR020,HSTR030,HSTR031,HSTR032,HSTR033,HSTR034,HSTR035,HSTR036,HSTR037,HSTR038,HSTR039,HSTR040,HSTR041,HSTR050,HSTR051,HSTR052,HSTR053,HSTR054,HSTR055,HSTR056,HSTR056A,HSTR056B,HSTR057,portion_retirement_insurance
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,…,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0
"""null_count""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",113083.263311,86004.969701,61756.463377,124100.46217,110659.096698,21329.761528,81120.980552,19451.348892,14051.852189,5640.647583,5453.737856,3894.12655,13714.912167,4475.787785,2307.920882,3620.452159,1831.592429,180.070429,3552.323879,1258.671821,1687.535929,5767.651152,2440.703466,13441.365471,3679.226495,183.21153,8340.110651,767.014484,471.802312,19451.348892,15057.07041,3572.375804,3527.188921,3503.827985,23.360936,8.074081,…,5998.314594,5794.098686,754.827871,489.202334,4550.06848,10.131729,4.580577,5.551153,194.084179,180.690133,108.968722,71.721411,13.394046,71.132071,5989.931861,164.19669,523.370725,666.971454,967.513306,5.106435,3239.682473,127.909632,224.4644,70.716746,19.397331,51.319415,1655.533641,256.831429,101.69908,44.400143,1130.306389,3.304134,67.110823,42.685376,24.425447,51.881644,0.048015
"""std""",95888.112232,61585.741431,47271.395164,106423.582737,96165.001769,34249.337102,57520.845902,14437.227215,8735.079763,4150.74371,3486.151988,3706.807942,9177.026733,3721.066535,1627.426249,2964.140116,2626.886746,143.91476,2806.49432,896.55783,1738.505479,3501.73853,5563.278449,23340.657412,3268.849224,363.169609,20937.345568,875.526983,401.521054,14437.227215,10180.499762,3356.026359,3310.699569,3284.343463,36.142203,26.417176,…,4397.121656,4230.065806,621.170324,541.99211,3477.680417,12.036328,5.169206,9.316355,223.83379,214.586982,136.121851,94.865981,19.929029,83.535532,3689.98299,172.769453,370.270476,599.671703,676.483186,6.464252,2048.639749,158.369118,216.206197,66.25426,19.125852,55.045244,1714.792859,293.847957,113.030572,94.403067,1334.561812,2.892185,131.658002,108.022754,33.055399,51.994555,0.022317
"""min""",0.0,0.0,0.0,-24249.492537,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-713372.555881,0.0,0.0,-821073.240312,-6402.348112,-21679.582212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""25%""",73127.635536,60115.360364,41531.846159,73269.917426,72382.541976,8272.243558,57945.224346,12655.415474,10652.687589,4175.338127,4131.35436,2285.338647,9472.642127,2736.240782,1622.154486,2405.924724,669.953215,114.432011,2262.510282,707.929534,1031.779193,3744.966973,863.763135,0.0,1368.972054,66.964758,-1342.756767,392.621006,235.464322,12655.415474,10160.037047,1025.873981,1020.371721,1018.227082,0.697442,0.262822,…,3580.051946,3455.204029,415.054067,162.159547,2541.290061,3.971605,1.645518,1.304341,74.45857,65.497224,34.255887,19.944268,0.827326,27.365728,4236.838064,70.623326,341.409674,339.145739,595.707259,1.566959,2137.112083,39.65448,97.032845,32.311686,8.001997,20.05245,684.36025,31.349019,42.196381,6.679889,412.544377,1.409769,20.144985,8.370629,3.945077,21.681811,0.039442
"""50%""",105518.335923,83307.457445,58778.608128,113891.309542,103292.475452,15610.903736,79140.170527,18369.143203,14157.421684,5462.094183,5463.240541,3520.890143,13559.25216,4147.454981,2236.969766,3387.950894,1289.236969,165.502325,3350.386084,1155.836663,1515.147671,5776.975331,1733.92237,8083.290954,3163.885399,125.759282,3163.054053,674.824496,436.90708,18369.143203,14472.843288,2667.499848,2633.007704,2619.007433,5.869349,2.598953,…,5674.482391,5489.955337,661.997138,351.978012,4260.42475,7.593488,3.303306,3.154613,144.521725,131.856312,75.751763,47.436984,6.417425,52.078122,6064.64217,104.745439,520.526665,580.22602,907.327164,3.407644,3225.71175,86.680592,191.456219,57.892544,12.44561,40.766558,1295.724858,146.591663,82.502736,18.106371,853.162573,2.696411,37.525668,17.488713,15.835964,40.908232,0.051997
"""75%""",142149.482269,108671.653657,77968.111932,159468.59394,138045.168772,25721.610617,101931.492258,25263.28767,17642.35633,6890.344213,6785.958768,4917.635412,17680.653551,5748.992286,2925.140756,4536.287478,2300.070615,231.29232,4592.127959,1726.235089,2075.518211,7838.505508,2809.386535,20693.84129,5163.760766,197.675522,13820.106465,991.037259,657.058631,25263.28767,19635.451266,5306.417259,5233.279191,5196.709294,33.455138,8.257677,…,7959.819968,7692.13778,994.766152,645.352644,6107.840209,12.843005,5.92594,6.681901,251.409835,234.78952,147.243044,90.92053,18.259162,93.679454,7772.5892,186.766599,689.220259,865.282575,1295.772351,6.595273,4269.965274,165.890106,310.540148,94.816481,24.562612,68.147281,2313.104392,393.628202,130.260078,43.64381,1519.098779,4.451897,62.724912,35.017269,32.134493,66.985569,0.062119
"""max""",7270000.0,4072600.0,3303800.0,6882200.0,7135000.0,3126400.0,3702000.0,1007600.0,538237.627175,344355.052512,197378.71681,272936.955798,535182.209863,247795.220702,105537.174494,216990.127662,249818.77172,10452.544728,236228.228001,74161.998085,138862.42004,201084.264762,817688.884688,2768000.0,103427.199404,33074.809974,2742600.0,80632.148718,14931.316191,1007600.0,739870.985769,241808.709393,239609.371991,239407.836089,1867.077059,4723.175301,…,293724.242998,276127.105616,41888.35566,39957.46051,251790.316824,824.100219,544.712467,673.36081,25537.027954,25433.863411,17655.602112,8094.590322,1474.977482,5983.712563,211397.412746,7337.435685,22692.312704,45226.872387,44471.351116,716.708352,101907.088828,13167.704698,13807.872756,6295.665656,1307.036666,4988.62899,114145.307319,13032.149094,8066.810543,11356.73863,96997.1598,141.980793,7699.87487,7006.249091,2635.985182,3635.902611,0.15988


## Feature Selection

Let's remove some redundant features. We'll first drop features highly correlated with our target variable. Then, we'll read in the metadata file for the dataset, which is structured like a pre-order depth first search tree traversal. We'll use this metadata to identify and delete the non-"leaf node" features that don't give us any new info. For example, "Total Population" is redundant when we have "Total Male Population", "Total Female Population" and "Total Other Population". 

In [6]:
# drop columns used to calculate target or highly correlated with the target (related to income or insurance spend)
columns_to_drop = [INCOME_COL, "HSAGDISPIN", "HSAGDISCIN", INSURANCE_COL]
for col in columns_to_drop:
    hs.drop_in_place(col)


In [7]:
def get_columns_to_drop(metadata_file):
    # Read the metadata file
    df = pl.read_csv(metadata_file)
    
    #Initialize lists for variables to drop
    non_leaf_variables = []
    summary_variables = []
    
    # process rows in order to identify non-leaf nodes
    for i in range(len(df)):
        var = df['Variable'][i]
        hier_level = df['Hierarchy Level'][i]
        

        # Check if this is a summary variable (ends with AVG or MED)
        if var.endswith('AVG') or var.endswith('MED'):
            summary_variables.append(var)
            continue
        
        # Check if the next row exists and is a direct child
        if i < len(df) - 1 and df['Hierarchy Level'][i+1] == hier_level + 1:
            # This is a parent node (has at least one child)
            non_leaf_variables.append(var)
    
    #  Combine all var to drop
    columns_to_drop = list(set(non_leaf_variables +  summary_variables))
    
    return columns_to_drop


ds_columns_to_drop = get_columns_to_drop("data\DemoStats 2024 - Metadata.csv")
hs_columns_to_drop = get_columns_to_drop("data\HouseholdSpend 2024 - Metadata.csv")

# set aside insurance and retirement - this is our target variable

dropped_hs_columns = []
for col in hs_columns_to_drop:
    if col in hs.columns:
        hs.drop_in_place(col)
        dropped_hs_columns.append(col)
print(f"Dropped columns from hs: {', '.join(dropped_hs_columns)}")

dropped_ds_columns = []
for col in ds_columns_to_drop:
    if col in ds.columns:
        ds.drop_in_place(col)
        dropped_ds_columns.append(col)
print(f"Dropped columns from ds: {', '.join(dropped_ds_columns)}")


display(hs.describe())
display(ds.describe())

  ds_columns_to_drop = get_columns_to_drop("data\DemoStats 2024 - Metadata.csv")
  hs_columns_to_drop = get_columns_to_drop("data\HouseholdSpend 2024 - Metadata.csv")


Dropped columns from hs: HSHE011, HSTR050, HSCS011, HSSH010, HSFD990, HSSH030, HSHO001, HSCC001, HSSH031, HSHC001, HSTR039, HSCS001, HSHF005, HSHF002, HSSH041, HSHF001, HSSH040, HSTC001, HSSH050, HSTR002, HSTR008, HSHO014, HSHC022, HSHO010, HSSH003, HSSH016, HSSH047, HSSH002, HSTR004, HSHC005, HSHE001, HSSH037, HSTR011, HSHC002, HSCS003, HSTR010, HSTE001, HSTT001, HSWH042S, HSCC002, HSFD991, HSHC006, HSTR003, HSTE001ZBS, HSTR030, HSCC013, HSSH032, HSSH036, HSSH033, HSSH034, HSHE002, HSSH001, HSHC010, HSSH012, HSSH046, HSFD001, HSTR056, HSSH035, HSSH004, HSHO003, HSSH012B, HSHE010, HSHO018, HSHE020, HSTR001, HSHE012, HSHC004
Dropped columns from ds: ECYTCAHPOP, ECYTIMEU, ECYHNIHHD, ECYPNIHP15, ECYRIMWAF, ECYTIMSAF, ECYRIMHPOP, ECYEDAHPWK, ECYTIMEA, ECYRIMEEU, ECYPTAMED, ECYMOTSING, ECYMARNMCL, ECYRELCHR, ECYRIMAM, ECYRIMOCE, ECYPMAPOP, ECYRIMA, ECYHTY1PH, ECYHTYFHT, ECYPMAMED, ECYHTYHHD, ECYACTHPL, ECYVISHPOP, ECYTIMNAF, ECYEDUHP15, ECYMTNMED, ECYHFSC, ECYTIMSA, ECYPTAAVG, ECYHTY1FH, EC

statistic,HSTX001,HSSH001S,HSFD001S,HSHO001S,HSHC001S,HSHF001S,HSTR001S,HSRE001S,HSPC001S,HSCL001S,HSED002S,HSRO001S,HSTA001S,HSGC001S,HSME001S,HSMG001S,HSWH002S,HSWH028S,HSWH040S,HSWH041S,HSSH053,HSSH054,HSSH005,HSSH006,HSSH007,HSSH011,HSSH014,HSSH013,HSSH015,HSSH017,HSSH018,HSSH019,HSSH021,HSSH020,HSSH022,HSRM002A,…,HSHE011B,HSHE031,HSHE012M,HSHE032,HSHE013,HSHE015,HSHE016,HSHE021,HSHE023,HSTR005,HSTR006,HSTR007,HSTR009,HSTR058,HSTR012,HSTR014M,HSTR015,HSTR020,HSTR031,HSTR032,HSTR033,HSTR034,HSTR035,HSTR036,HSTR037,HSTR038,HSTR040,HSTR041,HSTR051,HSTR052,HSTR053,HSTR054,HSTR055,HSTR056A,HSTR056B,HSTR057,portion_retirement_insurance
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,…,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0
"""null_count""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",21329.761528,19451.348892,14051.852189,5640.647583,5453.737856,3894.12655,13714.912167,4475.787785,2307.920882,3620.452159,1831.592429,180.070429,3552.323879,1258.671821,1687.535929,2440.703466,3679.226495,183.21153,8340.110651,767.014484,3503.827985,23.360936,8.074081,32.17807,4.934732,5734.197578,2039.657603,224.346864,450.89118,688.498413,42.610658,57.327097,22.868613,19.337479,154.819705,36.180773,…,50.669429,47.839276,48.257366,24.473367,196.022564,116.300456,314.245848,123.612696,218.320846,754.827871,489.202334,4550.06848,4.580577,5.551153,108.968722,71.721411,13.394046,71.132071,164.19669,523.370725,666.971454,967.513306,5.106435,3239.682473,127.909632,224.4644,19.397331,51.319415,256.831429,101.69908,44.400143,1130.306389,3.304134,42.685376,24.425447,51.881644,0.048015
"""std""",34249.337102,14437.227215,8735.079763,4150.74371,3486.151988,3706.807942,9177.026733,3721.066535,1627.426249,2964.140116,2626.886746,143.91476,2806.49432,896.55783,1738.505479,5563.278449,3268.849224,363.169609,20937.345568,875.526983,3284.343463,36.142203,26.417176,38.451252,11.039197,5211.756831,2124.710405,429.364967,417.349717,761.746528,40.965083,55.488358,29.495715,33.196496,323.674195,65.65381,…,58.162434,61.261459,55.079125,43.653086,179.932695,121.896706,467.674284,173.012319,308.065888,621.170324,541.99211,3477.680417,5.169206,9.316355,136.121851,94.865981,19.929029,83.535532,172.769453,370.270476,599.671703,676.483186,6.464252,2048.639749,158.369118,216.206197,19.125852,55.045244,293.847957,113.030572,94.403067,1334.561812,2.892185,108.022754,33.055399,51.994555,0.022317
"""min""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-821073.240312,-6402.348112,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""25%""",8272.243558,12655.415474,10652.687589,4175.338127,4131.35436,2285.338647,9472.642127,2736.240782,1622.154486,2405.924724,669.953215,114.432011,2262.510282,707.929534,1031.779193,863.763135,1368.972054,66.964758,-1342.756767,392.621006,1018.227082,0.697442,0.262822,1.466898,0.420496,1979.851196,803.162301,33.183931,200.385121,252.912423,16.327042,19.132909,4.387385,3.180641,14.227029,8.461287,…,11.786362,8.570196,10.828031,1.906987,105.855615,55.070358,62.622815,4.26806,64.93797,415.054067,162.159547,2541.290061,1.645518,1.304341,34.255887,19.944268,0.827326,27.365728,70.623326,341.409674,339.145739,595.707259,1.566959,2137.112083,39.65448,97.032845,8.001997,20.05245,31.349019,42.196381,6.679889,412.544377,1.409769,8.370629,3.945077,21.681811,0.039442
"""50%""",15610.903736,18369.143203,14157.421684,5462.094183,5463.240541,3520.890143,13559.25216,4147.454981,2236.969766,3387.950894,1289.236969,165.502325,3350.386084,1155.836663,1515.147671,1733.92237,3163.885399,125.759282,3163.054053,674.824496,2619.007433,5.869349,2.598953,19.812133,1.576386,5042.410649,1757.735701,90.81336,442.696186,545.144445,35.285086,49.064541,14.777471,10.228909,58.967873,22.751678,…,32.806617,29.17687,32.719464,9.930335,172.364567,96.705751,191.197661,32.58439,156.145562,661.997138,351.978012,4260.42475,3.303306,3.154613,75.751763,47.436984,6.417425,52.078122,104.745439,520.526665,580.22602,907.327164,3.407644,3225.71175,86.680592,191.456219,12.44561,40.766558,146.591663,82.502736,18.106371,853.162573,2.696411,17.488713,15.835964,40.908232,0.051997
"""75%""",25721.610617,25263.28767,17642.35633,6890.344213,6785.958768,4917.635412,17680.653551,5748.992286,2925.140756,4536.287478,2300.070615,231.29232,4592.127959,1726.235089,2075.518211,2809.386535,5163.760766,197.675522,13820.106465,991.037259,5196.709294,33.455138,8.257677,48.373973,4.634995,8317.980422,2810.96174,226.345837,617.921788,930.506308,57.592803,80.353802,30.693925,22.758953,176.533874,43.865413,…,70.643358,64.115551,67.825458,29.309771,256.502914,152.363281,408.697915,217.781427,297.466038,994.766152,645.352644,6107.840209,5.92594,6.681901,147.243044,90.92053,18.259162,93.679454,186.766599,689.220259,865.282575,1295.772351,6.595273,4269.965274,165.890106,310.540148,24.562612,68.147281,393.628202,130.260078,43.64381,1519.098779,4.451897,35.017269,32.134493,66.985569,0.062119
"""max""",3126400.0,1007600.0,538237.627175,344355.052512,197378.71681,272936.955798,535182.209863,247795.220702,105537.174494,216990.127662,249818.77172,10452.544728,236228.228001,74161.998085,138862.42004,817688.884688,103427.199404,33074.809974,2742600.0,80632.148718,239407.836089,1867.077059,4723.175301,2018.188124,567.65581,431146.427245,126501.015067,26581.158561,28099.258635,68319.676587,2649.200838,3907.01858,4283.505337,3760.923512,38449.444531,6921.904262,…,4031.862921,2864.974592,2963.306302,3511.165337,12898.986191,8331.291257,42750.155217,9752.413471,29623.725246,41888.35566,39957.46051,251790.316824,544.712467,673.36081,17655.602112,8094.590322,1474.977482,5983.712563,7337.435685,22692.312704,45226.872387,44471.351116,716.708352,101907.088828,13167.704698,13807.872756,1307.036666,4988.62899,13032.149094,8066.810543,11356.73863,96997.1598,141.980793,7006.249091,2635.985182,3635.902611,0.15988


statistic,ECYASQKM,ECYALSQKM,ECYBASPOP,ECYBASHHD,ECYBASHPOP,ECYBAS12P,ECYBAS15P,ECYBAS18P,ECYBAS19P,ECYBAS12HP,ECYBAS15HP,ECYBAS18HP,ECYBAS19HP,ECYBASTNGH,ECYBASADUH,ECYBASCF,ECYBASCFH,ECYBASKID,ECYBASLF,ECYPTA_0_4,ECYPTA_5_9,ECYPTA1014,ECYPTA1519,ECYPTA2024,ECYPTA2529,ECYPTA3034,ECYPTA3539,ECYPTA4044,ECYPTA4549,ECYPTA5054,ECYPTA5559,ECYPTA6064,ECYPTA6569,ECYPTA7074,ECYPTA7579,ECYPTA8084,…,ECYRIMCHIN,ECYRIMHONG,ECYRIMJAPA,ECYRIMSKOR,ECYRIMEAO,ECYRIMPHIL,ECYRIMVIET,ECYRIMSEAO,ECYRIMBANG,ECYRIMINDI,ECYRIMNEPA,ECYRIMPAKI,ECYRIMSRI,ECYRIMSASO,ECYRIMAUSS,ECYRIMOCEO,ECYPIMNI,ECYPIMP01,ECYPIM0110,ECYPIM1115,ECYPIM1621,ECYPIM22CY,ECYPIMNPER,ECYAIMNI,ECYAIM_0_5,ECYAIM_514,ECYAIM1524,ECYAIM2544,ECYAIM45P,ECYAIMNPER,ECYGEN1GEN,ECYGEN2GEN,ECYGEN3GEN,ECYTCA_U18,ECYTCA_18P,ECYNCA_U18,ECYNCA_18P
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,…,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0
"""null_count""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",0.0,0.0,46.684199,18.432629,45.881762,41.254934,39.613594,37.819792,37.323999,40.336732,38.832472,37.263988,36.746572,3.072744,24.49923,12.76093,12.051364,12.740845,24.633679,2.181318,2.390635,2.498651,2.672259,3.213171,3.308621,3.406466,3.258362,3.117668,2.858167,2.776588,2.859335,3.10898,2.80384,2.278714,1.770631,1.112368,…,0.115093,0.022437,0.004211,0.020825,0.005081,0.090905,0.023022,0.010255,0.016249,0.477122,0.005692,0.038764,0.008754,0.00028,0.005106,0.002166,31.717238,4.721479,2.368195,1.391643,1.640656,1.635501,2.407051,31.717238,1.066154,2.036562,2.230265,5.299374,1.125119,2.407051,14.168175,7.448384,24.265204,7.761493,32.767185,0.85628,4.496803
"""std""",0.0,0.0,174.837877,69.379951,171.817976,152.539099,145.876662,139.320601,137.611402,149.038058,143.038376,137.181937,135.319262,12.468123,87.647782,50.126862,47.828225,49.02113,89.352097,9.518254,10.400337,10.705186,10.30089,10.888878,10.894537,11.699239,11.371185,11.099312,10.485757,10.544898,11.657788,13.69638,12.729886,10.420684,8.140557,5.13421,…,0.751078,0.268443,0.077976,0.225336,0.09944,0.712533,0.228892,0.125798,0.233412,2.516032,0.123928,0.364373,0.142995,0.02439,0.094428,0.057987,149.721097,14.28478,7.28958,4.605282,5.813031,5.761745,10.479555,149.721097,3.400699,6.112712,6.550715,14.905824,3.936661,10.479555,40.185434,24.588753,131.818196,34.782061,130.449848,3.211073,15.020402
"""min""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""25%""",0.0,0.0,8.0,3.0,7.0,7.0,7.0,6.0,6.0,7.0,7.0,6.0,6.0,0.0,4.0,2.0,2.0,2.0,4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,6.0,0.0,0.0
"""50%""",0.0,0.0,21.0,8.0,21.0,19.0,19.0,18.0,17.0,19.0,18.0,17.0,17.0,1.0,11.0,6.0,5.0,5.0,11.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,15.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,3.0,11.0,3.0,16.0,0.0,1.0
"""75%""",0.0,0.0,47.0,18.0,46.0,41.0,40.0,38.0,38.0,41.0,39.0,38.0,37.0,3.0,25.0,13.0,12.0,12.0,25.0,2.0,2.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,2.0,1.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,32.0,5.0,2.0,1.0,1.0,1.0,1.0,32.0,1.0,2.0,2.0,5.0,1.0,1.0,13.0,8.0,23.0,8.0,33.0,1.0,4.0
"""max""",0.0,0.0,23507.0,7751.0,23378.0,20141.0,19020.0,17849.0,17487.0,19950.0,18891.0,17781.0,17436.0,2169.0,11623.0,6837.0,6487.0,8218.0,13551.0,1241.0,1530.0,1791.0,1871.0,1954.0,1200.0,1478.0,1437.0,1811.0,1723.0,1617.0,1545.0,1393.0,1293.0,1193.0,1139.0,700.0,…,84.0,54.0,11.0,33.0,18.0,240.0,34.0,14.0,40.0,326.0,29.0,66.0,28.0,14.0,18.0,13.0,19212.0,2055.0,1061.0,666.0,407.0,813.0,894.0,19212.0,508.0,971.0,964.0,1663.0,402.0,894.0,4874.0,4694.0,14518.0,5461.0,17068.0,318.0,1164.0


## Missing Values and Imputation

Let's join the two datasets together so we have a single feature matrix.

In [8]:
data = pl.concat([hs, ds], how="horizontal")
display(data.describe())

statistic,HSTX001,HSSH001S,HSFD001S,HSHO001S,HSHC001S,HSHF001S,HSTR001S,HSRE001S,HSPC001S,HSCL001S,HSED002S,HSRO001S,HSTA001S,HSGC001S,HSME001S,HSMG001S,HSWH002S,HSWH028S,HSWH040S,HSWH041S,HSSH053,HSSH054,HSSH005,HSSH006,HSSH007,HSSH011,HSSH014,HSSH013,HSSH015,HSSH017,HSSH018,HSSH019,HSSH021,HSSH020,HSSH022,HSRM002A,…,ECYRIMCHIN,ECYRIMHONG,ECYRIMJAPA,ECYRIMSKOR,ECYRIMEAO,ECYRIMPHIL,ECYRIMVIET,ECYRIMSEAO,ECYRIMBANG,ECYRIMINDI,ECYRIMNEPA,ECYRIMPAKI,ECYRIMSRI,ECYRIMSASO,ECYRIMAUSS,ECYRIMOCEO,ECYPIMNI,ECYPIMP01,ECYPIM0110,ECYPIM1115,ECYPIM1621,ECYPIM22CY,ECYPIMNPER,ECYAIMNI,ECYAIM_0_5,ECYAIM_514,ECYAIM1524,ECYAIM2544,ECYAIM45P,ECYAIMNPER,ECYGEN1GEN,ECYGEN2GEN,ECYGEN3GEN,ECYTCA_U18,ECYTCA_18P,ECYNCA_U18,ECYNCA_18P
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,…,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0,868970.0
"""null_count""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",21329.761528,19451.348892,14051.852189,5640.647583,5453.737856,3894.12655,13714.912167,4475.787785,2307.920882,3620.452159,1831.592429,180.070429,3552.323879,1258.671821,1687.535929,2440.703466,3679.226495,183.21153,8340.110651,767.014484,3503.827985,23.360936,8.074081,32.17807,4.934732,5734.197578,2039.657603,224.346864,450.89118,688.498413,42.610658,57.327097,22.868613,19.337479,154.819705,36.180773,…,0.115093,0.022437,0.004211,0.020825,0.005081,0.090905,0.023022,0.010255,0.016249,0.477122,0.005692,0.038764,0.008754,0.00028,0.005106,0.002166,31.717238,4.721479,2.368195,1.391643,1.640656,1.635501,2.407051,31.717238,1.066154,2.036562,2.230265,5.299374,1.125119,2.407051,14.168175,7.448384,24.265204,7.761493,32.767185,0.85628,4.496803
"""std""",34249.337102,14437.227215,8735.079763,4150.74371,3486.151988,3706.807942,9177.026733,3721.066535,1627.426249,2964.140116,2626.886746,143.91476,2806.49432,896.55783,1738.505479,5563.278449,3268.849224,363.169609,20937.345568,875.526983,3284.343463,36.142203,26.417176,38.451252,11.039197,5211.756831,2124.710405,429.364967,417.349717,761.746528,40.965083,55.488358,29.495715,33.196496,323.674195,65.65381,…,0.751078,0.268443,0.077976,0.225336,0.09944,0.712533,0.228892,0.125798,0.233412,2.516032,0.123928,0.364373,0.142995,0.02439,0.094428,0.057987,149.721097,14.28478,7.28958,4.605282,5.813031,5.761745,10.479555,149.721097,3.400699,6.112712,6.550715,14.905824,3.936661,10.479555,40.185434,24.588753,131.818196,34.782061,130.449848,3.211073,15.020402
"""min""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-821073.240312,-6402.348112,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""25%""",8272.243558,12655.415474,10652.687589,4175.338127,4131.35436,2285.338647,9472.642127,2736.240782,1622.154486,2405.924724,669.953215,114.432011,2262.510282,707.929534,1031.779193,863.763135,1368.972054,66.964758,-1342.756767,392.621006,1018.227082,0.697442,0.262822,1.466898,0.420496,1979.851196,803.162301,33.183931,200.385121,252.912423,16.327042,19.132909,4.387385,3.180641,14.227029,8.461287,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,6.0,0.0,0.0
"""50%""",15610.903736,18369.143203,14157.421684,5462.094183,5463.240541,3520.890143,13559.25216,4147.454981,2236.969766,3387.950894,1289.236969,165.502325,3350.386084,1155.836663,1515.147671,1733.92237,3163.885399,125.759282,3163.054053,674.824496,2619.007433,5.869349,2.598953,19.812133,1.576386,5042.410649,1757.735701,90.81336,442.696186,545.144445,35.285086,49.064541,14.777471,10.228909,58.967873,22.751678,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,15.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,3.0,11.0,3.0,16.0,0.0,1.0
"""75%""",25721.610617,25263.28767,17642.35633,6890.344213,6785.958768,4917.635412,17680.653551,5748.992286,2925.140756,4536.287478,2300.070615,231.29232,4592.127959,1726.235089,2075.518211,2809.386535,5163.760766,197.675522,13820.106465,991.037259,5196.709294,33.455138,8.257677,48.373973,4.634995,8317.980422,2810.96174,226.345837,617.921788,930.506308,57.592803,80.353802,30.693925,22.758953,176.533874,43.865413,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,32.0,5.0,2.0,1.0,1.0,1.0,1.0,32.0,1.0,2.0,2.0,5.0,1.0,1.0,13.0,8.0,23.0,8.0,33.0,1.0,4.0
"""max""",3126400.0,1007600.0,538237.627175,344355.052512,197378.71681,272936.955798,535182.209863,247795.220702,105537.174494,216990.127662,249818.77172,10452.544728,236228.228001,74161.998085,138862.42004,817688.884688,103427.199404,33074.809974,2742600.0,80632.148718,239407.836089,1867.077059,4723.175301,2018.188124,567.65581,431146.427245,126501.015067,26581.158561,28099.258635,68319.676587,2649.200838,3907.01858,4283.505337,3760.923512,38449.444531,6921.904262,…,84.0,54.0,11.0,33.0,18.0,240.0,34.0,14.0,40.0,326.0,29.0,66.0,28.0,14.0,18.0,13.0,19212.0,2055.0,1061.0,666.0,407.0,813.0,894.0,19212.0,508.0,971.0,964.0,1663.0,402.0,894.0,4874.0,4694.0,14518.0,5461.0,17068.0,318.0,1164.0


Drop rows where total population is 0 or > 5% of the values are negative/invalid. These represent empty postal codes that will only add noise to our data.
Also drop columns where all values are zero or > 5% of the values are negative/invalid.

In [9]:
# Drop rows where total population is 0 or has too many invalid values
invalid_row_threshold = 0.05  # 5% threshold

# filter rows where > 5% of values are invalid
data = data.filter(
    ~(pl.sum_horizontal(pl.all().lt(0)) / pl.sum_horizontal(pl.all().is_not_null()) > invalid_row_threshold)
)

# filter out rows where total population is 0
data = data.filter(~pl.all_horizontal(pl.all().eq(0)))

# filter out columns where > 5% of values are invalid
numeric_cols = [s.name for s in data]
cols_to_drop = [col for col in numeric_cols if (data[col] < 0).sum() / hs.height > invalid_row_threshold]
data.drop(cols_to_drop)

# drop all zero columns
data.drop(
    [col for col in hs.columns if (data[col] == 0).all()]
)
display(data.describe())

statistic,HSTX001,HSSH001S,HSFD001S,HSHO001S,HSHC001S,HSHF001S,HSTR001S,HSRE001S,HSPC001S,HSCL001S,HSED002S,HSRO001S,HSTA001S,HSGC001S,HSME001S,HSMG001S,HSWH002S,HSWH028S,HSWH040S,HSWH041S,HSSH053,HSSH054,HSSH005,HSSH006,HSSH007,HSSH011,HSSH014,HSSH013,HSSH015,HSSH017,HSSH018,HSSH019,HSSH021,HSSH020,HSSH022,HSRM002A,…,ECYRIMCHIN,ECYRIMHONG,ECYRIMJAPA,ECYRIMSKOR,ECYRIMEAO,ECYRIMPHIL,ECYRIMVIET,ECYRIMSEAO,ECYRIMBANG,ECYRIMINDI,ECYRIMNEPA,ECYRIMPAKI,ECYRIMSRI,ECYRIMSASO,ECYRIMAUSS,ECYRIMOCEO,ECYPIMNI,ECYPIMP01,ECYPIM0110,ECYPIM1115,ECYPIM1621,ECYPIM22CY,ECYPIMNPER,ECYAIMNI,ECYAIM_0_5,ECYAIM_514,ECYAIM1524,ECYAIM2544,ECYAIM45P,ECYAIMNPER,ECYGEN1GEN,ECYGEN2GEN,ECYGEN3GEN,ECYTCA_U18,ECYTCA_18P,ECYNCA_U18,ECYNCA_18P
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,…,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0
"""null_count""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",23716.777979,21628.151935,15624.396837,6271.893204,6064.066385,4329.918771,15249.749813,4976.673791,2566.200615,4025.617442,2036.566181,200.222135,3949.864917,1399.529952,1876.388299,2713.842916,4090.969221,203.714757,9273.453544,852.851177,3895.941841,25.975261,8.977653,35.779122,5.486978,6375.912392,2267.915957,249.453551,501.350471,765.548362,47.379222,63.74258,25.427842,21.501539,172.145599,40.229768,…,0.127973,0.024948,0.004682,0.023155,0.005649,0.101079,0.025598,0.011402,0.018068,0.530517,0.006329,0.043102,0.009734,0.000311,0.005677,0.002408,35.266718,5.249861,2.633219,1.547382,1.824262,1.81853,2.676424,35.266718,1.185468,2.264474,2.479854,5.892428,1.251032,2.676424,15.753737,8.281933,26.980726,8.630083,36.434165,0.952107,5.000041
"""std""",35322.476515,13589.668736,7763.405219,3898.41362,3132.451504,3659.395109,8380.73619,3592.093264,1510.666713,2852.777939,2693.572255,137.820392,2680.974863,834.6471,1733.86223,5802.795805,3193.234969,377.459188,21880.939129,882.681155,3235.185242,37.209337,27.710174,38.924482,11.509627,5109.857075,2121.775199,445.782966,410.335941,765.642896,40.497009,54.905236,30.038027,34.333695,336.907528,68.043464,…,0.790949,0.282955,0.08221,0.237497,0.104841,0.750662,0.241223,0.132601,0.24606,2.647741,0.130664,0.383978,0.150753,0.025719,0.099556,0.061141,157.479696,14.970532,7.641126,4.831262,6.102293,6.048143,11.017723,157.479696,3.566164,6.405522,6.862595,15.606206,4.13208,11.017723,42.078629,25.794672,138.734736,36.574361,137.069161,3.372485,15.758954
"""min""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-821073.240312,-6402.348112,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""25%""",10498.595672,14442.835295,11815.362034,4599.260584,4582.547079,2678.14736,10816.930251,3188.144857,1818.227831,2719.19476,846.466374,130.370373,2610.192584,841.984769,1183.785599,1107.230563,1986.635006,85.139467,-2097.88165,488.512808,1369.190266,1.496704,0.653707,4.198616,0.705314,2947.778185,1107.49378,47.332156,291.320585,343.995394,22.427703,28.9986,7.327878,5.120193,24.263824,12.930038,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,6.0,2.0,8.0,0.0,0.0
"""50%""",17235.016465,19582.375755,14804.825194,5709.235539,5707.056476,3763.53348,14316.81845,4423.603708,2358.290213,3588.746201,1443.813025,176.500259,3572.369327,1254.679404,1612.897131,1918.089082,3502.900283,137.870828,4914.77265,728.934757,3047.070221,8.981196,3.37688,24.378353,1.928907,5647.382616,1949.224702,108.06905,477.360776,607.931777,39.076064,54.593906,17.244342,12.013737,73.114026,25.950957,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,1.0,0.0,0.0,0.0,0.0,0.0,18.0,0.0,0.0,1.0,2.0,0.0,0.0,4.0,3.0,12.0,4.0,18.0,0.0,1.0
"""75%""",27240.41929,26166.258428,18077.052441,7089.160972,6959.162525,5111.168398,18192.557487,5968.796612,3016.032446,4693.872597,2461.281676,240.492639,4753.612022,1798.77881,2151.938969,2970.328409,5443.644912,208.326192,15354.690961,1034.662008,5550.419313,37.94069,9.24881,52.690943,5.283945,8744.276893,2947.282358,254.571574,639.740573,987.940772,60.750192,84.589699,33.166144,24.904155,200.865239,47.252261,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35.0,5.0,2.0,1.0,1.0,1.0,1.0,35.0,1.0,2.0,3.0,6.0,1.0,1.0,15.0,9.0,25.0,8.0,37.0,1.0,4.0
"""max""",3126400.0,1007600.0,538237.627175,344355.052512,197378.71681,272936.955798,535182.209863,247795.220702,105537.174494,216990.127662,249818.77172,10452.544728,236228.228001,74161.998085,138862.42004,817688.884688,103427.199404,33074.809974,2742600.0,80632.148718,239407.836089,1867.077059,4723.175301,2018.188124,567.65581,431146.427245,126501.015067,26581.158561,28099.258635,68319.676587,2649.200838,3907.01858,4283.505337,3760.923512,38449.444531,6921.904262,…,84.0,54.0,11.0,33.0,18.0,240.0,34.0,14.0,40.0,326.0,29.0,66.0,28.0,14.0,18.0,13.0,19212.0,2055.0,1061.0,666.0,407.0,813.0,894.0,19212.0,508.0,971.0,964.0,1663.0,402.0,894.0,4874.0,4694.0,14518.0,5461.0,17068.0,318.0,1164.0


Some of the spending variables have negative values which is obviously not valid. We will impute them with the median. We can see from the display() output that some columns' max spending values are in the 9-figure range. This is way outside a typical neighbourhood's spend in any category. We will cap extreme outliers by truncating all values above 3 * IQR for all the columns in the spending dataframe.

Finally, we will standardize the data using z-scores.

In [None]:
class NegativeValueImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.medians = None
        
    def fit(self, X, y=None):
        # X can be a DataFrame or np  array
        if isinstance(X, pd.DataFrame):
            self.medians = {}
            for col in X.columns:
                # Calculate median of non-negative values
                non_negative_values = X.loc[X[col] >= 0, col]
                self.medians[col] = non_negative_values.median() if len(non_negative_values) > 0 else 0
        else:
            self.medians = []
            for i in range(X.shape[1]):
                non_negative_values = X[:, i][X[:, i] >= 0]
                self.medians.append(np.median(non_negative_values) if len(non_negative_values) > 0 else 0)
                
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        
        if isinstance(X_copy, pd.DataFrame):
            for col in X_copy.columns:
                mask = X_copy[col] < 0
                X_copy.loc[mask, col] = self.medians[col]
        else:  # numpy array
            for i in range(X_copy.shape[1]):
                mask = X_copy[:, i] < 0
                X_copy[mask, i] = self.medians[i]
                

        return X_copy

class IQRClippingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, multiplier= 3):
        self.multiplier = multiplier
        self.upper_bounds_ = {  }   


    def fit(self, X, y=None):
        df = pl.DataFrame(X)
          
        for col in df.columns:
            q1 = df[col].quantile(.25)
            q3 = df[col].quantile(0.75)
            iqr = q3 - q1
            self.upper_bounds_[col] = q3 + self.multiplier * iqr
        
        return self
    
    def transform(self, X):
        df = pl.DataFrame(X)    
         
        for col in df.columns:  
            if col in self.upper_bounds_:
                upper_bound = self.upper_bounds_[ col]
                df = df.with_columns(
                     pl.when(pl.col(col) > upper_bound)
                    .then(upper_bound)
                    .otherwise(pl.col(col)) 
                    .alias(col) )
        
        return  df.to_numpy()

Before fitting NegativeValueImputer:
shape: (9, 758)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ statistic ┆ HSTX001   ┆ HSSH001S  ┆ HSFD001S  ┆ … ┆ ECYTCA_U1 ┆ ECYTCA_18 ┆ ECYNCA_U1 ┆ ECYNCA_1 │
│ ---       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ 8         ┆ P         ┆ 8         ┆ 8P       │
│ str       ┆ f64       ┆ f64       ┆ f64       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---      │
│           ┆           ┆           ┆           ┆   ┆ f64       ┆ f64       ┆ f64       ┆ f64      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ count     ┆ 781511.0  ┆ 781511.0  ┆ 781511.0  ┆ … ┆ 781511.0  ┆ 781511.0  ┆ 781511.0  ┆ 781511.0 │
│ null_coun ┆ 0.0       ┆ 0.0       ┆ 0.0       ┆ … ┆ 0.0       ┆ 0.0       ┆ 0.0       ┆ 0.0      │
│ t         ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆          │
│ mean      ┆ 23716.777 ┆ 21628.151 ┆ 

AssertionError: StandardScaler failed: Standard deviation is not approximately 1.

In [11]:
y = data[PORTION_RET_INSUR_COL]
X = data.drop(PORTION_RET_INSUR_COL)

# Impute negative values and clip outliers
# Construct the pipeline
preprocess = Pipeline([
    ('impute_negatives', NegativeValueImputer()),
    ('clip_outliers', IQRClippingTransformer(multiplier=3)),
    ('standardize', StandardScaler())
])

# Apply the pipeline to the data
X_processed = preprocess.fit_transform(X.to_pandas())

# Convert back to Polars DataFrame
X_processed = pl.DataFrame(X_processed)

# set columns
X_processed.columns = X.columns

display(X_processed.describe())

statistic,HSTX001,HSSH001S,HSFD001S,HSHO001S,HSHC001S,HSHF001S,HSTR001S,HSRE001S,HSPC001S,HSCL001S,HSED002S,HSRO001S,HSTA001S,HSGC001S,HSME001S,HSMG001S,HSWH002S,HSWH028S,HSWH040S,HSWH041S,HSSH053,HSSH054,HSSH005,HSSH006,HSSH007,HSSH011,HSSH014,HSSH013,HSSH015,HSSH017,HSSH018,HSSH019,HSSH021,HSSH020,HSSH022,HSRM002A,…,ECYRIMCHIN,ECYRIMHONG,ECYRIMJAPA,ECYRIMSKOR,ECYRIMEAO,ECYRIMPHIL,ECYRIMVIET,ECYRIMSEAO,ECYRIMBANG,ECYRIMINDI,ECYRIMNEPA,ECYRIMPAKI,ECYRIMSRI,ECYRIMSASO,ECYRIMAUSS,ECYRIMOCEO,ECYPIMNI,ECYPIMP01,ECYPIM0110,ECYPIM1115,ECYPIM1621,ECYPIM22CY,ECYPIMNPER,ECYAIMNI,ECYAIM_0_5,ECYAIM_514,ECYAIM1524,ECYAIM2544,ECYAIM45P,ECYAIMNPER,ECYGEN1GEN,ECYGEN2GEN,ECYGEN3GEN,ECYTCA_U18,ECYTCA_18P,ECYNCA_U18,ECYNCA_18P
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,…,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0,781511.0
"""null_count""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",-1.8349e-16,-6.2607e-16,7.8823e-16,-8.8348e-16,-1.3151e-15,-1.0947e-17,-3.6429e-16,-2.1915e-16,6.5338e-16,7.7154e-16,6.0534e-17,7.4539e-16,-3.979e-16,-1.0128e-16,-6.9953e-16,-1.2278e-16,-1.0738e-16,-5.4481e-16,1.9517e-16,-1.3947e-16,6.4734e-17,-1.1767e-16,-8.46e-17,1.6702e-16,-1.1843e-16,8.8642e-16,5.1257e-16,4.8914e-17,-2.2826e-16,-4.0484e-16,-4.087e-16,-1.674e-16,1.3078e-16,-9.6229e-17,-3.6795e-17,1.6911e-17,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.2226e-17,3.4904e-17,-6.8671e-17,1.2478e-16,4.4987e-17,1.5009e-16,4.1814e-17,7.2226e-17,1.4241e-16,-3.3013e-17,-8.9646e-18,-3.4377e-17,-1.7691e-16,4.1814e-17,-6.1207e-17,-3.3549e-18,7.0008e-18,2.5112e-17,-8.4591e-17,4.055e-18,-4.125e-17
"""std""",1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001
"""min""",-1.316828,-2.168847,-2.867458,-2.675862,-2.818175,-1.910454,-2.458588,-2.002602,-2.463781,-2.203985,-1.241381,-2.05306,-2.10579,-1.872657,-1.978248,-1.305182,-1.371866,-1.306016,-1.366635,-1.695644,-1.239219,-0.747991,-0.769673,-0.942896,-0.791453,-1.400408,-1.41785,-0.860757,-1.65337,-1.306939,-1.325703,-1.32756,-1.014225,-0.930242,-0.810018,-1.055725,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.000403,-0.699706,-0.641108,-0.602974,-0.658345,-0.690702,-0.613178,-1.000403,-0.640055,-0.682589,-0.627615,-0.66943,-0.567226,-0.613178,-0.714771,-0.781651,-0.979982,-0.921649,-0.98155,-0.535991,-0.677304
"""25%""",-0.675451,-0.692639,-0.663305,-0.656996,-0.64806,-0.682618,-0.686374,-0.684897,-0.682345,-0.670482,-0.689643,-0.685687,-0.683362,-0.738198,-0.667411,-0.69685,-0.698974,-0.659953,-0.580336,-0.65748,-0.802346,-0.703797,-0.697278,-0.831686,-0.659849,-0.746753,-0.705134,-0.66135,-0.668527,-0.701901,-0.686015,-0.714028,-0.709569,-0.68324,-0.679925,-0.678457,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.701587,-0.699706,-0.641108,-0.602974,-0.658345,-0.690702,-0.613178,-0.701587,-0.640055,-0.682589,-0.627615,-0.66943,-0.567226,-0.613178,-0.653454,-0.666877,-0.675724,-0.635706,-0.707202,-0.535991,-0.677304
"""50%""",-0.263912,-0.167324,-0.105621,-0.169767,-0.115536,-0.185007,-0.112963,-0.174265,-0.153211,-0.180095,-0.300285,-0.20186,-0.159022,-0.182148,-0.192245,-0.25135,-0.1854,-0.259813,-0.224511,-0.188171,-0.266977,-0.482799,-0.3957,-0.297179,-0.431538,-0.148129,-0.163446,-0.405469,-0.039597,-0.237676,-0.211166,-0.1725,-0.297293,-0.350691,-0.418009,-0.298538,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.328068,-0.524269,-0.641108,-0.602974,-0.658345,-0.690702,-0.613178,-0.328068,-0.640055,-0.682589,-0.311187,-0.370737,-0.567226,-0.613178,-0.469502,-0.437328,-0.371465,-0.349763,-0.364268,-0.535991,-0.468767
"""75%""",0.347335,0.505617,0.504812,0.435958,0.477412,0.432835,0.522027,0.464385,0.491222,0.443146,0.362915,0.469316,0.484698,0.550951,0.40465,0.326769,0.471949,0.274822,0.22525,0.415675,0.531777,0.372299,0.254588,0.452744,0.19448,0.538591,0.478844,0.211736,0.509347,0.430704,0.407029,0.46213,0.364654,0.271151,0.266942,0.322982,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.306915,0.177483,0.123553,0.108925,0.039128,0.022574,0.056192,0.306915,0.121478,0.137998,0.321668,0.226648,0.18885,0.056192,0.204988,0.251317,0.287762,0.222123,0.287307,0.320253,0.156842
"""max""",3.415693,4.100385,4.009164,3.714818,3.853829,3.779197,4.14723,3.912231,4.011922,3.784032,3.520591,3.934326,3.988876,4.418399,3.620835,3.397627,3.984715,3.079148,2.642009,3.635138,4.534145,3.600585,3.110187,4.306034,2.757464,4.394623,4.030777,2.830992,4.042969,3.828519,3.68616,3.990603,3.587323,3.134326,3.107544,3.327301,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.332421,2.809052,2.417538,2.24462,2.131545,2.162404,2.064304,3.332421,2.406076,2.59976,3.169517,2.914884,2.457078,2.064304,2.780311,3.005899,3.178219,2.795608,3.270835,2.888984,2.659282
