# Predicting median household income from health data

What are the weights of different health data in predicting median household income in California?

Can we use a model trained on California data to predict median household income in Colorado from health data?

- Health data is taken from the City Health Dashboard. Read more about the metrics here: https://www.cityhealthdashboard.com/metrics.
- Household income data is taken from the American Community Survey.

In [1]:
import json

import numpy as np
import pandas as pd

import sklearn.preprocessing
import sklearn.linear_model

import holoviews as hv
import colorcet as cc

from health import *

hv.extension('bokeh')

In [2]:
red = cc.b_glasbey_hv[1]
blue = cc.b_glasbey_hv[0]

# Load, clean, and join data - California

*Drop 'Limited access to healthy foods' (15% of this data is missing).*

In [3]:
health_file = 'CA_health_data_by_census_tract.txt'
health_data = load_health_data(health_file).drop(['Limited access to healthy foods'], axis=1)

income_data_file = 'CA_household_income_by_census_tract.csv'
income_data = load_hinc_data(income_data_file)

df = pd.merge(health_data, income_data[['median income', 'stcotr_fips']])

## Missing values

Let's drop missing values.

In [4]:
n = len(df)

print(f'Original number of data points: {n}.')

df.dropna(inplace=True)

print(f'Number of rows dropped: {n - len(df)}, {100 * (n - len(df)) / n:.1f}%.')

Original number of data points: 6075.
Number of rows dropped: 407, 6.7%.


## Correlations

Income distributions tend to be log-normal, so let's look at correlations of health data with household income and $\log10$(household income).

In [5]:
df['log10(median income)'] = np.log10(np.array(df['median income']))

In [6]:
df['median income (thousands)'] = df['median income'] / 1000

In [7]:
column_renaming = {column_name: column_name.replace(' ', '\n') for column_name in df.columns}

In [8]:
ds = hv.Dataset(
    data=df.drop(
        columns=['median income', 'stcotr_fips']
    ).rename(
        columns=column_renaming
    )
)

In [9]:
(
    hv.operation.gridmatrix(ds, chart_type=hv.Points)[:, 'median\nincome\n(thousands)'] +
    hv.operation.gridmatrix(ds, chart_type=hv.Points)[:, 'log10(median\nincome)']
).cols(
    1
)

# Model building

We will use ridge regression with built-in cross-validation, provided by the sklearn library.

In [10]:
X = df.drop(
    ['median income', 'median income (thousands)', 'log10(median income)', 'stcotr_fips'], axis=1
)
y = np.log10(
    np.array(df['median income (thousands)']).reshape(-1, 1)
)

## Preprocessing

In [11]:
transformer = sklearn.preprocessing.RobustScaler()
transformer.fit(X)

X_scaled = transformer.transform(X)

## Model training

In [12]:
model = sklearn.linear_model.RidgeCV(alphas=np.logspace(-6, 6, 13))

model.fit(X_scaled, y)

RidgeCV(alphas=array([1.e-06, 1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01,
       1.e+02, 1.e+03, 1.e+04, 1.e+05, 1.e+06]))

In [13]:
model.alpha_

1.0

# Performance on training data

In [14]:
train_score = model.score(X_scaled, y)
train_score

0.9490166406907407

In [15]:
y_pred = model.predict(X_scaled)

In [16]:
plot = plot_prediction_vs_actual(y_pred, y, 'median income (thousands)',log10=True)
plot.opts(
    hv.opts.Layout(title=f'score: {train_score:.2f}'),
    hv.opts.Points(alpha=0.4)
)

In [17]:
hv.save(
    plot.opts(
        hv.opts.Layout(
            title=f'score: {train_score:.2f}',
            toolbar=None
        ),
        hv.opts.Points(alpha=0.4),
    ), 
    'images/california-income-results.png'
)

We tend to overpredict high incomes - this may be because the highest category of income is >\\$250,000, which we truncated to \\$250,000.

# Feature weights

In [18]:
metrics = X.columns
metrics_sorted = np.array(metrics)[np.argsort(model.coef_)]
metrics_sorted = metrics_sorted.tolist()[0]

In [19]:
weights = model.coef_.copy()
weights.sort()
weights = weights.tolist()[0]

In [20]:
df_weights = pd.DataFrame({'metrics': metrics_sorted, 'weights': weights})

In [21]:
feature_weights = hv.Bars(df_weights, kdims='metrics')
feature_weights.opts(
    frame_width=800,
    frame_height=400,
    xrotation=45,
)

In [22]:
hv.save(
    feature_weights.opts(
        frame_width=800,
        frame_height=400,
        xrotation=45,
        toolbar=None
    ),
    'images/feature-weights.png'
)

In [23]:
metrics = X.columns
metrics_abs_sorted = np.array(metrics)[np.argsort(np.abs(model.coef_))]
metrics_abs_sorted = metrics_abs_sorted.tolist()[0]

In [24]:
abs_weights = model.coef_.copy()
abs_weights = np.abs(abs_weights)
abs_weights.sort()
abs_weights = abs_weights.tolist()[0]

In [25]:
df_abs_weights = pd.DataFrame({'metrics': metrics_abs_sorted, 'weights (absolute value)': abs_weights})

In [26]:
feature_abs_weights = hv.Bars(df_abs_weights, kdims='metrics')
feature_abs_weights.opts(
    frame_width=800,
    frame_height=400,
    xrotation=45,
)

In [27]:
hv.save(
    feature_abs_weights.opts(
        frame_width=800,
        frame_height=400,
        xrotation=45,
        toolbar=None
    ),
    'images/feature-abs-weights.png'
)

# Apply to test data

Can we use what we learned in California to predict median income in Colorado?

In [28]:
test_health_file = 'CO_health_data_by_census_tract.txt'
test_health_data = load_health_data(test_health_file).drop(['Limited access to healthy foods'], axis=1)

test_income_data_file = 'CO_household_income_by_census_tract.csv'
test_income_data = load_hinc_data(test_income_data_file)

test_df = pd.merge(test_health_data, test_income_data[['median income', 'stcotr_fips']])

In [29]:
test_df['log10(median income)'] = np.log10(np.array(test_df['median income']))

In [30]:
test_df['median income (thousands)'] = test_df['median income'] / 1000

In [31]:
n = len(test_df)

print(f'Original number of data points: {n}.')

test_df.dropna(inplace=True)

print(f'Number of rows dropped: {n - len(test_df)}, {100 * (n - len(test_df)) / n:.1f}%.')

Original number of data points: 804.
Number of rows dropped: 111, 13.8%.


## Correlations

In [32]:
test_ds = hv.Dataset(
    data=test_df.drop(
        columns=['median income', 'stcotr_fips']
    ).rename(
        columns=column_renaming
    )
)

In [33]:
(
    hv.operation.gridmatrix(test_ds, chart_type=hv.Points)[:, 'median\nincome\n(thousands)'] +
    hv.operation.gridmatrix(test_ds, chart_type=hv.Points)[:, 'log10(median\nincome)']
).opts(
    hv.opts.Points(color=red),
    hv.opts.Histogram(color=red),
).cols(
    1
)

In [34]:
X_test = test_df.drop(
    ['median income', 'median income (thousands)', 'log10(median income)', 'stcotr_fips'], axis=1
)
y_test_actual = np.log10(
    np.array(test_df['median income (thousands)']).reshape(-1, 1)
)

## Preprocessing

In [35]:
X_test_scaled = transformer.transform(X_test)

## Predict test data

In [36]:
y_test_pred = model.predict(X_test_scaled)

In [37]:
test_score = model.score(X_test_scaled, y_test_actual)
test_score

0.8969461153830847

In [38]:
plot_test = plot_prediction_vs_actual(y_test_pred, y_test_actual, label='median income', log10=True)
plot_test.opts(
    hv.opts.Layout(title=f'score: {test_score:.2f}'),
    hv.opts.Points(color=red, alpha=0.4),
    hv.opts.Distribution(color=red)
)

In [39]:
hv.save(
    plot_test.opts(
        hv.opts.Layout(
            title=f'score: {test_score:.2f}',
            toolbar=None
        ),
        hv.opts.Points(color=red, alpha=0.4),
        hv.opts.Distribution(color=red)
    ),
    'images/colorado-income-results.png'
)

The model scored pretty well on Colorado data -- but, it looks like there might be some systematic error...

In [40]:
(
    hv.Distribution(y, kdims=['log10(income)'], label='California') * hv.Distribution(y_test_actual, kdims=['log10(income)'], label='Colorado')
).opts(
    legend_position='right',
    frame_width=400,
    frame_height=300,
)

In [41]:
hv.save(
    (
        hv.Distribution(y, kdims=['log10(income)'], label='California') * hv.Distribution(y_test_actual, kdims=['log10(income)'], label='Colorado')
    ).opts(
        legend_position='right',
        frame_width=400,
        frame_height=300,
    ),
    'images/income-distributions.png'
)

Indeed, the whole distribution of Colorado incomes is shifted to the left of California incomes, which may contribute to why our model predicts a higher income than the true value consistently.

In [42]:
colorado_state_median = np.median(y_test_actual)
colorado_state_median

1.8251858585595182

In [43]:
california_state_median = np.median(y)
california_state_median

1.8504991994262454

In [44]:
delta = colorado_state_median - california_state_median

## Correct Colorado data by difference in state median household income from California state median

In [45]:
y_test_adj = y_test_actual - delta

In [46]:
(
    hv.Distribution(y, kdims=['log10(income)'], label='California') * hv.Distribution(y_test_adj, kdims=['log10(income)'], label='Colorado (adjusted to California median)')
).opts(
    legend_position='right',
    frame_width=400,
    frame_height=300,
)

In [47]:
test_score = model.score(X_test_scaled, y_test_adj)
test_score

0.9343895929208701

In [48]:
plot_test_adj = plot_prediction_vs_actual(y_test_pred, y_test_adj, label='median income', log10=True)
plot_test_adj.opts(
    hv.opts.Layout(title=f'score: {test_score:.2f}'),
    hv.opts.Points(color=red, alpha=0.4),
    hv.opts.Distribution(color=red)
)

In [49]:
hv.save(
    plot_test_adj.opts(
        hv.opts.Layout(
            title=f'score: {test_score:.2f}',
            toolbar=None
        ),
        hv.opts.Points(color=red, alpha=0.4),
        hv.opts.Distribution(color=red)
    ),
    'images/colorado-income-adjusted-results.png'
)

This really improves the predictive power of the model.