In [1]:
import pandas as pd
import numpy as np
import sqlite3
#
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.experimental import enable_iterative_imputer #required for IterativeImputer import below
from sklearn.impute import SimpleImputer, IterativeImputer

In [2]:
db_file = "life_expectancy.db"
db_file_geo_full = "life_expectancy_geo_full.db"
table_geo = "life_expectancy_geography"
table_demo = "life_expectancy_demography"
table_geo_full = "life_expectancy_geography_full"
# Load data from SQLite
conn1 = sqlite3.connect(db_file)
conn2 = sqlite3.connect(db_file_geo_full)
df_geo = pd.read_sql(f"SELECT * FROM {table_geo}", conn1)
df_demo = pd.read_sql(f"SELECT * FROM {table_demo}", conn1)
df_geo_full = pd.read_sql(f"SELECT * FROM {table_geo_full}", conn2)
conn1.close()
conn2.close()

In [3]:
df_geo['State_County'] = df_geo['State'] + '|' + df_geo['County']
df_geo_full['State_County_Census'] = df_geo_full['State'] + '|' + df_geo_full['County'] + '|' + df_geo_full['CensusTract'].astype(str)

In [4]:
#prepare for training model
X_geo = ['State_County', 'LifeExpectancyStandardError']
y_geo = 'LifeExpectancy'
#Repeat process with demography data
X_demo = ['Year', 'Race', 'Sex', 'AgeAdjustedDeathRate']
y_demo   = 'LifeExpectancy'
#Use imputation to handle NaN's in y_geo_full
numeric_cols = ["LifeExpectancy", "LifeExpectancyLow", "LifeExpectancyHigh", "LifeExpectancyStandardError"]
imputer_y = IterativeImputer(estimator=BayesianRidge(), random_state=42)
imputed = imputer_y.fit_transform(df_geo_full[numeric_cols])
df_geo_full[numeric_cols] = imputed
#repeat with full geography data
X_geo_full = ['State_County_Census', 'LifeExpectancyStandardError','LifeExpectancyLow','LifeExpectancyHigh']
y_geo_full = 'LifeExpectancy'

In [5]:
# Define model-based imputers
iter_imputer = IterativeImputer(estimator=BayesianRidge(), random_state=42)

geo_preprocessor_full = ColumnTransformer(transformers=[
    ('statecountycensus', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),   # categorical still needs simple imputer
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ]), ['State_County_Census']),
    
    ('lifeexpectlow', Pipeline([
        ('imputer', IterativeImputer(estimator=BayesianRidge(), random_state=42)),
        ('scaler', StandardScaler())
    ]), ['LifeExpectancyLow']),
    
    ('lifeexpecthigh', Pipeline([
        ('imputer', IterativeImputer(estimator=BayesianRidge(), random_state=42)),
        ('scaler', StandardScaler())
    ]), ['LifeExpectancyHigh']),
    ('stderr', Pipeline([
        ('imputer', IterativeImputer(estimator=BayesianRidge(), random_state=42)),
        ('scaler', StandardScaler())
    ]), ['LifeExpectancyStandardError']),
])

In [6]:
geo_preprocessor = ColumnTransformer(transformers=[
    ('statecounty', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ]), ['State_County']),
    ('stderr', Pipeline([
        ('imputer', IterativeImputer(estimator=BayesianRidge(), random_state=42)),
        ('scaler', StandardScaler())
    ]), ['LifeExpectancyStandardError'])
])

In [7]:
demo_preprocessor = ColumnTransformer(transformers=[
    ('year', Pipeline([
        ('imputer', IterativeImputer(estimator=BayesianRidge(), random_state=42)),
        ('scaler', StandardScaler())
    ]), ['Year']),
    ('race', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ]), ['Race']),
    ('sex', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ]), ['Sex']),
    ('deathrate', Pipeline([
        ('imputer', IterativeImputer(estimator=BayesianRidge(), random_state=42)),
        ('scaler', StandardScaler())
    ]), ['AgeAdjustedDeathRate'])
])

In [8]:
geo_model = Pipeline(steps=[
    ('preprocessor', geo_preprocessor),
    ('regressor', LinearRegression())
])

In [9]:
demo_model = Pipeline(steps=[
    ('preprocessor', demo_preprocessor),
    ('regressor', LinearRegression())
])

In [10]:
geo_full_model = Pipeline(steps=[
    ('preprocessor', geo_preprocessor_full),
    ('regressor', LinearRegression())
])

In [11]:
#Train demography model
demo_model.fit(df_demo[X_demo], df_demo[y_demo])

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('year', ...), ('race', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,estimator,BayesianRidge()
,missing_values,
,sample_posterior,False
,max_iter,10
,tol,0.001
,n_nearest_features,
,initial_strategy,'mean'
,fill_value,
,imputation_order,'ascending'
,skip_complete,False

0,1,2
,max_iter,300
,tol,0.001
,alpha_1,1e-06
,alpha_2,1e-06
,lambda_1,1e-06
,lambda_2,1e-06
,alpha_init,
,lambda_init,
,compute_score,False
,fit_intercept,True

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,estimator,BayesianRidge()
,missing_values,
,sample_posterior,False
,max_iter,10
,tol,0.001
,n_nearest_features,
,initial_strategy,'mean'
,fill_value,
,imputation_order,'ascending'
,skip_complete,False

0,1,2
,max_iter,300
,tol,0.001
,alpha_1,1e-06
,alpha_2,1e-06
,lambda_1,1e-06
,lambda_2,1e-06
,alpha_init,
,lambda_init,
,compute_score,False
,fit_intercept,True

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [12]:
# Train geographic life expectancy prediction model
geo_model.fit(df_geo[X_geo], df_geo[y_geo])

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('statecounty', ...), ('stderr', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,estimator,BayesianRidge()
,missing_values,
,sample_posterior,False
,max_iter,10
,tol,0.001
,n_nearest_features,
,initial_strategy,'mean'
,fill_value,
,imputation_order,'ascending'
,skip_complete,False

0,1,2
,max_iter,300
,tol,0.001
,alpha_1,1e-06
,alpha_2,1e-06
,lambda_1,1e-06
,lambda_2,1e-06
,alpha_init,
,lambda_init,
,compute_score,False
,fit_intercept,True

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [13]:
# Train full geography data life expectancy prediction model
geo_full_model.fit(df_geo_full[X_geo_full], df_geo_full[y_geo_full])

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('statecountycensus', ...), ('lifeexpectlow', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,estimator,BayesianRidge()
,missing_values,
,sample_posterior,False
,max_iter,10
,tol,0.001
,n_nearest_features,
,initial_strategy,'mean'
,fill_value,
,imputation_order,'ascending'
,skip_complete,False

0,1,2
,max_iter,300
,tol,0.001
,alpha_1,1e-06
,alpha_2,1e-06
,lambda_1,1e-06
,lambda_2,1e-06
,alpha_init,
,lambda_init,
,compute_score,False
,fit_intercept,True

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,estimator,BayesianRidge()
,missing_values,
,sample_posterior,False
,max_iter,10
,tol,0.001
,n_nearest_features,
,initial_strategy,'mean'
,fill_value,
,imputation_order,'ascending'
,skip_complete,False

0,1,2
,max_iter,300
,tol,0.001
,alpha_1,1e-06
,alpha_2,1e-06
,lambda_1,1e-06
,lambda_2,1e-06
,alpha_init,
,lambda_init,
,compute_score,False
,fit_intercept,True

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,estimator,BayesianRidge()
,missing_values,
,sample_posterior,False
,max_iter,10
,tol,0.001
,n_nearest_features,
,initial_strategy,'mean'
,fill_value,
,imputation_order,'ascending'
,skip_complete,False

0,1,2
,max_iter,300
,tol,0.001
,alpha_1,1e-06
,alpha_2,1e-06
,lambda_1,1e-06
,lambda_2,1e-06
,alpha_init,
,lambda_init,
,compute_score,False
,fit_intercept,True

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False
