In [41]:
import pandas as pd
import sqlite3
import numpy as np
#
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.experimental import enable_iterative_imputer #required for IterativeImputer import
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.ensemble import StackingRegressor

In [42]:
#will be class variables defined in constructor
global STACKING_MODEL, ALL_FEATURES
iter_imputer = IterativeImputer(estimator=BayesianRidge(), random_state=42)
simple_imputer = SimpleImputer(strategy='most_frequent')
m_encoder = OneHotEncoder(handle_unknown='ignore')

In [43]:
#function in class to load data into dataframes
db_file = "life_expectancy_tables.db"
conn = sqlite3.connect(db_file)
sex_year = "ssa_life_expectancy_sex_year"
sex_state = "life_expectancy_sex_state"
state_county = "life_expectancy_state_county"
race_sex = "life_expectancy_race_sex"
race_sex_year = "life_expectancy_race_sex_year"
df_sex_year = pd.read_sql(f"SELECT * FROM {sex_year}", conn)
df_sex_state = pd.read_sql(f"SELECT * FROM {sex_state}", conn)
df_state_county = pd.read_sql(f"SELECT * FROM {state_county}", conn)
df_race_sex = pd.read_sql(f"SELECT * FROM {race_sex}", conn)
df_race_sex_year = pd.read_sql(f"SELECT * FROM {race_sex_year}", conn)
conn.close()

In [44]:
#function to create master dataset
dataset_list = []
dataset_list.append(df_sex_year)
dataset_list.append(df_sex_state)
dataset_list.append(df_state_county)
dataset_list.append(df_race_sex)
dataset_list.append(df_race_sex_year)
full_dataset = pd.concat(dataset_list, ignore_index=True, sort=False)
full_dataset = full_dataset.dropna(subset=['LifeExpectancy'])

In [45]:
#function to train data for meta-model
X = full_dataset.drop('LifeExpectancy', axis=1)
y = full_dataset['LifeExpectancy']
ALL_FEATURES = X.columns.tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [46]:
#function to create ssa_life_expectancy_sex_year pipeline
sex_year_transformer = ColumnTransformer(transformers=[
    ('year', Pipeline([
        ('imputer', iter_imputer),    
        ('scaler', StandardScaler())
    ]), ['Year']),
    ('sex', Pipeline([
        ('imputer', simple_imputer),
        ('encoder', m_encoder)
    ]), ['Sex'])
],
remainder='drop'                                           
)

sex_year_pipeline = Pipeline(steps=[
    ('transformer', sex_year_transformer),
    ('model', LinearRegression())
])

In [47]:
#function to create life_expectancy_sex_state pipeline
sex_state_transformer = ColumnTransformer(transformers=[
    ('state', Pipeline([
        ('imputer', simple_imputer),    
        ('encoder', m_encoder)
    ]), ['State']),
    ('sex', Pipeline([
        ('imputer', simple_imputer),
        ('encoder', m_encoder)
    ]), ['Sex'])
],
remainder='drop'  
)

sex_state_pipeline = Pipeline(steps=[
    ('transformer', sex_state_transformer),
    ('model', LinearRegression())
])

In [48]:
#function to create life_expectancy_state_county pipeline
state_county_transformer = ColumnTransformer(transformers=[
    ('state', Pipeline([
        ('imputer', simple_imputer),    
        ('encoder', m_encoder)
    ]), ['State']),
    ('county', Pipeline([
        ('imputer', simple_imputer),
        ('encoder', m_encoder)
    ]), ['County'])  
],
remainder='drop'  
)

state_county_pipeline = Pipeline(steps=[
    ('transformer', state_county_transformer),
    ('model', LinearRegression())
])

In [49]:
#function to create life_expectancy_race_sex pipeline
race_sex_transformer = ColumnTransformer(transformers=[
    ('year', Pipeline([
        ('imputer', iter_imputer),    
        ('scaler', StandardScaler())
    ]), ['Year']),
    ('sex', Pipeline([
        ('imputer', simple_imputer),
        ('encoder', m_encoder)
    ]), ['Sex']),
    ('race', Pipeline([
        ('imputer', simple_imputer),
        ('encoder', m_encoder)
    ]), ['Race']),
    ('deathrate', Pipeline([
        ('imputer', iter_imputer),    
        ('scaler', StandardScaler())
    ]), ['AgeAdjustedDeathRate'])
],
remainder='drop'  
)

race_sex_pipeline = Pipeline(steps=[
    ('transformer', race_sex_transformer),
    ('model', LinearRegression())
])

In [50]:
#function to create life_expectancy_race_sex_year pipeline
race_sex_year_transformer = ColumnTransformer(transformers=[
    ('year', Pipeline([
        ('imputer', iter_imputer),    
        ('scaler', StandardScaler())
    ]), ['Year']),
    ('sex', Pipeline([
        ('imputer', simple_imputer),
        ('encoder', m_encoder)
    ]), ['Sex']),
    ('race', Pipeline([
        ('imputer', simple_imputer),
        ('encoder', m_encoder)
    ]), ['Race'])
],
remainder='drop'  
)

race_sex_year_pipeline = Pipeline(steps=[
    ('transformer', race_sex_year_transformer),
    ('model', LinearRegression())
])

In [51]:
#function to create the Stacking Ensemble Model
estimators = [
    ('sex_year_model', sex_year_pipeline),
    ('sex_state_model', sex_state_pipeline),
    ('state_county_model', state_county_pipeline),
    ('race_sex_model', race_sex_pipeline),
    ('race_sex_year_model', race_sex_year_pipeline)
]
# Create Linear Regression meta-model that learns how to combine predictions from the 4 base models
stacking_regressor = StackingRegressor(
    estimators=estimators,
    final_estimator=LinearRegression(),
    cv=5, #5-fold cross validation is used to generate the base model predictions. Sckikit-learn standard
    passthrough=False
)

In [52]:
#function to train the Stacking Ensemble Model
stacking_regressor.fit(X_train, y_train)
STACKING_MODEL = stacking_regressor

In [53]:
#will not be part of object-oriented program, just to test the model
year = 1970
sex = 'Black'
race = 'Male'
state = 'Mississippi'
county = 'Hinds County'

In [54]:
#driver function to run predictions
# it initiates the class into an object. ALL_FEATURES will be available as a class variable
#i.e.
#def predict_life_expectancy(state: str, county: str, year: int, race: str, sex: str) -> float:
#  predictor = life_expectancy_predictor_engine()
#  return predictor.make_prediction(prediction_input)
def predictor (year, sex, race, state, county):
    row = {col: np.nan for col in ALL_FEATURES}
    if 'Year' in row: row['Year'] = year
    if 'Sex' in row: row['Sex'] = sex
    if 'Race' in row: row['Race'] = race
    if 'State' in row: row['State'] = state
    if 'County' in row: row['County'] = county

    return pd.DataFrame([row])

In [55]:
#will not be part of object-oriented program, just to test the model

_input = predictor(year, sex, race, state, county)
prediction = STACKING_MODEL.predict(_input)
print(prediction)

[71.45864755]


In [56]:
year = 1970
sex = 'White'
race = 'Female'
state = 'Florida'
county = 'Broward'
_input = predictor(year, sex, race, state, county)
print(STACKING_MODEL.predict(_input))

[71.91585254]
