In [1]:
import pandas as pd
import sagemaker
from sagemaker.sklearn.estimator import SKLearn
import boto3
import re
import pathlib
from sklearn.model_selection import train_test_split
from sagemaker.tuner import IntegerParameter, HyperparameterTuner, ContinuousParameter, CategoricalParameter
from utils.utils import get_logger, load_data
from utils.load_config_file import load_config_file
from warnings import filterwarnings
filterwarnings("ignore")

logger = get_logger(name=pathlib.Path("ltv-models-training.ipynb"))

CONFIG_PATH = "config/config.ini"

pd.set_option('display.max_columns', 500)

In [2]:
## Get data from s3
s3_bucket = 'hc-data-science'
bucket_path = f"pre-conversion-ma-ltv/data/ma_ltv_merged.csv"
data_bucket_path = 's3://{}/{}'.format(s3_bucket, bucket_path)

data = pd.read_csv(data_bucket_path, low_memory=False)
data.head()

Unnamed: 0,application_id,owner_email,application_name,policy_id,owner_id,owner_phone,parent_application_id,sk_referral_flag,bk_product_type,carrier,lead_id,first_name,last_name,submitted_weekday,submitted_day,submitted_month,submitted_year,area_code,age,age_range,jrn_boberdoo_amount,jrn_boberdoo_source,jrn_boberdoo_lead_type,jrn_id,jrn_date,jrn_tracking_file_path,jrn_event_date,jrn_firstparty,jrn_tcpa_universal_id,jrn_is_first_party,jrn_error,jrn_url,jrn_year,jrn_month,jrn_day,jrn_hour,jrn_request_f_name,jrn_request_l_name,jrn_request_email,jrn_request_phone1,jrn_request_address1,jrn_request_tcpa_universal_id,jrn_request_provider,jrn_request_age,jrn_request_dob,jrn_request_smoker,jrn_response_audit_authentic,jrn_response_audit_consumer_five_minutes,jrn_response_audit_consumer_hour,jrn_response_audit_consumer_twelve_hours,jrn_response_audit_consumer_twelve_consumer_day,jrn_response_audit_consumer_week,jrn_response_audit_data_integrity,jrn_response_audit_fields_email,jrn_response_audit_fields_f_name,jrn_response_audit_fields_l_name,jrn_response_audit_fields_phone1,jrn_response_audit_fields_address1,jrn_response_audit_device_five_minutes,jrn_response_audit_device_hour,jrn_response_audit_device_twelve_hours,jrn_response_audit_device_day,jrn_response_audit_device_week,jrn_response_audit_consumer_dupe_check,jrn_response_audit_entity_value,jrn_response_audit_ip_five_minutes,jrn_response_audit_ip_hour,jrn_response_audit_ip_twelve_hours,jrn_response_audit_ip_day,jrn_response_audit_ip_week,jrn_response_audit_lead_age,jrn_response_audit_age,jrn_response_audit_lead_duration,jrn_response_audit_duration,jrn_response_audit_lead_dupe_check,jrn_response_audit_lead_dupe,jrn_response_audit_lead_five_minutes,jrn_response_audit_lead_hour,jrn_response_audit_lead_twelve_hours,jrn_response_audit_lead_day,jrn_response_audit_lead_week,jrn_response_audit_market_leadid_tcpa_disclosure,jrn_response_audit_market_leadid_tcpa_prominence,jrn_response_audit_market_leadid_tcpa_contrast,jrn_response_audit_market_leadid_tcpa_visibility,jrn_response_audit_market_leadid_tcpa_stored,jrn_response_audit_market_leadid_tcpa_capture,jrn_response_audit_market_leadid_tcpa_result,jrn_response_audit_market_leadid_result,jrn_response_audit_market_result,jrn_response_audit_url_value,jrn_response_audit_result,jrn_response_audit_token,zcta_latitude,zcta_longitude,zcta_cdc_all_teeth_lost,zcta_cdc_annual_checkup,zcta_cdc_arthritis,zcta_cdc_binge_drinking,zcta_cdc_cancer_except_skin,zcta_cdc_cervical_cancer_screening,zcta_cdc_cholesterol_screening,zcta_cdc_chronic_kidney_disease,zcta_cdc_colorectal_cancer_screening,zcta_cdc_copd,zcta_cdc_core_preventive_services_for_older_men,zcta_cdc_core_preventive_services_for_older_women,zcta_cdc_coronary_heart_disease,zcta_cdc_current_asthma,zcta_cdc_current_smoking,zcta_cdc_dental_visit,zcta_cdc_depression,zcta_cdc_diabetes,zcta_cdc_general_health,zcta_cdc_health_insurance,zcta_cdc_high_blood_pressure,zcta_cdc_high_cholesterol,zcta_cdc_mammography,zcta_cdc_mental_health,zcta_cdc_obesity,zcta_cdc_physical_health,zcta_cdc_physical_inactivity,zcta_cdc_sleep_lt_7_hours,zcta_cdc_stroke,zcta_cdc_taking_bp_medication,zcta_cms_mapd_aetna_mrkt_share,zcta_cms_mapd_bcbs_mrkt_share,zcta_cms_mapd_cigna_mrkt_share,zcta_cms_mapd_humana_mrkt_share,zcta_cms_mapd_kaiser_mrkt_share,zcta_cms_mapd_lis_mrkt_share,zcta_cms_mapd_mrkt_leader,zcta_cms_mapd_other_mrkt_share,zcta_cms_mapd_penetration_2021,zcta_cms_mapd_penetration_2022,zcta_cms_mapd_penetration_pct_change,zcta_cms_mapd_priority_mrkt_share,zcta_cms_mapd_uhc_mrkt_share,zcta_cms_mapd_wellcare_mrkt_share,zcta_nyt_pct_dem,zcta_nyt_pct_gop,zcta_usc_housing_units,zcta_usc_land_area_m2,zcta_usc_median_home_value,zcta_usc_median_household_income,zcta_usc_occupied_housing_units,zcta_usc_pct_20_24,zcta_usc_pct_25_34,zcta_usc_pct_35_44,zcta_usc_pct_45_54,zcta_usc_pct_55_59,zcta_usc_pct_60_64,zcta_usc_pct_65_74,zcta_usc_pct_65_over,zcta_usc_pct_american_indian_alaska_native,zcta_usc_pct_asian,zcta_usc_pct_black,zcta_usc_pct_hispanic_latino,zcta_usc_pct_native_hawaiian_pacific_islander,zcta_usc_pct_white,zcta_usc_population_density_per_km2,zcta_usc_sex_ratio,zcta_usc_total_population,tu_DEMO_INCOME_DOLLARS,tu_DEMO_CHILDREN_YES,tu_DEMO_CHILDREN_NO,tu_DEMO_AFFILIATION_CONSERVATIVE,tu_DEMO_AFFILIATION_LIBERAL,tu_DEMO_EDUCATION_YEARS,tu_DEMO_HOMEOWNER_YES,tu_DEMO_HOMEOWNER_NO,tu_DEMO_HOMEVALUE_DOLLARS,tu_DEMO_RESIDENT_YEARS,tu_DEMO_OCCUPATION_FIRST,tu_STATUS_COUNT,tu_STATUS_LATENCY,tu_SUB_USER_ID,tu_GROUP_ID,tu_ACCOUNT,tu_CONTACT_SCORE,tu_CREDIT_SCORE,post_raw_application_id,post_raw_application_name,post_raw_policy_id,post_raw_carrier,post_raw_zcta,LTV,zip,state,city,gender
0,01218aa1-14ee-4844-85ed-a1a24941ccf1,,Wellcare No Premium Open (PPO),34055980-504-001-000,7xw4v80fm74,5203314000.0,01218aa1-14ee-4844-85ed-a1a24941ccf1,0.0,O65,O65 - Wellcare,,william,pinnell,3.0,2.0,12.0,2021.0,520.0,76.0,75 to 85,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,32.33942,-110.984997,9.8,77.0,26.6,15.5,9.2,85.4,92.0,3.4,69.1,6.6,35.8,27.3,6.6,9.4,12.2,69.8,18.2,8.9,15.3,11.7,32.7,35.7,76.8,12.8,26.2,12.3,19.2,30.9,3.4,78.1,3.89,0.02,0.91,14.95,0.0,0.06,uhc,0.8,5333.0,5511.0,5.49,,77.82,1.62,58.57,39.9,17325.0,47146858.0,285800.0,62544.0,15591.0,6.9,14.5,10.6,9.8,5.5,7.2,13.3,27.2,0.6,6.6,1.5,18.9,0.0,82.0,729.91,92.2,34413.0,40000.0,0.0,100.0,0.0,0.0,13.0,100.0,0.0,150000.0,44.0,professional,17.0,0.277553,5349.0,pickup,5349.0,930.0,715.0,01218aa1-14ee-4844-85ed-a1a24941ccf1,Wellcare No Premium Open (PPO),34055980-504-001-000,O65 - Wellcare,85704,509.25,85704.0,AZ,TUCSON,M
1,0123cdb6-24f2-451f-b4c6-a34b9832eac1,olesauceda@gmail.com,Wellcare No Premium (HMO),C4053348601-H5294-011-000,1v20gr1ta31,9562455000.0,0123cdb6-24f2-451f-b4c6-a34b9832eac1,0.0,O65,O65 - Wellcare,,olegario,sauceda,0.0,14.0,2.0,2022.0,956.0,78.0,75 to 85,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,26.285511,-97.471595,20.0,75.0,22.8,17.7,6.4,79.5,87.9,4.5,50.9,8.6,19.5,21.6,8.6,8.4,17.3,46.2,18.1,19.5,34.6,43.1,37.6,35.4,74.7,15.2,41.6,18.2,37.3,38.4,4.6,75.7,9.5,0.02,19.03,10.14,0.0,0.14,uhc,0.01,5895.0,6299.0,8.7,,58.19,3.11,56.11,42.94,3419.0,241166274.0,118900.0,40299.0,2259.0,3.0,10.5,9.0,12.1,8.1,3.6,15.0,23.0,0.4,0.4,0.0,80.5,0.0,76.1,28.67,92.0,6915.0,,,,,,,,,,,,,,,,,,,0123cdb6-24f2-451f-b4c6-a34b9832eac1,Wellcare No Premium (HMO),C4053348601-H5294-011-000,O65 - Wellcare,78583,442.5,78583.0,TX,,M
2,01c1b965-9369-42e2-8d86-cecad10badee,,AARP Medicare Advantage Plan 1 (HMO-POS),OEC_SFTYX3HC6A1LT,4g10u97qr03,5735628000.0,01c1b965-9369-42e2-8d86-cecad10badee,0.0,O65,O65 - United Health Care,,lillian,mitchell,1.0,18.0,1.0,2022.0,573.0,89.0,More than 85,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,37.861318,-90.587878,23.2,71.7,27.1,17.9,6.5,81.6,84.7,2.7,57.9,8.9,33.1,32.8,6.3,10.9,25.1,53.1,27.0,9.6,20.8,17.1,29.8,31.1,73.5,18.4,36.2,14.9,35.4,36.0,3.3,74.5,4.5,0.96,,53.41,0.0,0.06,humana,0.0,4105.0,4535.0,11.43,,39.95,1.18,25.16,73.25,608.0,2818635.0,47200.0,39625.0,492.0,9.9,8.6,10.3,20.5,5.0,3.6,6.9,8.6,0.0,0.0,0.0,0.3,0.0,99.0,522.24,69.6,1472.0,,,,,,,,,,,,,,,,,,,01c1b965-9369-42e2-8d86-cecad10badee,AARP Medicare Advantage Plan 1 (HMO-POS),OEC_SFTYX3HC6A1LT,O65 - United Health Care,63653,984.245833,63653.0,MO,,F
3,04615383-867d-469d-876b-8ac397481f3d,noemail@yahoo.com,Wellcare Giveback (HMO),34755918-444-193-000,6fh5j36rw62,3862660000.0,04615383-867d-469d-876b-8ac397481f3d,0.0,O65,O65 - Wellcare,,clinton,jr,1.0,29.0,3.0,2022.0,386.0,66.0,65 to 75,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,29.232739,-81.068471,24.2,76.4,29.5,18.2,7.5,80.0,89.1,4.0,62.4,11.7,27.4,25.9,8.5,9.7,25.9,56.0,21.9,14.7,27.2,24.3,38.9,33.2,78.4,19.5,32.2,18.9,33.3,42.7,5.1,75.2,1.31,1.18,0.83,61.59,0.0,0.05,humana,0.55,5559.0,5725.0,5.83,,31.55,2.99,42.47,56.54,13098.0,31068944.0,120000.0,40659.0,11249.0,4.5,12.5,13.8,11.2,6.7,8.3,11.4,20.0,0.1,3.6,29.0,9.8,0.0,60.8,909.78,94.3,28266.0,,,,,,,,,,,,,,,,,,,04615383-867d-469d-876b-8ac397481f3d,Wellcare Giveback (HMO),34755918-444-193-000,O65 - Wellcare,32117,0.0,32117.0,FL,,M
4,0484f9eb-864e-45ac-81b3-f56af9ec7608,,HumanaChoice SNP-DE H5216-292 (PPO D-SNP),,6p77p48ar59,6625880000.0,0484f9eb-864e-45ac-81b3-f56af9ec7608,0.0,O65,O65 - Humana,,bobbie,flowers,0.0,14.0,3.0,2022.0,662.0,79.0,75 to 85,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,33.739586,-90.523058,23.3,83.6,30.7,10.9,6.5,85.7,90.4,4.2,61.5,8.4,32.2,24.0,7.5,11.7,21.0,47.2,18.8,17.2,28.4,22.2,50.6,36.8,76.4,17.2,48.4,15.6,42.8,42.8,5.6,82.6,,0.0,,66.67,,0.1,humana,0.0,2930.0,4000.0,37.42,,25.51,7.82,70.03,28.91,1256.0,140237704.0,87000.0,42616.0,1098.0,10.2,16.6,9.7,12.2,6.3,4.1,9.8,15.8,0.0,0.0,79.4,1.0,0.0,20.1,21.25,80.9,2980.0,,,,,,,,,,,,,,,,,,,0484f9eb-864e-45ac-81b3-f56af9ec7608,HumanaChoice SNP-DE H5216-292 (PPO D-SNP),,O65 - Humana,38771,0.0,38771.0,MS,,F


#### CatBoost Data Preprocessing

In [3]:
## Post-process the Dataset
from utils.post_processing_utils import process_catboost

X_train, X_test = process_catboost(data=data, config_path=CONFIG_PATH, for_training=False)


# logger.info(
#     f"Splitted input dataset shapes are: \
#         X_train= {X_train.shape}, y_train= {y_train.shape},\
#             X_test= {X_test.shape}, y_test= {y_test.shape}"
# )

In [4]:
X_train.columns

Index(['submitted_weekday', 'submitted_day', 'submitted_month',
       'submitted_year', 'area_code', 'age', 'age_range',
       'jrn_boberdoo_amount', 'jrn_boberdoo_source', 'jrn_boberdoo_lead_type',
       ...
       'tu_DEMO_OCCUPATION_FIRST', 'tu_STATUS_COUNT', 'tu_STATUS_LATENCY',
       'tu_CONTACT_SCORE', 'tu_CREDIT_SCORE', 'zip', 'state', 'city', 'gender',
       'LTV'],
      dtype='object', length=138)

In [5]:
## Also store Train and Test datasets in s3
# Train
bucket_path = f"pre-conversion-ma-ltv/data/post-processed/ma_ltv_train.csv"
train_data_path = 's3://{}/{}'.format(s3_bucket, bucket_path)
X_train.to_csv(train_data_path, index=False)
# Test
bucket_path = f"pre-conversion-ma-ltv/data/post-processed/ma_ltv_test.csv"
test_data_path = 's3://{}/{}'.format(s3_bucket, bucket_path)
X_test.to_csv(test_data_path, index=False)

In [6]:
experiment_name = 'rb_test1'
# s3_bucket = "s3://hc-prd-mlflow-bucket"
tracking_uri = "https://mlflow.healthcare.com/"

sess = sagemaker.Session(default_bucket=s3_bucket)
subnets = ['subnet-0b8fee7c', 'subnet-da08a7f1', 'subnet-e008a7cb']
security_group_ids = ['sg-e6d64f82']
role = 'arn:aws:iam::915124832670:role/hc-sagemaker-default-execution-role' # Local
current_user_arn = boto3.resource('iam').CurrentUser().arn # Local


In [7]:
current_user_arn

'arn:aws:iam::915124832670:user/rutvik.bhende'

In [8]:
hyperparameters = {
    'tracking_uri': tracking_uri,
    'experiment_name': experiment_name,
    'user_arn': current_user_arn,
    'target': "LTV",
    'iterations': 566,
    'learning_rate': 0.01,
    'depth': 14,
    'loss_function': "RMSE",
}

hyperparameter_ranges = {
    'iterations': IntegerParameter(566),
    'depth': IntegerParameter(12),
    'learning_rate': CategoricalParameter([0.01]),
    'loss_function': CategoricalParameter(["MAE"]), 
}


metric_definitions = [
    {'Name': 'MAE_train', 'Regex': "MAE_train: ([0-9]*.[0-9]*)"},
    {'Name': 'MAE_test', 'Regex': "MAE_test: ([0-9]*.[0-9]*)"},
    {'Name': 'RMSE_train', 'Regex': "RMSE_train: ([0-9]*.[0-9]*)"},
    {'Name': 'RMSE_test', 'Regex': "RMSE_test: ([0-9]*.[0-9]*)"},
    {'Name': 'R2_score_train', 'Regex': "R2_score_train: ([0-9]*.[0-9]*)"},
    {'Name': 'R2_score_test', 'Regex': "R2_score_test: ([0-9]*.[0-9]*)"},
    {'Name': 'test_preds_mean', 'Regex': "test_preds_mean: ([0-9]*.[0-9]*)"},
]

objective_metric_name = 'RMSE_test'
objective_type = 'Minimize'

estimator = SKLearn(
    entry_point='train_catboost.py',
    source_dir='models_py',
    role=role,
    subnets=subnets,
    security_group_ids=security_group_ids,
    metric_definitions=metric_definitions,
    hyperparameters=hyperparameters,
    instance_count=1,
    instance_type='ml.c5.xlarge',
    framework_version='0.23-1',
    py_version='py3',
)

In [9]:
estimator.fit({'train':train_data_path, 'test': test_data_path})

2022-06-24 17:05:19 Starting - Starting the training job...
2022-06-24 17:05:44 Starting - Preparing the instances for trainingProfilerReport-1656090319: InProgress
.........
2022-06-24 17:07:16 Downloading - Downloading input data...
2022-06-24 17:07:52 Training - Downloading the training image...
2022-06-24 17:08:12 Training - Training image download completed. Training in progress.[34m2022-06-24 17:08:15,141 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2022-06-24 17:08:15,144 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-06-24 17:08:15,156 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2022-06-24 17:08:15,822 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:[0m
[34m/miniconda3/bin/python -m pip install -r requirements.txt[0m
[34mCollecting mlflow==1.26.0
  Downloading mlflow-1.26.0-py3-none-any.whl (17.8 MB)

UnexpectedStatusException: Error for Training job sagemaker-scikit-learn-2022-06-24-17-05-14-673: Failed. Reason: ClientError: Please use an instance type with more memory, or reduce the size of training data processed on an instance.

In [None]:
tuner = HyperparameterTuner(
    estimator,
    objective_metric_name,
    hyperparameter_ranges,
    metric_definitions,
    max_jobs=10,
    max_parallel_jobs=4,
    objective_type=objective_type,
    base_tuning_job_name='mlflow-ma-ltv-preconv'
)

In [None]:
# tuner.fit({'train':train_data_path, 'test': test_data_path})