In [1]:
import pandas as pd
import sagemaker
from sagemaker.sklearn.estimator import SKLearn
import boto3
import re
import pathlib
from sklearn.model_selection import train_test_split
from sagemaker.tuner import IntegerParameter, HyperparameterTuner, ContinuousParameter, CategoricalParameter
from utils.utils import get_logger, load_data
from utils.load_config_file import load_config_file
from warnings import filterwarnings
filterwarnings("ignore")

logger = get_logger(name=pathlib.Path("ltv-models-training.ipynb"))

CONFIG_PATH = "config/config.ini"

pd.set_option('display.max_columns', 500)

In [16]:
## Get data from s3
s3_bucket = 'hc-data-science'
bucket_path = f"pre-conversion-ma-ltv/data/ma_ltv_merged.csv"
data_bucket_path = 's3://{}/{}'.format(s3_bucket, bucket_path)

data = pd.read_csv(data_bucket_path, low_memory=False)
data.head()

Unnamed: 0,application_id,owner_email,application_name,policy_id,owner_id,owner_phone,app_zip_code,pol_zip_code,parent_application_id,sk_referral_flag,bk_product_type,carrier,bk_gender,bk_state,lead_id,first_name,last_name,submitted_weekday,submitted_day,submitted_month,submitted_year,area_code,age,age_range,jrn_state,jrn_boberdoo_amount,jrn_boberdoo_source,jrn_boberdoo_lead_type,jrn_id,jrn_date,jrn_tracking_file_path,jrn_event_date,jrn_firstparty,jrn_tcpa_universal_id,jrn_is_first_party,jrn_error,jrn_url,jrn_year,jrn_month,jrn_day,jrn_hour,jrn_request_f_name,jrn_request_l_name,jrn_request_email,jrn_request_phone1,jrn_request_address1,jrn_request_city,jrn_request_state,jrn_request_tcpa_universal_id,jrn_request_provider,jrn_request_age,jrn_request_dob,jrn_request_smoker,jrn_request_gender,jrn_response_audit_authentic,jrn_response_audit_consumer_five_minutes,jrn_response_audit_consumer_hour,jrn_response_audit_consumer_twelve_hours,jrn_response_audit_consumer_twelve_consumer_day,jrn_response_audit_consumer_week,jrn_response_audit_data_integrity,jrn_response_audit_fields_email,jrn_response_audit_fields_f_name,jrn_response_audit_fields_l_name,jrn_response_audit_fields_phone1,jrn_response_audit_fields_city,jrn_response_audit_fields_state,jrn_response_audit_fields_address1,jrn_response_audit_device_five_minutes,jrn_response_audit_device_hour,jrn_response_audit_device_twelve_hours,jrn_response_audit_device_day,jrn_response_audit_device_week,jrn_response_audit_consumer_dupe_check,jrn_response_audit_entity_value,jrn_response_audit_ip_five_minutes,jrn_response_audit_ip_hour,jrn_response_audit_ip_twelve_hours,jrn_response_audit_ip_day,jrn_response_audit_ip_week,jrn_response_audit_lead_age,jrn_response_audit_age,jrn_response_audit_lead_duration,jrn_response_audit_duration,jrn_response_audit_lead_dupe_check,jrn_response_audit_lead_dupe,jrn_response_audit_lead_five_minutes,jrn_response_audit_lead_hour,jrn_response_audit_lead_twelve_hours,jrn_response_audit_lead_day,jrn_response_audit_lead_week,jrn_response_audit_market_leadid_tcpa_disclosure,jrn_response_audit_market_leadid_tcpa_prominence,jrn_response_audit_market_leadid_tcpa_contrast,jrn_response_audit_market_leadid_tcpa_visibility,jrn_response_audit_market_leadid_tcpa_stored,jrn_response_audit_market_leadid_tcpa_capture,jrn_response_audit_market_leadid_tcpa_result,jrn_response_audit_market_leadid_result,jrn_response_audit_market_result,jrn_response_audit_url_value,jrn_response_audit_result,jrn_response_audit_token,zcta_latitude,zcta_longitude,zcta_state,zcta_cdc_all_teeth_lost,zcta_cdc_annual_checkup,zcta_cdc_arthritis,zcta_cdc_binge_drinking,zcta_cdc_cancer_except_skin,zcta_cdc_cervical_cancer_screening,zcta_cdc_cholesterol_screening,zcta_cdc_chronic_kidney_disease,zcta_cdc_colorectal_cancer_screening,zcta_cdc_copd,zcta_cdc_core_preventive_services_for_older_men,zcta_cdc_core_preventive_services_for_older_women,zcta_cdc_coronary_heart_disease,zcta_cdc_current_asthma,zcta_cdc_current_smoking,zcta_cdc_dental_visit,zcta_cdc_depression,zcta_cdc_diabetes,zcta_cdc_general_health,zcta_cdc_health_insurance,zcta_cdc_high_blood_pressure,zcta_cdc_high_cholesterol,zcta_cdc_mammography,zcta_cdc_mental_health,zcta_cdc_obesity,zcta_cdc_physical_health,zcta_cdc_physical_inactivity,zcta_cdc_sleep_lt_7_hours,zcta_cdc_stroke,zcta_cdc_taking_bp_medication,zcta_cms_mapd_aetna_mrkt_share,zcta_cms_mapd_bcbs_mrkt_share,zcta_cms_mapd_cigna_mrkt_share,zcta_cms_mapd_humana_mrkt_share,zcta_cms_mapd_kaiser_mrkt_share,zcta_cms_mapd_lis_mrkt_share,zcta_cms_mapd_mrkt_leader,zcta_cms_mapd_other_mrkt_share,zcta_cms_mapd_penetration_2021,zcta_cms_mapd_penetration_2022,zcta_cms_mapd_penetration_pct_change,zcta_cms_mapd_priority_mrkt_share,zcta_cms_mapd_uhc_mrkt_share,zcta_cms_mapd_wellcare_mrkt_share,zcta_nyt_pct_dem,zcta_nyt_pct_gop,zcta_usc_housing_units,zcta_usc_land_area_m2,zcta_usc_median_home_value,zcta_usc_median_household_income,zcta_usc_occupied_housing_units,zcta_usc_pct_20_24,zcta_usc_pct_25_34,zcta_usc_pct_35_44,zcta_usc_pct_45_54,zcta_usc_pct_55_59,zcta_usc_pct_60_64,zcta_usc_pct_65_74,zcta_usc_pct_65_over,zcta_usc_pct_american_indian_alaska_native,zcta_usc_pct_asian,zcta_usc_pct_black,zcta_usc_pct_hispanic_latino,zcta_usc_pct_native_hawaiian_pacific_islander,zcta_usc_pct_white,zcta_usc_population_density_per_km2,zcta_usc_sex_ratio,zcta_usc_total_population,tu_CITY,tu_STATE,tu_ZIP,tu_DEMO_INCOME_DOLLARS,tu_DEMO_CHILDREN_YES,tu_DEMO_CHILDREN_NO,tu_DEMO_AFFILIATION_CONSERVATIVE,tu_DEMO_AFFILIATION_LIBERAL,tu_DEMO_EDUCATION_YEARS,tu_DEMO_HOMEOWNER_YES,tu_DEMO_HOMEOWNER_NO,tu_DEMO_HOMEVALUE_DOLLARS,tu_DEMO_RESIDENT_YEARS,tu_DEMO_OCCUPATION_FIRST,tu_STATUS_COUNT,tu_STATUS_LATENCY,tu_SUB_USER_ID,tu_GROUP_ID,tu_ACCOUNT,tu_CONTACT_SCORE,tu_CREDIT_SCORE,post_raw_application_id,post_raw_model_predicted_duration,LTV,post_raw_medicare_number,post_raw_policy_id
0,0001b60b-18da-471e-a8d8-8f9ddc79951e,,Wellcare Dual Access (HMO D-SNP),21157188-444-124-000,2nu8wn0gm33,8504662000.0,32507.0,,0001b60b-18da-471e-a8d8-8f9ddc79951e,0,O65,O65 - Wellcare,F,FL,,annie,powell,4,5,11,2021,850.0,84,75 to 85,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,30.339189,-87.378231,FL,15.2,76.8,27.1,17.9,7.2,83.6,89.3,3.1,61.6,8.9,38.8,26.3,6.6,9.5,19.9,61.0,21.6,11.9,21.3,19.9,34.4,31.3,74.0,17.1,29.9,15.0,27.9,39.9,3.7,73.1,3.83,0.89,7.3,31.8,0.0,0.08,uhc,0.31,4059.0,4350.0,8.31,0.0,48.71,7.16,41.63,56.74,18716.0,72190100.0,172200.0,51842.0,13631.0,8.3,14.0,9.4,12.8,7.6,8.4,10.8,16.0,0.4,3.2,16.3,5.8,0.2,75.2,466.67,101.0,33689.0,,,,,,,,,,,,,,,,,,,,,,0001b60b-18da-471e-a8d8-8f9ddc79951e,15.0,996.25,2NU8WN0GM33,21157188-444-124-000
1,00042fc5-f3cf-4c54-b746-41c77dbe51db,,HumanaChoice H5216-248 (PPO),00013228007K_PPO,1dr1xm2ke15,5409600000.0,24422.0,,00042fc5-f3cf-4c54-b746-41c77dbe51db,0,O65,O65 - Humana,F,VA,,rhonda,hoke,3,2,12,2021,540.0,56,Less than 65,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,37.832856,-79.750441,VA,20.7,80.3,36.2,13.4,9.1,83.2,90.6,4.2,60.2,12.3,30.1,29.1,9.9,10.8,23.0,58.7,21.6,15.7,27.1,14.7,44.0,37.4,76.2,16.9,38.6,18.3,35.9,39.4,5.3,81.2,15.84,2.5,,34.98,,0.05,uhc,0.01,2419.0,2824.67,19.76,,46.67,,26.72,72.08,3111.0,205949300.0,108300.0,43977.0,2616.0,5.5,13.6,7.8,9.4,7.9,9.7,13.5,25.6,0.0,0.2,9.2,0.9,0.0,87.4,28.91,88.6,5953.0,CLIFTON FORGE,VA,24422.0,85000.0,0.0,100.0,,,13.0,100.0,0.0,100000.0,12.0,,17.0,0.360621,5349.0,pickup,5349.0,42.0,556.0,,,,,
2,0006a78d-2b0f-44e4-a1cb-e1f5a3416ff9,richard_kelly2007@yahoo.com,AARP Medicare Advantage Plan 2 (HMO),OEC_SFTCF04JQ36A0,1rj6fq6nh72,9412650000.0,34221.0,,0006a78d-2b0f-44e4-a1cb-e1f5a3416ff9,0,O65,O65 - United Health Care,M,FL,129566671.0,richard,kelly,1,19,4,2022,941.0,65,65 to 75,FL,13.72,500.0,Exclusive,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,27.581152,-82.551825,FL,16.6,,,,,83.4,,,64.1,,30.2,28.4,,,,59.5,,,,,,,74.8,,,,,36.3,,,20.63,0.36,0.34,40.56,,0.04,humana,0.6,4345.0,4552.0,8.72,0.0,36.07,1.44,41.56,57.61,22155.0,139606000.0,198100.0,57820.0,17455.0,5.2,11.6,11.3,12.2,6.5,8.0,13.7,25.2,0.3,1.0,17.0,16.9,0.0,75.4,364.29,96.5,50857.0,,,,,,,,,,,,,,,,,,,,,,0006a78d-2b0f-44e4-a1cb-e1f5a3416ff9,19.0,946.083333,1RJ6FQ6NH72,OEC_SFTCF04JQ36A0
3,000b17c2-9ddc-45b8-818a-6dad511fe5de,clindabrown225@gmail.com,Humana Gold Plus H1951-048 (HMO),,6y25k28gu56,2252537000.0,70737.0,,000b17c2-9ddc-45b8-818a-6dad511fe5de,0,O65,O65 - Humana,F,LA,24912031.0,clinda,brown,5,26,3,2022,225.0,65,65 to 75,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,30.226092,-90.92498,LA,14.9,80.3,23.7,21.7,6.1,86.5,88.9,2.7,65.9,6.6,27.8,24.2,5.5,9.1,21.0,59.0,24.7,9.6,19.1,14.0,34.6,33.8,78.5,17.5,37.6,12.7,30.2,37.5,3.1,71.8,2.35,0.06,,92.65,,0.06,humana,0.41,6245.0,6440.0,6.63,,0.0,4.52,32.73,65.28,17887.0,139842400.0,205300.0,74863.0,16185.0,6.7,16.4,12.6,13.9,7.2,4.9,9.1,12.8,0.1,1.0,24.9,6.3,0.1,67.3,319.96,100.7,44744.0,GONZALES,LA,70737.0,,,,,,,,,,,,10.0,0.342807,5349.0,pickup,5349.0,239.0,490.0,,,,,
4,000c6270-f162-4f5f-a0bb-104ccdb5274a,012345@noreply.com,Humana Gold Plus H0028-029 (HMO),00018705403K_HMO,8gf3jg1da73,3613193000.0,78102.0,,000c6270-f162-4f5f-a0bb-104ccdb5274a,0,O65,O65 - Humana,F,TX,,adela,casarez,0,7,3,2022,361.0,72,65 to 75,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,28.415392,-97.714457,TX,17.9,69.2,19.7,19.9,4.9,79.7,84.4,3.1,51.7,6.5,20.8,19.5,5.9,8.3,20.9,47.7,19.3,13.5,27.3,37.4,33.7,33.2,74.1,15.7,41.3,14.6,35.2,38.8,3.3,70.6,0.0,,,43.82,,0.05,uhc,0.01,5202.0,5618.5,9.51,,55.89,0.56,25.55,73.42,8690.0,1363098000.0,95600.0,46089.0,7012.0,8.4,18.7,15.6,12.6,6.0,4.3,6.6,11.4,0.7,0.7,8.3,59.0,0.0,78.9,20.73,169.3,28253.0,,,,,,,,,,,,,,,,,,,,,,,,,,


In [17]:
data.pol_zip_code



0            NaN
1            NaN
2            NaN
3            NaN
4            NaN
          ...   
21295        NaN
21296        NaN
21297    77053.0
21298        NaN
21299    45154.0
Name: pol_zip_code, Length: 21300, dtype: float64

#### CatBoost Data Preprocessing

In [18]:
## Post-process the Dataset
from utils.post_processing_utils import process_catboost

X_train, X_test = process_catboost(data=data, config_path=CONFIG_PATH)


# logger.info(
#     f"Splitted input dataset shapes are: \
#         X_train= {X_train.shape}, y_train= {y_train.shape},\
#             X_test= {X_test.shape}, y_test= {y_test.shape}"
# )

In [19]:
## Also store Train and Test datasets in s3
# Train
bucket_path = f"pre-conversion-ma-ltv/data/post-processed/ma_ltv_train.csv"
train_data_path = 's3://{}/{}'.format(s3_bucket, bucket_path)
X_train.to_csv(train_data_path, index=False)
# Test
bucket_path = f"pre-conversion-ma-ltv/data/post-processed/ma_ltv_test.csv"
test_data_path = 's3://{}/{}'.format(s3_bucket, bucket_path)
X_test.to_csv(test_data_path, index=False)

In [20]:
config = load_config_file(config_path=CONFIG_PATH)
force_categorical = config["force_categorical"]
for cat in force_categorical:
    feats = [col for col in X_train.columns if cat in col]
    print(feats)

['app_zip_code']
['pol_zip_code']
['submitted_weekday', 'submitted_day', 'jrn_day', 'jrn_response_audit_consumer_twelve_consumer_day', 'jrn_response_audit_device_day', 'jrn_response_audit_ip_day', 'jrn_response_audit_lead_day']
['submitted_year', 'jrn_year']


In [24]:
print(X_train.LTV.dtype)

float64


In [21]:
experiment_name = 'rb_test1'
# s3_bucket = "s3://hc-prd-mlflow-bucket"
tracking_uri = "https://mlflow.healthcare.com/"

sess = sagemaker.Session(default_bucket=s3_bucket)
subnets = ['subnet-0b8fee7c', 'subnet-da08a7f1', 'subnet-e008a7cb']
security_group_ids = ['sg-e6d64f82']
role = 'arn:aws:iam::915124832670:role/hc-sagemaker-default-execution-role' # Local
current_user_arn = boto3.resource('iam').CurrentUser().arn # Local


In [15]:
current_user_arn

'arn:aws:iam::915124832670:user/rutvik.bhende'

In [22]:
hyperparameters = {
    'tracking_uri': tracking_uri,
    'experiment_name': experiment_name,
    'user_arn': current_user_arn,
    'target': "LTV",
    'iterations': 400,
    'learning_rate': 0.01,
    'depth': 13,
    'loss_function': "RMSE",
}

hyperparameter_ranges = {
    'iterations': IntegerParameter(200,300),
    'depth': IntegerParameter(12,13),
    'learning_rate': CategoricalParameter([0.01, 0.02]),
    'loss_function': CategoricalParameter(["RMSE", "MAE"]), 
}


metric_definitions = [
    {'Name': 'MAE_train', 'Regex': "MAE_train: ([0-9]*.[0-9]*)"},
    {'Name': 'MAE_test', 'Regex': "MAE_test: ([0-9]*.[0-9]*)"},
    {'Name': 'RMSE_train', 'Regex': "RMSE_train: ([0-9]*.[0-9]*)"},
    {'Name': 'RMSE_test', 'Regex': "RMSE_test: ([0-9]*.[0-9]*)"},
    {'Name': 'R2_score_train', 'Regex': "R2_score_train: ([0-9]*.[0-9]*)"},
    {'Name': 'R2_score_test', 'Regex': "R2_score_test: ([0-9]*.[0-9]*)"},
    {'Name': 'test_preds_mean', 'Regex': "test_preds_mean: ([0-9]*.[0-9]*)"},
]

objective_metric_name = 'RMSE_test'
objective_type = 'Minimize'

estimator = SKLearn(
    entry_point='train_catboost.py',
    source_dir='models_py',
    role=role,
    subnets=subnets,
    security_group_ids=security_group_ids,
    metric_definitions=metric_definitions,
    hyperparameters=hyperparameters,
    instance_count=1,
    instance_type='ml.c5.xlarge',
    framework_version='0.23-1',
    py_version='py3',
)

In [23]:
estimator.fit({'train':train_data_path, 'test': test_data_path})

2022-06-10 16:54:20 Starting - Starting the training job...
2022-06-10 16:54:46 Starting - Preparing the instances for trainingProfilerReport-1654880059: InProgress
......
2022-06-10 16:55:49 Downloading - Downloading input data...
2022-06-10 16:56:10 Training - Downloading the training image...
2022-06-10 16:56:47 Training - Training image download completed. Training in progress..[34m2022-06-10 16:56:47,779 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2022-06-10 16:56:47,782 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-06-10 16:56:47,790 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2022-06-10 16:56:48,635 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:[0m
[34m/miniconda3/bin/python -m pip install -r requirements.txt[0m
[34mCollecting mlflow==1.26.0[0m
[34m  Downloading mlflow-1.26.0-py3-none-any.whl (1

UnexpectedStatusException: Error for Training job sagemaker-scikit-learn-2022-06-10-16-54-12-922: Failed. Reason: AlgorithmError: framework error: 
Traceback (most recent call last):
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_containers/_trainer.py", line 84, in train
    entrypoint()
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_sklearn_container/training.py", line 39, in main
    train(environment.Environment())
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_sklearn_container/training.py", line 35, in train
    runner_type=runner.ProcessRunnerType)
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_training/entry_point.py", line 100, in run
    wait, capture_error
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_training/process.py", line 291, in run
    cwd=environment.code_dir,
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_training/process.py", line 208, in check_error
    info=extra_info,
sagemaker_training.errors.ExecuteUserScriptError: ExecuteUserScriptError:
ExitCode 1
ErrorMessage ""
Command "/miniconda3/bin/python train_catboost.py --depth 13 --e

In [10]:
tuner = HyperparameterTuner(
    estimator,
    objective_metric_name,
    hyperparameter_ranges,
    metric_definitions,
    max_jobs=10,
    max_parallel_jobs=4,
    objective_type=objective_type,
    base_tuning_job_name='mlflow-ma-ltv-preconv'
)

In [11]:
tuner.fit({'train':train_data_path, 'test': test_data_path})

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


..................................................................................................................................................................................................................!
