In [1]:
!pip install awswrangler

Collecting awswrangler
  Downloading awswrangler-2.8.0-py3-none-any.whl (179 kB)
[K     |████████████████████████████████| 179 kB 27.0 MB/s eta 0:00:01
[?25hCollecting pymysql<1.1.0,>=0.9.0
  Downloading PyMySQL-1.0.2-py3-none-any.whl (43 kB)
[K     |████████████████████████████████| 43 kB 4.1 MB/s  eta 0:00:01
Collecting pg8000<1.20.0,>=1.16.0
  Downloading pg8000-1.19.5-py3-none-any.whl (34 kB)
Collecting redshift-connector~=2.0.0
  Downloading redshift_connector-2.0.881-py3-none-any.whl (91 kB)
[K     |████████████████████████████████| 91 kB 3.3 MB/s  eta 0:00:01
Collecting scramp==1.4.0
  Downloading scramp-1.4.0-py3-none-any.whl (8.4 kB)
Installing collected packages: scramp, redshift-connector, pymysql, pg8000, awswrangler
Successfully installed awswrangler-2.8.0 pg8000-1.19.5 pymysql-1.0.2 redshift-connector-2.0.881 scramp-1.4.0


In [2]:
# imports
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner
from sagemaker.amazon.amazon_estimator import get_image_uri
import awswrangler as wr
import pandas as pd
import sagemaker
import boto3
bucket = sagemaker.Session().default_bucket()
bucket ## bucket asignado al equipo acorde a nuestro SSO 

# config
sess = sagemaker.session.Session(default_bucket= 's3://{}'.format(bucket))  ##seteado de bucket a usar
role = sagemaker.get_execution_role()   ## levantamiento de roles
region = boto3.Session().region_name    
smclient = boto3.Session().client('sagemaker')

bucket

'sagemaker-us-east-1-058528764918'

### >> Habilitación de rutas

In [3]:
prefix = 'vpc'     # subcarpeta de guardado
model_prefix = 'model'
output_path = 's3://{}/{}/victor'.format(bucket, prefix)
print(output_path)

# rutas de train y validacion acorde al formato de lectura de XGBOOST
train_fname = "{}/train.csv".format(output_path)
print(train_fname)

val_fname = "{}/validation_set.csv".format(output_path)
print(val_fname)

model_output = "{}/{}/ouput".format(output_path, model_prefix)
print(model_output)

s3://sagemaker-us-east-1-058528764918/vpc/victor
s3://sagemaker-us-east-1-058528764918/vpc/victor/train.csv
s3://sagemaker-us-east-1-058528764918/vpc/victor/validation_set.csv
s3://sagemaker-us-east-1-058528764918/vpc/victor/model/ouput


### >> Modelado

In [4]:
container = get_image_uri(region, 'xgboost', '0.90-1')

xgb = sagemaker.estimator.Estimator(container,
                                    role, 
                                    instance_count=1, 
                                    instance_type='ml.m5.2xlarge',  ### 
                                    output_path=model_output,
                                    sagemaker_session=sess)

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [5]:
xgb.set_hyperparameters(eval_metric = 'auc',   #'rmse',
                        objective = 'binary:logistic', #'reg:squarederror',          ---ojo se cambian ya que ahora el objetivo es clasiificacion
                        early_stopping_rounds = 75,
                        num_round = 100)

In [6]:
hyperparameter_ranges = {
    'eta': ContinuousParameter(0.1, 0.5),
    'min_child_weight': ContinuousParameter(0, 120),
    'alpha': ContinuousParameter(0, 1000),
    'max_depth': IntegerParameter(0, 10),
    'gamma': ContinuousParameter(0, 5),
    'colsample_bytree': ContinuousParameter(0.5, 1),
    'lambda': ContinuousParameter(0, 1000),
    'max_delta_step': IntegerParameter(0, 10)
}

In [7]:
objective_metric_name = 'validation:auc' #'validation:rmse' --- ojo tambien cambia ya que este caso es clasiicaicon

In [8]:
tuner = HyperparameterTuner(
    xgb,
    objective_metric_name,
    hyperparameter_ranges,
    max_jobs=20,
    max_parallel_jobs=8,
    objective_type='Maximize',   #Minimize OJO!!! era mínimo ya que mientras menor RMSE mejor modelo de regresion, acá cambia /// mayor AUc mejor modelo 
    base_tuning_job_name='xgboost-vpc'
)

In [9]:
#s3_input_train = sagemaker.s3_input(s3_data=train_fname, content_type='csv')
s3_input_train = sagemaker.inputs.TrainingInput(s3_data=train_fname, content_type='csv')   # OJO!! revisar cambio por la version

#s3_input_validation = sagemaker.s3_input(s3_data=val_fname, content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data=val_fname, content_type='csv') # OJO!! revisar cambio por la version

In [10]:
tuner.fit(
    {
        'train': s3_input_train, 
        'validation': s3_input_validation
    }, 
    include_cls_metadata=False
)

......................................................................................................................................!


In [11]:
tuner

<sagemaker.tuner.HyperparameterTuner at 0x7fe2fddaa978>

In [12]:
print(tuner.latest_tuning_job.name)

xgboost-vpc-210604-1710


In [13]:
print(dir(tuner))

['DEFAULT_ESTIMATOR_CLS_NAME', 'DEFAULT_ESTIMATOR_MODULE', 'SAGEMAKER_ESTIMATOR_CLASS_NAME', 'SAGEMAKER_ESTIMATOR_MODULE', 'TUNING_JOB_NAME_MAX_LENGTH', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_add_estimator', '_attach_with_training_details', '_attach_with_training_details_list', '_create_warm_start_tuner', '_current_job_name', '_ensure_last_tuning_job', '_extract_hyperparameters_from_parameter_ranges', '_fit_with_estimator', '_fit_with_estimator_dict', '_get_best_training_job', '_hyperparameter_ranges', '_hyperparameter_ranges_dict', '_prepare_estimator', '_prepare_estimator_cls', '_prepare_estimator_for_tuning', '_prepare_estimator_from_job_description', '_prepare_for_tuning', '_prepa

In [14]:
tuner.describe()

{'HyperParameterTuningJobName': 'xgboost-vpc-210604-1710',
 'HyperParameterTuningJobArn': 'arn:aws:sagemaker:us-east-1:058528764918:hyper-parameter-tuning-job/xgboost-vpc-210604-1710',
 'HyperParameterTuningJobConfig': {'Strategy': 'Bayesian',
  'HyperParameterTuningJobObjective': {'Type': 'Maximize',
   'MetricName': 'validation:auc'},
  'ResourceLimits': {'MaxNumberOfTrainingJobs': 20,
   'MaxParallelTrainingJobs': 8},
  'ParameterRanges': {'IntegerParameterRanges': [{'Name': 'max_depth',
     'MinValue': '0',
     'MaxValue': '10',
     'ScalingType': 'Auto'},
    {'Name': 'max_delta_step',
     'MinValue': '0',
     'MaxValue': '10',
     'ScalingType': 'Auto'}],
   'ContinuousParameterRanges': [{'Name': 'eta',
     'MinValue': '0.1',
     'MaxValue': '0.5',
     'ScalingType': 'Auto'},
    {'Name': 'min_child_weight',
     'MinValue': '0',
     'MaxValue': '120',
     'ScalingType': 'Auto'},
    {'Name': 'alpha',
     'MinValue': '0',
     'MaxValue': '1000',
     'ScalingType': '

In [15]:
tuner.best_training_job()
#xgboost-vpc-210603-1725-019-d2684b91

'xgboost-vpc-210604-1710-016-8f8a7268'

In [16]:
#response = client.create_model(
#    ModelName='string',

In [None]:
imagen_entrenamiento = '683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:0.90-1-cpu-py3'

In [17]:
nombre_exportado = HyperparameterTuner.attach(tuner.latest_tuning_job.name).best_training_job()
nombre_exportado

'xgboost-vpc-210604-1710-016-8f8a7268'

In [18]:
from sagemaker.estimator import Estimator, Transformer
model = Estimator.attach(nombre_exportado)


2021-06-04 17:18:12 Starting - Preparing the instances for training
2021-06-04 17:18:12 Downloading - Downloading input data
2021-06-04 17:18:12 Training - Training image download completed. Training in progress.
2021-06-04 17:18:12 Uploading - Uploading generated training model
2021-06-04 17:18:12 Completed - Training job completed


In [19]:
print(dir(model))

['__abstractmethods__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abc_cache', '_abc_negative_cache', '_abc_negative_cache_version', '_abc_registry', '_compilation_job_name', '_compiled_models', '_current_job_name', '_enable_network_isolation', '_ensure_base_job_name', '_ensure_latest_training_job', '_get_or_create_name', '_prepare_collection_configs', '_prepare_debugger_for_training', '_prepare_debugger_rules', '_prepare_for_training', '_prepare_init_params_from_job_description', '_prepare_profiler_for_training', '_prepare_profiler_rules', '_prepare_rules', '_set_default_rule_config', '_set_source_s3_uri', 'attach', 'base_job_name', 'checkpoint_local_path', 'checkpoint_s3_uri', 'code_cha

In [20]:
model.image_uri

'683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:0.90-1-cpu-py3'

In [21]:
tuning_job_name = 'xgboost-vpc-210603-1725'
tuner_metrics = sagemaker.HyperparameterTuningJobAnalytics(tuning_job_name)
tuner_metrics.dataframe().sort_values(['FinalObjectiveValue'], ascending=False).head(100)

Unnamed: 0,alpha,colsample_bytree,eta,gamma,lambda,max_delta_step,max_depth,min_child_weight,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
1,49.633127,0.997583,0.185674,0.469707,159.932396,9.0,6.0,35.207694,xgboost-vpc-210603-1725-019-d2684b91,Completed,0.998713,2021-06-03 17:34:55+00:00,2021-06-03 17:35:49+00:00,54.0
10,609.467112,0.660791,0.457639,4.465166,303.262474,1.0,8.0,56.450272,xgboost-vpc-210603-1725-010-619b6e12,Completed,0.998663,2021-06-03 17:31:41+00:00,2021-06-03 17:32:41+00:00,60.0
15,334.637573,0.960773,0.10017,0.209516,733.645119,1.0,1.0,59.560175,xgboost-vpc-210603-1725-005-5357197d,Completed,0.998663,2021-06-03 17:28:07+00:00,2021-06-03 17:28:59+00:00,52.0
2,563.568674,0.888819,0.131467,0.175517,455.098412,9.0,8.0,56.154959,xgboost-vpc-210603-1725-018-110291d2,Completed,0.998663,2021-06-03 17:34:57+00:00,2021-06-03 17:35:49+00:00,52.0
3,601.715647,0.80457,0.139556,4.057048,677.346947,0.0,9.0,81.855643,xgboost-vpc-210603-1725-017-23bd9894,Completed,0.998663,2021-06-03 17:35:05+00:00,2021-06-03 17:35:58+00:00,53.0
4,525.055491,0.978388,0.318267,2.330974,308.694603,8.0,4.0,3.576737,xgboost-vpc-210603-1725-016-fd16b8e4,Completed,0.998663,2021-06-03 17:32:07+00:00,2021-06-03 17:33:02+00:00,55.0
5,407.410227,0.595652,0.344273,3.183177,930.287621,8.0,9.0,18.346024,xgboost-vpc-210603-1725-015-726ee6a2,Completed,0.998663,2021-06-03 17:31:48+00:00,2021-06-03 17:33:24+00:00,96.0
6,192.333323,0.77035,0.308965,4.755083,29.214623,6.0,8.0,68.623123,xgboost-vpc-210603-1725-014-224f628b,Completed,0.998663,2021-06-03 17:31:27+00:00,2021-06-03 17:32:18+00:00,51.0
7,454.437815,0.926783,0.330784,1.529277,480.752709,4.0,1.0,72.534256,xgboost-vpc-210603-1725-013-023272cc,Completed,0.998663,2021-06-03 17:31:30+00:00,2021-06-03 17:32:27+00:00,57.0
8,192.333323,0.77035,0.316965,4.855083,29.214623,6.0,8.0,71.023123,xgboost-vpc-210603-1725-012-0da367ef,Completed,0.998663,2021-06-03 17:31:14+00:00,2021-06-03 17:32:08+00:00,54.0
