## Script para procesamiento
Para crear un Job de procesamiento de Amazon SageMaker primero crearemos un script python el cual nombraremos processing.py y tendrá toda la lógica necesaria para realizar las mismas transformaciones que en el Jupyter Notebook de ejemplo descargado en la Introducción

In [1]:
import os
import datetime
import sagemaker
import sagemaker_utils
import numpy as np
import matplotlib.pyplot as plt
from time import gmtime, strftime
from sklearn.metrics import confusion_matrix
from sagemaker import Session, get_execution_role
from sagemaker.estimator import Estimator
from sagemaker.processing import Processor, ProcessingInput, ProcessingOutput
from sagemaker.tuner import HyperparameterTuner, ContinuousParameter, IntegerParameter, CategoricalParameter
from sagemaker.inputs import TrainingInput, CreateModelInput, TransformInput
from sagemaker.workflow.steps import ProcessingStep, TrainingStep, CreateModelStep, TransformStep
from sagemaker.workflow.parameters import ParameterString, ParameterFloat
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.properties import PropertyFile
from sagemaker.model_metrics import MetricsSource, ModelMetrics
from sagemaker.workflow.step_collections import RegisterModel
from sagemaker.workflow.conditions import ConditionGreaterThanOrEqualTo
from sagemaker.workflow.condition_step import ConditionStep
from sagemaker.workflow.functions import JsonGet

sagemaker.__version__
session = Session()
#sagemaker_role = get_execution_role()

data_file = 'Data sets/churn.txt'

region = session.boto_region_name
account_id = session.account_id()
bucket = session.default_bucket()

prefix = 'churn-clf'
datasets_prefix = f'{prefix}/datasets'
processed_data_prefix = f'{prefix}/processed'
eval_prefix = f'{prefix}/eval'
transformed_data_prefix = f'{prefix}/transformed'
images_directory = f'{prefix}/images'
code_prefix = f'{prefix}/code'
model_prefix = f'{prefix}/models'


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
data_prep_script_file = 'code/data_prep.py'

In [2]:
%%writefile $data_prep_script_file
import argparse
import pickle
import os
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

def to_pkl(data, file):
    with open(file, 'wb') as f:
        pickle.dump(data, f)
        
if __name__=='__main__':
    script_name = os.path.basename(__file__)
    
    print(f'INFO: {script_name}: Iniciando la preparación de los datos')
    
    parser = argparse.ArgumentParser()
    parser.add_argument('--test-size', type=float, default=0.1)
    parser.add_argument('--data-file', type=str, default='train.csv')
    parser.add_argument('--train-data-file', type=str)
    parser.add_argument('--train-target-file', type=str)
    parser.add_argument('--test-data-file', type=str)
    parser.add_argument('--test-target-file', type=str)
    parser.add_argument('--encoder-file', type=str)
    
    args, _ = parser.parse_known_args()    
    
    print(f'INFO: {script_name}: Parámetros recibidos: {args}')
    
    input_path = '/opt/ml/processing/input'
    output_path = '/opt/ml/processing/output'
    
    data_path = os.path.join(input_path, args.data_file) 
    
    # Cargar dataset
    data = pd.read_csv(data_path)
    
    # Eliminar caracteres especiales y reemplazar espacios por guiones bajos
    data.columns = [''.join (c if c.isalnum() else '_' for c in str(column)) for column in data.columns]
    
    # Selección de columnas
    columns = ['State', 'Account_Length', 'Area_Code', 'Int_l_Plan','VMail_Plan', 'VMail_Message', 
           'Day_Mins', 'Day_Calls','Eve_Mins', 'Eve_Calls', 'Night_Mins', 'Night_Calls', 
           'Intl_Mins', 'Intl_Calls', 'CustServ_Calls', 'Churn_']
    data = data[columns]
    
    # Eliminación del . al final de la palabra False o True en la columna Churn_ y renombrarla a Churn
    data['Churn_']=data['Churn_'].str.replace('.','')
    data.rename(columns={'Churn_':'Churn'}, inplace=True)
    
    # One hot encoding de variables categóricas
    columns = ['State','Area_Code']
    encoder = OneHotEncoder().fit(data[columns])
    
    transformed = encoder.transform(data[columns]).toarray()
    
    data.drop(columns,axis=1, inplace=True)
    data = pd.concat([data,pd.DataFrame(transformed, columns=encoder.get_feature_names())],axis=1)
    
    # Reemplazar yes/no por 1/0 en columnas Int_l_Plan y VMail_Plan
    data['Int_l_Plan'] = data['Int_l_Plan'].map(dict(yes=1, no=0))
    data['VMail_Plan'] = data['VMail_Plan'].map(dict(yes=1, no=0))
    
    # Reemplazar True/False por 1/0 en columna Churn
    data['Churn'] = data['Churn'].map({'True': 1, 'False': 0})
    
    # Separar la etiqueta o target del resto de los datos
    target = data[['Churn']]
    data.drop(['Churn'], axis=1, inplace=True)
    
    # Y dividimos en train (80%) y test (20%), manteniendo las mismas proporciones de observaciones por cada clase
    train_data, test_data, train_target, test_target = train_test_split(data, target, stratify=target, 
                                                                        test_size=args.test_size)
    
    print('Train: {0} records with clasess: 0={1[0]}% and 1={1[1]}%'.format(train_target.shape[0],
                                             round(train_target['Churn'].value_counts(normalize=True) * 100, 1)))

    print('Test: {0} records with clasess: 0={1[0]}% and 1={1[1]}%'.format(test_target.shape[0],
                                             round(test_target['Churn'].value_counts(normalize=True) * 100, 1)))
    
    # Guardar los dataframes resultantes y el encoder
    train_data.to_csv(os.path.join(output_path, 'train_data', args.train_data_file), index=False)
    train_target.to_csv(os.path.join(output_path, 'train_target', args.train_target_file), index=False)
    test_data.to_csv(os.path.join(output_path, 'test_data', args.test_data_file), index=False)
    test_target.to_csv(os.path.join(output_path, 'test_target', args.test_target_file), index=False)
    to_pkl(encoder, os.path.join(output_path, 'encoder', args.encoder_file))
    
    print(f'INFO: {script_name}: Finalizando la preparación de los datos')


Writing $data_prep_script_file


Y subimos el script creado a un bucket de Amazon S3.

In [5]:
data_prep_script_path = sagemaker_utils.upload(data_prep_script_file, f's3://{bucket}/{code_prefix}')

Uploading: 100%|██████████| 4.10k/4.10k [00:01<00:00, 3.87kB/s]


Especificamos las dependencias requeridas para cada uno de los contenedores Docker que crearemos.

In [7]:
sagemaker_role = "arn:aws:iam::829825986145:role/service-role/AmazonSageMaker-ExecutionRole-20220424T173630"

Al inicio del código podemos observar que se definen los siguientes parámetros, los cuales se recibirán como argumentos de la línea de comandos y de esta forma nos permitirá usar estos valores como mejor nos convenga en nuestro programa  

Adicionalmente vemos las rutas que utilizamos para cargar el dataset (archvio churn.txt) y posteriormente guardar los DataFrames y encoder creado  

Y por último, podemos observar que la lógica que hemos incorporado en el script para la preparación de los datos es prácticamente la misma que la del Jupyter Notebook descargado.



**Esta ejecucion se debe hacer posterior a la generacion de las imagenes, para tener las img uris**  
La traemos de Sagemaker_pipelines previamente ejecutado

In [9]:
docker_images = {'Processing': {'libraries': {'pandas': '1.2.4',
   'numpy': '1.20.2',
   'scikit-learn': '0.24.2'},
  'build_id': 'churn-clf-processing-build-image:7e3a3c1e-4038-4a84-aae6-408606f73789',
  'image_uri': '829825986145.dkr.ecr.us-east-1.amazonaws.com/churn-clf-processing:latest'},
 'Training': {'libraries': {'pandas': '1.2.4',
   'numpy': '1.20.2',
   'scikit-learn': '0.24.2',
   'sagemaker-training': '3.9.2'},
  'build_id': 'churn-clf-training-build-image:acd0f05b-4dba-48d4-85ca-c06c2addfd4e',
  'image_uri': '829825986145.dkr.ecr.us-east-1.amazonaws.com/churn-clf-training:latest'},
 'Inference': {'libraries': {'pandas': '1.2.4',
   'numpy': '1.20.2',
   'scikit-learn': '0.24.2',
   'multi-model-server': '1.1.8',
   'sagemaker-inference': '1.5.11',
   'boto3': '1.21.43',
   'itsdangerous': '2.0.1'},
  'dependencies': [('serving', '/opt/ml/serving')],
  'others': ['RUN pip install -e /opt/ml/serving',
   'LABEL com.amazonaws.sagemaker.capabilities.multi-models=false',
   'LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true'],
  'entrypoint': ['python', '/opt/ml/serving/custom_inference/serving.py'],
  'cmd': ['serve'],
  'build_id': 'churn-clf-inference-build-image:c096f995-2a79-4ea7-8364-0aa4afa43752',
  'image_uri': '829825986145.dkr.ecr.us-east-1.amazonaws.com/churn-clf-inference:latest'}}

In [10]:
processor = Processor(
    image_uri=docker_images['Processing']['image_uri'],
    role=sagemaker_role,
    instance_count=1,
    instance_type='ml.m5.4xlarge',
    entrypoint=['python3',f'/opt/ml/processing/input/code/{os.path.basename(data_prep_script_file)}'],
    volume_size_in_gb=5,
    max_runtime_in_seconds=60*60*2)# dos horas 

Previo a ejecutar nuestro Job de procesamiento, definimos las siguientes variables para mas adelante poder re-utilizarlas

In [12]:
train_data_file = 'train_data.csv'
train_target_file = 'train_target.csv'
test_data_file = 'test_data.csv'
test_target_file = 'test_target.csv'
encoder_file = 'encoder.pkl'

Y finalmente ejecutamos el Job utilizando el metodo **run** del objeto creado mediante la clase **Processor**.  
Debemos pasar las rutas de los buckets de Amazon S3 tanto para **inputs** (entradas) como para **outputs** (salidas). De esta forma SageMaker sabe de dónde tomar los datos de entrada y en dónde colocar los archivos resultantes de ejecutar el Job de procesamiento.

In [13]:
data_prep_parameters = {
    'inputs':[ProcessingInput(input_name='input',
                    source=f's3://{bucket}/{datasets_prefix}',
                    destination='/opt/ml/processing/input'),
              ProcessingInput(input_name='code',
                    source=data_prep_script_path,
                    destination='/opt/ml/processing/input/code')],
    'outputs':[ProcessingOutput(output_name='train_data',
                    source=f'/opt/ml/processing/output/train_data',
                    destination=f's3://{bucket}/{processed_data_prefix}/train_data'),
               ProcessingOutput(output_name='train_target',
                    source=f'/opt/ml/processing/output/train_target',
                    destination=f's3://{bucket}/{processed_data_prefix}/train_target'),
               ProcessingOutput(output_name='test_data',
                    source=f'/opt/ml/processing/output/test_data',
                    destination=f's3://{bucket}/{processed_data_prefix}/test_data'),
               ProcessingOutput(output_name='test_target',
                    source=f'/opt/ml/processing/output/test_target',
                    destination=f's3://{bucket}/{processed_data_prefix}/test_target'),
               ProcessingOutput(output_name='encoder',
                    source=f'/opt/ml/processing/output/encoder',
                    destination=f's3://{bucket}/{processed_data_prefix}/encoder')],
    'arguments':['--test-size', '0.1',
                 '--data-file', 'churn.txt',
                 '--train-data-file', train_data_file,
                 '--train-target-file', train_target_file,
                 '--test-data-file', test_data_file,
                 '--test-target-file', test_target_file,
                 '--encoder-file', encoder_file]}

processor.run(**data_prep_parameters)



Job Name:  churn-clf-processing-2022-05-01-17-44-57-933
Inputs:  [{'InputName': 'input', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-829825986145/churn-clf/datasets', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-829825986145/churn-clf/code/data_prep.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'train_data', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-east-1-829825986145/churn-clf/processed/train_data', 'LocalPath': '/opt/ml/processing/output/train_data', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'train_target', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-east-1-

In [15]:
sagemaker_utils.get_processor_output_path(processor, 'train_data')

's3://sagemaker-us-east-1-829825986145/churn-clf/processed/train_data'