# Libraries


In [1]:
import sagemaker
import ipykernel
from sklearn.model_selection import train_test_split
import boto3
import pandas as pd

# Configuración Sesión de SageMaker

In [2]:
sm_boto3 = boto3.client("sagemaker")
sess = sagemaker.Session()
region = sess.boto_session.region_name
bucket = 'bucketprojectsagemaker' #S3 Instance



# Extracción de datos

In [None]:
df = pd.read_csv("./datasets/info.csv")
#Ver datos
df.head(10)

Unnamed: 0,ID_DETALLE,NOMCIENTIFICO,ALTURATOTAL,VOLUMENCOMER,DENSIDAD,ESTADO_COPA,ESTADO_FUSTE,ESTADO_RAIZ,EST_RAIZ_GEN,NIVEL_RIESGO,ESTADO_FITO,TRATAMIENTO
0,266,Sambucus nigra,3.0,0.0,Medio,Bueno,Bueno,No apreciable,Bueno,Ninguna,Bueno,Conservacion
1,267,Tecoma stans,2.5,0.0,Denso,Bueno,Bueno,No apreciable,Bueno,Ninguna,Bueno,Conservacion
2,268,Fraxinus chinensis,4.5,0.0,Ralo,Regular,Bueno,No apreciable,Bueno,Ninguna,Regular,Poda Estructural
3,269,Acacia melanoxylon,3.0,0.003,Muy Ralo,Malo,Regular,Raices Descubiertas,Regular,Ninguna,Malo,Tala
4,270,Acacia decurrens,9.5,0.665,Medio,Regular,Regular,Raices Descubiertas,Regular,Moderada,Regular,Tala
5,271,Acacia calamifolia,2.3,0.001,Muy Ralo,Regular,Regular,Raices Descubiertas,Regular,Ninguna,Regular,Tala
6,272,Sambucus nigra,3.5,0.008,Medio,Regular,Regular,Raices Descubiertas,Regular,Ninguna,Regular,Tala
7,273,Sambucus nigra,2.0,0.001,Denso,Bueno,Regular,Raices Descubiertas,Regular,Ninguna,Regular,Tala
8,274,Prunus capuli,3.5,0.0,Denso,Regular,Bueno,No apreciable,Bueno,Ninguna,Regular,Poda Estructural
9,275,Sambucus nigra,1.6,0.0,Medio,Regular,Regular,Raices Descubiertas,Regular,Ninguna,Regular,Tala


In [4]:
df.columns

Index(['ID_DETALLE', 'NOMCIENTIFICO', 'ALTURATOTAL', 'VOLUMENCOMER',
       'DENSIDAD', 'ESTADO_COPA', 'ESTADO_FUSTE', 'ESTADO_RAIZ',
       'EST_RAIZ_GEN', 'NIVEL_RIESGO', 'ESTADO_FITO', 'TRATAMIENTO'],
      dtype='object')

In [5]:
df.shape

(312923, 12)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312923 entries, 0 to 312922
Data columns (total 12 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   ID_DETALLE     312923 non-null  int64  
 1   NOMCIENTIFICO  312922 non-null  object 
 2   ALTURATOTAL    312923 non-null  float64
 3   VOLUMENCOMER   312923 non-null  float64
 4   DENSIDAD       312923 non-null  object 
 5   ESTADO_COPA    312923 non-null  object 
 6   ESTADO_FUSTE   312923 non-null  object 
 7   ESTADO_RAIZ    312923 non-null  object 
 8   EST_RAIZ_GEN   312923 non-null  object 
 9   NIVEL_RIESGO   312923 non-null  object 
 10  ESTADO_FITO    305418 non-null  object 
 11  TRATAMIENTO    312923 non-null  object 
dtypes: float64(2), int64(1), object(9)
memory usage: 28.6+ MB


# Normalización y Categorización de datos

In [7]:
#Eliminar datos Nulos
df= df[['ID_DETALLE','TRATAMIENTO', 'ALTURATOTAL', 'DENSIDAD', 'ESTADO_COPA', 'ESTADO_FUSTE', 'EST_RAIZ_GEN', 'NIVEL_RIESGO', 'ESTADO_FITO']].dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 305418 entries, 0 to 312918
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   ID_DETALLE    305418 non-null  int64  
 1   TRATAMIENTO   305418 non-null  object 
 2   ALTURATOTAL   305418 non-null  float64
 3   DENSIDAD      305418 non-null  object 
 4   ESTADO_COPA   305418 non-null  object 
 5   ESTADO_FUSTE  305418 non-null  object 
 6   EST_RAIZ_GEN  305418 non-null  object 
 7   NIVEL_RIESGO  305418 non-null  object 
 8   ESTADO_FITO   305418 non-null  object 
dtypes: float64(1), int64(1), object(7)
memory usage: 23.3+ MB


In [8]:
densidad_type = {'Denso':0,'Medio':1, 'Muy Ralo':2, 'Ralo':3}
estado_types = {'Bueno':0,'Regular':1, 'Malo':2, 'Suprimido':4}
riesgo_types = {'Moderada':0,'Ninguna':1, 'Severa':2}

df['DENSIDAD'] = df['DENSIDAD'].map(densidad_type).astype(int) 
df['ESTADO_COPA'] = df['ESTADO_COPA'].map(estado_types).astype(int)  
df['ESTADO_FUSTE'] = df['ESTADO_FUSTE'].map(estado_types).astype(int) 
df['EST_RAIZ_GEN'] = df['EST_RAIZ_GEN'].map(estado_types).astype(int)  
df['ESTADO_FITO'] = df['ESTADO_FITO'].map(estado_types).astype(int)  
df['NIVEL_RIESGO'] = df['NIVEL_RIESGO'].map(riesgo_types).astype(int) 

tratamientos_types = {'Tala':0, 'Conservacion':1, 'Traslado':2, 'Poda Formacion':3, 'Poda Control de Altura':4, 'Poda Realce':5, 
                    'Poda Aclareo':6, 'Poda Equilibrio':7, 'Poda Mejoramiento':8, 'Poda Estructural':9, 'Poda Saneamiento':10, 
                      'Poda Radicular':11, 'Tratamiento Integral':12, 'Tratamiento Especial':13}
 
df['TRATAMIENTO'] = df['TRATAMIENTO'].map(tratamientos_types).astype(int)  

In [9]:
df.describe()

Unnamed: 0,ID_DETALLE,TRATAMIENTO,ALTURATOTAL,DENSIDAD,ESTADO_COPA,ESTADO_FUSTE,EST_RAIZ_GEN,NIVEL_RIESGO,ESTADO_FITO
count,305418.0,305418.0,305418.0,305418.0,305418.0,305418.0,305418.0,305418.0,305418.0
mean,1044613000000.0,3.554568,8.261619,0.899917,0.808387,0.753659,0.40385,0.651861,0.958156
std,703049200000.0,4.553216,5.658934,0.915717,0.706314,0.793375,0.615848,0.751888,0.42539
min,266.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0
25%,492931.2,0.0,4.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,1463652000000.0,1.0,7.0,1.0,1.0,1.0,0.0,0.0,1.0
75%,1539292000000.0,9.0,11.0,1.0,1.0,1.0,1.0,1.0,1.0
max,1675971000000.0,13.0,89.8,3.0,2.0,4.0,2.0,2.0,2.0


In [10]:
df.head(10)

Unnamed: 0,ID_DETALLE,TRATAMIENTO,ALTURATOTAL,DENSIDAD,ESTADO_COPA,ESTADO_FUSTE,EST_RAIZ_GEN,NIVEL_RIESGO,ESTADO_FITO
0,266,1,3.0,1,0,0,0,1,0
1,267,1,2.5,0,0,0,0,1,0
2,268,9,4.5,3,1,0,0,1,1
3,269,0,3.0,2,2,1,1,1,2
4,270,0,9.5,1,1,1,1,0,1
5,271,0,2.3,2,1,1,1,1,1
6,272,0,3.5,1,1,1,1,1,1
7,273,0,2.0,0,0,1,1,1,1
8,274,9,3.5,0,1,0,0,1,1
9,275,0,1.6,1,1,1,1,1,1


# Seleccionar conjuntos de datos de entrenamiento y prueba

In [11]:
X = df[['ALTURATOTAL', 'DENSIDAD', 'ESTADO_COPA', 'ESTADO_FUSTE', 'EST_RAIZ_GEN', 'NIVEL_RIESGO', 'ESTADO_FITO']].values
y = df[['TRATAMIENTO']].values

In [12]:
# 80% -> Data Training | 20% -> Data Testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(244334, 7)
(61084, 7)
(244334, 1)
(61084, 1)


In [13]:
features = list({'ALTURATOTAL', 'DENSIDAD', 'ESTADO_COPA', 'ESTADO_FUSTE', 
                 'EST_RAIZ_GEN', 'NIVEL_RIESGO', 'ESTADO_FITO'})
label = 'TRATAMIENTO'
trainX = pd.DataFrame(X_train)
trainX[label] = y_train

testX = pd.DataFrame(X_test)
testX[label] = y_test

print(trainX.shape)
print(testX.shape)

(244334, 8)
(61084, 8)


In [14]:
trainX.head()

Unnamed: 0,0,1,2,3,4,5,6,TRATAMIENTO
0,6.0,0.0,0.0,0.0,0.0,0.0,1.0,1
1,2.0,0.0,0.0,0.0,0.0,1.0,1.0,1
2,6.0,0.0,1.0,0.0,0.0,0.0,1.0,9
3,26.0,1.0,1.0,1.0,1.0,0.0,1.0,0
4,5.3,0.0,0.0,0.0,0.0,1.0,1.0,11


In [15]:
#Datos Nulos
trainX.isnull().sum()

0              0
1              0
2              0
3              0
4              0
5              0
6              0
TRATAMIENTO    0
dtype: int64

# Almacenamiento de datos de Entrenamniento y Prueba

In [16]:
#Creacion de CSV
trainX.to_csv("train-V-1.csv", index=False)
testX.to_csv("test-V-1.csv", index=False)

In [17]:
#Almacenar datos en el S3
sk_prefix = "sagemaker/tratamientos/sklearncontainer"
trainpath = sess.upload_data(
    path = "train-V-1.csv", bucket=bucket, key_prefix=sk_prefix
)

testpath = sess.upload_data(
    path = "test-V-1.csv", bucket=bucket, key_prefix=sk_prefix
)

# Creación de Script para SageMaker

In [18]:
%%writefile script.py

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score
import sklearn
import joblib
import boto3
import pathlib
from io import StringIO
import argparse
import joblib
import os
import numpy as np
import pandas as pd

def model_fn(model_dir):
  clf = joblib.load(os.path.join(model_dir, "model.joblib"))
  return clf

if __name__ == "__main__":
  print("[INFO] Extracting arguments")
  parser = argparse.ArgumentParser()

  # Hyperparameters
  parser.add_argument("--n_estimators", type=int, default=100)
  parser.add_argument("--random_state", type=int, default=0)

  # Directorios
  parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
  parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
  parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
  parser.add_argument("--train-file", type=str, default="train-V-1.csv")
  parser.add_argument("--test-file", type=str, default="test-V-1.csv")
  args, _ = parser.parse_known_args()

  #Versiones de scikit-learn y joblib
  print("Scikit-learn Version:", sklearn.__version__)
  print("Joblib Version:", joblib.__version__)

  # Carga los datos de entrenamiento y prueba desde los archivos CSV especificados
  print("[INFO] Reading data")
  train_df = pd.read_csv(os.path.join(args.train, args.train_file))
  test_df = pd.read_csv(os.path.join(args.test, args.test_file))

  # Obtiene las columnas
  features = list(train_df.columns)
  label = features.pop(-1)

  # Conjuntos de entrenamiento y prueba
  print("Building training and testing datasets")
  X_train = train_df[features]
  X_test = test_df[features]
  y_train = train_df[label]
  y_test = test_df[label]

  print("Training Data")
  print(X_train.shape)
  print(y_train.shape)
  print()

  print("Testing Data")
  print(X_test.shape)
  print(y_test.shape)
  print()

  print("Training RandomForest Model")
  model = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
  model.fit(X_train, y_train)


  model_path = os.path.join(args.model_dir, "model.joblib")
  joblib.dump(model, model_path)


  y_pred_test = model.predict(X_test)
  test_acc = accuracy_score(y_test, y_pred_test)
  test_rep = classification_report(y_test, y_pred_test)


  print("RESULTS FOR TESTING DATA")
  print("Total Rows are: ", X_test.shape[0])
  print("[TESTING] Model Accuracy is: ", test_acc)
  print("[TESTING] Testing Report:")
  print(test_rep)


Overwriting script.py


# Configuración de entrenameinto del modelo

In [19]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
    entry_point="script.py",
    role="arn:aws:iam::445567096458:role/service-role/SageMaker-SageMaker_Pruebas",
    instance_count=1,
    instance_type="ml.m5.large",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="RF-custom-sklearn",
    hyperparameters={
        "n_estimators": 100,
        "random_state": 0,
    },
    use_spot_instances = True,
    max_wait = 7200,
    max_run = 3600
)



In [20]:
# launch training job, with asynchronous call
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True)


Using provided s3_resource


INFO:sagemaker:Creating training-job with name: RF-custom-sklearn-2024-12-07-01-12-16-244


2024-12-07 01:12:19 Starting - Starting the training job...
2024-12-07 01:12:36 Starting - Preparing the instances for training...
2024-12-07 01:13:07 Downloading - Downloading input data...
2024-12-07 01:13:32 Downloading - Downloading the training image...
2024-12-07 01:14:18 Training - Training image download completed. Training in progress..2024-12-07 01:14:22,316 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2024-12-07 01:14:22,320 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-12-07 01:14:22,364 sagemaker_sklearn_container.training INFO     Invoking user training script.
2024-12-07 01:14:22,530 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-12-07 01:14:22,543 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-12-07 01:14:22,557 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-12-07 01:

# Ubicación del modelo

In [21]:
sklearn_estimator.latest_training_job.wait(logs="None")
artifact = sm_boto3.describe_training_job(
    TrainingJobName = sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]
print("Model artifact " + artifact)


2024-12-07 01:14:46 Starting - Preparing the instances for training
2024-12-07 01:14:46 Downloading - Downloading the training image
2024-12-07 01:14:46 Training - Training image download completed. Training in progress.
2024-12-07 01:14:46 Uploading - Uploading generated training model
2024-12-07 01:14:46 Completed - Training job completed
Model artifact s3://sagemaker-us-east-1-445567096458/RF-custom-sklearn-2024-12-07-01-12-16-244/output/model.tar.gz


# Creación un modelo de SageMaker a partir del artifact

In [22]:
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

model_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model = SKLearnModel(
    name=model_name,
    model_data=artifact,
    role="arn:aws:iam::445567096458:role/service-role/SageMaker-Datos",
    entry_point="script.py",
    framework_version=FRAMEWORK_VERSION,
)
model

<sagemaker.sklearn.model.SKLearnModel at 0x27229612248>

# Despliegue del modelo

In [23]:
endpoint_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("EndpointName={}".format(endpoint_name))

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    endpoint_name=endpoint_name
)
endpoint_name

EndpointName=Custom-sklearn-model-2024-12-07-01-15-15


INFO:sagemaker:Creating model with name: Custom-sklearn-model-2024-12-07-01-15-15
INFO:sagemaker:Creating endpoint-config with name Custom-sklearn-model-2024-12-07-01-15-15
INFO:sagemaker:Creating endpoint with name Custom-sklearn-model-2024-12-07-01-15-15


-------!

'Custom-sklearn-model-2024-12-07-01-15-15'

In [24]:
#Eliminar Endpoint
sm_boto3.delete_endpoint(Endpoint = endpoint_name)

ParamValidationError: Parameter validation failed:
Missing required parameter in input: "EndpointName"
Unknown parameter in input: "Endpoint", must be one of: EndpointName