In [1]:
import sagemaker
import boto3
from sagemaker.session import Session
from sagemaker.inputs import TrainingInput
import pandas as pd
from sagemaker.serializers import CSVSerializer
import sklearn.metrics as metrics
import numpy as np

In [None]:
#Modelagem utilizando o algoritmo XGBoost

In [2]:
bucket = 's3://datascience-sagemaker-laurawindlin'
role = sagemaker.get_execution_role()
treinamento = f'{bucket}/treinamento.csv'
validacao = f'{bucket}/validacao.csv'
saida = f'{bucket}/saida'

tipo_instancia = 'ml.m5.large'

container = '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest'

base_name = 'faltas-xgboost'

hp = {
    "max_depth": "5",
    "eta": "0.2",
    "gamma": "4",
    "min_child_weight":"6",
    "subsample":"0.7",
    "objective":"binary:logistic",
    "num_round": "50"
}

In [None]:
#Construindo o estimador

In [3]:
estimador = sagemaker.estimator.Estimator(
    base_name=base_name,
    image_uri=container,
    hyperparameters = hp,
    role=role,
    instance_count=1,
    instance_type=tipo_instancia,
    volume_size=5,
    output_path=saida
)

In [4]:
treinamento_input = TrainingInput(treinamento, content_type='csv')
validacao_input = TrainingInput(validacao, content_type='csv')

In [None]:
#Implantação do modelo no SageMaker

In [5]:
estimador.fit({
    'train':treinamento_input,
    'validation':validacao_input
})

2022-05-02 16:14:23 Starting - Starting the training job...ProfilerReport-1651508062: InProgress
...
2022-05-02 16:14:55 Starting - Preparing the instances for training......
2022-05-02 16:16:21 Downloading - Downloading input data......
2022-05-02 16:17:21 Training - Downloading the training image.....[34mArguments: train[0m
[34m[2022-05-02:16:18:01:INFO] Running standalone xgboost training.[0m
[34m[2022-05-02:16:18:01:INFO] File size need to be processed in the node: 0.03mb. Available memory size in the node: 300.18mb[0m
[34m[2022-05-02:16:18:01:INFO] Determined delimiter of CSV input is ','[0m
[34m[16:18:01] S3DistributionType set as FullyReplicated[0m
[34m[16:18:01] 518x8 matrix with 4144 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2022-05-02:16:18:01:INFO] Determined delimiter of CSV input is ','[0m
[34m[16:18:01] S3DistributionType set as FullyReplicated[0m
[34m[16:18:01] 148x8 matrix with 1184 entries loaded from /o

In [6]:
predictor = estimador.deploy(
    initial_instance_count=1, 
    instance_type=tipo_instancia,
    serializer=CSVSerializer(), 
    endpoint_name='datascience-xgboost-ac'
)

-----!

In [None]:
#Avalição do modelo com base de testes

In [8]:
dados_teste = pd.read_csv('dados/teste.csv')

predictions = []
for index, row in dados_teste.iterrows():
    p = predictor.predict(row[1:])
    predictions.append(float(p.decode('utf8')))

predictions = np.array(predictions)

In [9]:
predictions

array([0.13265353, 0.03146214, 0.03146214, 0.03146214, 0.03819143,
       0.05735774, 0.03146214, 0.03146214, 0.03819143, 0.03146214,
       0.05735774, 0.03819143, 0.03146214, 0.03146214, 0.03146214,
       0.03146214, 0.03146214, 0.13265353, 0.03146214, 0.05735774,
       0.13265353, 0.03819143, 0.03146214, 0.03146214, 0.03819143,
       0.03146214, 0.05735774, 0.13265353, 0.03819143, 0.05735774,
       0.03146214, 0.13265353, 0.05735774, 0.03146214, 0.06979008,
       0.03819143, 0.03146214, 0.03146214, 0.03819143, 0.05735774,
       0.06979008, 0.03146214, 0.03146214, 0.05735774, 0.03819143,
       0.03146214, 0.03819143, 0.03146214, 0.03146214, 0.03819143,
       0.03146214, 0.13265353, 0.03146214, 0.03146214, 0.03146214,
       0.03146214, 0.03146214, 0.03146214, 0.06979008, 0.03146214,
       0.05735774, 0.05735774, 0.05735774, 0.03819143, 0.05735774,
       0.03146214, 0.13265353, 0.03146214, 0.03146214, 0.03146214,
       0.11120456, 0.03819143, 0.05735774, 0.03146214])

In [10]:
predictions = np.where(predictions > 0.5, 1, 0)
predictions

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0])

In [11]:
print(metrics.classification_report(dados_teste.iloc[:, 0], predictions))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97        70
           1       0.00      0.00      0.00         4

    accuracy                           0.95        74
   macro avg       0.47      0.50      0.49        74
weighted avg       0.89      0.95      0.92        74



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
