# Laboratório 3.4 ML

### Importando os dados

In [1]:
import warnings, requests, zipfile, io
warnings.simplefilter('ignore')
import pandas as pd
from scipy.io import arff
import boto3

In [2]:
f_zip = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00212/vertebral_column_data.zip'
r = requests.get(f_zip, stream=True)
Vertebral_zip = zipfile.ZipFile(io.BytesIO(r.content))
Vertebral_zip.extractall()

In [3]:
data = arff.loadarff('column_2C_weka.arff')
df = pd.DataFrame(data[0])

In [4]:
class_mapper = {b'Abnormal':1,b'Normal':0}
df['class']=df['class'].replace(class_mapper)

### Explorando os dados

In [5]:
df.shape

(310, 7)

In [6]:
df.columns

Index(['pelvic_incidence', 'pelvic_tilt', 'lumbar_lordosis_angle',
       'sacral_slope', 'pelvic_radius', 'degree_spondylolisthesis', 'class'],
      dtype='object')

### Preparando dados

In [7]:
cols = df.columns.tolist()
cols = cols[-1:] + cols[:-1]
df = df[cols]

#### A última coluna (class) foi movida para primeira posição para que XGboost tenha o valor alvo nessa posição

In [8]:
## Verificando

In [9]:
df.columns

Index(['class', 'pelvic_incidence', 'pelvic_tilt', 'lumbar_lordosis_angle',
       'sacral_slope', 'pelvic_radius', 'degree_spondylolisthesis'],
      dtype='object')

### Dividindo o conjunto de dados para treinamento utilizando a função train_test_split da biblioteca scikit-learn

In [10]:
from sklearn.model_selection import train_test_split
train, test_and_validate = train_test_split(df, test_size=0.2, random_state=42, stratify=df['class'])

#### Dividindo o conjunto de teste e validação em duas partes iguais

In [11]:
test, validate = train_test_split(test_and_validate, test_size=0.5, random_state=42, stratify=test_and_validate['class'])

#### Examinando o conjunto de dados

In [12]:
print(train.shape)
print(test.shape)
print(validate.shape)

(248, 7)
(31, 7)
(31, 7)


In [13]:
## Verificando a distribuição das classes

In [14]:
print(train['class'].value_counts())
print(test['class'].value_counts())
print(validate['class'].value_counts())

1    168
0     80
Name: class, dtype: int64
1    21
0    10
Name: class, dtype: int64
1    21
0    10
Name: class, dtype: int64


### Uploading para o Amazon S3

In [16]:
bucket='c30911a393409l787677t1w293575557795-labbucket-1d4z9im0gu8no'

prefix='lab3'

train_file='vertebral_train.csv'
test_file='vertebral_test.csv'
validate_file='vertebral_validate.csv'

import os

s3_resource = boto3.Session().resource('s3')
def upload_s3_csv(filename, folder, dataframe):
    csv_buffer = io.StringIO()
    dataframe.to_csv(csv_buffer, header=False, index=False)
    s3_resource.Bucket(bucket).Object(os.path.join(prefix, folder, filename)).put(Body=csv_buffer.getvalue())

In [17]:
## Acima foi definida uma função para fazer o upload dos arquivos para o tipo csv.

In [18]:
upload_s3_csv(train_file, 'train', train)
upload_s3_csv(test_file, 'test', test)
upload_s3_csv(validate_file, 'validate', validate)

### Treinando o modelo

In [19]:
## Obtendo a URI do container no XGboost

In [20]:
import boto3
from sagemaker.image_uris import retrieve
container = retrieve('xgboost',boto3.Session().region_name,'1.0-1')

In [21]:
## Definindo hiperparâmetros

In [22]:
hyperparams={"num_round":"42",
             "eval_metric": "auc",
             "objective": "binary:logistic"}

In [23]:
## Usando a função estimator para configurar o modelo

In [24]:
import sagemaker
s3_output_location="s3://{}/{}/output/".format(bucket,prefix)
xgb_model=sagemaker.estimator.Estimator(container,
                                       sagemaker.get_execution_role(),
                                       instance_count=1,
                                       instance_type='ml.m4.xlarge',
                                       output_path=s3_output_location,
                                        hyperparameters=hyperparams,
                                        sagemaker_session=sagemaker.Session())

In [27]:
## Canais para alimentar dados no modelo

In [26]:
train_channel = sagemaker.inputs.TrainingInput(
    "s3://{}/{}/train/".format(bucket,prefix,train_file),
    content_type='text/csv')

validate_channel = sagemaker.inputs.TrainingInput(
    "s3://{}/{}/validate/".format(bucket,prefix,validate_file),
    content_type='text/csv')

data_channels = {'train': train_channel, 'validation': validate_channel}

In [28]:
## Usando função fit para treinar o modelo

In [30]:
xgb_model.fit(inputs=data_channels, logs=False)


2021-06-21 19:44:49 Starting - Starting the training job..
2021-06-21 19:45:00 Starting - Launching requested ML instances.................
2021-06-21 19:46:30 Starting - Preparing the instances for training...............
2021-06-21 19:47:53 Downloading - Downloading input data..
2021-06-21 19:48:09 Training - Downloading the training image.......
2021-06-21 19:48:49 Training - Training image download completed. Training in progress.
2021-06-21 19:48:55 Uploading - Uploading generated training model
2021-06-21 19:48:58 Failed - Training job failed


UnexpectedStatusException: Error for Training job sagemaker-xgboost-2021-06-21-19-44-49-254: Failed. Reason: AlgorithmError: framework error: 
Traceback (most recent call last):
  File "/miniconda3/lib/python3.6/site-packages/sagemaker_xgboost_container/algorithm_mode/train.py", line 226, in train_job
    verbose_eval=False)
  File "/miniconda3/lib/python3.6/site-packages/xgboost/training.py", line 209, in train
    xgb_model=xgb_model, callbacks=callbacks)
  File "/miniconda3/lib/python3.6/site-packages/xgboost/training.py", line 74, in _train_internal
    bst.update(dtrain, i, obj)
  File "/miniconda3/lib/python3.6/site-packages/xgboost/core.py", line 1248, in update
    dtrain.handle))
  File "/miniconda3/lib/python3.6/site-packages/xgboost/core.py", line 189, in _check_call
    raise XGBoostError(py_str(_LIB.XGBGetLastError()))
xgboost.core.XGBoostError: [19:48:53] /workspace/src/objective/regression_obj.cu:102: label must be in [0,1] for logistic regression
Stack trace:
  [bt] (0) /miniconda3/lib/python3.6/site-packages/xgboost/./lib/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x54) [0x7f90cf495614]