Clasificación del dataset del iris usando boosted trees
===

* *30 min* | Última modificación: Abril 6, 2020.

## Importación de librerías

In [1]:
import os
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sb
import tensorflow as tf

%matplotlib inline


print(tf.__version__)

#
# Establece el nivel de reporte en
# pantalla de TensorFlow
#
import logging

logger = tf.get_logger().setLevel(logging.ERROR)

  import pandas.util.testing as tm


2.4.1


## Carga y configuración del dataset

In [2]:
col_names = ["SepalLength", "SepalWidth", "PetalLength", "PetalWidth", "Species"]

target_dimensions = ["Setosa", "Versicolor", "Virginica"]

training_data_path = tf.keras.utils.get_file(
    "iris_ training.csv",
    "https://storage.googleapis.com/download.tensorflow.org/data/iris_training.csv",
)

test_data_path = tf.keras.utils.get_file(
    "iris_test.csv",
    "https://storage.googleapis.com/download.tensorflow.org/data/iris_test.csv",
)

training = pd.read_csv(training_data_path, names=col_names, header=0)
training = training[training["Species"] >= 1]
training["Species"] = training["Species"].replace([1, 2], [0, 1])
training.reset_index(drop=True, inplace=True)

test = pd.read_csv(test_data_path, names=col_names, header=0)
test = test[test["Species"] >= 1]
test["Species"] = test["Species"].replace([1, 2], [0, 1])
test.reset_index(drop=True, inplace=True)

df = pd.concat([training, test], axis=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 0 to 21
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   SepalLength  100 non-null    float64
 1   SepalWidth   100 non-null    float64
 2   PetalLength  100 non-null    float64
 3   PetalWidth   100 non-null    float64
 4   Species      100 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 4.7 KB


## Conjuntos de entrenamiento y validación

In [3]:
from sklearn.model_selection import train_test_split

#
#  Partición de los datos. La función retorno un objeto
#  pandas.DataFrame para X y un objeto pandas.Series para
#  y
#
X_train, X_test, y_train, y_test = train_test_split(
    df[[c for c in df.columns if c != "Species"]], df["Species"], test_size=0.2
)

## Escalamiento de los datos

In [4]:
from sklearn import preprocessing

#
#  El preprocesador retorna una numpy.ndarray y debe
#  transformarse nuevamente en un pandas.DataFrame
#
scaler = preprocessing.StandardScaler().fit(X_train)

X_train = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns).astype(
    np.float32
)

X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns).astype(
    np.float32
)

## Especificación de los parámetros del estimador

In [5]:
def make_input_fn(data_df, label_df, num_epochs=10, shuffle=True, batch_size=32):
    def input_function():
        ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
        if shuffle:
            ds = ds.shuffle(1000)
        ds = ds.batch(batch_size).repeat(num_epochs)
        return ds

    return input_function


train_input_fn = make_input_fn(X_train, y_train, num_epochs=1, shuffle=False)

test_input_fn = make_input_fn(X_test, y_test, num_epochs=1, shuffle=False)

In [6]:
#
#  Crea una lista con los tipos de las columnas
#  del dataframe de entrada
#
feature_columns = [tf.feature_column.numeric_column(m) for m in X_train.columns]

In [7]:
btree_model = tf.estimator.BoostedTreesClassifier(
    feature_columns=feature_columns, n_batches_per_layer=1
)
btree_model.train(train_input_fn)

<tensorflow_estimator.python.estimator.canned.boosted_trees.BoostedTreesClassifier at 0x7f77ac596390>

## Pronósticos

In [8]:
train_predictions = btree_model.predict(train_input_fn)
train_predictions = pd.Series(
    [p["classes"][0].decode("utf-8") for p in train_predictions]
)

test_predictions = btree_model.predict(test_input_fn)
test_predictions = pd.Series(
    [p["classes"][0].decode("utf-8") for p in test_predictions]
)

## Métricas de error

In [9]:
from sklearn.metrics import accuracy_score, precision_score, recall_score


def calculate_errors_and_r2(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred.astype("int64"))
    precision = precision_score(y_true, y_pred.astype("int64"))
    recall = recall_score(y_true, y_pred.astype("int64"))
    return accuracy, precision, recall


(
    train_accuracy_score,
    train_precision_score,
    train_recall_score,
) = calculate_errors_and_r2(y_train, train_predictions)

(
    test_accuracy_score,
    test_precision_score,
    test_recall_score,
) = calculate_errors_and_r2(y_test, test_predictions)

print("Training Data Accuracy = ", train_accuracy_score)
print("Training Data Score = ", train_precision_score)
print("Training Data Recall = ", train_recall_score)
print()
print("Testing Data Accuracy = ", test_accuracy_score)
print("Testing Data Score = ", test_precision_score)
print("Testing Data Recall = ", test_recall_score)

Training Data Accuracy =  0.9375
Training Data Score =  0.9743589743589743
Training Data Recall =  0.9047619047619048

Testing Data Accuracy =  0.9
Testing Data Score =  0.8
Testing Data Recall =  1.0
