https://autokeras.com/tutorial/structured_data_classification/

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import autokeras as ak

In [2]:
train_file_path = "data/titanic_train.csv"
test_file_path = "data/titanic_eval.csv"

In [3]:
# The second step is to run the StructuredDataClassifier

# Initialize the structured data classifier
clf = ak.StructuredDataClassifier(
    overwrite=True,
    max_trials=20 # It tries 3 different models
)

# Feed the structured data classifier with training data
clf.fit(
    train_file_path, # The path to the train.csv file
    "survived", # The name of the label column
    #epochs=10, # leave the epochs unspecified for an adaptive number of epochs
)

# Predict with the best model
predicted_y = clf.predict(test_file_path)

# Evaluate the best model with testing data
print(clf.evaluate(test_file_path, "survived"))

Trial 20 Complete [00h 00m 03s]
val_accuracy: 0.8782608509063721

Best val_accuracy So Far: 0.895652174949646
Total elapsed time: 00h 04m 01s
INFO:tensorflow:Oracle triggered exit
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
INFO:tensorflow:Assets written to: ./structured_data_classifier/best_model/assets
[0.5085175037384033, 0.7689393758773804]


In [4]:
# how the data can be prepared with numpy.ndarray, pandas.DataFrame, and tensorflow.data.Dataset

# x_train as pandas.DataFrame, y_train as pandas.Series
x_train = pd.read_csv(train_file_path)
print(type(x_train))  # pandas.DataFrame
y_train = x_train.pop("survived")
print(type(y_train))  # pandas.Series

# You can also use pandas.DataFrame for y_train
y_train = pd.DataFrame(y_train)
print(type(y_train))  # pandas.DataFrame

# You can also use numpy.ndarray for x_train and y_train
x_train = x_train.to_numpy()
y_train = y_train.to_numpy()
print(type(x_train))  # numpy.ndarray
print(type(y_train))  # numpy.ndarray

# Preparing testing data
x_test = pd.read_csv(test_file_path)
y_test = x_test.pop("survived")

# It tries 10 different models
clf = ak.StructuredDataClassifier(overwrite=True, max_trials=3)

# Feed the structured data classifier with training data
clf.fit(x_train, y_train, epochs=10)

# Predict with the best model
predicted_y = clf.predict(x_test)

# Evaluate the best model with testing data
print(clf.evaluate(x_test, y_test))

Trial 3 Complete [00h 00m 01s]
val_accuracy: 0.852173924446106

Best val_accuracy So Far: 0.886956512928009
Total elapsed time: 00h 00m 04s
INFO:tensorflow:Oracle triggered exit
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
INFO:tensorflow:Assets written to: ./structured_data_classifier/best_model/assets
[0.4502401351928711, 0.7840909361839294]


In [5]:
# how to convert numpy.ndarray to tf.data.Dataset

train_set = tf.data.Dataset.from_tensor_slices((x_train.astype(np.unicode), y_train))
test_set = tf.data.Dataset.from_tensor_slices((x_test.to_numpy().astype(np.unicode), y_test))

clf = ak.StructuredDataClassifier(overwrite=True, max_trials=3)

# Feed the tensorflow Dataset to the classifier
clf.fit(train_set, epochs=10)

# Predict with the best model
predicted_y = clf.predict(test_set)

# Evaluate the best model with testing data
print(clf.evaluate(test_set))

Trial 3 Complete [00h 00m 01s]
val_accuracy: 0.8608695864677429

Best val_accuracy So Far: 0.8608695864677429
Total elapsed time: 00h 00m 05s
INFO:tensorflow:Oracle triggered exit
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
INFO:tensorflow:Assets written to: ./structured_data_classifier/best_model/assets
[0.4587315022945404, 0.7954545617103577]


In [6]:
# You can also specify the column names and types for the data

# Initialize the structured data classifier
clf = ak.StructuredDataClassifier(
    column_names=[
        "sex",
        "age",
        "n_siblings_spouses",
        "parch",
        "fare",
        "class",
        "deck",
        "embark_town",
        "alone",
    ],
    column_types={"sex": "categorical", "fare": "numerical"},
    max_trials=10,  # It tries 10 different models
    overwrite=True,
)
clf

<autokeras.tasks.structured_data.StructuredDataClassifier at 0x7f8c147a4c70>

In [7]:
# By default, AutoKeras use the last 20% of training data as validation data

clf.fit(
    x_train,
    y_train,
    # Split the training data and use the last 15% as validation data
    validation_split=0.15,
    epochs=10,
)

Trial 10 Complete [00h 00m 01s]
val_accuracy: 0.8795180916786194

Best val_accuracy So Far: 0.891566276550293
Total elapsed time: 00h 00m 13s
INFO:tensorflow:Oracle triggered exit
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
INFO:tensorflow:Assets written to: ./structured_data_classifier/best_model/assets


In [8]:
# You can also use your own validation set instead of splitting it from the training data

split = 500
x_val = x_train[split:]
y_val = y_train[split:]
x_train = x_train[:split]
y_train = y_train[:split]

clf.fit(
    x_train,
    y_train,
    # Use your own validation set
    validation_data=(x_val, y_val),
    epochs=10,
)

In [9]:
# export the best model found by AutoKeras as a Keras Model

model = clf.export_model()
model.summary()
print(x_train.dtype)

# numpy array in object (mixed type) is not supported.
# convert it to unicode.
model.predict(x_train.astype(np.unicode))[:5]

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 9)]               0         
_________________________________________________________________
multi_category_encoding (Mul (None, 9)                 0         
_________________________________________________________________
normalization (Normalization (None, 9)                 19        
_________________________________________________________________
dense (Dense)                (None, 32)                320       
_________________________________________________________________
re_lu (ReLU)                 (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               8448      
_________________________________________________________________
re_lu_1 (ReLU)               (None, 256)               0     

array([[0.12436253],
       [0.94959366],
       [0.51434654],
       [0.91824704],
       [0.10626236]], dtype=float32)