https://autokeras.com/tutorial/structured_data_regression/

In [15]:
import numpy as np
import pandas as pd
import tensorflow as tf
import autokeras as ak

In [16]:
df = pd.read_csv("data/california_housing.csv", index_col="ID")
df.head()

Unnamed: 0_level_0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Price
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [17]:
prefix = "data/california_housing_"
train_file_path = prefix + "train.csv"
test_file_path = prefix + "eval.csv"

train_size = int(df.shape[0] * 0.9)
df[:train_size].to_csv(train_file_path, index=False)
df[train_size:].to_csv(test_file_path, index=False)

In [18]:
# The second step is to run the StructuredDataRegressor

# Initialize the structured data regressor
reg = ak.StructuredDataRegressor(
    overwrite=True,
    max_trials=3 # It tries 3 different models
)

# Feed the structured data regressor with training data
reg.fit(
    train_file_path, # The path to the train.csv file
    "Price", # The name of the label column
    epochs=10,
)

# Predict with the best model
predicted_y = reg.predict(test_file_path)

# Evaluate the best model with testing data
print(reg.evaluate(test_file_path, "Price"))

Trial 3 Complete [00h 00m 05s]
val_loss: 1.0086475610733032

Best val_loss So Far: 0.90126633644104
Total elapsed time: 00h 00m 16s
INFO:tensorflow:Oracle triggered exit
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
INFO:tensorflow:Assets written to: ./structured_data_regressor/best_model/assets
[0.5895569920539856, 0.5895569920539856]


In [19]:
# how the data can be prepared with numpy.ndarray, pandas.DataFrame, and tensorflow.data.Dataset

# x_train as pandas.DataFrame, y_train as pandas.Series
x_train = pd.read_csv(train_file_path)
print(type(x_train))  # pandas.DataFrame
y_train = x_train.pop("Price")
print(type(y_train))  # pandas.Series

# You can also use pandas.DataFrame for y_train
y_train = pd.DataFrame(y_train)
print(type(y_train))  # pandas.DataFrame

# You can also use numpy.ndarray for x_train and y_train
x_train = x_train.to_numpy()
y_train = y_train.to_numpy()
print(type(x_train))  # numpy.ndarray
print(type(y_train))  # numpy.ndarray

# Preparing testing data.
x_test = pd.read_csv(test_file_path)
y_test = x_test.pop("Price")

# It tries 10 different models.
reg = ak.StructuredDataRegressor(max_trials=3, overwrite=True)

# Feed the structured data regressor with training data
reg.fit(x_train, y_train, epochs=10)

# Predict with the best model
predicted_y = reg.predict(x_test)

# Evaluate the best model with testing data
print(reg.evaluate(x_test, y_test))

Trial 3 Complete [00h 00m 06s]
val_loss: 0.9246858358383179

Best val_loss So Far: 0.8997812867164612
Total elapsed time: 00h 00m 19s
INFO:tensorflow:Oracle triggered exit
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
INFO:tensorflow:Assets written to: ./structured_data_regressor/best_model/assets
[0.603534460067749, 0.603534460067749]


In [20]:
# how to convert numpy.ndarray to tf.data.Dataset

train_set = tf.data.Dataset.from_tensor_slices((x_train, y_train))
test_set = tf.data.Dataset.from_tensor_slices((x_test, y_test))

reg = ak.StructuredDataRegressor(max_trials=3, overwrite=True)

# Feed the tensorflow Dataset to the regressor
reg.fit(train_set, epochs=10)

# Predict with the best model
predicted_y = reg.predict(test_set)

# Evaluate the best model with testing data
print(reg.evaluate(test_set))

Trial 3 Complete [00h 00m 05s]
val_loss: 1.0187060832977295

Best val_loss So Far: 0.8860920667648315
Total elapsed time: 00h 00m 17s
INFO:tensorflow:Oracle triggered exit
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
INFO:tensorflow:Assets written to: ./structured_data_regressor/best_model/assets
[0.6516181826591492, 0.6516181826591492]


In [21]:
# You can also specify the column names and types for the data

# Initialize the structured data regressor
reg = ak.StructuredDataRegressor(
    column_names=[
        "MedInc",
        "HouseAge",
        "AveRooms",
        "AveBedrms",
        "Population",
        "AveOccup",
        "Latitude",
        "Longitude",
    ],
    column_types={"MedInc": "numerical", "Latitude": "numerical"},
    max_trials=10,  # It tries 10 different models
    overwrite=True,
)
reg

<autokeras.tasks.structured_data.StructuredDataRegressor at 0x7f96785f3cd0>

In [22]:
# By default, AutoKeras use the last 20% of training data as validation data

reg.fit(
    x_train,
    y_train,
    # Split the training data and use the last 15% as validation data
    validation_split=0.15,
    epochs=10,
)

Trial 10 Complete [00h 00m 08s]
val_loss: 0.6923542618751526

Best val_loss So Far: 0.6894374489784241
Total elapsed time: 00h 01m 11s
INFO:tensorflow:Oracle triggered exit
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
INFO:tensorflow:Assets written to: ./structured_data_regressor/best_model/assets


In [23]:
# You can also use your own validation set instead of splitting it from the training data

split = 500
x_val = x_train[split:]
y_val = y_train[split:]
x_train = x_train[:split]
y_train = y_train[:split]
reg.fit(
    x_train,
    y_train,
    # Use your own validation set
    validation_data=(x_val, y_val),
    epochs=10,
)

In [25]:
# You can also export the best model found by AutoKeras as a Keras Model

model = reg.export_model()
model.summary()

# numpy array in object (mixed type) is not supported
# you need convert it to unicode or float first
model.predict(x_train)[:5]

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 8)]               0         
_________________________________________________________________
multi_category_encoding (Mul (None, 8)                 0         
_________________________________________________________________
normalization (Normalization (None, 8)                 17        
_________________________________________________________________
dense (Dense)                (None, 32)                288       
_________________________________________________________________
re_lu (ReLU)                 (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               8448      
_________________________________________________________________
re_lu_1 (ReLU)               (None, 256)               0     

array([[5.0555844],
       [5.4558573],
       [4.6001453],
       [3.627623 ],
       [2.8417006]], dtype=float32)