Licensed under the MIT License.

Copyright (c) 2021-2031. All rights reserved.

# Model Selection with AutoKeras

* It will try to find thr optimal neural network architecture & hyperparameters for you. 
* Totorials: https://autokeras.com/tutorial/overview/

In [39]:
import autokeras
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, balanced_accuracy_score

### Regression
* AutoKeras `StructuredDataRegressor`: https://autokeras.com/structured_data_regressor/
* Too slow when the dataset has either large dimensions or large number of records, so here removed many features

In [57]:
df = pd.read_pickle('../luigi_pipeline/output/preprocessed_data.pkl')
print(df.shape)

df.head()

(693861, 22)


Unnamed: 0,Store,Date,Year,Month,Quarter,Customers_larger_than_3000,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,...,Promo2SinceWeek,Promo2SinceYear,PromoInterval,DayOfWeek,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,2015-07-31,2015,7,941,0.0,2,0,1270.0,9,...,-1,-1,0,5,5263,555,1,1,0,1
1,1,2015-07-30,2015,7,940,0.0,2,0,1270.0,9,...,-1,-1,0,4,5020,546,1,1,0,1
2,1,2015-07-29,2015,7,939,0.0,2,0,1270.0,9,...,-1,-1,0,3,4782,523,1,1,0,1
3,1,2015-07-28,2015,7,938,0.0,2,0,1270.0,9,...,-1,-1,0,2,5011,560,1,1,0,1
4,1,2015-07-27,2015,7,937,0.0,2,0,1270.0,9,...,-1,-1,0,1,6102,612,1,1,0,1


In [58]:
cat_cols = [col for col in df.select_dtypes(include='category').columns if col != 'Year']
df.drop(cat_cols, axis=1, inplace=True)

df.head()

Unnamed: 0,Date,Year,Customers_larger_than_3000,CompetitionDistance,Sales,Customers
0,2015-07-31,2015,0.0,1270.0,5263,555
1,2015-07-30,2015,0.0,1270.0,5020,546
2,2015-07-29,2015,0.0,1270.0,4782,523
3,2015-07-28,2015,0.0,1270.0,5011,560
4,2015-07-27,2015,0.0,1270.0,6102,612


In [59]:
# train, test split for df
train_df = df.loc[df['Year'].astype(str) < '2015']
test_df = df.loc[df['Year'].astype(str) == '2015']

y_train, y_test = train_df['Sales'], test_df['Sales']
X_train, X_test = train_df.drop(['Sales', 'Date', 'Year'], axis=1), test_df.drop(['Sales', 'Date', 'Year'], axis=1)

X_train.reset_index(inplace=True, drop=True)
X_test.reset_index(inplace=True, drop=True)
y_train.reset_index(inplace=True, drop=True)
y_test.reset_index(inplace=True, drop=True)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
X_train.head()

(532529, 3) (161332, 3) (532529,) (161332,)


Unnamed: 0,Customers_larger_than_3000,CompetitionDistance,Customers
0,0.0,1270.0,327
1,0.0,1270.0,703
2,0.0,1270.0,700
3,0.0,1270.0,0
4,0.0,1270.0,684


In [60]:
ak_regressor = autokeras.StructuredDataRegressor(max_trials=3, tuner='bayesian')
ak_regressor.fit(X_train, y_train, epochs=3)

Trial 3 Complete [00h 07m 41s]
val_loss: 7056720.0

Best val_loss So Far: 7056720.0
Total elapsed time: 00h 44m 37s
INFO:tensorflow:Oracle triggered exit
Epoch 1/3
Epoch 2/3
Epoch 3/3
INFO:tensorflow:Assets written to: .\structured_data_regressor\best_model\assets


<tensorflow.python.keras.callbacks.History at 0x23f56af1ac8>

In [61]:
y_pred_regression = ak_regressor.predict(X_test)
print(r2_score(y_test, y_pred_regression.flatten().astype(np.float)))

-0.2587499301929723


In [62]:
keras_export_regression = ak_regressor.export_model()
keras_export_regression.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 3)]               0         
_________________________________________________________________
multi_category_encoding (Mul (None, 3)                 0         
_________________________________________________________________
normalization (Normalization (None, 3)                 7         
_________________________________________________________________
dense (Dense)                (None, 1024)              4096      
_________________________________________________________________
re_lu (ReLU)                 (None, 1024)              0         
_________________________________________________________________
dropout (Dropout)            (None, 1024)              0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 1024)              0     

### Classification
* AutoKeras `StructuredDataClassifier`: https://autokeras.com/structured_data_classifier/

In [10]:
df30 = pd.read_csv('../../crystal_ball/data_collector/structured_data/leaf.csv')

y30 = df30['species']
X30 = df30.drop('species', axis=1)

X_train30, X_test30, y_train30, y_test30 = train_test_split(X30, y30, test_size=0.2,
                                               random_state=10, shuffle=True, stratify=y30)

X_train30.reset_index(inplace=True, drop=True)
X_test30.reset_index(inplace=True, drop=True)
y_train30.reset_index(inplace=True, drop=True)
y_test30.reset_index(inplace=True, drop=True)

print(X_train30.shape, X_test30.shape, y_train30.shape, y_test30.shape)
print(y_train30.nunique(), y_test30.nunique())

(272, 15) (68, 15) (272,) (68,)
30 30


In [11]:
ak_classifier = autokeras.StructuredDataClassifier(max_trials=10)
ak_classifier.fit(X_train30, y_train30, epochs=100)

Trial 10 Complete [00h 00m 39s]
val_accuracy: 0.75

Best val_accuracy So Far: 0.9375
Total elapsed time: 00h 06m 38s
INFO:tensorflow:Oracle triggered exit
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/

<tensorflow.python.keras.callbacks.History at 0x23f9dc72908>

In [21]:
y_pred_classification = ak_classifier.predict(X_test30)
print(balanced_accuracy_score(y_test30, y_pred_classification.flatten().astype(np.float)))

0.8444444444444446


In [22]:
keras_export_classification = ak_classifier.export_model()
keras_export_classification.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 15)]              0         
_________________________________________________________________
multi_category_encoding (Mul (None, 15)                0         
_________________________________________________________________
normalization (Normalization (None, 15)                31        
_________________________________________________________________
dense (Dense)                (None, 32)                512       
_________________________________________________________________
re_lu (ReLU)                 (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                1056      
_________________________________________________________________
re_lu_1 (ReLU)               (None, 32)                0     