In [1]:
import sys
print(sys.version, sys.platform, sys.executable)

3.7.6 (default, Aug  9 2020, 21:13:30) 
[Clang 11.0.3 (clang-1103.0.32.62)] darwin /Users/layne/.pyenv/versions/3.7.6/envs/jupyterlab/bin/python3.7


In [2]:
import os
os.chdir('/Users/layne/Desktop/pydatasci')
os.getcwd()

'/Users/layne/Desktop/pydatasci'

In [3]:
import h5py
import pydatasci as pds
from pydatasci import aidb
import os, sqlite3, io, gzip 
import pandas as pd
import numpy as np
import pyarrow
from pyarrow import csv as pc

from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder

import keras
from keras import metrics
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout
from keras.layers.normalization import BatchNormalization
from keras.constraints import maxnorm
from keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasClassifier

import matplotlib.pyplot as plt



In [4]:
from importlib import reload; aidb.delete_db(True); reload(aidb); aidb.create_db()


=> Success - deleted database file at path:
/Users/layne/Library/Application Support/pydatasci/aidb.sqlite3


=> Success - created database file for machine learning metrics at path:
/Users/layne/Library/Application Support/pydatasci/aidb.sqlite3


=> Success - created the following tables within database:
['algorithm', 'batch', 'dataset', 'featureset', 'fold', 'foldset', 'hyperparamcombo', 'hyperparamset', 'job', 'label', 'preprocess', 'result', 'splitset']



---

In [5]:
dataset = aidb.Dataset.from_file(
	path = 'data/iris.tsv' 
	,file_format = 'tsv'
	,name = 'tab-separated plants'
	,perform_gzip = True
    ,dtype = 'float64'
)

In [6]:
label_name = 'target'

In [7]:
label = dataset.make_label(columns=[label_name])

In [8]:
featureset = dataset.make_featureset(exclude_columns=[label_name])

In [9]:
splitset = featureset.make_splitset(
	label_id = label.id
	, size_test = 0.20
	, size_validation = 0.12
)

In [10]:
#foldset = splitset.make_foldset(fold_count=6)

# aidb

### Define Preprocess (Optional)

Going to have to make sure that I am uniformly importing the packages. 
`from sklearn.preprocessing import *`

In [11]:
encoder_features = StandardScaler()

In [12]:
encoder_labels = OneHotEncoder(sparse=False)

In [13]:
#params_encode_labels = {"sparse": [False]}

### Define Algorithm

In [14]:
def function_model_build(**hyperparameters):
    model = Sequential()
    model.add(Dense(9, input_shape=(4,), activation='relu', kernel_initializer='he_uniform', name='fc1')) # first hidden layer
    model.add(Dense(hyperparameters['l2_neuron_count'], activation='relu', kernel_initializer='he_uniform', name='fc2'))
    model.add(Dense(3, activation='softmax', name='output'))

    model.compile(optimizer=hyperparameters['optimizer'], loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [15]:
def function_model_train(model, samples_train, samples_evaluate, **hyperparameters):
    model.fit(
        samples_train["features"]
        , samples_train["labels"]
        , validation_data = (
            samples_evaluate["features"]
            , samples_evaluate["labels"]
        )
        , verbose = 0
        , batch_size = 3
        , epochs = hyperparameters['epochs']
    )
    return model

In [16]:
def function_model_evaluate(model, samples_evaluate, **hyperparameters):
    results = model.evaluate(samples_evaluate["features"], samples_evaluate["labels"], verbose=0)
    #print("Loss = " + str(results[0]) + " // Accuracy = " + str(results[1]) )
    return results

In [17]:
# hyperparameters = {
#     "l1_neuron_count": [9, 18]
#     , "l2_neuron_count": [9, 18]
#     , "optimizer": ["adamax", "adam"]
#     , "epochs": [30, 60, 90]
#     , "batch_size": [3, 5]
# }

In [18]:
hyperparameters = {
    "l2_neuron_count": [9, 18]
    , "optimizer": ["adamax", "adam"]
    , "epochs": [10, 20]
}

### Stage the Experiment and Parameters

In [19]:
algorithm = aidb.Algorithm.create(
    description = "dense, 2 layers, medium height"
	, function_model_build = function_model_build
	, function_model_train = function_model_train
	, function_model_evaluate = function_model_evaluate
)

In [20]:
preprocess = aidb.Preprocess.from_splitset(
    splitset_id = splitset.id
    , description = "standard scaling on features"
    , encoder_features = encoder_features
    , encoder_labels = encoder_labels
)

In [21]:
hyperparamset = aidb.Hyperparamset.from_algorithm(
    algorithm_id = algorithm.id
    , preprocess_id = preprocess.id
    , description = "experimenting with number of epochs"
	, hyperparameters = hyperparameters
)

In [22]:
hyperparamset.hyperparamcombo_count

8

In [23]:
batch = aidb.Batch.from_algorithm(
    algorithm_id = algorithm.id
    , splitset_id = splitset.id
    , hyperparamset_id = hyperparamset.id
    , foldset_id = None #foldset.id
    , only_folded_training = False
)

In [24]:
batch.job_count

8

In [25]:
batch.run_jobs(verbose=False)

🔮 Training Models 🔮: 100%|██████████████████████████████████████████| 8/8 [00:10<00:00,  1.29s/it]


In [26]:
batch.get_statuses()

{1: 'Queued',
 2: 'Queued',
 3: 'Queued',
 4: 'Queued',
 5: 'Queued',
 6: 'Queued',
 7: 'Queued',
 8: 'Queued'}

In [27]:
#batch.stop_jobs()

In [33]:
batch.jobs[4].results[0].get_model()

<tensorflow.python.keras.engine.sequential.Sequential at 0x14f50b590>

---

---

---

### Data to use

In [11]:
samples = splitset.to_numpy()

In [12]:
train_features = samples["train"]["features"]
validation_features = samples["validation"]["features"]
test_features = samples["test"]["features"]

In [13]:
train_labels = samples["train"]["labels"]
validation_labels = samples["validation"]["labels"]
test_labels = samples["test"]["labels"]

https://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing

---

In [14]:
folds = [None]
fset = ['a','b','c']
folds = folds + fset

On jobs need a `Job.from_hyperparamset()`

---

^ Alternatively, pass in a dictionary of values for that keyword.

^ Or just pass the params directly into the encoder function if you don't want to hypertune on them.

In [None]:
encoder_labels = encoder.set_params(**encode_features_kwargs)

In [None]:
encoder_labels_trained = encoder_labels.fit(train_labels)

In [None]:
train_labels = encoder_labels_trained.transform(train_labels)
validation_labels = encoder_labels_trained.transform(validation_labels)
test_labels = encoder_labels_trained.transform(test_labels)

Create the preprocess object.

In [None]:
preprocess = aidb.Preprocess.create(
    encode_labels_function = encoder
)

In [None]:
feature_encoder = preprocess.preprocess_labels_function.set_params(**preprocess_features_kwargs)

In [None]:
encoder_labels_trained = feature_encoder.fit(train_labels)

In [None]:
train_labels = encoder_labels_trained.transform(train_labels)
validation_labels = encoder_labels_trained.transform(validation_labels)
test_labels = encoder_labels_trained.transform(test_labels)

^^^ Now do the same, but with kwargs stored in the hyperparamset attribute.

---

---

# Normal

ToDo - Does Pandas labels probably wants a series too, not a full dataframe? `.as_series()` method?

In [10]:
samples = splitset.to_numpy()

In [11]:
train_features = samples["train"]["features"]
test_features = samples["test"]["features"]

In [12]:
train_labels = samples["train"]["labels"]
test_labels = samples["test"]["labels"]

In [13]:
encoder = OneHotEncoder(sparse=False)
train_labels = encoder.fit_transform(train_labels)
test_labels = encoder.fit_transform(test_labels)

In [14]:
model = Sequential()
model.add(Dense(13, input_shape=(4,), activation='relu', kernel_initializer='he_uniform', name='fc1')) # first hidden layer
model.add(Dense(3, activation='softmax', name='output'))
model.compile(optimizer='adamax', loss='categorical_crossentropy', metrics=['accuracy'])

In [15]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
fc1 (Dense)                  (None, 13)                65        
_________________________________________________________________
output (Dense)               (None, 3)                 42        
Total params: 107
Trainable params: 107
Non-trainable params: 0
_________________________________________________________________
None


In [16]:
# Train
model.fit(
    train_features
    , train_labels
    , validation_data = (test_features, test_labels)
    , verbose = 0
    , batch_size = 6
    , epochs = 60
)

<tensorflow.python.keras.callbacks.History at 0x147f834d0>

In [17]:
buffer_h5 = io.BytesIO()

In [18]:
model.save(
    buffer_h5
    , include_optimizer = True
    , save_format = 'h5'
)

In [21]:
bytes = buffer_h5.getvalue()

In [19]:
h5_file = h5py.File(buffer_h5,'r')

In [20]:
load_model(h5_file, compile=True)

<tensorflow.python.keras.engine.sequential.Sequential at 0x148673cd0>

In [83]:
h5py.is_hdf5(
    buffer.getvalue()
)

False

In [None]:
model.history

In [None]:
model.weights

In [None]:
type(model)

In [None]:
# Train
dir(model)

In [None]:
plt.subplot(211)
plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()

plt.subplot(212)
plt.title('Accuracy')
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='test')
plt.legend()

plt.subplots_adjust(top=2, right=2)

plt.show()

---