# Setup

#### Dependencies

In [1]:
pip uninstall pydatasci -y

Uninstalling pydatasci-0.0.56:
  Successfully uninstalled pydatasci-0.0.56
Note: you may need to restart the kernel to use updated packages.


In [2]:
%%capture
pip install --upgrade pydatasci

In [3]:
#! jupyter labextension install jupyterlab-plotly

In [4]:
%%capture
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import *

from keras.metrics import *
from keras.models import Sequential
from keras.callbacks import History
from keras.layers import Dense, Dropout

#### Create the database.

In [5]:
import pydatasci as pds
pds.create_folder()
pds.create_config()

from pydatasci import aidb
from importlib import reload; aidb.delete_db(True); reload(aidb); aidb.create_db()


=> Success - the following file path already exists on your system:
/Users/layne/Library/Application Support/pydatasci/


=> Info - skipping folder creation as folder already exists at file path:
/Users/layne/Library/Application Support/pydatasci/


=> Success - the following file path already exists on your system:
/Users/layne/Library/Application Support/pydatasci/


=> Info - skipping as config file already exists at path:
/Users/layne/Library/Application Support/pydatasci/config.json


=> Success - deleted database file at path:
/Users/layne/Library/Application Support/pydatasci/aidb.sqlite3


=> Success - created database file for machine learning metrics at path:
/Users/layne/Library/Application Support/pydatasci/aidb.sqlite3


=> Success - created the following tables within database:
['algorithm', 'batch', 'dataset', 'featureset', 'fold', 'foldset', 'hyperparamcombo', 'hyperparamset', 'job', 'label', 'preprocess', 'result', 'splitset']



---

# Data

#### Ingest file, dataframe, or array.

In [6]:
import os
os.chdir('/Users/layne/Desktop')

In [8]:
df = pd.read_csv('pydatasci/data/double_iris.tsv', sep='\t')
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [9]:
dataset = aidb.Dataset.from_pandas(
	dataframe = df
	, file_format = 'tsv'
	, name = 'tab-separated plants'
	, perform_gzip = True
)

#### Select features and labels.

In [10]:
label_name = 'target'

label = dataset.make_label(columns=[label_name])

featureset = dataset.make_featureset(exclude_columns=[label_name])

#### Assign sample IDs to training, validation, and test splits.

In [11]:
splitset = featureset.make_splitset(
	label_id = label.id
	, size_test = 0.20
	, size_validation = 0.12
)

In [12]:
splitset = featureset.make_splitset(
	label_id = label.id
	, size_test = 0.20
	, size_validation = 0.12
)

In [13]:
foldset = splitset.make_foldset(fold_count=3)

In [14]:
encoder_features = StandardScaler()

In [15]:
encoder_labels = OneHotEncoder(sparse=False)

In [16]:
preprocess = aidb.Preprocess.from_splitset(
    splitset_id = splitset.id
    , description = "Scale features and OHE labels."
    , encoder_features = encoder_features
    , encoder_labels = encoder_labels
)

---

# Algorithm

### Define model to be trained.

In [17]:
hyperparameters = {
    "l2_neuron_count": [13, 9]
    , "optimizer": ["adamax"]
    , "epochs": [60, 30]
}

In [18]:
def function_model_build(**hyperparameters):
    model = Sequential()
    model.add(Dense(13, input_shape=(4,), activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(hyperparameters['l2_neuron_count'], activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(3, activation='softmax', name='output'))

    model.compile(
        loss='categorical_crossentropy'
        , optimizer=hyperparameters['optimizer']
        , metrics=['accuracy']
    )
    return model

In [19]:
def function_model_train(model, samples_train, samples_evaluate, **hyperparameters):
    model.fit(
        samples_train["features"], samples_train["labels"]
        , validation_data = (
            samples_evaluate["features"], samples_evaluate["labels"]
        )
        , verbose = 0
        , batch_size = 3
        , epochs = hyperparameters['epochs']
        , callbacks=[History()]
    )
    return model

In [20]:
def function_model_predict(model, samples_predict):
    probabilities = model.predict(samples_predict['features'])
    predictions = np.argmax(probabilities, axis=-1)
    
    return predictions, probabilities

In [21]:
def function_model_loss(model, samples_evaluate):
    loss, _ = model.evaluate(samples_evaluate['features'], samples_evaluate['labels'], verbose=0)
    return loss

### Stage the model.

In [22]:
algorithm = aidb.Algorithm.create(
    library = "Keras"
    , analysis_type = "classification_multi"
    , description = "dense, 2 layers, medium height"
	, function_model_build = function_model_build
	, function_model_train = function_model_train
    , function_model_predict = function_model_predict
    , function_model_loss = function_model_loss
)

In [23]:
hyperparamset = aidb.Hyperparamset.from_algorithm(
    algorithm_id = algorithm.id
    , preprocess_id = preprocess.id
    , description = "experimenting with number of epochs"
	, hyperparameters = hyperparameters
)

---

# Hypertune

In [24]:
batch = aidb.Batch.from_algorithm(
    algorithm_id = algorithm.id
    , splitset_id = splitset.id
    , hyperparamset_id = hyperparamset.id
    , foldset_id = foldset.id
    , only_folded_training = False
)

In [25]:
batch.get_statuses()

{1: 'Not yet started',
 2: 'Not yet started',
 3: 'Not yet started',
 4: 'Not yet started',
 5: 'Not yet started',
 6: 'Not yet started',
 7: 'Not yet started',
 8: 'Not yet started',
 9: 'Not yet started',
 10: 'Not yet started',
 11: 'Not yet started',
 12: 'Not yet started',
 13: 'Not yet started',
 14: 'Not yet started',
 15: 'Not yet started',
 16: 'Not yet started'}

In [26]:
batch.run_jobs(verbose=False)

🔮 Training Models 🔮:  50%|████████████████████▌                    | 8/16 [00:26<00:26,  3.32s/it]
Process aidb_batch_1:
Traceback (most recent call last):
  File "/Users/layne/.pyenv/versions/3.7.6/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/Users/layne/.pyenv/versions/3.7.6/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/layne/.pyenv/versions/3.7.6/envs/pydatasci_test/lib/python3.7/site-packages/pydatasci/aidb/__init__.py", line 1247, in background_proc
    j.run(verbose=verbose)
  File "/Users/layne/.pyenv/versions/3.7.6/envs/pydatasci_test/lib/python3.7/site-packages/pydatasci/aidb/__init__.py", line 1478, in run
    fold_samples_np = foldset.to_numpy(fold_index=fold.fold_index)[0]
KeyError: 0


In [29]:
#batch.stop_jobs()

In [28]:
batch.metrics_to_pandas()

Unnamed: 0,job_id,split,roc_auc,accuracy,precision,recall,f1,loss
0,1,test,0.972917,0.916667,0.917293,0.916667,0.916615,0.24883
1,1,validation,0.965278,0.888889,0.898268,0.888889,0.890747,0.292155
2,1,train,0.994197,0.955882,0.95629,0.955882,0.955985,0.140673
3,2,test,0.9675,0.9,0.902357,0.9,0.899749,0.269888
4,2,validation,0.954861,0.861111,0.874074,0.861111,0.858907,0.318518
5,2,train,0.991097,0.946078,0.946623,0.946078,0.946052,0.192092
6,3,test,0.988333,0.95,0.956522,0.95,0.949717,0.216586
7,3,validation,0.981481,0.888889,0.898268,0.888889,0.890747,0.272348
8,3,train,0.992971,0.965686,0.96597,0.965686,0.965793,0.165158
9,4,test,0.92625,0.833333,0.835017,0.833333,0.832916,0.497565


In [27]:
#batch.stop_jobs()

In [28]:
batch.jobs[0].results[0].plot_learning_curve()

IndexError: list index out of range

In [None]:
batch.jobs[6].results[0].plot_roc_curve()

In [None]:
batch.jobs[0].results[0].plot_precision_recall()

In [None]:
batch.jobs[0].results[0].plot_confusion_matrix()

In [None]:
batch.plot_performance(max_loss=0.40, min_metric_2=0.85)