In [1]:
import optuna
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import toolkit.filter

import tensorflow as tf
from tensorflow.keras.backend import clear_session
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

from optuna.integration import KerasPruningCallback


In [2]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
if len(physical_devices) > 0:
    for k in range(len(physical_devices)):
        tf.config.experimental.set_memory_growth(physical_devices[k], True)
        print('memory growth:', tf.config.experimental.get_memory_growth(physical_devices[k]))
else:
    print("Not enough GPU hardware devices available")

Not enough GPU hardware devices available


In [3]:
class DatasetClass:
    def __init__(self, data_path):
        self.path = data_path
    
    def __reflect_index(self, data, index):
        if index != None:
            data = data[:, index]
        return data
        
    def __load_df(self, data_label):
        data_x = pd.read_csv(f"{self.path}/{data_label}_nx.csv", index_col=0)
        data_y = pd.read_csv(f"{self.path}/{data_label}_ny.csv", index_col=0)
        return data_x, data_y
    
    def __load_data(self, data_label, x_index, y_index):
        data_x, data_y = self.__load_df(data_label)
        data_x = self.__reflect_index(data_x.values, x_index)
        data_y = self.__reflect_index(data_y.values, y_index)
        return data_x, data_y
    
    def __load_stack(self, dataset_list, x_index, y_index):
        for label in dataset_list:
            tmp_x, tmp_y = self.__load_data(label, x_index, y_index)
            if dataset_list.index(label) == 0:
                data_x = tmp_x
                data_y = tmp_y
            else:
                data_x = np.vstack((data_x, tmp_x))
                data_y = np.vstack((data_y, tmp_y))
        return data_x, data_y
    
    def __load_dict(self, dataset_list, x_index, y_index):
        data_x, data_y = {}, {}
        for label in dataset_list:
            tmp_x, tmp_y = self.__load_data(label, x_index, y_index)
            data_x[label] = tmp_x
            data_y[label] = tmp_y
        return data_x, data_y
    
    def get_data(self, dataset_label, x_index=None, y_index=None, dict_type:bool=False):
        if not dict_type:
            if type(dataset_label) == str:
                data_x, data_y = self.__load_data(dataset_label, x_index, y_index)
            else:
                data_x, data_y = self.__load_stack(dataset_label, x_index, y_index)
        else:
            data_x, data_y = self.__load_dict(dataset_label, x_index, y_index)
        return data_x, data_y
    
    def get_dataframe(self, dataset_list):
        data_x = {}
        data_y = {}
        for label in dataset_list:
            tmp_x, tmp_y = self.__load_df(label)
            data_x[label] = tmp_x
            data_y[label] = tmp_y
        return data_x, data_y


def set_index(model_type):
    if "conv." in model_type:
        x_index = [i for i in range(7)]
    elif model_type == "prop.1":
        x_index = [0,3,4,6]
    elif model_type == "prop.2":
        x_index = [1,2,5,6]
    elif model_type == "prop.3":
        x_index = [1,2,5,6]
    elif model_type == "prop.4":
        x_index = [0,3,4,5,6]
    else:
        print(f"<< {model_type} >> This model is not exist.")
    y_index = [int(model_type[-1])-1]
    return x_index, y_index


def set_dataset_label(model_type):
    type_id = int(model_type[-1])
    if type_id == 1:
        learn = "ms1a"
        test = "ms2a"
    elif type_id == 2:
        learn = "ms3a"
        test = "ms1a"
    elif type_id == 3:
        learn = "ms2a"
        test = "ms3a"
    elif type_id == 4:
        learn = "ms3a"
        test = "ms1a"
    return learn, test


In [4]:
model_tag = "prop.1"
data_path = f"./dataset/norms"

x_index, y_index = set_index(model_tag)
learn_list, test_list  = set_dataset_label(model_tag)

dataset = DatasetClass(data_path)
learn_x, learn_y = dataset.get_data(learn_list, x_index, y_index)
test_x, test_y = dataset.get_data(test_list, x_index, y_index)

print(f"x_index: {x_index}, y_index: [{y_index[0]}]")
print(f"learn list: {learn_list}, test_list: {test_list}")


x_index: [0, 3, 4, 6], y_index: [0]
learn list: ms1a, test_list: ms2a


In [5]:
study_label = "ver1.0"
STUDY_LOADING = True

storage_path = f"sqlite:///optimize_{model_tag}.db"
study_name = model_tag + "_" + study_label

# study load or create
if STUDY_LOADING:
    study = optuna.load_study(study_name, storage_path, pruner=optuna.pruners.MedianPruner())
else:
    study = optuna.create_study(study_name=study_name, storage=storage_path, direction="minimize", pruner=optuna.pruners.MedianPruner())


In [6]:
batch_size = 512
element = 169

epochs = 200
samples = 51
input_unit = learn_x.shape[1]
testdata_size = test_x.shape[0]

filter = toolkit.filter.Filter_with_IQR()

def objective(trial):
    max_unit1 = 33
    max_unit2 = 100
    
    num_unit1 = trial.suggest_int(f"num_unit1", 1, max_unit1)
    buf_unit2 = int((element - input_unit * num_unit1)/(num_unit1 + 1))
    if max_unit2 >= buf_unit2:
        max_unit2 = buf_unit2
    num_unit2 = trial.suggest_int(f"num_unit2", 1, max_unit2)
    
    score_list = [np.nan for i in range(samples)]
    for i in range(samples):
        clear_session()
        model = Sequential([
            Dense(input_dim=input_unit, units=num_unit1, activation="tanh", kernel_initializer="glorot_uniform"),
            Dense(input_dim=num_unit1, units=num_unit2, activation="tanh", kernel_initializer="glorot_uniform"),
            Dense(input_dim=num_unit2, units=1, kernel_initializer="glorot_uniform")
        ])
        model.compile(loss="mse", optimizer=Adam())
        model.fit(learn_x, learn_y, batch_size=batch_size, epochs=epochs, verbose=0)
        score = model.evaluate(test_x, test_y, batch_size=testdata_size, verbose=0)
        score_list[i] = score
        print(f"\r#{trial.number:2}: units={[num_unit1, num_unit2]}, sample {i}/{samples}, score:{score:.4e}", end="")
        
    df_score = filter.filtering(score_list)
    count, mean, std = df_score.describe().loc[["count","mean","std"]]
    print(f"\r#{trial.number:2}: units={[num_unit1, num_unit2]}: samples={int(count)}/{samples}, mean={mean:.4e}, std={std:.4e}\n")
    return mean

In [None]:
study.optimize(objective, n_trials=81)

print("\n")
print("*** All Trial are finished!! ***")

#20: units=[28, 1]: samples=43/51, mean=1.3345e-06, std=7.0898e-07



[I 2020-07-21 03:21:42,902] Finished trial#20 with value: 1.334495314542733e-06 with parameters: {'num_unit1': 28, 'num_unit2': 1}. Best is trial#17 with value: 1.0068103907517272e-06.


#21: units=[28, 1]: samples=45/51, mean=1.7548e-06, std=1.1218e-06



[I 2020-07-21 03:28:28,365] Finished trial#21 with value: 1.754778986019624e-06 with parameters: {'num_unit1': 28, 'num_unit2': 1}. Best is trial#17 with value: 1.0068103907517272e-06.


#22: units=[28, 1]: samples=46/51, mean=1.5790e-06, std=1.0969e-06



[I 2020-07-21 03:35:33,225] Finished trial#22 with value: 1.5790023826041206e-06 with parameters: {'num_unit1': 28, 'num_unit2': 1}. Best is trial#17 with value: 1.0068103907517272e-06.


#23: units=[33, 1]: samples=41/51, mean=1.7804e-06, std=1.6962e-06



[I 2020-07-21 03:42:47,323] Finished trial#23 with value: 1.7803761436176146e-06 with parameters: {'num_unit1': 33, 'num_unit2': 1}. Best is trial#17 with value: 1.0068103907517272e-06.


#24: units=[25, 2]: samples=48/51, mean=9.3521e-07, std=7.0317e-07



[I 2020-07-21 03:50:05,057] Finished trial#24 with value: 9.352120089456406e-07 with parameters: {'num_unit1': 25, 'num_unit2': 2}. Best is trial#24 with value: 9.352120089456406e-07.


#25: units=[27, 2]: samples=51/51, mean=1.2696e-06, std=9.5663e-07



[I 2020-07-21 03:57:52,005] Finished trial#25 with value: 1.2695639011532618e-06 with parameters: {'num_unit1': 27, 'num_unit2': 2}. Best is trial#24 with value: 9.352120089456406e-07.


#26: units=[21, 3]: samples=46/51, mean=9.9928e-07, std=6.9808e-07



[I 2020-07-21 04:05:07,885] Finished trial#26 with value: 9.99280052251033e-07 with parameters: {'num_unit1': 21, 'num_unit2': 3}. Best is trial#24 with value: 9.352120089456406e-07.


#27: units=[20, 3]: samples=49/51, mean=1.5360e-06, std=8.9144e-07



[I 2020-07-21 04:11:57,395] Finished trial#27 with value: 1.5360486616533326e-06 with parameters: {'num_unit1': 20, 'num_unit2': 3}. Best is trial#24 with value: 9.352120089456406e-07.


#28: units=[21, 3]: samples=50/51, mean=1.1283e-06, std=8.4585e-07



[I 2020-07-21 04:19:39,238] Finished trial#28 with value: 1.1282584540595053e-06 with parameters: {'num_unit1': 21, 'num_unit2': 3}. Best is trial#24 with value: 9.352120089456406e-07.


#29: units=[14, 4]: samples=50/51, mean=1.3416e-06, std=1.0075e-06



[I 2020-07-21 04:26:44,877] Finished trial#29 with value: 1.3415541158678933e-06 with parameters: {'num_unit1': 14, 'num_unit2': 4}. Best is trial#24 with value: 9.352120089456406e-07.


#30: units=[31, 1]: samples=47/51, mean=1.2293e-06, std=8.3678e-07



[I 2020-07-21 04:34:10,392] Finished trial#30 with value: 1.229282175551001e-06 with parameters: {'num_unit1': 31, 'num_unit2': 1}. Best is trial#24 with value: 9.352120089456406e-07.


#31: units=[20, 4]: samples=49/51, mean=1.2925e-06, std=7.2233e-07



[I 2020-07-21 04:40:59,793] Finished trial#31 with value: 1.292515790106447e-06 with parameters: {'num_unit1': 20, 'num_unit2': 4}. Best is trial#24 with value: 9.352120089456406e-07.


#32: units=[19, 4]: samples=47/51, mean=1.0516e-06, std=8.0734e-07



[I 2020-07-21 04:48:29,978] Finished trial#32 with value: 1.0516333617260898e-06 with parameters: {'num_unit1': 19, 'num_unit2': 4}. Best is trial#24 with value: 9.352120089456406e-07.


#33: units=[26, 2], sample 37/51, score:3.7068e-06