In [1]:
import optuna
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import toolkit.filter

import tensorflow as tf
from tensorflow.keras.backend import clear_session
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

from optuna.integration import KerasPruningCallback


In [2]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
if len(physical_devices) > 0:
    for k in range(len(physical_devices)):
        tf.config.experimental.set_memory_growth(physical_devices[k], True)
        print('memory growth:', tf.config.experimental.get_memory_growth(physical_devices[k]))
else:
    print("Not enough GPU hardware devices available")

Not enough GPU hardware devices available


In [3]:
class DatasetClass:
    def __init__(self, data_path):
        self.path = data_path
    
    def __reflect_index(self, data, index):
        if index != None:
            data = data[:, index]
        return data
        
    def __load_df(self, data_label):
        data_x = pd.read_csv(f"{self.path}/{data_label}_nx.csv", index_col=0)
        data_y = pd.read_csv(f"{self.path}/{data_label}_ny.csv", index_col=0)
        return data_x, data_y
    
    def __load_data(self, data_label, x_index, y_index):
        data_x, data_y = self.__load_df(data_label)
        data_x = self.__reflect_index(data_x.values, x_index)
        data_y = self.__reflect_index(data_y.values, y_index)
        return data_x, data_y
    
    def __load_stack(self, dataset_list, x_index, y_index):
        for label in dataset_list:
            tmp_x, tmp_y = self.__load_data(label, x_index, y_index)
            if dataset_list.index(label) == 0:
                data_x = tmp_x
                data_y = tmp_y
            else:
                data_x = np.vstack((data_x, tmp_x))
                data_y = np.vstack((data_y, tmp_y))
        return data_x, data_y
    
    def __load_dict(self, dataset_list, x_index, y_index):
        data_x, data_y = {}, {}
        for label in dataset_list:
            tmp_x, tmp_y = self.__load_data(label, x_index, y_index)
            data_x[label] = tmp_x
            data_y[label] = tmp_y
        return data_x, data_y
    
    def get_data(self, dataset_label, x_index=None, y_index=None, dict_type:bool=False):
        if not dict_type:
            if type(dataset_label) == str:
                data_x, data_y = self.__load_data(dataset_label, x_index, y_index)
            else:
                data_x, data_y = self.__load_stack(dataset_label, x_index, y_index)
        else:
            data_x, data_y = self.__load_dict(dataset_label, x_index, y_index)
        return data_x, data_y
    
    def get_dataframe(self, dataset_list):
        data_x = {}
        data_y = {}
        for label in dataset_list:
            tmp_x, tmp_y = self.__load_df(label)
            data_x[label] = tmp_x
            data_y[label] = tmp_y
        return data_x, data_y


def set_index(model_type):
    if "conv." in model_type:
        x_index = [i for i in range(7)]
    elif model_type == "prop.1":
        x_index = [0,3,4,6]
    elif model_type == "prop.2":
        x_index = [1,2,5,6]
    elif model_type == "prop.3":
        x_index = [1,2,5,6]
    elif model_type == "prop.4":
        x_index = [0,3,4,5,6]
    else:
        print(f"<< {model_type} >> This model is not exist.")
    y_index = [int(model_type[-1])-1]
    return x_index, y_index


def set_dataset_label(model_type):
    type_id = int(model_type[-1])
    if type_id == 1:
        learn = "ms1a"
        test = "ms2a"
    elif type_id == 2:
        learn = "ms3a"
        test = "ms1a"
    elif type_id == 3:
        learn = "ms2a"
        test = "ms3a"
    elif type_id == 4:
        learn = "ms3a"
        test = "ms1a"
    return learn, test


In [4]:
model_tag = "prop.3"
data_path = f"./dataset/norms"

x_index, y_index = set_index(model_tag)
learn_list, test_list  = set_dataset_label(model_tag)

dataset = DatasetClass(data_path)
learn_x, learn_y = dataset.get_data(learn_list, x_index, y_index)
test_x, test_y = dataset.get_data(test_list, x_index, y_index)

print(f"x_index: {x_index}, y_index: [{y_index[0]}]")
print(f"learn list: {learn_list}, test_list: {test_list}")


x_index: [1, 2, 5, 6], y_index: [2]
learn list: ms2a, test_list: ms3a


In [5]:
study_label = "ver1.0"
STUDY_LOADING = True

storage_path = f"sqlite:///optimize_{model_tag}.db"
study_name = model_tag + "_" + study_label

# study load or create
if STUDY_LOADING:
    study = optuna.load_study(study_name, storage_path, pruner=optuna.pruners.MedianPruner())
else:
    study = optuna.create_study(study_name=study_name, storage=storage_path, direction="minimize", pruner=optuna.pruners.MedianPruner())


In [6]:
batch_size = 256
element = 1369

epochs = 200
samples = 51
input_unit = learn_x.shape[1]
testdata_size = test_x.shape[0]

filter = toolkit.filter.Filter_with_IQR()

def objective(trial):
    max_unit1 = 100
    max_unit2 = 100
    
    num_unit1 = trial.suggest_int(f"num_unit1", 1, max_unit1)
    buf_unit2 = int((element - input_unit * num_unit1)/(num_unit1 + 1))
    if max_unit2 >= buf_unit2:
        max_unit2 = buf_unit2
    num_unit2 = trial.suggest_int(f"num_unit2", 1, max_unit2)
    
    score_list = [np.nan for i in range(samples)]
    for i in range(samples):
        clear_session()
        model = Sequential([
            Dense(input_dim=input_unit, units=num_unit1, activation="tanh", kernel_initializer="glorot_uniform"),
            Dense(input_dim=num_unit1, units=num_unit2, activation="tanh", kernel_initializer="glorot_uniform"),
            Dense(input_dim=num_unit2, units=1, kernel_initializer="glorot_uniform")
        ])
        model.compile(loss="mse", optimizer=Adam())
        model.fit(learn_x, learn_y, batch_size=batch_size, epochs=epochs, verbose=0)
        score = model.evaluate(test_x, test_y, batch_size=testdata_size, verbose=0)
        score_list[i] = score
        print(f"\r#{trial.number:2}: units={[num_unit1, num_unit2]}, sample {i}/{samples}, score:{score:.4e}", end="")
        
    df_score = filter.filtering(score_list)
    count, mean, std = df_score.describe().loc[["count","mean","std"]]
    print(f"\r#{trial.number:2}: units={[num_unit1, num_unit2]}: samples={int(count)}/{samples}, mean={mean:.4e}, std={std:.4e}\n")
    return mean

In [None]:
study.optimize(objective, n_trials=84)

print("\n")
print("*** All Trial are finished!! ***")

#17: units=[17, 59]: samples=43/51, mean=2.0779e-06, std=1.2117e-06



[I 2020-07-21 03:28:29,292] Finished trial#17 with value: 2.0779156353603677e-06 with parameters: {'num_unit1': 17, 'num_unit2': 59}. Best is trial#0 with value: 1.0673152356265414e-06.


#18: units=[44, 13]: samples=48/51, mean=1.8554e-06, std=1.5459e-06



[I 2020-07-21 03:35:54,853] Finished trial#18 with value: 1.8553518075966242e-06 with parameters: {'num_unit1': 44, 'num_unit2': 13}. Best is trial#0 with value: 1.0673152356265414e-06.


#19: units=[65, 6]: samples=44/51, mean=1.3051e-06, std=9.7157e-07



[I 2020-07-21 03:42:59,929] Finished trial#19 with value: 1.3050804978123862e-06 with parameters: {'num_unit1': 65, 'num_unit2': 6}. Best is trial#0 with value: 1.0673152356265414e-06.


#20: units=[1, 67]: samples=51/51, mean=8.9138e-04, std=9.8736e-04



[I 2020-07-21 03:49:37,267] Finished trial#20 with value: 0.0008913789588444063 with parameters: {'num_unit1': 1, 'num_unit2': 67}. Best is trial#0 with value: 1.0673152356265414e-06.


#21: units=[53, 14]: samples=46/51, mean=1.2199e-06, std=8.4790e-07



[I 2020-07-21 03:57:21,911] Finished trial#21 with value: 1.2199054140182852e-06 with parameters: {'num_unit1': 53, 'num_unit2': 14}. Best is trial#0 with value: 1.0673152356265414e-06.


#22: units=[31, 15]: samples=44/51, mean=1.2915e-06, std=5.5354e-07



[I 2020-07-21 04:04:27,745] Finished trial#22 with value: 1.2914748349534089e-06 with parameters: {'num_unit1': 31, 'num_unit2': 15}. Best is trial#0 with value: 1.0673152356265414e-06.


#23: units=[54, 6]: samples=46/51, mean=1.5190e-06, std=1.0042e-06



[I 2020-07-21 04:11:46,889] Finished trial#23 with value: 1.519028694597973e-06 with parameters: {'num_unit1': 54, 'num_unit2': 6}. Best is trial#0 with value: 1.0673152356265414e-06.


#24: units=[92, 10]: samples=46/51, mean=8.0754e-07, std=4.5781e-07



[I 2020-07-21 04:19:47,690] Finished trial#24 with value: 8.075403389076873e-07 with parameters: {'num_unit1': 92, 'num_unit2': 10}. Best is trial#24 with value: 8.075403389076873e-07.


#25: units=[91, 10]: samples=43/51, mean=1.4615e-06, std=1.3648e-06



[I 2020-07-21 04:27:46,945] Finished trial#25 with value: 1.4614552002893655e-06 with parameters: {'num_unit1': 91, 'num_unit2': 10}. Best is trial#24 with value: 8.075403389076873e-07.


#26: units=[73, 12]: samples=44/51, mean=1.3499e-06, std=1.1349e-06



[I 2020-07-21 04:35:14,715] Finished trial#26 with value: 1.3498607574743717e-06 with parameters: {'num_unit1': 73, 'num_unit2': 12}. Best is trial#24 with value: 8.075403389076873e-07.


#27: units=[89, 7]: samples=44/51, mean=1.0971e-06, std=8.2160e-07



[I 2020-07-21 04:42:50,551] Finished trial#27 with value: 1.0971213936531141e-06 with parameters: {'num_unit1': 89, 'num_unit2': 7}. Best is trial#24 with value: 8.075403389076873e-07.


#28: units=[100, 7]: samples=46/51, mean=1.4874e-06, std=1.3365e-06



[I 2020-07-21 04:50:53,548] Finished trial#28 with value: 1.4873873343800448e-06 with parameters: {'num_unit1': 100, 'num_unit2': 7}. Best is trial#24 with value: 8.075403389076873e-07.


#29: units=[88, 7], sample 21/51, score:3.8898e-07