In [1]:
import optuna
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import toolkit.filter

import tensorflow as tf
from tensorflow.keras.backend import clear_session
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

from optuna.integration import KerasPruningCallback


In [2]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
if len(physical_devices) > 0:
    for k in range(len(physical_devices)):
        tf.config.experimental.set_memory_growth(physical_devices[k], True)
        print('memory growth:', tf.config.experimental.get_memory_growth(physical_devices[k]))
else:
    print("Not enough GPU hardware devices available")

Not enough GPU hardware devices available


In [3]:
class DatasetClass:
    def __init__(self, data_path):
        self.path = data_path
    
    def __reflect_index(self, data, index):
        if index != None:
            data = data[:, index]
        return data
        
    def __load_df(self, data_label):
        data_x = pd.read_csv(f"{self.path}/{data_label}_nx.csv", index_col=0)
        data_y = pd.read_csv(f"{self.path}/{data_label}_ny.csv", index_col=0)
        return data_x, data_y
    
    def __load_data(self, data_label, x_index, y_index):
        data_x, data_y = self.__load_df(data_label)
        data_x = self.__reflect_index(data_x.values, x_index)
        data_y = self.__reflect_index(data_y.values, y_index)
        return data_x, data_y
    
    def __load_stack(self, dataset_list, x_index, y_index):
        for label in dataset_list:
            tmp_x, tmp_y = self.__load_data(label, x_index, y_index)
            if dataset_list.index(label) == 0:
                data_x = tmp_x
                data_y = tmp_y
            else:
                data_x = np.vstack((data_x, tmp_x))
                data_y = np.vstack((data_y, tmp_y))
        return data_x, data_y
    
    def __load_dict(self, dataset_list, x_index, y_index):
        data_x, data_y = {}, {}
        for label in dataset_list:
            tmp_x, tmp_y = self.__load_data(label, x_index, y_index)
            data_x[label] = tmp_x
            data_y[label] = tmp_y
        return data_x, data_y
    
    def get_data(self, dataset_label, x_index=None, y_index=None, dict_type:bool=False):
        if not dict_type:
            if type(dataset_label) == str:
                data_x, data_y = self.__load_data(dataset_label, x_index, y_index)
            else:
                data_x, data_y = self.__load_stack(dataset_label, x_index, y_index)
        else:
            data_x, data_y = self.__load_dict(dataset_label, x_index, y_index)
        return data_x, data_y
    
    def get_dataframe(self, dataset_list):
        data_x = {}
        data_y = {}
        for label in dataset_list:
            tmp_x, tmp_y = self.__load_df(label)
            data_x[label] = tmp_x
            data_y[label] = tmp_y
        return data_x, data_y


def set_index(model_type):
    if "conv." in model_type:
        x_index = [i for i in range(7)]
    elif model_type == "prop.1":
        x_index = [0,3,4,6]
    elif model_type == "prop.2":
        x_index = [1,2,5,6]
    elif model_type == "prop.3":
        x_index = [1,2,5,6]
    elif model_type == "prop.4":
        x_index = [0,3,4,5,6]
    else:
        print(f"<< {model_type} >> This model is not exist.")
    y_index = [int(model_type[-1])-1]
    return x_index, y_index


def set_dataset_label(model_type):
    type_id = int(model_type[-1])
    if type_id == 1:
        learn = "ms1a"
        test = "ms2a"
    elif type_id == 2:
        learn = "ms3a"
        test = "ms1a"
    elif type_id == 3:
        learn = "ms2a"
        test = "ms3a"
    elif type_id == 4:
        learn = "ms3a"
        test = "ms1a"
    return learn, test


In [4]:
model_tag = "prop.2"
data_path = f"./dataset/norms"

x_index, y_index = set_index(model_tag)
learn_list, test_list  = set_dataset_label(model_tag)

dataset = DatasetClass(data_path)
learn_x, learn_y = dataset.get_data(learn_list, x_index, y_index)
test_x, test_y = dataset.get_data(test_list, x_index, y_index)

print(f"x_index: {x_index}, y_index: [{y_index[0]}]")
print(f"learn list: {learn_list}, test_list: {test_list}")


x_index: [1, 2, 5, 6], y_index: [1]
learn list: ms3a, test_list: ms1a


In [5]:
study_label = "ver1.0"
STUDY_LOADING = True

storage_path = f"sqlite:///optimize_{model_tag}.db"
study_name = model_tag + "_" + study_label

# study load or create
if STUDY_LOADING:
    study = optuna.load_study(study_name, storage_path, pruner=optuna.pruners.MedianPruner())
else:
    study = optuna.create_study(study_name=study_name, storage=storage_path, direction="minimize", pruner=optuna.pruners.MedianPruner())


In [6]:
batch_size = 512
element = 713

epochs = 200
samples = 51
input_unit = learn_x.shape[1]
testdata_size = test_x.shape[0]

filter = toolkit.filter.Filter_with_IQR()

def objective(trial):
    max_unit1 = 100
    max_unit2 = 100
    
    num_unit1 = trial.suggest_int(f"num_unit1", 1, max_unit1)
    buf_unit2 = int((element - input_unit * num_unit1)/(num_unit1 + 1))
    if max_unit2 >= buf_unit2:
        max_unit2 = buf_unit2
    num_unit2 = trial.suggest_int(f"num_unit2", 1, max_unit2)
    
    score_list = [np.nan for i in range(samples)]
    for i in range(samples):
        clear_session()
        model = Sequential([
            Dense(input_dim=input_unit, units=num_unit1, activation="tanh", kernel_initializer="glorot_uniform"),
            Dense(input_dim=num_unit1, units=num_unit2, activation="tanh", kernel_initializer="glorot_uniform"),
            Dense(input_dim=num_unit2, units=1, kernel_initializer="glorot_uniform")
        ])
        model.compile(loss="mse", optimizer=Adam())
        model.fit(learn_x, learn_y, batch_size=batch_size, epochs=epochs, verbose=0)
        score = model.evaluate(test_x, test_y, batch_size=testdata_size, verbose=0)
        score_list[i] = score
        print(f"\r#{trial.number:2}: units={[num_unit1, num_unit2]}, sample {i}/{samples}, score:{score:.4e}", end="")
        
    df_score = filter.filtering(score_list)
    count, mean, std = df_score.describe().loc[["count","mean","std"]]
    print(f"\r#{trial.number:2}: units={[num_unit1, num_unit2]}: samples={int(count)}/{samples}, mean={mean:.4e}, std={std:.4e}\n")
    return mean

In [None]:
study.optimize(objective, n_trials=87)

print("\n")
print("*** All Trial are finished!! ***")

#15: units=[84, 1]: samples=43/51, mean=1.3846e-07, std=1.9550e-07



[I 2020-07-21 03:27:57,109] Finished trial#15 with value: 1.384553360540475e-07 with parameters: {'num_unit1': 84, 'num_unit2': 1}. Best is trial#10 with value: 4.271637743524792e-08.


#16: units=[79, 2]: samples=44/51, mean=2.7409e-07, std=4.2932e-07



[I 2020-07-21 03:36:55,634] Finished trial#16 with value: 2.7409129968711004e-07 with parameters: {'num_unit1': 79, 'num_unit2': 2}. Best is trial#10 with value: 4.271637743524792e-08.


#17: units=[2, 32]: samples=41/51, mean=2.0962e-07, std=2.6346e-07



[I 2020-07-21 03:43:37,308] Finished trial#17 with value: 2.0962414687434168e-07 with parameters: {'num_unit1': 2, 'num_unit2': 32}. Best is trial#10 with value: 4.271637743524792e-08.


#18: units=[98, 1]: samples=44/51, mean=1.4980e-07, std=2.5123e-07



[I 2020-07-21 03:52:20,617] Finished trial#18 with value: 1.4980233278081492e-07 with parameters: {'num_unit1': 98, 'num_unit2': 1}. Best is trial#10 with value: 4.271637743524792e-08.


#19: units=[84, 3]: samples=46/51, mean=4.1300e-07, std=5.4101e-07



[I 2020-07-21 04:01:45,507] Finished trial#19 with value: 4.1300111308650855e-07 with parameters: {'num_unit1': 84, 'num_unit2': 3}. Best is trial#10 with value: 4.271637743524792e-08.


#20: units=[75, 4]: samples=43/51, mean=1.9752e-07, std=2.4957e-07



[I 2020-07-21 04:10:42,332] Finished trial#20 with value: 1.9751569607780216e-07 with parameters: {'num_unit1': 75, 'num_unit2': 4}. Best is trial#10 with value: 4.271637743524792e-08.


#21: units=[100, 1]: samples=41/51, mean=9.1624e-08, std=1.4747e-07



[I 2020-07-21 04:19:39,480] Finished trial#21 with value: 9.162390722550748e-08 with parameters: {'num_unit1': 100, 'num_unit2': 1}. Best is trial#10 with value: 4.271637743524792e-08.


#22: units=[90, 1]: samples=43/51, mean=1.3980e-07, std=1.9624e-07



[I 2020-07-21 04:28:15,452] Finished trial#22 with value: 1.3980111720464922e-07 with parameters: {'num_unit1': 90, 'num_unit2': 1}. Best is trial#10 with value: 4.271637743524792e-08.


#23: units=[90, 1]: samples=42/51, mean=6.0315e-08, std=6.7581e-08



[I 2020-07-21 04:36:26,193] Finished trial#23 with value: 6.031493651087196e-08 with parameters: {'num_unit1': 90, 'num_unit2': 1}. Best is trial#10 with value: 4.271637743524792e-08.


#24: units=[87, 2]: samples=47/51, mean=2.1406e-07, std=2.8199e-07



[I 2020-07-21 04:45:38,942] Finished trial#24 with value: 2.1406278666431946e-07 with parameters: {'num_unit1': 87, 'num_unit2': 2}. Best is trial#10 with value: 4.271637743524792e-08.


#25: units=[99, 1], sample 46/51, score:1.9295e-07