Skip to content

Commit

Permalink
out of memory catched in subprocess
Browse files Browse the repository at this point in the history
  • Loading branch information
haifeng-jin committed Nov 29, 2018
1 parent 754f64d commit c9410e1
Show file tree
Hide file tree
Showing 7 changed files with 63 additions and 90 deletions.
2 changes: 1 addition & 1 deletion autokeras/constant.py
Expand Up @@ -14,7 +14,7 @@ class Constant:
N_NEIGHBOURS = 8
MAX_MODEL_SIZE = (1 << 25)
MAX_LAYER_WIDTH = 4096
MAX_LAYERS = 500
MAX_LAYERS = 200

# Model Defaults

Expand Down
12 changes: 8 additions & 4 deletions autokeras/nn/graph.py
Expand Up @@ -256,7 +256,7 @@ def _get_pooling_layers(self, start_node_id, end_node_id):
"""Given two node IDs, return all the pooling layers between them."""
layer_list = []
node_list = [start_node_id]
self._depth_first_search(end_node_id, layer_list, node_list)
assert self._depth_first_search(end_node_id, layer_list, node_list)
ret = []
for layer_id in layer_list:
layer = self.layer_list[layer_id]
Expand All @@ -271,6 +271,7 @@ def _depth_first_search(self, target_id, layer_id_list, node_list):
A recursive function to search all the layers and nodes between the node in the node_list
and the node with target_id."""
assert len(node_list) <= self.n_nodes
u = node_list[-1]
if u == target_id:
return True
Expand Down Expand Up @@ -632,10 +633,13 @@ def get_main_chain(self):
for i in range(self.n_nodes):
if distance[i] > distance[temp_id]:
temp_id = i
ret = [temp_id]
while pre_node[temp_id] != temp_id:
temp_id = pre_node[temp_id]
ret = []
for i in range(self.n_nodes + 5):
ret.append(temp_id)
if pre_node[temp_id] == temp_id:
break
temp_id = pre_node[temp_id]
assert temp_id == pre_node[temp_id]
ret.reverse()
return ret

Expand Down
66 changes: 25 additions & 41 deletions autokeras/search.py
Expand Up @@ -36,7 +36,6 @@ class Searcher:
training_queue: A list of the generated architectures to be trained.
x_queue: A list of trained architectures not updated to the gpr.
y_queue: A list of trained architecture performances not updated to the gpr.
beta: A float. The beta in the UCB acquisition function.
t_min: A float. The minimum temperature during simulated annealing.
bo: An instance of BayesianOptimizer.
"""
Expand Down Expand Up @@ -212,20 +211,12 @@ def search(self, train_data, test_data, timeout=60 * 60 * 24):
if self.verbose and searched:
verbose_print(generated_other_info, generated_graph)

self.add_model(metric_value, loss, graph, model_id)
self.update(other_info, graph, metric_value, model_id)

self.export_json(os.path.join(self.path, 'history.json'))
if metric_value is not None:
self.add_model(metric_value, loss, graph, model_id)
self.update(other_info, graph, metric_value, model_id)

except (TimeoutError, queue.Empty) as e:
raise TimeoutError from e
except RuntimeError as e:
if not re.search('out of memory', str(e)):
raise e
if self.verbose:
print('\nCurrent model size is too big. Discontinuing training this model to search for other models.')
Constant.MAX_MODEL_SIZE = graph.size() - 1
return
finally:
# terminate and join the subprocess to prevent any resource leak
p.terminate()
Expand Down Expand Up @@ -265,35 +256,28 @@ def generate(self, remaining_time, multiprocessing_queue):

return new_father_id, generated_graph

def export_json(self, path):
"""Export a json file of the search process."""
data = dict()

networks = []
for model_id in range(self.model_count - len(self.training_queue)):
networks.append(self.load_model_by_id(model_id).extract_descriptor().to_json())

tree = self.bo.search_tree.get_dict()

# Saving the data to file.
# data['networks'] = networks
data['tree'] = tree
import json
with open(path, 'w') as fp:
json.dump(data, fp)


def train(q, graph, train_data, test_data, trainer_args, metric, loss, verbose, path):
"""Train the neural architecture."""
model = graph.produce_model()
loss, metric_value = ModelTrainer(model=model,
path=path,
train_data=train_data,
test_data=test_data,
metric=metric,
loss_function=loss,
verbose=verbose).train_model(**trainer_args)
model.set_weight_to_graph()
if q:
q.put((metric_value, loss, model.graph))
return metric_value, loss, model.graph
try:
model = graph.produce_model()
loss, metric_value = ModelTrainer(model=model,
path=path,
train_data=train_data,
test_data=test_data,
metric=metric,
loss_function=loss,
verbose=verbose).train_model(**trainer_args)
model.set_weight_to_graph()
if q:
q.put((metric_value, loss, model.graph))
return metric_value, loss, model.graph
except RuntimeError as e:
if not re.search('out of memory', str(e)):
raise e
if verbose:
print('\nCurrent model size is too big. Discontinuing training this model to search for other models.')
Constant.MAX_MODEL_SIZE = graph.size() - 1
if q:
q.put((None, None, None))
return None, None, None
10 changes: 5 additions & 5 deletions tests/common.py
Expand Up @@ -293,11 +293,6 @@ def close(self):
pass


class MockMemoryOutProcess(MockProcess):
def start(self):
raise RuntimeError('cuda: out of memory.')


def simple_transform(graph):
graph.to_wider_model(6, 64)
return [deepcopy(graph)]
Expand All @@ -313,6 +308,11 @@ def mock_train(**kwargs):
return 1, 0


def mock_out_of_memory_train(**kwargs):
str(kwargs)
raise RuntimeError('CUDA: out of memory.')


def mock_nvidia_smi_output(*arg, **kwargs):
return \
' Free : 1 MiB \n' \
Expand Down
4 changes: 2 additions & 2 deletions tests/image/test_image_supervised.py
Expand Up @@ -90,9 +90,9 @@ def test_timeout(_):


@patch('torch.multiprocessing.get_context', side_effect=MockProcess)
@patch('autokeras.bayesian.transform', side_effect=simple_transform)
# @patch('autokeras.bayesian.transform', side_effect=simple_transform)
@patch('autokeras.search.ModelTrainer.train_model', side_effect=mock_train)
def test_final_fit(_, _1, _2):
def test_final_fit(_, _2):
Constant.LIMIT_MEMORY = True
clean_dir(TEST_TEMP_DIR)
clf = ImageClassifier(path=TEST_TEMP_DIR, verbose=False)
Expand Down
10 changes: 10 additions & 0 deletions tests/nn/test_graph.py
@@ -1,3 +1,4 @@
from autokeras.net_transformer import transform
from autokeras.nn.generator import CnnGenerator, ResNetGenerator
from autokeras.nn.graph import *
from tests.common import get_conv_data, get_add_skip_model, get_conv_dense_model, get_pooling_model, \
Expand Down Expand Up @@ -204,6 +205,15 @@ def test_long_transform2():
model(torch.Tensor(np.random.random((10, 1, 28, 28))))


# def test_long_transform3():
# graph = CnnGenerator(10, (28, 28, 1)).generate()
# for i in range(200):
# graph = transform(graph)[3]
# print(graph.operation_history)
# model = graph.produce_model()
# model(torch.Tensor(np.random.random((10, 1, 28, 28))))
#

def test_long_transform4():
graph = ResNetGenerator(10, (28, 28, 1)).generate()
graph.to_concat_skip_model(57, 68)
Expand Down
49 changes: 12 additions & 37 deletions tests/test_search.py
Expand Up @@ -6,11 +6,7 @@
from autokeras.search import *
from autokeras.nn.generator import CnnGenerator, MlpGenerator, ResNetGenerator
from tests.common import clean_dir, MockProcess, get_classification_data_loaders, get_classification_data_loaders_mlp, \
simple_transform, MockMemoryOutProcess, TEST_TEMP_DIR, simple_transform_mlp


def mock_train(**_):
return 1, 0
simple_transform, TEST_TEMP_DIR, simple_transform_mlp, mock_train, mock_out_of_memory_train


@patch('torch.multiprocessing.get_context', side_effect=MockProcess)
Expand Down Expand Up @@ -45,28 +41,6 @@ def test_bayesian_searcher_mlp(_, _1, _2):
assert len(generator.history) == 2


@patch('torch.multiprocessing.get_context', side_effect=MockProcess)
@patch('autokeras.bayesian.transform', side_effect=simple_transform)
@patch('autokeras.search.ModelTrainer.train_model', side_effect=mock_train)
def test_export_json(_, _1, _2):
train_data, test_data = get_classification_data_loaders()

clean_dir(TEST_TEMP_DIR)
generator = Searcher(3, (28, 28, 3), verbose=False, path=TEST_TEMP_DIR, metric=Accuracy,
loss=classification_loss, generators=[CnnGenerator])
Constant.N_NEIGHBOURS = 1
Constant.T_MIN = 0.8
for _ in range(3):
generator.search(train_data, test_data)
file_path = os.path.join(TEST_TEMP_DIR, 'test.json')
generator.export_json(file_path)
import json
data = json.load(open(file_path, 'r'))
assert len(data['tree']['children']) == 2
clean_dir(TEST_TEMP_DIR)
assert len(generator.history) == 3


@patch('torch.multiprocessing.get_context', side_effect=MockProcess)
@patch('autokeras.search.ModelTrainer.train_model', side_effect=mock_train)
def test_max_acq(_, _2):
Expand All @@ -87,17 +61,18 @@ def test_max_acq(_, _2):
clean_dir(TEST_TEMP_DIR)


@patch('torch.multiprocessing.get_context', side_effect=MockMemoryOutProcess)
@patch('autokeras.bayesian.transform', side_effect=simple_transform)
@patch('autokeras.search.ModelTrainer.train_model', side_effect=mock_train)
def test_out_of_memory(_, _1, _2):
@patch('torch.multiprocessing.get_context', side_effect=MockProcess)
@patch('autokeras.search.ModelTrainer.train_model', side_effect=mock_out_of_memory_train)
def test_out_of_memory(_, _2):
train_data, test_data = get_classification_data_loaders()
clean_dir(TEST_TEMP_DIR)
searcher = Searcher(3, (28, 28, 3), verbose=False, path=TEST_TEMP_DIR, metric=Accuracy,
loss=classification_loss, generators=[CnnGenerator])
Constant.N_NEIGHBOURS = 1
Constant.N_NEIGHBOURS = 2
Constant.SEARCH_MAX_ITER = 0
Constant.T_MIN = 0.8
for _ in range(4):
searcher.search(train_data, test_data)
Constant.BETA = 1
generator = Searcher(3, (28, 28, 3), verbose=True, path=TEST_TEMP_DIR, metric=Accuracy,
loss=classification_loss, generators=[CnnGenerator, ResNetGenerator])
for _ in range(3):
generator.search(train_data, test_data)
clean_dir(TEST_TEMP_DIR)
assert len(searcher.history) == 0
assert len(generator.history) == 0

0 comments on commit c9410e1

Please sign in to comment.