out of memory catched in subprocess

keras-team · Nov 29, 2018 · c9410e1 · c9410e1
1 parent 754f64d
commit c9410e1
Show file tree

Hide file tree

Showing 7 changed files with 63 additions and 90 deletions.
diff --git a/autokeras/constant.py b/autokeras/constant.py
@@ -14,7 +14,7 @@ class Constant:
     N_NEIGHBOURS = 8
     MAX_MODEL_SIZE = (1 << 25)
     MAX_LAYER_WIDTH = 4096
-    MAX_LAYERS = 500
+    MAX_LAYERS = 200
 
     # Model Defaults
 

diff --git a/autokeras/nn/graph.py b/autokeras/nn/graph.py
@@ -256,7 +256,7 @@ def _get_pooling_layers(self, start_node_id, end_node_id):
         """Given two node IDs, return all the pooling layers between them."""
         layer_list = []
         node_list = [start_node_id]
-        self._depth_first_search(end_node_id, layer_list, node_list)
+        assert self._depth_first_search(end_node_id, layer_list, node_list)
         ret = []
         for layer_id in layer_list:
             layer = self.layer_list[layer_id]
@@ -271,6 +271,7 @@ def _depth_first_search(self, target_id, layer_id_list, node_list):
 
         A recursive function to search all the layers and nodes between the node in the node_list
             and the node with target_id."""
+        assert len(node_list) <= self.n_nodes
         u = node_list[-1]
         if u == target_id:
             return True
@@ -632,10 +633,13 @@ def get_main_chain(self):
         for i in range(self.n_nodes):
             if distance[i] > distance[temp_id]:
                 temp_id = i
-        ret = [temp_id]
-        while pre_node[temp_id] != temp_id:
-            temp_id = pre_node[temp_id]
+        ret = []
+        for i in range(self.n_nodes + 5):
             ret.append(temp_id)
+            if pre_node[temp_id] == temp_id:
+                break
+            temp_id = pre_node[temp_id]
+        assert temp_id == pre_node[temp_id]
         ret.reverse()
         return ret
 

diff --git a/autokeras/search.py b/autokeras/search.py
@@ -36,7 +36,6 @@ class Searcher:
         training_queue: A list of the generated architectures to be trained.
         x_queue: A list of trained architectures not updated to the gpr.
         y_queue: A list of trained architecture performances not updated to the gpr.
-        beta: A float. The beta in the UCB acquisition function.
         t_min: A float. The minimum temperature during simulated annealing.
         bo: An instance of BayesianOptimizer.
     """
@@ -212,20 +211,12 @@ def search(self, train_data, test_data, timeout=60 * 60 * 24):
             if self.verbose and searched:
                 verbose_print(generated_other_info, generated_graph)
 
-            self.add_model(metric_value, loss, graph, model_id)
-            self.update(other_info, graph, metric_value, model_id)
-
-            self.export_json(os.path.join(self.path, 'history.json'))
+            if metric_value is not None:
+                self.add_model(metric_value, loss, graph, model_id)
+                self.update(other_info, graph, metric_value, model_id)
 
         except (TimeoutError, queue.Empty) as e:
             raise TimeoutError from e
-        except RuntimeError as e:
-            if not re.search('out of memory', str(e)):
-                raise e
-            if self.verbose:
-                print('\nCurrent model size is too big. Discontinuing training this model to search for other models.')
-            Constant.MAX_MODEL_SIZE = graph.size() - 1
-            return
         finally:
             # terminate and join the subprocess to prevent any resource leak
             p.terminate()
@@ -265,35 +256,28 @@ def generate(self, remaining_time, multiprocessing_queue):
 
         return new_father_id, generated_graph
 
-    def export_json(self, path):
-        """Export a json file of the search process."""
-        data = dict()
-
-        networks = []
-        for model_id in range(self.model_count - len(self.training_queue)):
-            networks.append(self.load_model_by_id(model_id).extract_descriptor().to_json())
-
-        tree = self.bo.search_tree.get_dict()
-
-        # Saving the data to file.
-        # data['networks'] = networks
-        data['tree'] = tree
-        import json
-        with open(path, 'w') as fp:
-            json.dump(data, fp)
-
 
 def train(q, graph, train_data, test_data, trainer_args, metric, loss, verbose, path):
     """Train the neural architecture."""
-    model = graph.produce_model()
-    loss, metric_value = ModelTrainer(model=model,
-                                      path=path,
-                                      train_data=train_data,
-                                      test_data=test_data,
-                                      metric=metric,
-                                      loss_function=loss,
-                                      verbose=verbose).train_model(**trainer_args)
-    model.set_weight_to_graph()
-    if q:
-        q.put((metric_value, loss, model.graph))
-    return metric_value, loss, model.graph
+    try:
+        model = graph.produce_model()
+        loss, metric_value = ModelTrainer(model=model,
+                                          path=path,
+                                          train_data=train_data,
+                                          test_data=test_data,
+                                          metric=metric,
+                                          loss_function=loss,
+                                          verbose=verbose).train_model(**trainer_args)
+        model.set_weight_to_graph()
+        if q:
+            q.put((metric_value, loss, model.graph))
+        return metric_value, loss, model.graph
+    except RuntimeError as e:
+        if not re.search('out of memory', str(e)):
+            raise e
+        if verbose:
+            print('\nCurrent model size is too big. Discontinuing training this model to search for other models.')
+        Constant.MAX_MODEL_SIZE = graph.size() - 1
+        if q:
+            q.put((None, None, None))
+        return None, None, None
diff --git a/tests/common.py b/tests/common.py
@@ -293,11 +293,6 @@ def close(self):
         pass
 
 
-class MockMemoryOutProcess(MockProcess):
-    def start(self):
-        raise RuntimeError('cuda: out of memory.')
-
-
 def simple_transform(graph):
     graph.to_wider_model(6, 64)
     return [deepcopy(graph)]
@@ -313,6 +308,11 @@ def mock_train(**kwargs):
     return 1, 0
 
 
+def mock_out_of_memory_train(**kwargs):
+    str(kwargs)
+    raise RuntimeError('CUDA: out of memory.')
+
+
 def mock_nvidia_smi_output(*arg, **kwargs):
     return \
         '    Free                        : 1 MiB \n' \

diff --git a/tests/image/test_image_supervised.py b/tests/image/test_image_supervised.py
@@ -90,9 +90,9 @@ def test_timeout(_):
 
 
 @patch('torch.multiprocessing.get_context', side_effect=MockProcess)
-@patch('autokeras.bayesian.transform', side_effect=simple_transform)
+# @patch('autokeras.bayesian.transform', side_effect=simple_transform)
 @patch('autokeras.search.ModelTrainer.train_model', side_effect=mock_train)
-def test_final_fit(_, _1, _2):
+def test_final_fit(_, _2):
     Constant.LIMIT_MEMORY = True
     clean_dir(TEST_TEMP_DIR)
     clf = ImageClassifier(path=TEST_TEMP_DIR, verbose=False)

diff --git a/tests/nn/test_graph.py b/tests/nn/test_graph.py
@@ -1,3 +1,4 @@
+from autokeras.net_transformer import transform
 from autokeras.nn.generator import CnnGenerator, ResNetGenerator
 from autokeras.nn.graph import *
 from tests.common import get_conv_data, get_add_skip_model, get_conv_dense_model, get_pooling_model, \
@@ -204,6 +205,15 @@ def test_long_transform2():
     model(torch.Tensor(np.random.random((10, 1, 28, 28))))
 
 
+# def test_long_transform3():
+#     graph = CnnGenerator(10, (28, 28, 1)).generate()
+#     for i in range(200):
+#         graph = transform(graph)[3]
+#     print(graph.operation_history)
+#     model = graph.produce_model()
+#     model(torch.Tensor(np.random.random((10, 1, 28, 28))))
+#
+
 def test_long_transform4():
     graph = ResNetGenerator(10, (28, 28, 1)).generate()
     graph.to_concat_skip_model(57, 68)

diff --git a/tests/test_search.py b/tests/test_search.py
@@ -6,11 +6,7 @@
 from autokeras.search import *
 from autokeras.nn.generator import CnnGenerator, MlpGenerator, ResNetGenerator
 from tests.common import clean_dir, MockProcess, get_classification_data_loaders, get_classification_data_loaders_mlp, \
-    simple_transform, MockMemoryOutProcess, TEST_TEMP_DIR, simple_transform_mlp
-
-
-def mock_train(**_):
-    return 1, 0
+    simple_transform, TEST_TEMP_DIR, simple_transform_mlp, mock_train, mock_out_of_memory_train
 
 
 @patch('torch.multiprocessing.get_context', side_effect=MockProcess)
@@ -45,28 +41,6 @@ def test_bayesian_searcher_mlp(_, _1, _2):
     assert len(generator.history) == 2
 
 
-@patch('torch.multiprocessing.get_context', side_effect=MockProcess)
-@patch('autokeras.bayesian.transform', side_effect=simple_transform)
-@patch('autokeras.search.ModelTrainer.train_model', side_effect=mock_train)
-def test_export_json(_, _1, _2):
-    train_data, test_data = get_classification_data_loaders()
-
-    clean_dir(TEST_TEMP_DIR)
-    generator = Searcher(3, (28, 28, 3), verbose=False, path=TEST_TEMP_DIR, metric=Accuracy,
-                         loss=classification_loss, generators=[CnnGenerator])
-    Constant.N_NEIGHBOURS = 1
-    Constant.T_MIN = 0.8
-    for _ in range(3):
-        generator.search(train_data, test_data)
-    file_path = os.path.join(TEST_TEMP_DIR, 'test.json')
-    generator.export_json(file_path)
-    import json
-    data = json.load(open(file_path, 'r'))
-    assert len(data['tree']['children']) == 2
-    clean_dir(TEST_TEMP_DIR)
-    assert len(generator.history) == 3
-
-
 @patch('torch.multiprocessing.get_context', side_effect=MockProcess)
 @patch('autokeras.search.ModelTrainer.train_model', side_effect=mock_train)
 def test_max_acq(_, _2):
@@ -87,17 +61,18 @@ def test_max_acq(_, _2):
     clean_dir(TEST_TEMP_DIR)
 
 
-@patch('torch.multiprocessing.get_context', side_effect=MockMemoryOutProcess)
-@patch('autokeras.bayesian.transform', side_effect=simple_transform)
-@patch('autokeras.search.ModelTrainer.train_model', side_effect=mock_train)
-def test_out_of_memory(_, _1, _2):
+@patch('torch.multiprocessing.get_context', side_effect=MockProcess)
+@patch('autokeras.search.ModelTrainer.train_model', side_effect=mock_out_of_memory_train)
+def test_out_of_memory(_, _2):
     train_data, test_data = get_classification_data_loaders()
     clean_dir(TEST_TEMP_DIR)
-    searcher = Searcher(3, (28, 28, 3), verbose=False, path=TEST_TEMP_DIR, metric=Accuracy,
-                        loss=classification_loss, generators=[CnnGenerator])
-    Constant.N_NEIGHBOURS = 1
+    Constant.N_NEIGHBOURS = 2
+    Constant.SEARCH_MAX_ITER = 0
     Constant.T_MIN = 0.8
-    for _ in range(4):
-        searcher.search(train_data, test_data)
+    Constant.BETA = 1
+    generator = Searcher(3, (28, 28, 3), verbose=True, path=TEST_TEMP_DIR, metric=Accuracy,
+                         loss=classification_loss, generators=[CnnGenerator, ResNetGenerator])
+    for _ in range(3):
+        generator.search(train_data, test_data)
     clean_dir(TEST_TEMP_DIR)
-    assert len(searcher.history) == 0
+    assert len(generator.history) == 0