Change text data to 2D data, change pre train file path (#329)

* Change Image class document * change text documentation * change parameters to attributes * change doc to google format * change image supervised doc Attributes * change text supervised doc with attributes * fix download pre train doc * fix some small issues * fix the line break * Add random string to the temp folder generator * change preprocessed data to 2D and fix pretrain store path * resolve utils conflict
keras-team · Nov 16, 2018 · 7114051 · 7114051
1 parent 407687e
commit 7114051
Show file tree

Hide file tree

Showing 10 changed files with 66 additions and 45 deletions.
diff --git a/autokeras/constant.py b/autokeras/constant.py
@@ -45,6 +45,7 @@ class Constant:
     MAX_SEQUENCE_LENGTH = 400
     MAX_NB_WORDS = 5000
     EXTRACT_PATH = "glove/"
+    STORE_PATH = ''
 
     # Download file name
 

diff --git a/autokeras/image/image_supervised.py b/autokeras/image/image_supervised.py
@@ -12,7 +12,7 @@
 from autokeras.nn.metric import Accuracy, MSE
 from autokeras.preprocessor import OneHotEncoder, ImageDataTransformer
 from autokeras.supervised import Supervised, PortableClass
-from autokeras.utils import has_file, pickle_from_file, pickle_to_file, temp_folder_generator, validate_xy, \
+from autokeras.utils import has_file, pickle_from_file, pickle_to_file, rand_temp_folder_generator, validate_xy, \
     read_csv_file, read_image, compute_image_resize_params, resize_image_data
 
 
@@ -99,7 +99,7 @@ def __init__(self, verbose=False, path=None, resume=False, searcher_args=None, a
             searcher_args = {}
 
         if path is None:
-            path = temp_folder_generator()
+            path = rand_temp_folder_generator()
 
         if augment is None:
             augment = Constant.DATA_AUGMENTATION

diff --git a/autokeras/preprocessor.py b/autokeras/preprocessor.py
@@ -156,7 +156,7 @@ def transform_test(self, data, targets=None, batch_size=None):
         return DataLoader(dataset, batch_size=batch_size, shuffle=True)
 
     def _transform(self, compose_list, data, targets):
-        data = torch.Tensor(data.transpose(0, 3, 1, 2))
+        data = torch.Tensor(data.transpose(0, 2, 1))
         data_transforms = Compose(compose_list)
         return MultiTransformDataset(data, targets, data_transforms)
 

diff --git a/autokeras/text/text_preprocessor.py b/autokeras/text/text_preprocessor.py
@@ -11,7 +11,7 @@
 from keras_preprocessing.text import Tokenizer
 
 from autokeras.constant import Constant
-from autokeras.utils import download_file_with_extract
+from autokeras.utils import download_file_with_extract, temp_path_generator, ensure_dir
 
 
 def download_pre_train(file_path, extract_path):
@@ -166,21 +166,28 @@ def processing(path, word_index, input_length, x_train):
         print("converting text to vector...")
         x_train = model.predict(x_train)
         del model
-    x_train = np.expand_dims(x_train, -1)
+
     return x_train
 
 
-def text_preprocess(x_train, path):
+def text_preprocess(x_train):
     """This is the text preprocess main method.
 
     It takes an raw string, clean it and processing it into tokenlized numpy array.
     """
+    if Constant.STORE_PATH == '':
+        temp_path = temp_path_generator()
+        path = temp_path + '_store'
+    else:
+        path = Constant.STORE_PATH
+
+    ensure_dir(path)
+
     x_train = [clean_str(x) for x in x_train]
     x_train, word_index = tokenlize_text(max_seq_length=Constant.MAX_SEQUENCE_LENGTH,
                                          max_num_words=Constant.MAX_NB_WORDS,
                                          x_train=x_train)
 
     print("generating preprocessing model...")
     x_train = processing(path=path, word_index=word_index, input_length=Constant.MAX_SEQUENCE_LENGTH, x_train=x_train)
-
     return x_train
diff --git a/autokeras/text/text_supervised.py b/autokeras/text/text_supervised.py
@@ -13,7 +13,7 @@
 from autokeras.preprocessor import OneHotEncoder, TextDataTransformer
 from autokeras.supervised import Supervised
 from autokeras.text.text_preprocessor import text_preprocess
-from autokeras.utils import pickle_to_file, validate_xy, temp_folder_generator, has_file, pickle_from_file
+from autokeras.utils import pickle_to_file, validate_xy, rand_temp_folder_generator, has_file, pickle_from_file
 
 
 class TextClassifier(Supervised):
@@ -46,7 +46,7 @@ def __init__(self, verbose=False, path=None, resume=False, searcher_args=None):
             searcher_args = {}
 
         if path is None:
-            path = temp_folder_generator()
+            path = rand_temp_folder_generator()
 
         self.cnn = CnnModule(self.loss, self.metric, searcher_args, path, verbose)
 
@@ -74,7 +74,7 @@ def fit(self, x, y, x_test=None, y_test=None, batch_size=None, time_limit=None):
             batch_size: int, define the batch size.
             time_limit: The time limit for the search in seconds.
         """
-        x = text_preprocess(x, path=self.path)
+        x = text_preprocess(x)
 
         x = np.array(x)
         y = np.array(y)
@@ -106,7 +106,7 @@ def fit(self, x, y, x_test=None, y_test=None, batch_size=None, time_limit=None):
 
         if time_limit is None:
             time_limit = 24 * 60 * 60
-
+        print(x_train.shape)
         self.cnn.fit(self.get_n_output_node(), x_train.shape, train_data, test_data, time_limit)
 
     def final_fit(self, x_train=None, y_train=None, x_test=None, y_test=None, trainer_args=None, retrain=False):
@@ -129,8 +129,8 @@ def final_fit(self, x_train=None, y_train=None, x_test=None, y_test=None, traine
                                                                               int(len(y_train) * 0.2)),
                                                                 random_state=42)
 
-        x_train = text_preprocess(x_train, path=self.path)
-        x_test = text_preprocess(x_test, path=self.path)
+        x_train = text_preprocess(x_train)
+        x_test = text_preprocess(x_test)
 
         y_train = self.transform_y(y_train)
         y_test = self.transform_y(y_test)
@@ -163,7 +163,7 @@ def predict(self, x_test):
         return self.inverse_transform_y(output)
 
     def evaluate(self, x_test, y_test):
-        x_test = text_preprocess(x_test, path=self.path)
+        x_test = text_preprocess(x_test)
         """Return the accuracy score between predict value and `y_test`."""
         y_predict = self.predict(x_test)
         return self.metric().evaluate(y_test, y_predict)

diff --git a/autokeras/utils.py b/autokeras/utils.py
@@ -85,13 +85,19 @@ def get_device():
     return device
 
 
-def temp_folder_generator():
+def temp_path_generator():
+    sys_temp = tempfile.gettempdir()
+    path = os.path.join(sys_temp, 'autokeras')
+    return path
+
+
+def rand_temp_folder_generator():
     """Create and return a temporary directory with the path name '/temp_dir_name/autokeras' (E:g:- /tmp/autokeras)."""
     chars = string.ascii_uppercase + string.digits
     size = 6
-    sys_temp = tempfile.gettempdir()
     random_suffix = ''.join(random.choice(chars) for _ in range(size))
-    path = os.path.join(sys_temp, 'autokeras_' + random_suffix)
+    sys_temp = temp_path_generator()
+    path = sys_temp + '_' + random_suffix
     ensure_dir(path)
     return path
 

diff --git a/tests/common.py b/tests/common.py
@@ -11,6 +11,7 @@
 from autokeras.nn.layers import StubPooling2d
 from autokeras.preprocessor import ImageDataTransformer
 
+TEST_TEMP_KERAS_DIR =  'tests/resources/temp/autokeras'
 TEST_TEMP_DIR = 'tests/resources/temp'
 
 

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -3,9 +3,10 @@
 from unittest.mock import patch
 
 from autokeras.constant import Constant
-from autokeras.utils import temp_folder_generator, download_file, get_system, get_device, compute_image_resize_params, \
-    resize_image_data
-from tests.common import clean_dir, TEST_TEMP_DIR, mock_nvidia_smi_output
+from autokeras.utils import rand_temp_folder_generator, download_file, get_system, get_device, \
+    compute_image_resize_params, \
+    resize_image_data, temp_path_generator
+from tests.common import clean_dir, TEST_TEMP_DIR, mock_nvidia_smi_output, TEST_TEMP_KERAS_DIR
 
 
 # This method will be used by the mock to replace requests.get
@@ -34,11 +35,16 @@ def get(self, key):
 
 
 @patch('tempfile.gettempdir', return_value=TEST_TEMP_DIR)
-def test_temp_folder_generator(_):
-    clean_dir(TEST_TEMP_DIR)
-    path = temp_folder_generator()
-    assert path.find("tests/resources/temp/autokeras") != -1
-    clean_dir(TEST_TEMP_DIR)
+def test_temp_path_generator(_):
+    path = temp_path_generator()
+    assert path == TEST_TEMP_DIR + "/autokeras"
+
+
+@patch('autokeras.utils.temp_path_generator', return_value=TEST_TEMP_KERAS_DIR)
+def test_rand_temp_folder_generator(_):
+    path = rand_temp_folder_generator()
+    assert path.find("tests/resources/temp/autokeras_") != -1
+    clean_dir(path)
 
 
 @patch('requests.get', side_effect=mocked_requests_get)
@@ -64,8 +70,8 @@ def test_compute_image_resize_params():
         assert image.shape == (25, 25, 3)
 
     # Case-2: Resize to max size for larger images.
-    data = numpy.array([numpy.random.randint(256, size=(int(numpy.sqrt(Constant.MAX_IMAGE_SIZE)+1),
-                                                        int(numpy.sqrt(Constant.MAX_IMAGE_SIZE)+1),
+    data = numpy.array([numpy.random.randint(256, size=(int(numpy.sqrt(Constant.MAX_IMAGE_SIZE) + 1),
+                                                        int(numpy.sqrt(Constant.MAX_IMAGE_SIZE) + 1),
                                                         3))])
     resize_height, resize_width = compute_image_resize_params(data)
     assert resize_height == int(numpy.sqrt(Constant.MAX_IMAGE_SIZE))

diff --git a/tests/text/test_text_preprocessor.py b/tests/text/test_text_preprocessor.py
@@ -38,8 +38,8 @@ def mock_load_pretrain(path, word_index):
 @patch('autokeras.text.text_preprocessor.tokenlize_text', side_effect=mock_tokenlize_text)
 @patch('autokeras.text.text_preprocessor.clean_str', side_effect=mock_clean_str)
 def test_text_preprocess_class(_, _1, _2):
-    train_x = np.random.rand(100, 25, 25, 1)
-    train_x = text_preprocess(train_x, TEST_TEMP_DIR)
+    train_x = np.random.rand(100, 25, 25)
+    train_x = text_preprocess(train_x)
 
 
 def test_clean_str():
@@ -60,7 +60,6 @@ def test_load_pretrain(_, _1):
 def test_processing(_, _1):
     train_x = np.full((1, 2), 1)
     train_x = processing(TEST_TEMP_DIR, word_index, 2, train_x)
-    train_x = np.squeeze(train_x, axis=-1)
     assert np.allclose(train_x[0][0], embedding_matrix[1])
 
 

diff --git a/tests/text/test_text_supervised.py b/tests/text/test_text_supervised.py
@@ -11,7 +11,7 @@ def mock_train(**kwargs):
     return 1, 0
 
 
-def mock_text_preprocess(x_train, path="dummy_path"):
+def mock_text_preprocess(x_train):
     return x_train
 
 
@@ -25,7 +25,7 @@ def test_fit_predict(_, _1, _2):
     Constant.T_MIN = 0.8
     clean_dir(TEST_TEMP_DIR)
     clf = TextClassifier(path=TEST_TEMP_DIR, verbose=True)
-    train_x = np.random.rand(100, 25, 25, 1)
+    train_x = np.random.rand(100, 25, 25)
     train_y = np.random.randint(0, 5, 100)
     clf.fit(train_x, train_y, )
     results = clf.predict(train_x)
@@ -42,7 +42,7 @@ def test_timeout(_, _1):
     Constant.DATA_AUGMENTATION = False
     clean_dir(TEST_TEMP_DIR)
     clf = TextClassifier(path=TEST_TEMP_DIR, verbose=False)
-    train_x = np.random.rand(100, 25, 25, 1)
+    train_x = np.random.rand(100, 25, 25)
     train_y = np.random.randint(0, 5, 100)
     with pytest.raises(TimeoutError):
         clf.fit(train_x, train_y, time_limit=0)
@@ -62,9 +62,9 @@ def test_final_fit(_, _1, _2, _3):
     Constant.SEARCH_MAX_ITER = 1
     Constant.N_NEIGHBOURS = 1
     Constant.T_MIN = 0.8
-    train_x = np.random.rand(100, 25, 25, 1)
+    train_x = np.random.rand(100, 25, 25)
     train_y = np.random.randint(0, 5, 100)
-    test_x = np.random.rand(100, 25, 25, 1)
+    test_x = np.random.rand(100, 25, 25)
     test_y = np.random.randint(0, 5, 100)
     clf.fit(train_x, train_y)
     clf.final_fit(train_x, train_y, test_x, test_y)
@@ -81,9 +81,9 @@ def test_save_continue(_, _1, _2):
     Constant.MAX_MODEL_NUM = 1
     Constant.SEARCH_MAX_ITER = 1
     Constant.T_MIN = 0.8
-    train_x = np.random.rand(100, 25, 25, 1)
+    train_x = np.random.rand(100, 25, 25)
     train_y = np.random.randint(0, 5, 100)
-    test_x = np.random.rand(100, 25, 25, 1)
+    test_x = np.random.rand(100, 25, 25)
     clean_dir(TEST_TEMP_DIR)
     clf = TextClassifier(path=TEST_TEMP_DIR, verbose=False, resume=False)
     clf.n_epochs = 100
@@ -106,7 +106,7 @@ def test_save_continue(_, _1, _2):
     clean_dir(TEST_TEMP_DIR)
 
 
-@patch('autokeras.text.text_supervised.temp_folder_generator', return_value=TEST_TEMP_DIR)
+@patch('autokeras.text.text_supervised.rand_temp_folder_generator', return_value=TEST_TEMP_DIR)
 def test_init_image_classifier_with_none_path(_):
     clf = TextClassifier()
     assert clf.path == TEST_TEMP_DIR
@@ -120,7 +120,7 @@ def test_evaluate(_, _1, _2):
     Constant.MAX_MODEL_NUM = 1
     Constant.SEARCH_MAX_ITER = 1
     Constant.T_MIN = 0.8
-    train_x = np.random.rand(100, 25, 25, 1)
+    train_x = np.random.rand(100, 25, 25)
     train_y = np.random.randint(0, 5, 100)
     clean_dir(TEST_TEMP_DIR)
     clf = TextClassifier(path=TEST_TEMP_DIR, verbose=False, resume=False)
@@ -131,7 +131,7 @@ def test_evaluate(_, _1, _2):
     assert score <= 1.0
 
 
-@patch('torch.multiprocessing.Pool', new=MockProcess)
+@patch('torch.multiprocessing.get_context', new=MockProcess)
 @patch('autokeras.search.ModelTrainer.train_model', side_effect=mock_train)
 @patch('autokeras.text.text_supervised.text_preprocess', side_effect=mock_text_preprocess)
 def test_fit_predict_regression(_, _1):
@@ -140,14 +140,15 @@ def test_fit_predict_regression(_, _1):
     Constant.SEARCH_MAX_ITER = 1
     Constant.T_MIN = 0.8
     Constant.DATA_AUGMENTATION = False
-    path = 'tests/resources/temp'
-    for f in os.listdir(path):
-        print(f)
+    path = TEST_TEMP_DIR
+    print(os.getcwd())
+    # for f in os.listdir(path):
+    #     print(f)
     clean_dir(path)
     clf = TextRegressor(path=path, verbose=False)
-    train_x = np.random.rand(100, 25, 25, 1)
+    train_x = np.random.rand(100, 25, 25)
     train_y = np.random.randint(0, 5, 100)
     clf.fit(train_x, train_y)
     results = clf.predict(train_x)
     assert len(results) == len(train_x)
-    clean_dir(path)
+    clean_dir(path)