Skip to content

Commit

Permalink
Change text data to 2D data, change pre train file path (#329)
Browse files Browse the repository at this point in the history
* Change Image class document

* change text documentation

* change parameters to attributes

* change doc to google format

* change image supervised doc Attributes

*  change text supervised doc with attributes

* fix download pre train doc

* fix some small issues

* fix the line break

* Add random string to the temp folder generator

* change preprocessed data to 2D and fix pretrain store path

* resolve utils conflict
  • Loading branch information
boyuangong authored and haifeng-jin committed Nov 16, 2018
1 parent 407687e commit 7114051
Show file tree
Hide file tree
Showing 10 changed files with 66 additions and 45 deletions.
1 change: 1 addition & 0 deletions autokeras/constant.py
Expand Up @@ -45,6 +45,7 @@ class Constant:
MAX_SEQUENCE_LENGTH = 400
MAX_NB_WORDS = 5000
EXTRACT_PATH = "glove/"
STORE_PATH = ''

# Download file name

Expand Down
4 changes: 2 additions & 2 deletions autokeras/image/image_supervised.py
Expand Up @@ -12,7 +12,7 @@
from autokeras.nn.metric import Accuracy, MSE
from autokeras.preprocessor import OneHotEncoder, ImageDataTransformer
from autokeras.supervised import Supervised, PortableClass
from autokeras.utils import has_file, pickle_from_file, pickle_to_file, temp_folder_generator, validate_xy, \
from autokeras.utils import has_file, pickle_from_file, pickle_to_file, rand_temp_folder_generator, validate_xy, \
read_csv_file, read_image, compute_image_resize_params, resize_image_data


Expand Down Expand Up @@ -99,7 +99,7 @@ def __init__(self, verbose=False, path=None, resume=False, searcher_args=None, a
searcher_args = {}

if path is None:
path = temp_folder_generator()
path = rand_temp_folder_generator()

if augment is None:
augment = Constant.DATA_AUGMENTATION
Expand Down
2 changes: 1 addition & 1 deletion autokeras/preprocessor.py
Expand Up @@ -156,7 +156,7 @@ def transform_test(self, data, targets=None, batch_size=None):
return DataLoader(dataset, batch_size=batch_size, shuffle=True)

def _transform(self, compose_list, data, targets):
data = torch.Tensor(data.transpose(0, 3, 1, 2))
data = torch.Tensor(data.transpose(0, 2, 1))
data_transforms = Compose(compose_list)
return MultiTransformDataset(data, targets, data_transforms)

Expand Down
15 changes: 11 additions & 4 deletions autokeras/text/text_preprocessor.py
Expand Up @@ -11,7 +11,7 @@
from keras_preprocessing.text import Tokenizer

from autokeras.constant import Constant
from autokeras.utils import download_file_with_extract
from autokeras.utils import download_file_with_extract, temp_path_generator, ensure_dir


def download_pre_train(file_path, extract_path):
Expand Down Expand Up @@ -166,21 +166,28 @@ def processing(path, word_index, input_length, x_train):
print("converting text to vector...")
x_train = model.predict(x_train)
del model
x_train = np.expand_dims(x_train, -1)

return x_train


def text_preprocess(x_train, path):
def text_preprocess(x_train):
"""This is the text preprocess main method.
It takes an raw string, clean it and processing it into tokenlized numpy array.
"""
if Constant.STORE_PATH == '':
temp_path = temp_path_generator()
path = temp_path + '_store'
else:
path = Constant.STORE_PATH

ensure_dir(path)

x_train = [clean_str(x) for x in x_train]
x_train, word_index = tokenlize_text(max_seq_length=Constant.MAX_SEQUENCE_LENGTH,
max_num_words=Constant.MAX_NB_WORDS,
x_train=x_train)

print("generating preprocessing model...")
x_train = processing(path=path, word_index=word_index, input_length=Constant.MAX_SEQUENCE_LENGTH, x_train=x_train)

return x_train
14 changes: 7 additions & 7 deletions autokeras/text/text_supervised.py
Expand Up @@ -13,7 +13,7 @@
from autokeras.preprocessor import OneHotEncoder, TextDataTransformer
from autokeras.supervised import Supervised
from autokeras.text.text_preprocessor import text_preprocess
from autokeras.utils import pickle_to_file, validate_xy, temp_folder_generator, has_file, pickle_from_file
from autokeras.utils import pickle_to_file, validate_xy, rand_temp_folder_generator, has_file, pickle_from_file


class TextClassifier(Supervised):
Expand Down Expand Up @@ -46,7 +46,7 @@ def __init__(self, verbose=False, path=None, resume=False, searcher_args=None):
searcher_args = {}

if path is None:
path = temp_folder_generator()
path = rand_temp_folder_generator()

self.cnn = CnnModule(self.loss, self.metric, searcher_args, path, verbose)

Expand Down Expand Up @@ -74,7 +74,7 @@ def fit(self, x, y, x_test=None, y_test=None, batch_size=None, time_limit=None):
batch_size: int, define the batch size.
time_limit: The time limit for the search in seconds.
"""
x = text_preprocess(x, path=self.path)
x = text_preprocess(x)

x = np.array(x)
y = np.array(y)
Expand Down Expand Up @@ -106,7 +106,7 @@ def fit(self, x, y, x_test=None, y_test=None, batch_size=None, time_limit=None):

if time_limit is None:
time_limit = 24 * 60 * 60

print(x_train.shape)
self.cnn.fit(self.get_n_output_node(), x_train.shape, train_data, test_data, time_limit)

def final_fit(self, x_train=None, y_train=None, x_test=None, y_test=None, trainer_args=None, retrain=False):
Expand All @@ -129,8 +129,8 @@ def final_fit(self, x_train=None, y_train=None, x_test=None, y_test=None, traine
int(len(y_train) * 0.2)),
random_state=42)

x_train = text_preprocess(x_train, path=self.path)
x_test = text_preprocess(x_test, path=self.path)
x_train = text_preprocess(x_train)
x_test = text_preprocess(x_test)

y_train = self.transform_y(y_train)
y_test = self.transform_y(y_test)
Expand Down Expand Up @@ -163,7 +163,7 @@ def predict(self, x_test):
return self.inverse_transform_y(output)

def evaluate(self, x_test, y_test):
x_test = text_preprocess(x_test, path=self.path)
x_test = text_preprocess(x_test)
"""Return the accuracy score between predict value and `y_test`."""
y_predict = self.predict(x_test)
return self.metric().evaluate(y_test, y_predict)
Expand Down
12 changes: 9 additions & 3 deletions autokeras/utils.py
Expand Up @@ -85,13 +85,19 @@ def get_device():
return device


def temp_folder_generator():
def temp_path_generator():
sys_temp = tempfile.gettempdir()
path = os.path.join(sys_temp, 'autokeras')
return path


def rand_temp_folder_generator():
"""Create and return a temporary directory with the path name '/temp_dir_name/autokeras' (E:g:- /tmp/autokeras)."""
chars = string.ascii_uppercase + string.digits
size = 6
sys_temp = tempfile.gettempdir()
random_suffix = ''.join(random.choice(chars) for _ in range(size))
path = os.path.join(sys_temp, 'autokeras_' + random_suffix)
sys_temp = temp_path_generator()
path = sys_temp + '_' + random_suffix
ensure_dir(path)
return path

Expand Down
1 change: 1 addition & 0 deletions tests/common.py
Expand Up @@ -11,6 +11,7 @@
from autokeras.nn.layers import StubPooling2d
from autokeras.preprocessor import ImageDataTransformer

TEST_TEMP_KERAS_DIR = 'tests/resources/temp/autokeras'
TEST_TEMP_DIR = 'tests/resources/temp'


Expand Down
26 changes: 16 additions & 10 deletions tests/test_utils.py
Expand Up @@ -3,9 +3,10 @@
from unittest.mock import patch

from autokeras.constant import Constant
from autokeras.utils import temp_folder_generator, download_file, get_system, get_device, compute_image_resize_params, \
resize_image_data
from tests.common import clean_dir, TEST_TEMP_DIR, mock_nvidia_smi_output
from autokeras.utils import rand_temp_folder_generator, download_file, get_system, get_device, \
compute_image_resize_params, \
resize_image_data, temp_path_generator
from tests.common import clean_dir, TEST_TEMP_DIR, mock_nvidia_smi_output, TEST_TEMP_KERAS_DIR


# This method will be used by the mock to replace requests.get
Expand Down Expand Up @@ -34,11 +35,16 @@ def get(self, key):


@patch('tempfile.gettempdir', return_value=TEST_TEMP_DIR)
def test_temp_folder_generator(_):
clean_dir(TEST_TEMP_DIR)
path = temp_folder_generator()
assert path.find("tests/resources/temp/autokeras") != -1
clean_dir(TEST_TEMP_DIR)
def test_temp_path_generator(_):
path = temp_path_generator()
assert path == TEST_TEMP_DIR + "/autokeras"


@patch('autokeras.utils.temp_path_generator', return_value=TEST_TEMP_KERAS_DIR)
def test_rand_temp_folder_generator(_):
path = rand_temp_folder_generator()
assert path.find("tests/resources/temp/autokeras_") != -1
clean_dir(path)


@patch('requests.get', side_effect=mocked_requests_get)
Expand All @@ -64,8 +70,8 @@ def test_compute_image_resize_params():
assert image.shape == (25, 25, 3)

# Case-2: Resize to max size for larger images.
data = numpy.array([numpy.random.randint(256, size=(int(numpy.sqrt(Constant.MAX_IMAGE_SIZE)+1),
int(numpy.sqrt(Constant.MAX_IMAGE_SIZE)+1),
data = numpy.array([numpy.random.randint(256, size=(int(numpy.sqrt(Constant.MAX_IMAGE_SIZE) + 1),
int(numpy.sqrt(Constant.MAX_IMAGE_SIZE) + 1),
3))])
resize_height, resize_width = compute_image_resize_params(data)
assert resize_height == int(numpy.sqrt(Constant.MAX_IMAGE_SIZE))
Expand Down
5 changes: 2 additions & 3 deletions tests/text/test_text_preprocessor.py
Expand Up @@ -38,8 +38,8 @@ def mock_load_pretrain(path, word_index):
@patch('autokeras.text.text_preprocessor.tokenlize_text', side_effect=mock_tokenlize_text)
@patch('autokeras.text.text_preprocessor.clean_str', side_effect=mock_clean_str)
def test_text_preprocess_class(_, _1, _2):
train_x = np.random.rand(100, 25, 25, 1)
train_x = text_preprocess(train_x, TEST_TEMP_DIR)
train_x = np.random.rand(100, 25, 25)
train_x = text_preprocess(train_x)


def test_clean_str():
Expand All @@ -60,7 +60,6 @@ def test_load_pretrain(_, _1):
def test_processing(_, _1):
train_x = np.full((1, 2), 1)
train_x = processing(TEST_TEMP_DIR, word_index, 2, train_x)
train_x = np.squeeze(train_x, axis=-1)
assert np.allclose(train_x[0][0], embedding_matrix[1])


Expand Down
31 changes: 16 additions & 15 deletions tests/text/test_text_supervised.py
Expand Up @@ -11,7 +11,7 @@ def mock_train(**kwargs):
return 1, 0


def mock_text_preprocess(x_train, path="dummy_path"):
def mock_text_preprocess(x_train):
return x_train


Expand All @@ -25,7 +25,7 @@ def test_fit_predict(_, _1, _2):
Constant.T_MIN = 0.8
clean_dir(TEST_TEMP_DIR)
clf = TextClassifier(path=TEST_TEMP_DIR, verbose=True)
train_x = np.random.rand(100, 25, 25, 1)
train_x = np.random.rand(100, 25, 25)
train_y = np.random.randint(0, 5, 100)
clf.fit(train_x, train_y, )
results = clf.predict(train_x)
Expand All @@ -42,7 +42,7 @@ def test_timeout(_, _1):
Constant.DATA_AUGMENTATION = False
clean_dir(TEST_TEMP_DIR)
clf = TextClassifier(path=TEST_TEMP_DIR, verbose=False)
train_x = np.random.rand(100, 25, 25, 1)
train_x = np.random.rand(100, 25, 25)
train_y = np.random.randint(0, 5, 100)
with pytest.raises(TimeoutError):
clf.fit(train_x, train_y, time_limit=0)
Expand All @@ -62,9 +62,9 @@ def test_final_fit(_, _1, _2, _3):
Constant.SEARCH_MAX_ITER = 1
Constant.N_NEIGHBOURS = 1
Constant.T_MIN = 0.8
train_x = np.random.rand(100, 25, 25, 1)
train_x = np.random.rand(100, 25, 25)
train_y = np.random.randint(0, 5, 100)
test_x = np.random.rand(100, 25, 25, 1)
test_x = np.random.rand(100, 25, 25)
test_y = np.random.randint(0, 5, 100)
clf.fit(train_x, train_y)
clf.final_fit(train_x, train_y, test_x, test_y)
Expand All @@ -81,9 +81,9 @@ def test_save_continue(_, _1, _2):
Constant.MAX_MODEL_NUM = 1
Constant.SEARCH_MAX_ITER = 1
Constant.T_MIN = 0.8
train_x = np.random.rand(100, 25, 25, 1)
train_x = np.random.rand(100, 25, 25)
train_y = np.random.randint(0, 5, 100)
test_x = np.random.rand(100, 25, 25, 1)
test_x = np.random.rand(100, 25, 25)
clean_dir(TEST_TEMP_DIR)
clf = TextClassifier(path=TEST_TEMP_DIR, verbose=False, resume=False)
clf.n_epochs = 100
Expand All @@ -106,7 +106,7 @@ def test_save_continue(_, _1, _2):
clean_dir(TEST_TEMP_DIR)


@patch('autokeras.text.text_supervised.temp_folder_generator', return_value=TEST_TEMP_DIR)
@patch('autokeras.text.text_supervised.rand_temp_folder_generator', return_value=TEST_TEMP_DIR)
def test_init_image_classifier_with_none_path(_):
clf = TextClassifier()
assert clf.path == TEST_TEMP_DIR
Expand All @@ -120,7 +120,7 @@ def test_evaluate(_, _1, _2):
Constant.MAX_MODEL_NUM = 1
Constant.SEARCH_MAX_ITER = 1
Constant.T_MIN = 0.8
train_x = np.random.rand(100, 25, 25, 1)
train_x = np.random.rand(100, 25, 25)
train_y = np.random.randint(0, 5, 100)
clean_dir(TEST_TEMP_DIR)
clf = TextClassifier(path=TEST_TEMP_DIR, verbose=False, resume=False)
Expand All @@ -131,7 +131,7 @@ def test_evaluate(_, _1, _2):
assert score <= 1.0


@patch('torch.multiprocessing.Pool', new=MockProcess)
@patch('torch.multiprocessing.get_context', new=MockProcess)
@patch('autokeras.search.ModelTrainer.train_model', side_effect=mock_train)
@patch('autokeras.text.text_supervised.text_preprocess', side_effect=mock_text_preprocess)
def test_fit_predict_regression(_, _1):
Expand All @@ -140,14 +140,15 @@ def test_fit_predict_regression(_, _1):
Constant.SEARCH_MAX_ITER = 1
Constant.T_MIN = 0.8
Constant.DATA_AUGMENTATION = False
path = 'tests/resources/temp'
for f in os.listdir(path):
print(f)
path = TEST_TEMP_DIR
print(os.getcwd())
# for f in os.listdir(path):
# print(f)
clean_dir(path)
clf = TextRegressor(path=path, verbose=False)
train_x = np.random.rand(100, 25, 25, 1)
train_x = np.random.rand(100, 25, 25)
train_y = np.random.randint(0, 5, 100)
clf.fit(train_x, train_y)
results = clf.predict(train_x)
assert len(results) == len(train_x)
clean_dir(path)
clean_dir(path)

0 comments on commit 7114051

Please sign in to comment.