-
Notifications
You must be signed in to change notification settings - Fork 1.4k
/
text_supervised.py
190 lines (148 loc) · 6.93 KB
/
text_supervised.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import os
import pickle
from functools import reduce
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from autokeras.net_module import CnnModule
from autokeras.constant import Constant
from autokeras.nn.loss_function import classification_loss, regression_loss
from autokeras.nn.metric import Accuracy, MSE
from autokeras.preprocessor import OneHotEncoder, TextDataTransformer
from autokeras.supervised import Supervised
from autokeras.text.text_preprocessor import text_preprocess
from autokeras.utils import pickle_to_file, validate_xy, temp_folder_generator, has_file, pickle_from_file
class TextClassifier(Supervised):
def __init__(self, verbose=False, path=None, resume=False, searcher_args=None):
super().__init__(verbose)
if searcher_args is None:
searcher_args = {}
if path is None:
path = temp_folder_generator()
self.cnn = CnnModule(self.loss, self.metric, searcher_args, path, verbose)
self.path = path
if has_file(os.path.join(self.path, 'text_classifier')) and resume:
classifier = pickle_from_file(os.path.join(self.path, 'text_classifier'))
self.__dict__ = classifier.__dict__
else:
self.y_encoder = None
self.data_transformer = None
self.verbose = verbose
def fit(self, x, y, x_test=None, y_test=None, batch_size=None, time_limit=None):
"""Find the best neural architecture and train it.
Based on the given dataset, the function will find the best neural architecture for it.
The dataset is in numpy.ndarray format.
So they training data should be passed through `x_train`, `y_train`.
Args:
x: A numpy.ndarray instance containing the training data.
y: A numpy.ndarray instance containing the label of the training data.
time_limit: The time limit for the search in seconds.
y_test:
x_test:
"""
x = text_preprocess(x, path=self.path)
x = np.array(x)
y = np.array(y)
validate_xy(x, y)
y = self.transform_y(y)
if batch_size is None:
batch_size = Constant.MAX_BATCH_SIZE
# Divide training data into training and testing data.
if x_test is None or y_test is None:
x_train, x_test, y_train, y_test = train_test_split(x, y,
test_size=min(Constant.VALIDATION_SET_SIZE,
int(len(y) * 0.2)),
random_state=42)
else:
x_train = x
y_train = y
# Wrap the data into DataLoaders
if self.data_transformer is None:
self.data_transformer = TextDataTransformer()
train_data = self.data_transformer.transform_train(x_train, y_train, batch_size=batch_size)
test_data = self.data_transformer.transform_test(x_test, y_test)
# Save the classifier
pickle.dump(self, open(os.path.join(self.path, 'text_classifier'), 'wb'))
pickle_to_file(self, os.path.join(self.path, 'text_classifier'))
if time_limit is None:
time_limit = 24 * 60 * 60
self.cnn.fit(self.get_n_output_node(), x_train.shape, train_data, test_data, time_limit)
def final_fit(self, x_train=None, y_train=None, x_test=None, y_test=None, trainer_args=None, retrain=False):
"""Final training after found the best architecture.
Args:
x_train: A numpy.ndarray of training data.
y_train: A numpy.ndarray of training targets.
x_test: A numpy.ndarray of testing data.
y_test: A numpy.ndarray of testing targets.
trainer_args: A dictionary containing the parameters of the ModelTrainer constructor.
retrain: A boolean of whether reinitialize the weights of the model.
"""
if trainer_args is None:
trainer_args = {'max_no_improvement_num': 30}
if x_test is None:
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train,
test_size=min(Constant.VALIDATION_SET_SIZE,
int(len(y_train) * 0.2)),
random_state=42)
x_train = text_preprocess(x_train, path=self.path)
x_test = text_preprocess(x_test, path=self.path)
y_train = self.transform_y(y_train)
y_test = self.transform_y(y_test)
train_data = self.data_transformer.transform_train(x_train, y_train, batch_size=Constant.MAX_BATCH_SIZE)
test_data = self.data_transformer.transform_test(x_test, y_test, batch_size=Constant.MAX_BATCH_SIZE)
self.cnn.final_fit(train_data, test_data, trainer_args, retrain)
def predict(self, x_test):
"""Return predict results for the testing data.
Args:
x_test: An instance of numpy.ndarray containing the testing data.
Returns:
A numpy.ndarray containing the results.
"""
if Constant.LIMIT_MEMORY:
pass
test_loader = self.data_transformer.transform_test(x_test)
model = self.cnn.best_model.produce_model()
model.eval()
outputs = []
with torch.no_grad():
for index, inputs in enumerate(test_loader):
outputs.append(model(inputs).numpy())
output = reduce(lambda x, y: np.concatenate((x, y)), outputs)
return self.inverse_transform_y(output)
def evaluate(self, x_test, y_test):
x_test = text_preprocess(x_test, path=self.path)
"""Return the accuracy score between predict value and `y_test`."""
y_predict = self.predict(x_test)
return self.metric().evaluate(y_test, y_predict)
@property
def metric(self):
return Accuracy
@property
def loss(self):
return classification_loss
def transform_y(self, y_train):
# Transform y_train.
if self.y_encoder is None:
self.y_encoder = OneHotEncoder()
self.y_encoder.fit(y_train)
y_train = self.y_encoder.transform(y_train)
return y_train
def inverse_transform_y(self, output):
return self.y_encoder.inverse_transform(output)
def load_searcher(self):
return pickle_from_file(os.path.join(self.path, 'searcher'))
def get_n_output_node(self):
return self.y_encoder.n_classes
class TextRegressor(TextClassifier):
@property
def loss(self):
return regression_loss
@property
def metric(self):
return MSE
def get_n_output_node(self):
return 1
def transform_y(self, y_train):
return y_train.flatten().reshape(len(y_train), 1)
def inverse_transform_y(self, output):
return output.flatten()