Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
2 contributors

Users who have contributed to this file

@haifeng-jin @yufei-12
256 lines (217 sloc) 10.1 KB
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.python.util import nest
from autokeras import utils
from autokeras.hypermodel import base
class TextNode(base.Node):
pass
class Input(base.Node):
"""Input node for tensor data.
The data should be numpy.ndarray or tf.data.Dataset.
"""
def _check(self, x):
"""Record any information needed by transform."""
if not isinstance(x, (np.ndarray, tf.data.Dataset)):
raise TypeError('Expect the data to Input to be numpy.ndarray or '
'tf.data.Dataset, but got {type}.'.format(type=type(x)))
if isinstance(x, np.ndarray) and not np.issubdtype(x.dtype, np.number):
raise TypeError('Expect the data to Input to be numerical, but got '
'{type}.'.format(type=x.dtype))
def _convert_to_dataset(self, x):
if isinstance(x, tf.data.Dataset):
return x
if isinstance(x, np.ndarray):
x = x.astype(np.float32)
return tf.data.Dataset.from_tensor_slices(x)
def _record_dataset_shape(self, dataset):
self.shape = utils.dataset_shape(dataset)
def fit_transform(self, x):
dataset = self.transform(x)
self._record_dataset_shape(dataset)
return dataset
def transform(self, x):
"""Transform x into a compatible type (tf.data.Dataset)."""
self._check(x)
dataset = self._convert_to_dataset(x)
return dataset
class ImageInput(Input):
"""Input node for image data.
The input data should be numpy.ndarray or tf.data.Dataset. The shape of the data
should be 3 or 4 dimensional, the last dimension of which should be channel
dimension.
"""
def _check(self, x):
"""Record any information needed by transform."""
if not isinstance(x, (np.ndarray, tf.data.Dataset)):
raise TypeError('Expect the data to ImageInput to be numpy.ndarray or '
'tf.data.Dataset, but got {type}.'.format(type=type(x)))
if isinstance(x, np.ndarray) and x.ndim not in [3, 4]:
raise ValueError('Expect the data to ImageInput to have 3 or 4 '
'dimensions, but got input shape {shape} with {ndim} '
'dimensions'.format(shape=x.shape, ndim=x.ndim))
if isinstance(x, np.ndarray) and not np.issubdtype(x.dtype, np.number):
raise TypeError('Expect the data to ImageInput to be numerical, but got '
'{type}.'.format(type=x.dtype))
def _convert_to_dataset(self, x):
if isinstance(x, np.ndarray):
if x.ndim == 3:
x = np.expand_dims(x, axis=3)
return super()._convert_to_dataset(x)
class TextInput(Input, TextNode):
"""Input node for text data.
The input data should be numpy.ndarray or tf.data.Dataset. The data should be
one-dimensional. Each element in the data should be a string which is a full
sentence.
"""
def _check(self, x):
"""Record any information needed by transform."""
if not isinstance(x, (np.ndarray, tf.data.Dataset)):
raise TypeError('Expect the data to TextInput to be numpy.ndarray or '
'tf.data.Dataset, but got {type}.'.format(type=type(x)))
if isinstance(x, np.ndarray) and x.ndim != 1:
raise ValueError('Expect the data to TextInput to have 1 dimension, but '
'got input shape {shape} with {ndim} dimensions'.format(
shape=x.shape,
ndim=x.ndim))
if isinstance(x, np.ndarray) and not np.issubdtype(x.dtype, np.character):
raise TypeError('Expect the data to TextInput to be strings, but got '
'{type}.'.format(type=x.dtype))
def _convert_to_dataset(self, x):
if isinstance(x, np.ndarray):
x = tf.data.Dataset.from_tensor_slices(x)
return x
class StructuredDataInput(Input):
"""Input node for structured data.
The input data should be numpy.ndarray, pandas.DataFrame or tensorflow.Dataset.
The data should be two-dimensional with numerical or categorical values.
# Arguments
column_names: A list of strings specifying the names of the columns. The
length of the list should be equal to the number of columns of the data.
Defaults to None. If None, it will be obtained from the header of the csv
file or the pandas.DataFrame.
column_types: Dict. The keys are the column names. The values should either
be 'numerical' or 'categorical', indicating the type of that column.
Defaults to None. If not None, the column_names need to be specified.
If None, it will be inferred from the data. A column will be judged as
categorical if the number of different values is less than 5% of the
number of instances.
"""
def __init__(self, column_names=None, column_types=None, **kwargs):
super().__init__(**kwargs)
self.column_names = column_names
self.column_types = column_types
# Variables for inferring column types.
self.count_nan = None
self.count_numerical = None
self.count_categorical = None
self.count_unique_numerical = []
self.num_col = None
def get_config(self):
config = super().get_config()
config.update({
'column_names': self.column_names,
'column_types': self.column_types,
})
return config
def get_state(self):
state = super().get_state()
state.update({
'count_nan': self.count_nan,
'count_numerical': self.count_numerical,
'count_categorical': self.count_categorical,
'count_unique_numerical': self.count_unique_numerical,
'num_col': self.num_col
})
return state
def set_state(self, state):
super().set_state(state)
self.count_nan = state['count_nan']
self.count_numerical = state['count_numerical']
self.count_categorical = state['count_categorical']
self.count_unique_numerical = state['count_unique_numerical']
self.num_col = state['num_col']
def _check(self, x):
if not isinstance(x, (pd.DataFrame, np.ndarray)):
raise TypeError('Unsupported type {type} for '
'{name}.'.format(type=type(x),
name=self.__class__.__name__))
# Extract column_names from pd.DataFrame.
if isinstance(x, pd.DataFrame) and self.column_names is None:
self.column_names = list(x.columns)
# column_types is provided by user
if self.column_types:
for column_name in self.column_types:
if column_name not in self.column_names:
raise ValueError('Column_names and column_types are '
'mismatched. Cannot find column name '
'{name} in the data.'.format(
name=column_name))
# Generate column_names.
if self.column_names is None:
if self.column_types:
raise ValueError('Column names must be specified.')
self.column_names = [index for index in range(x.shape[1])]
# Check if column_names has the correct length.
if len(self.column_names) != x.shape[1]:
raise ValueError('Expect column_names to have length {expect} '
'but got {actual}.'.format(
expect=x.shape[1],
actual=len(self.column_names)))
def _convert_to_dataset(self, x):
if isinstance(x, pd.DataFrame):
# Convert x, y, validation_data to tf.Dataset.
x = tf.data.Dataset.from_tensor_slices(
x.values.astype(np.unicode))
if isinstance(x, np.ndarray):
x = tf.data.Dataset.from_tensor_slices(x.astype(np.unicode))
dataset = super()._convert_to_dataset(x)
for x in dataset:
self.update(x)
self.infer_column_types()
return dataset
def update(self, x):
# Calculate the statistics.
x = nest.flatten(x)[0].numpy()
if self.num_col is None:
self.num_col = len(x)
self.count_nan = np.zeros(self.num_col)
self.count_numerical = np.zeros(self.num_col)
self.count_categorical = np.zeros(self.num_col)
for i in range(len(x)):
self.count_unique_numerical.append({})
for i in range(self.num_col):
x[i] = x[i].decode('utf-8')
if x[i] == 'nan':
self.count_nan[i] += 1
elif x[i] == 'True':
self.count_categorical[i] += 1
elif x[i] == 'False':
self.count_categorical[i] += 1
else:
try:
tmp_num = float(x[i])
self.count_numerical[i] += 1
if tmp_num not in self.count_unique_numerical[i]:
self.count_unique_numerical[i][tmp_num] = 1
else:
self.count_unique_numerical[i][tmp_num] += 1
except ValueError:
self.count_categorical[i] += 1
def infer_column_types(self):
column_types = {}
for i in range(self.num_col):
if self.count_categorical[i] > 0:
column_types[self.column_names[i]] = 'categorical'
elif len(self.count_unique_numerical[i])/self.count_numerical[i] < 0.05:
column_types[self.column_names[i]] = 'categorical'
else:
column_types[self.column_names[i]] = 'numerical'
# Partial column_types is provided.
if self.column_types is None:
self.column_types = {}
for key, value in column_types.items():
if key not in self.column_types:
self.column_types[key] = value
class TimeSeriesInput(Input):
pass
You can’t perform that action at this time.