Skip to content

Commit

Permalink
Merge pull request #31 from albalu/master
Browse files Browse the repository at this point in the history
expand PreProcess a bit
  • Loading branch information
albalu committed Jul 17, 2018
2 parents 5d8791a + 7626441 commit a2f33ab
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 12 deletions.
20 changes: 19 additions & 1 deletion matbench/automl/tpot_utils.py
@@ -1,3 +1,7 @@
from matbench.data.load import load_double_perovskites_gap
from matbench.featurize import Featurize
from matbench.preprocess import PreProcess
from matminer.featurizers.composition import ElementProperty, TMetalFraction
from tpot import TPOTClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
Expand Down Expand Up @@ -31,4 +35,18 @@
tpot.export('tpot_iris_pipeline.py')

# very good method for us; keeps track of the score of different algorithms:
print(tpot.evaluated_individuals_)
print(tpot.evaluated_individuals_)


# limit = 200
# df_init = load_double_perovskites_gap(return_lumo=False)[:limit]
#
# featzer = Featurize(df_init, ignore_cols=['a_1', 'b_1', 'a_2', 'b_2'])
# df_feats = featzer.featurize_formula(featurizers=[
# ElementProperty.from_preset(preset_name='matminer'), TMetalFraction()])
#
# prep = PreProcess(df_feats, target_col='gap gllbsc')
# df = prep.preprocess()
#
#
# print(df)
72 changes: 61 additions & 11 deletions matbench/preprocess.py
@@ -1,28 +1,78 @@
from matbench.utils.utils import MatbenchError
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from pandas.api.types import is_numeric_dtype


class PreProcess(object):
"""
PreProcess has several methods to clean and prepare the data
for visualization and training.
Args:
df (pandas.DataFrame): input data
target_col (str): if set, the target column may be examined (e.g. to be
numeric)
max_colnull (float): after generating features, drop the columns that
have null/na rows with more than this ratio.
"""
def __init__(self):
pass
def __init__(self, df, target_col=None, max_colnull=0.1):
self.df = df
self.target_col = target_col
self.max_colnull = max_colnull


def preprocess(self, df=None, scale=False, pca=False, **kwargs):
"""
A sequence of data pre-processing steps either through this class or
sklearn.
Args:
scale (bool): whether to scale/normalize the data
pca (bool): whether to use principal component analysis (PCA) to
reduce the dimensions of the data.
kwargs (dict): the keyword arguments that are specific to some
of the preprocessing methods such as PCA
Returns (pandas.DataFrame):
"""
df = self._prescreen_df(df)
df = self.handle_nulls(df, na_method=kwargs.pop('na_method', 'drop'))
if scale:
self.scaler = MinMaxScaler()
df = self.scaler.fit_transform(df)
if pca:
pca = PCA(n_components=kwargs.pop('n_components', None))
df = pca.fit_transform(df)
if self.target_col:
if not is_numeric_dtype(df[self.target_col]):
raise MatbenchError('Target column "{}" must be numeric'.format(
self.target_col))
return df


def _prescreen_df(self, df):
if df is None:
df = self.df.copy(deep=True)
return df


def handle_nulls(self, max_null_ratio=0.05, method='drop'):
def handle_nulls(self, df=None, max_colnull=None, na_method='drop'):
"""
Args:
max_null_ratio ([str]): after generating features, drop the columns
that have null/na rows with more than this ratio. Default 0.05
method (str): method of handling null rows.
max_colnull ([str]): after generating features, drop the columns
that have null/na rows with more than this ratio.
na_method (str): method of handling null rows.
Options: "drop", "mode", ... (see pandas fillna method options)
Returns:
"""
self.df = self.df.dropna(
axis=1, thresh=int((1-max_null_ratio)*len(self.df)))
if method == "drop": # drop all rows that contain any null
self.df = self.df.dropna(axis=0)
df = self._prescreen_df(df)
max_colnull = max_colnull or self.max_colnull
df = df.dropna(axis=1, thresh=int((1-max_colnull)*len(df)))
if na_method == "drop": # drop all rows that contain any null
df = df.dropna(axis=0)
else:
self.df = self.df.fillna(method=method)
df = df.fillna(method=na_method)
return df

0 comments on commit a2f33ab

Please sign in to comment.