diff --git a/matbench/automl/tpot_utils.py b/matbench/automl/tpot_utils.py index 073abc43..42d42c74 100644 --- a/matbench/automl/tpot_utils.py +++ b/matbench/automl/tpot_utils.py @@ -1,3 +1,7 @@ +from matbench.data.load import load_double_perovskites_gap +from matbench.featurize import Featurize +from matbench.preprocess import PreProcess +from matminer.featurizers.composition import ElementProperty, TMetalFraction from tpot import TPOTClassifier from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split @@ -31,4 +35,18 @@ tpot.export('tpot_iris_pipeline.py') # very good method for us; keeps track of the score of different algorithms: -print(tpot.evaluated_individuals_) \ No newline at end of file +print(tpot.evaluated_individuals_) + + +# limit = 200 +# df_init = load_double_perovskites_gap(return_lumo=False)[:limit] +# +# featzer = Featurize(df_init, ignore_cols=['a_1', 'b_1', 'a_2', 'b_2']) +# df_feats = featzer.featurize_formula(featurizers=[ +# ElementProperty.from_preset(preset_name='matminer'), TMetalFraction()]) +# +# prep = PreProcess(df_feats, target_col='gap gllbsc') +# df = prep.preprocess() +# +# +# print(df) \ No newline at end of file diff --git a/matbench/preprocess.py b/matbench/preprocess.py index 3fa9727c..be5bbcfe 100644 --- a/matbench/preprocess.py +++ b/matbench/preprocess.py @@ -1,28 +1,78 @@ +from matbench.utils.utils import MatbenchError +from sklearn.decomposition import PCA +from sklearn.preprocessing import MinMaxScaler +from pandas.api.types import is_numeric_dtype class PreProcess(object): """ PreProcess has several methods to clean and prepare the data for visualization and training. + + Args: + df (pandas.DataFrame): input data + target_col (str): if set, the target column may be examined (e.g. to be + numeric) + max_colnull (float): after generating features, drop the columns that + have null/na rows with more than this ratio. """ - def __init__(self): - pass + def __init__(self, df, target_col=None, max_colnull=0.1): + self.df = df + self.target_col = target_col + self.max_colnull = max_colnull + + + def preprocess(self, df=None, scale=False, pca=False, **kwargs): + """ + A sequence of data pre-processing steps either through this class or + sklearn. + + Args: + scale (bool): whether to scale/normalize the data + pca (bool): whether to use principal component analysis (PCA) to + reduce the dimensions of the data. + kwargs (dict): the keyword arguments that are specific to some + of the preprocessing methods such as PCA + + Returns (pandas.DataFrame): + """ + df = self._prescreen_df(df) + df = self.handle_nulls(df, na_method=kwargs.pop('na_method', 'drop')) + if scale: + self.scaler = MinMaxScaler() + df = self.scaler.fit_transform(df) + if pca: + pca = PCA(n_components=kwargs.pop('n_components', None)) + df = pca.fit_transform(df) + if self.target_col: + if not is_numeric_dtype(df[self.target_col]): + raise MatbenchError('Target column "{}" must be numeric'.format( + self.target_col)) + return df + + + def _prescreen_df(self, df): + if df is None: + df = self.df.copy(deep=True) + return df - def handle_nulls(self, max_null_ratio=0.05, method='drop'): + def handle_nulls(self, df=None, max_colnull=None, na_method='drop'): """ Args: - max_null_ratio ([str]): after generating features, drop the columns - that have null/na rows with more than this ratio. Default 0.05 - method (str): method of handling null rows. + max_colnull ([str]): after generating features, drop the columns + that have null/na rows with more than this ratio. + na_method (str): method of handling null rows. Options: "drop", "mode", ... (see pandas fillna method options) Returns: """ - self.df = self.df.dropna( - axis=1, thresh=int((1-max_null_ratio)*len(self.df))) - if method == "drop": # drop all rows that contain any null - self.df = self.df.dropna(axis=0) + df = self._prescreen_df(df) + max_colnull = max_colnull or self.max_colnull + df = df.dropna(axis=1, thresh=int((1-max_colnull)*len(df))) + if na_method == "drop": # drop all rows that contain any null + df = df.dropna(axis=0) else: - self.df = self.df.fillna(method=method) \ No newline at end of file + df = df.fillna(method=na_method) + return df