Merge pull request #31 from albalu/master

expand PreProcess a bit
hackingmaterials · Jul 17, 2018 · a2f33ab · a2f33ab
2 parents 5d8791a + 7626441
commit a2f33ab
Show file tree

Hide file tree

Showing 2 changed files with 80 additions and 12 deletions.
diff --git a/matbench/automl/tpot_utils.py b/matbench/automl/tpot_utils.py
@@ -1,3 +1,7 @@
+from matbench.data.load import load_double_perovskites_gap
+from matbench.featurize import Featurize
+from matbench.preprocess import PreProcess
+from matminer.featurizers.composition import ElementProperty, TMetalFraction
 from tpot import TPOTClassifier
 from sklearn.datasets import load_iris
 from sklearn.model_selection import train_test_split
@@ -31,4 +35,18 @@
 tpot.export('tpot_iris_pipeline.py')
 
 # very good method for us; keeps track of the score of different algorithms:
-print(tpot.evaluated_individuals_)
+print(tpot.evaluated_individuals_)
+
+
+# limit = 200
+# df_init = load_double_perovskites_gap(return_lumo=False)[:limit]
+#
+# featzer = Featurize(df_init, ignore_cols=['a_1', 'b_1', 'a_2', 'b_2'])
+# df_feats = featzer.featurize_formula(featurizers=[
+#     ElementProperty.from_preset(preset_name='matminer'), TMetalFraction()])
+#
+# prep = PreProcess(df_feats, target_col='gap gllbsc')
+# df = prep.preprocess()
+#
+#
+# print(df)
diff --git a/matbench/preprocess.py b/matbench/preprocess.py
@@ -1,28 +1,78 @@
+from matbench.utils.utils import MatbenchError
+from sklearn.decomposition import PCA
+from sklearn.preprocessing import MinMaxScaler
+from pandas.api.types import is_numeric_dtype
 
 
 class PreProcess(object):
     """
     PreProcess has several methods to clean and prepare the data
     for visualization and training.
+
+    Args:
+        df (pandas.DataFrame): input data
+        target_col (str): if set, the target column may be examined (e.g. to be
+            numeric)
+        max_colnull (float): after generating features, drop the columns that
+            have null/na rows with more than this ratio.
     """
-    def __init__(self):
-        pass
+    def __init__(self, df, target_col=None, max_colnull=0.1):
+        self.df = df
+        self.target_col = target_col
+        self.max_colnull = max_colnull
+
+
+    def preprocess(self, df=None, scale=False, pca=False, **kwargs):
+        """
+        A sequence of data pre-processing steps either through this class or
+        sklearn.
+
+        Args:
+            scale (bool): whether to scale/normalize the data
+            pca (bool): whether to use principal component analysis (PCA) to
+                reduce the dimensions of the data.
+            kwargs (dict): the keyword arguments that are specific to some
+                of the preprocessing methods such as PCA
+
+        Returns (pandas.DataFrame):
+        """
+        df = self._prescreen_df(df)
+        df = self.handle_nulls(df, na_method=kwargs.pop('na_method', 'drop'))
+        if scale:
+            self.scaler = MinMaxScaler()
+            df = self.scaler.fit_transform(df)
+        if pca:
+            pca = PCA(n_components=kwargs.pop('n_components', None))
+            df = pca.fit_transform(df)
+        if self.target_col:
+            if not is_numeric_dtype(df[self.target_col]):
+                raise MatbenchError('Target column "{}" must be numeric'.format(
+                    self.target_col))
+        return df
+
+
+    def _prescreen_df(self, df):
+        if df is None:
+            df = self.df.copy(deep=True)
+        return df
 
 
-    def handle_nulls(self, max_null_ratio=0.05, method='drop'):
+    def handle_nulls(self, df=None, max_colnull=None, na_method='drop'):
         """
 
         Args:
-            max_null_ratio ([str]): after generating features, drop the columns
-                that have null/na rows with more than this ratio. Default 0.05
-            method (str): method of handling null rows.
+            max_colnull ([str]): after generating features, drop the columns
+                that have null/na rows with more than this ratio.
+            na_method (str): method of handling null rows.
                 Options: "drop", "mode", ... (see pandas fillna method options)
         Returns:
 
         """
-        self.df = self.df.dropna(
-                        axis=1, thresh=int((1-max_null_ratio)*len(self.df)))
-        if method == "drop": # drop all rows that contain any null
-            self.df = self.df.dropna(axis=0)
+        df = self._prescreen_df(df)
+        max_colnull = max_colnull or self.max_colnull
+        df = df.dropna(axis=1, thresh=int((1-max_colnull)*len(df)))
+        if na_method == "drop": # drop all rows that contain any null
+            df = df.dropna(axis=0)
         else:
-            self.df = self.df.fillna(method=method)
+            df = df.fillna(method=na_method)
+        return df