Merge pull request #69 from ardunn/master

Featurize changes
hackingmaterials · Sep 15, 2018 · e4c81a8 · e4c81a8
2 parents 27a2dc2 + 71356b8
commit e4c81a8
Show file tree

Hide file tree

Showing 6 changed files with 38 additions and 46 deletions.
diff --git a/matbench/core/featurize.py b/matbench/core/featurize.py
@@ -1,4 +1,5 @@
 from warnings import warn
+import logging
 
 import matminer.featurizers.composition as cf
 import matminer.featurizers.structure as sf
@@ -7,7 +8,7 @@
 from matminer.featurizers.base import MultipleFeaturizer
 from matminer.utils.conversions import composition_to_oxidcomposition, \
     structure_to_oxidstructure
-from matbench.utils.utils import MatbenchError
+from matbench.utils.utils import MatbenchError, setup_custom_logger
 from pymatgen import Composition, Structure
 from pymatgen.electronic_structure.bandstructure import BandStructure
 from pymatgen.electronic_structure.dos import CompleteDos
@@ -195,7 +196,7 @@ def fast(self):
     def many_features(self):
         featzers = [sf.BagofBonds(),
                     sf.PartialRadialDistributionFunction(),
-                    sf.BondFractions]
+                    sf.BondFractions()]
         return [i for i in featzers if i.__class__.__name__ not in self.exclude]
 
     @property
@@ -229,8 +230,7 @@ def all(self):
 
     @property
     def best(self):
-        featzers = self.fast + [sf.BondFractions()] + self.slow
-        return [i for i in featzers if i.__class__.__name__ not in self.exclude]
+        return self.fast + self.slow
 
 
 class DOSFeaturizers(FeaturizerSet):
@@ -283,11 +283,11 @@ class Featurize(object):
     """
     Takes in a dataframe and generate features from preset columns such as
     "formula", "structure", "bandstructure", "dos", etc. One may use
-    the featurize_columns method to featurize via all available featurizers
+    the auto_featurize method to featurize via all available featurizers
     with default setting or selectively call featurizer methods.
     Usage examples:
         featurizer = Featurize()
-            df = featurizer.featurize_columns(df) # all features of all types
+            df = featurizer.auto_featurize(df) # all features of all types
         or:
             df = featurizer.featurize_formula(df) # all formula-related feature
         or:
@@ -313,8 +313,9 @@ class Featurize(object):
 
     def __init__(self, ignore_cols=None, ignore_errors=True,
                  drop_featurized_col=True, exclude=None, multiindex=False,
-                 n_jobs=None):
+                 n_jobs=None, loglevel=logging.INFO, logpath='.'):
 
+        self.logger = setup_custom_logger(filepath=logpath, level=loglevel)
         self.ignore_cols = ignore_cols or []
         self.cfset = CompositionFeaturizers(exclude=exclude)
         self.sfset = StructureFeaturizers(exclude=exclude)
@@ -350,7 +351,8 @@ def _featurize_sequentially(self, df, fset, col_id, **kwargs):
             df = f.fit_featurize_dataframe(df, col_id, **kwargs)
         return df
 
-    def featurize_columns(self, df, input_cols=None, **kwargs):
+    def auto_featurize(self, df, input_cols=("formula", "structure"),
+                       **kwargs):
         """
         Featurizes the dataframe based on input_columns.
 
@@ -367,7 +369,6 @@ def featurize_columns(self, df, input_cols=None, **kwargs):
             self.df w/ new features added via featurizering input_cols
         """
         df = self._prescreen_df(df)
-        input_cols = input_cols or ["formula", "structure"]
         for idx, column in enumerate(input_cols):
             featurizer = getattr(self, "featurize_{}".format(column), None)
             if featurizer is not None:
@@ -377,9 +378,11 @@ def featurize_columns(self, df, input_cols=None, **kwargs):
                     col_id = column
                 df = featurizer(df, col_id=col_id, **kwargs)
             elif column not in df:
-                raise MatbenchError('no "{}" in the data!')
+                self.logger.warn(
+                    "{} not found in the dataframe! Skipping...".format(column))
             else:
-                warn('No method available to featurize "{}"'.format(column))
+                self.logger.warn(
+                    'No method available to featurize "{}"'.format(column))
         return df
 
     def featurize_formula(self, df, featurizers="best", col_id="formula",
@@ -416,12 +419,10 @@ def featurize_formula(self, df, featurizers="best", col_id="formula",
         if isinstance(featurizers, str):
             featurizers = getattr(self.cfset, featurizers)
 
-        featzer = MultipleFeaturizer(featurizers)
-        if self.n_jobs:
-            featzer.set_n_jobs(n_jobs=self.n_jobs)
-        df = featzer.fit_featurize_dataframe(df, compcol,
-                                             ignore_errors=self.ignore_errors,
-                                             multiindex=self.multiindex)
+        # Multiple featurizer has issues, just use this bc we get pbar!
+        df = self._featurize_sequentially(df, featurizers, compcol,
+                                          ignore_errors=self.ignore_errors,
+                                          multiindex=self.multiindex)
         if asindex:
             df = df.set_index(self._pre_screen_col(col_id))
         if self.drop_featurized_col:
@@ -464,14 +465,7 @@ def featurize_structure(self, df, featurizers="best",
         if isinstance(featurizers, str):
             featurizers = getattr(self.sfset, featurizers)
 
-
-        # Todo: revert to MultipleFeaturizer once it is fixed
-        # featzer = MultipleFeaturizer(featurizers)
-        # if self.n_jobs:
-        #     featzer.set_n_jobs(n_jobs=self.n_jobs)
-        # df = featzer.fit_featurize_dataframe(df, compcol,
-        #                                      ignore_errors=self.ignore_errors,
-        #                                      multiindex=self.multiindex)
+        # Multiple featurizer has issues, just use this bc we get pbar!
         df = self._featurize_sequentially(df, featurizers, col_id,
                                           ignore_errors=self.ignore_errors,
                                           multiindex=self.multiindex)
@@ -498,7 +492,7 @@ def featurize_dos(self, df, featurizers="best", col_id="dos",
             col_id (str): actual column name to be used as dos
             inplace (bool): whether to modify the input df
             kwargs: keyword arguments that may be accepted by other featurize_*
-                methods passed through featurize_columns
+                methods passed through auto_featurize
 
         Returns (pandas.DataFrame):
             Dataframe with dos features added.
@@ -509,12 +503,10 @@ def featurize_dos(self, df, featurizers="best", col_id="dos",
         if isinstance(featurizers, str):
             featurizers = getattr(self.dosfset, featurizers)
 
-        featzer = MultipleFeaturizer(featurizers)
-        if self.n_jobs:
-            featzer.set_n_jobs(n_jobs=self.n_jobs)
-        df = featzer.fit_featurize_dataframe(df, col_id,
-                                             ignore_errors=self.ignore_errors,
-                                             multiindex=self.multiindex)
+        # Multiple featurizer has issues, just use this bc we get pbar!
+        df = self._featurize_sequentially(df, featurizers, col_id,
+                                          ignore_errors=self.ignore_errors,
+                                          multiindex=self.multiindex)
         if self.drop_featurized_col:
             return df.drop([self._pre_screen_col(col_id)], axis=1)
         else:
@@ -534,7 +526,7 @@ def featurize_bandstructure(self, df, featurizers="all",
             col_id (str): actual column name containing the bandstructure data
             inplace (bool): whether to modify the input df
             kwargs: keyword arguments that may be accepted by other featurize_*
-                methods passed through featurize_columns
+                methods passed through auto_featurize
 
         Returns (pandas.DataFrame):
             Dataframe with bandstructure features added.
@@ -544,12 +536,10 @@ def featurize_bandstructure(self, df, featurizers="all",
             df[col_id] = df[col_id].apply(BandStructure.from_dict)
         if isinstance(featurizers, str):
             featurizers = getattr(self.bsfset, featurizers)
-        featzer = MultipleFeaturizer(featurizers)
-        if self.n_jobs:
-            featzer.set_n_jobs(n_jobs=self.n_jobs)
-        df = featzer.fit_featurize_dataframe(df, col_id,
-                                             ignore_errors=self.ignore_errors,
-                                             multiindex=self.multiindex)
+        # Multiple featurizer has issues, just use this bc we get pbar!
+        df = self._featurize_sequentially(df, featurizers, col_id,
+                                          ignore_errors=self.ignore_errors,
+                                          multiindex=self.multiindex)
         if self.drop_featurized_col:
             return df.drop([self._pre_screen_col(col_id)], axis=1)
         else:

diff --git a/matbench/core/preprocess.py b/matbench/core/preprocess.py
@@ -83,6 +83,7 @@ def preprocess(self, df, target_key, scale=False, n_pca_features=None,
         targets = df[target_key].copy(deep=True)
 
         # Todo: StandardScaler might be better
+        # Todo: Data *must* be standardized for PCA...
         if scale:
             number_df[number_cols] = MinMaxScaler().fit_transform(number_df)
 

diff --git a/matbench/core/tests/test_featurize.py b/matbench/core/tests/test_featurize.py
@@ -29,6 +29,7 @@ def test_featurize_formula(self, limit=5):
 
         df = featurizer.featurize_formula(df_init,
                                           featurizers="all",
+                                          compcol=None,
                                           asindex=False,
                                           guess_oxidstates=True)
 

diff --git a/matbench/core/visualize.py b/matbench/core/visualize.py
@@ -31,7 +31,7 @@ def targetted_visualize(self, target, ncols=10):
     prep = PrepareData(df_init,
                        targets=['gap gllbsc'],
                        ignore_cols=['A1', 'A2', 'B1', 'B2'])
-    prep.featurize_columns()
+    prep.auto_featurize()
     # prep.handle_na()
 
     vis = VisualizeData(prep.get_train_target())

diff --git a/matbench/examples/expt_gap.py b/matbench/examples/expt_gap.py
@@ -55,9 +55,9 @@
                         drop_featurized_col=True,
                         n_jobs=N_JOBS)
 
-    df = featzer.featurize_columns(df_init,
-                                   input_cols=FEATUREIZE_THESE_COLUMNS,
-                                   guess_oxidstates=False)
+    df = featzer.auto_featurize(df_init,
+                                input_cols=FEATUREIZE_THESE_COLUMNS,
+                                guess_oxidstates=False)
     df.to_pickle(os.path.join(CALC_DIR, '{}_data.pickle'.format(fname_base)))
 else:
     df = pd.read_pickle(os.path.join(CALC_DIR, '{}_data.pickle'.format(fname_base)))

diff --git a/matbench/examples/tricky_target.py b/matbench/examples/tricky_target.py
@@ -42,9 +42,9 @@
                     multiindex=MULTIINDEX,
                     drop_featurized_col=True)
 
-df = featzer.featurize_columns(df_init,
-                               input_cols=FEATUREIZE_THESE_COLUMNS,
-                               guess_oxidstates=True)
+df = featzer.auto_featurize(df_init,
+                            input_cols=FEATUREIZE_THESE_COLUMNS,
+                            guess_oxidstates=True)
 
 prep = PreProcess(target=TARGET)
 df = prep.preprocess(df)