Skip to content

Commit

Permalink
Merge pull request #69 from ardunn/master
Browse files Browse the repository at this point in the history
Featurize changes
  • Loading branch information
ardunn committed Sep 15, 2018
2 parents 27a2dc2 + 71356b8 commit e4c81a8
Show file tree
Hide file tree
Showing 6 changed files with 38 additions and 46 deletions.
68 changes: 29 additions & 39 deletions matbench/core/featurize.py
@@ -1,4 +1,5 @@
from warnings import warn
import logging

import matminer.featurizers.composition as cf
import matminer.featurizers.structure as sf
Expand All @@ -7,7 +8,7 @@
from matminer.featurizers.base import MultipleFeaturizer
from matminer.utils.conversions import composition_to_oxidcomposition, \
structure_to_oxidstructure
from matbench.utils.utils import MatbenchError
from matbench.utils.utils import MatbenchError, setup_custom_logger
from pymatgen import Composition, Structure
from pymatgen.electronic_structure.bandstructure import BandStructure
from pymatgen.electronic_structure.dos import CompleteDos
Expand Down Expand Up @@ -195,7 +196,7 @@ def fast(self):
def many_features(self):
featzers = [sf.BagofBonds(),
sf.PartialRadialDistributionFunction(),
sf.BondFractions]
sf.BondFractions()]
return [i for i in featzers if i.__class__.__name__ not in self.exclude]

@property
Expand Down Expand Up @@ -229,8 +230,7 @@ def all(self):

@property
def best(self):
featzers = self.fast + [sf.BondFractions()] + self.slow
return [i for i in featzers if i.__class__.__name__ not in self.exclude]
return self.fast + self.slow


class DOSFeaturizers(FeaturizerSet):
Expand Down Expand Up @@ -283,11 +283,11 @@ class Featurize(object):
"""
Takes in a dataframe and generate features from preset columns such as
"formula", "structure", "bandstructure", "dos", etc. One may use
the featurize_columns method to featurize via all available featurizers
the auto_featurize method to featurize via all available featurizers
with default setting or selectively call featurizer methods.
Usage examples:
featurizer = Featurize()
df = featurizer.featurize_columns(df) # all features of all types
df = featurizer.auto_featurize(df) # all features of all types
or:
df = featurizer.featurize_formula(df) # all formula-related feature
or:
Expand All @@ -313,8 +313,9 @@ class Featurize(object):

def __init__(self, ignore_cols=None, ignore_errors=True,
drop_featurized_col=True, exclude=None, multiindex=False,
n_jobs=None):
n_jobs=None, loglevel=logging.INFO, logpath='.'):

self.logger = setup_custom_logger(filepath=logpath, level=loglevel)
self.ignore_cols = ignore_cols or []
self.cfset = CompositionFeaturizers(exclude=exclude)
self.sfset = StructureFeaturizers(exclude=exclude)
Expand Down Expand Up @@ -350,7 +351,8 @@ def _featurize_sequentially(self, df, fset, col_id, **kwargs):
df = f.fit_featurize_dataframe(df, col_id, **kwargs)
return df

def featurize_columns(self, df, input_cols=None, **kwargs):
def auto_featurize(self, df, input_cols=("formula", "structure"),
**kwargs):
"""
Featurizes the dataframe based on input_columns.
Expand All @@ -367,7 +369,6 @@ def featurize_columns(self, df, input_cols=None, **kwargs):
self.df w/ new features added via featurizering input_cols
"""
df = self._prescreen_df(df)
input_cols = input_cols or ["formula", "structure"]
for idx, column in enumerate(input_cols):
featurizer = getattr(self, "featurize_{}".format(column), None)
if featurizer is not None:
Expand All @@ -377,9 +378,11 @@ def featurize_columns(self, df, input_cols=None, **kwargs):
col_id = column
df = featurizer(df, col_id=col_id, **kwargs)
elif column not in df:
raise MatbenchError('no "{}" in the data!')
self.logger.warn(
"{} not found in the dataframe! Skipping...".format(column))
else:
warn('No method available to featurize "{}"'.format(column))
self.logger.warn(
'No method available to featurize "{}"'.format(column))
return df

def featurize_formula(self, df, featurizers="best", col_id="formula",
Expand Down Expand Up @@ -416,12 +419,10 @@ def featurize_formula(self, df, featurizers="best", col_id="formula",
if isinstance(featurizers, str):
featurizers = getattr(self.cfset, featurizers)

featzer = MultipleFeaturizer(featurizers)
if self.n_jobs:
featzer.set_n_jobs(n_jobs=self.n_jobs)
df = featzer.fit_featurize_dataframe(df, compcol,
ignore_errors=self.ignore_errors,
multiindex=self.multiindex)
# Multiple featurizer has issues, just use this bc we get pbar!
df = self._featurize_sequentially(df, featurizers, compcol,
ignore_errors=self.ignore_errors,
multiindex=self.multiindex)
if asindex:
df = df.set_index(self._pre_screen_col(col_id))
if self.drop_featurized_col:
Expand Down Expand Up @@ -464,14 +465,7 @@ def featurize_structure(self, df, featurizers="best",
if isinstance(featurizers, str):
featurizers = getattr(self.sfset, featurizers)


# Todo: revert to MultipleFeaturizer once it is fixed
# featzer = MultipleFeaturizer(featurizers)
# if self.n_jobs:
# featzer.set_n_jobs(n_jobs=self.n_jobs)
# df = featzer.fit_featurize_dataframe(df, compcol,
# ignore_errors=self.ignore_errors,
# multiindex=self.multiindex)
# Multiple featurizer has issues, just use this bc we get pbar!
df = self._featurize_sequentially(df, featurizers, col_id,
ignore_errors=self.ignore_errors,
multiindex=self.multiindex)
Expand All @@ -498,7 +492,7 @@ def featurize_dos(self, df, featurizers="best", col_id="dos",
col_id (str): actual column name to be used as dos
inplace (bool): whether to modify the input df
kwargs: keyword arguments that may be accepted by other featurize_*
methods passed through featurize_columns
methods passed through auto_featurize
Returns (pandas.DataFrame):
Dataframe with dos features added.
Expand All @@ -509,12 +503,10 @@ def featurize_dos(self, df, featurizers="best", col_id="dos",
if isinstance(featurizers, str):
featurizers = getattr(self.dosfset, featurizers)

featzer = MultipleFeaturizer(featurizers)
if self.n_jobs:
featzer.set_n_jobs(n_jobs=self.n_jobs)
df = featzer.fit_featurize_dataframe(df, col_id,
ignore_errors=self.ignore_errors,
multiindex=self.multiindex)
# Multiple featurizer has issues, just use this bc we get pbar!
df = self._featurize_sequentially(df, featurizers, col_id,
ignore_errors=self.ignore_errors,
multiindex=self.multiindex)
if self.drop_featurized_col:
return df.drop([self._pre_screen_col(col_id)], axis=1)
else:
Expand All @@ -534,7 +526,7 @@ def featurize_bandstructure(self, df, featurizers="all",
col_id (str): actual column name containing the bandstructure data
inplace (bool): whether to modify the input df
kwargs: keyword arguments that may be accepted by other featurize_*
methods passed through featurize_columns
methods passed through auto_featurize
Returns (pandas.DataFrame):
Dataframe with bandstructure features added.
Expand All @@ -544,12 +536,10 @@ def featurize_bandstructure(self, df, featurizers="all",
df[col_id] = df[col_id].apply(BandStructure.from_dict)
if isinstance(featurizers, str):
featurizers = getattr(self.bsfset, featurizers)
featzer = MultipleFeaturizer(featurizers)
if self.n_jobs:
featzer.set_n_jobs(n_jobs=self.n_jobs)
df = featzer.fit_featurize_dataframe(df, col_id,
ignore_errors=self.ignore_errors,
multiindex=self.multiindex)
# Multiple featurizer has issues, just use this bc we get pbar!
df = self._featurize_sequentially(df, featurizers, col_id,
ignore_errors=self.ignore_errors,
multiindex=self.multiindex)
if self.drop_featurized_col:
return df.drop([self._pre_screen_col(col_id)], axis=1)
else:
Expand Down
1 change: 1 addition & 0 deletions matbench/core/preprocess.py
Expand Up @@ -83,6 +83,7 @@ def preprocess(self, df, target_key, scale=False, n_pca_features=None,
targets = df[target_key].copy(deep=True)

# Todo: StandardScaler might be better
# Todo: Data *must* be standardized for PCA...
if scale:
number_df[number_cols] = MinMaxScaler().fit_transform(number_df)

Expand Down
1 change: 1 addition & 0 deletions matbench/core/tests/test_featurize.py
Expand Up @@ -29,6 +29,7 @@ def test_featurize_formula(self, limit=5):

df = featurizer.featurize_formula(df_init,
featurizers="all",
compcol=None,
asindex=False,
guess_oxidstates=True)

Expand Down
2 changes: 1 addition & 1 deletion matbench/core/visualize.py
Expand Up @@ -31,7 +31,7 @@ def targetted_visualize(self, target, ncols=10):
prep = PrepareData(df_init,
targets=['gap gllbsc'],
ignore_cols=['A1', 'A2', 'B1', 'B2'])
prep.featurize_columns()
prep.auto_featurize()
# prep.handle_na()

vis = VisualizeData(prep.get_train_target())
Expand Down
6 changes: 3 additions & 3 deletions matbench/examples/expt_gap.py
Expand Up @@ -55,9 +55,9 @@
drop_featurized_col=True,
n_jobs=N_JOBS)

df = featzer.featurize_columns(df_init,
input_cols=FEATUREIZE_THESE_COLUMNS,
guess_oxidstates=False)
df = featzer.auto_featurize(df_init,
input_cols=FEATUREIZE_THESE_COLUMNS,
guess_oxidstates=False)
df.to_pickle(os.path.join(CALC_DIR, '{}_data.pickle'.format(fname_base)))
else:
df = pd.read_pickle(os.path.join(CALC_DIR, '{}_data.pickle'.format(fname_base)))
Expand Down
6 changes: 3 additions & 3 deletions matbench/examples/tricky_target.py
Expand Up @@ -42,9 +42,9 @@
multiindex=MULTIINDEX,
drop_featurized_col=True)

df = featzer.featurize_columns(df_init,
input_cols=FEATUREIZE_THESE_COLUMNS,
guess_oxidstates=True)
df = featzer.auto_featurize(df_init,
input_cols=FEATUREIZE_THESE_COLUMNS,
guess_oxidstates=True)

prep = PreProcess(target=TARGET)
df = prep.preprocess(df)
Expand Down

0 comments on commit e4c81a8

Please sign in to comment.