# Basic featurization

In [8]:
from mofdscribe.datasets import CoREDataset

from mofdscribe.featurizers.chemistry import RACS, APRDF, AMD
from mofdscribe.featurizers.topology import PHStats, PHHist

from matminer.featurizers.base import MultipleFeaturizer

import pandas as pd

In [4]:
ds = CoREDataset()

2022-08-02 11:13:06.784 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:118 - Dropped 639 duplicate basenames. New length 8182
2022-08-02 11:13:06.794 | DEBUG    | mofdscribe.datasets.core_dataset:__init__:124 - Dropped 1312 duplicate graphs. New length 6870


We can access structures from datasets with `get_structures`

In [5]:
structures = ds.get_structures(range(10)) 

This will give us a generator.

In [6]:
type(structures)

generator

We can instantiate a featurizer class and featurize a single structure

In [11]:
racs_featurizer = RACS()
racs = racs_featurizer.featurize(next(structures))

2022-08-02 11:14:54.603 | DEBUG    | mofdscribe.featurizers.chemistry.racs:_compute_racs:38 - No start indices for linker_functional
2022-08-02 11:14:54.603 | DEBUG    | mofdscribe.featurizers.chemistry.racs:_compute_racs:38 - No start indices for linker_functional
2022-08-02 11:14:54.604 | DEBUG    | mofdscribe.featurizers.chemistry.racs:_compute_racs:38 - No start indices for linker_functional


In [14]:
pd.DataFrame([dict(zip(racs_featurizer.feature_labels(), racs))])

Unnamed: 0,racs_bb-linker_all_prop-I_scope-1_propagg-diff_corragg-avg_bbagg-avg,racs_bb-linker_all_prop-I_scope-1_propagg-diff_corragg-avg_bbagg-sum,racs_bb-linker_all_prop-I_scope-1_propagg-diff_corragg-sum_bbagg-avg,racs_bb-linker_all_prop-I_scope-1_propagg-diff_corragg-sum_bbagg-sum,racs_bb-linker_all_prop-I_scope-1_propagg-product_corragg-avg_bbagg-avg,racs_bb-linker_all_prop-I_scope-1_propagg-product_corragg-avg_bbagg-sum,racs_bb-linker_all_prop-I_scope-1_propagg-product_corragg-sum_bbagg-avg,racs_bb-linker_all_prop-I_scope-1_propagg-product_corragg-sum_bbagg-sum,racs_bb-linker_all_prop-I_scope-2_propagg-diff_corragg-avg_bbagg-avg,racs_bb-linker_all_prop-I_scope-2_propagg-diff_corragg-avg_bbagg-sum,...,racs_bb-nodes_prop-mod_pettifor_scope-2_propagg-product_corragg-sum_bbagg-avg,racs_bb-nodes_prop-mod_pettifor_scope-2_propagg-product_corragg-sum_bbagg-sum,racs_bb-nodes_prop-mod_pettifor_scope-3_propagg-diff_corragg-avg_bbagg-avg,racs_bb-nodes_prop-mod_pettifor_scope-3_propagg-diff_corragg-avg_bbagg-sum,racs_bb-nodes_prop-mod_pettifor_scope-3_propagg-diff_corragg-sum_bbagg-avg,racs_bb-nodes_prop-mod_pettifor_scope-3_propagg-diff_corragg-sum_bbagg-sum,racs_bb-nodes_prop-mod_pettifor_scope-3_propagg-product_corragg-avg_bbagg-avg,racs_bb-nodes_prop-mod_pettifor_scope-3_propagg-product_corragg-avg_bbagg-sum,racs_bb-nodes_prop-mod_pettifor_scope-3_propagg-product_corragg-sum_bbagg-avg,racs_bb-nodes_prop-mod_pettifor_scope-3_propagg-product_corragg-sum_bbagg-sum
0,0.0,0.0,0.0,0.0,1.0,4.0,29.0,116.0,0.0,0.0,...,47472.0,189888.0,23.636364,94.545455,520.0,2080.0,6391.909091,25567.636364,140622.0,562488.0


However, you can also combine multiple featurizers into a single featurizer.

In [16]:
featurizer = MultipleFeaturizer([RACS(), APRDF(), AMD(), PHStats(), PHHist()])
features = featurizer.featurize(next(structures))
pd.DataFrame([dict(zip(featurizer.feature_labels(), features))])

2022-08-02 11:17:08.861 | DEBUG    | mofdscribe.featurizers.chemistry.racs:_compute_racs:38 - No start indices for linker_functional
2022-08-02 11:17:08.861 | DEBUG    | mofdscribe.featurizers.chemistry.racs:_compute_racs:38 - No start indices for linker_functional
2022-08-02 11:17:08.862 | DEBUG    | mofdscribe.featurizers.chemistry.racs:_compute_racs:38 - No start indices for linker_functional


Unnamed: 0,racs_bb-linker_all_prop-I_scope-1_propagg-diff_corragg-avg_bbagg-avg,racs_bb-linker_all_prop-I_scope-1_propagg-diff_corragg-avg_bbagg-sum,racs_bb-linker_all_prop-I_scope-1_propagg-diff_corragg-sum_bbagg-avg,racs_bb-linker_all_prop-I_scope-1_propagg-diff_corragg-sum_bbagg-sum,racs_bb-linker_all_prop-I_scope-1_propagg-product_corragg-avg_bbagg-avg,racs_bb-linker_all_prop-I_scope-1_propagg-product_corragg-avg_bbagg-sum,racs_bb-linker_all_prop-I_scope-1_propagg-product_corragg-sum_bbagg-avg,racs_bb-linker_all_prop-I_scope-1_propagg-product_corragg-sum_bbagg-sum,racs_bb-linker_all_prop-I_scope-2_propagg-diff_corragg-avg_bbagg-avg,racs_bb-linker_all_prop-I_scope-2_propagg-diff_corragg-avg_bbagg-sum,...,phhist_all_dim2_nx9_ny0_persistence,phhist_all_dim2_nx9_ny1_persistence,phhist_all_dim2_nx9_ny2_persistence,phhist_all_dim2_nx9_ny3_persistence,phhist_all_dim2_nx9_ny4_persistence,phhist_all_dim2_nx9_ny5_persistence,phhist_all_dim2_nx9_ny6_persistence,phhist_all_dim2_nx9_ny7_persistence,phhist_all_dim2_nx9_ny8_persistence,phhist_all_dim2_nx9_ny9_persistence
0,0.0,0.0,0.0,0.0,1.0,104.0,13.615385,1416.0,0.0,0.0,...,0.980811,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.326937


But we can also run it over multiple structures with one call 

In [21]:
feats_all = featurizer.featurize_many(list(ds.get_structures(range(10))), ignore_errors=True)

MultipleFeaturizer:   0%|          | 0/10 [00:00<?, ?it/s]

