# Example of use of featurizer and featurecollector

In [2]:
import pandas as pd
from glob import glob
import os 
from pathlib import Path

# this package
from oximachine_featurizer.featurize import GetFeatures, FeatureCollector

In [3]:
example_structures = glob('structures/*.cif') 

In [4]:
example_structures

['structures/KAJZIH_freeONLY.cif',
 'structures/SnO_mp-2097_computed.cif',
 'structures/BaO_mp-1342_computed.cif',
 'structures/ACODAA.cif',
 'structures/BaO2_mp-1105_computed.cif',
 'structures/SnO2_mp-856_computed.cif']

Define the features we are interested in.

In [5]:
METAL_CENTER_FEATURES = [
    "column",
    "row",
    "valenceelectrons",
    "diffto18electrons",
    "sunfilled",
    "punfilled",
    "dunfilled",
]
GEOMETRY_FEATURES = ["crystal_nn_fingerprint", "behler_parinello"]
CHEMISTRY_FEATURES = ["local_property_stats"]

In [6]:
features_dict = {}
# Get the structures for which we also have features in the output folder
already_featurized = [Path(s).stem for s in glob("features/*.pkl")]

# Iterate over all structures
for s in example_structures:
    name = Path(s).stem
    # check if they are already in the output folder
    # if (name not in already_featurized):
    #     print(name)
    # If they are not, then we will run the featurization for them
    # the features are written as pickle files to the 'features' folder
    gf = GetFeatures.from_file(s, 'features')
    gf.run_featurization()

featurize.py: iterating over 16 metal sites
featurize.py: iterating over 2 metal sites
featurize.py: iterating over 4 metal sites
featurize.py: iterating over 2 metal sites
featurize.py: iterating over 2 metal sites
featurize.py: iterating over 2 metal sites


Now, we can collect the features from this folder into a matrix.

In [8]:
import numpy as np 
features_dict = { }

# get all output files with features
features = glob('features/*.pkl')

for feature in features:
    try:
        rl = FeatureCollector.create_dict_for_feature_table(feature)
        print(rl)
        features = []
        for d in rl:
            features.append(d['feature'])
        features = np.vstack(features)
        features = FeatureCollector._select_features(CHEMISTRY_FEATURES + METAL_CENTER_FEATURES + ["crystal_nn_no_steinhardt"], features)
        # note, that this is a simplification for this example.
        features_dict[Path(feature).stem] = features
    except Exception as e:
        print(e)

.0, 5.0, 0.0, 1.54, 1.0, 4.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0, 0.0, 3.0, 4.496, 0.0, 0.0, -2.0, 0.0, 0.0, 0.0, -10.0, 0.0, -7.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 17.435763635419583, 5.374581696800158, 0.4061483581552421, 0.0028417785966257983, 51.2414469709615, 15.522830035691467, 31.68691754629936, 2.496070854261599, 29, 4, 11, 11.0, 7.0, 1.0, 0.0, 0.0, 9], 'name': 'KAJZIH_freeONLY'}, {'metal': 'Cu', 'coordinate_x': 15, 'coordinate_y': 3, 'coordinate_z': 3, 'feature': [0, 0, 0.8526361170230569, 1.0017993606954692e-08, 4.3774938980631365e-05, 0.007498845592509273, 0.6111369862632474, 0.5807478372602987, 0, 0, 0, 0, 0.14736388297694308, 0.054042050882633336, 0.042474899118055215, 0.05968076080103379, 0.07867041227033911, 0.07921223858604357, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13.595129825674428, 2.9966507397818676, 1.3881238732349128, 0.8580771526199736, 0.6940619366174564, 2.302588803164412, 6.9406

Let's look at the output. The keys of the dictionary are the names of the structures.

In [9]:
features_dict.keys()

dict_keys(['ACODAA', 'BaO2_mp-1105_computed', 'SnO2_mp-856_computed', 'KAJZIH_freeONLY', 'SnO_mp-2097_computed', 'BaO_mp-1342_computed'])

And the values are the feature values.

In [11]:
len(features_dict['BaO2_mp-1105_computed'][0])

116