# Example of use of featurizer and featurecollector

In [1]:
import pandas as pd
from glob import glob
import os 
from pathlib import Path

# this package
from oximachine_featurizer.featurize import GetFeatures, FeatureCollector

In [2]:
example_structures = glob('structures/*.cif') 

In [3]:
example_structures

['structures/KAJZIH_freeONLY.cif',
 'structures/SnO_mp-2097_computed.cif',
 'structures/BaO_mp-1342_computed.cif',
 'structures/UiO66_GC1.cif',
 'structures/ACODAA.cif',
 'structures/BaO2_mp-1105_computed.cif',
 'structures/SnO2_mp-856_computed.cif']

Define the features we are interested in.

In [4]:
METAL_CENTER_FEATURES = [
    "column",
    "row",
    "valenceelectrons",
    "diffto18electrons",
    "sunfilled",
    "punfilled",
    "dunfilled",
]
GEOMETRY_FEATURES = ["crystal_nn_fingerprint", "behler_parinello"]
CHEMISTRY_FEATURES = ["local_property_stats"]

In [5]:
features_dict = {}
# Get the structures for which we also have features in the output folder
already_featurized = [Path(s).stem for s in glob("features/*.pkl")]

# Iterate over all structures
for s in example_structures:
    name = Path(s).stem
    # check if they are already in the output folder
    # if (name not in already_featurized):
    #     print(name)
    # If they are not, then we will run the featurization for them
    # the features are written as pickle files to the 'features' folder
    gf = GetFeatures.from_file(s, 'features')
    gf.run_featurization()

featurize.py: iterating over 16 metal sites
featurize.py: iterating over 2 metal sites
featurize.py: iterating over 4 metal sites
featurize.py: iterating over 24 metal sites
featurize.py: iterating over 2 metal sites
featurize.py: iterating over 2 metal sites
featurize.py: iterating over 2 metal sites


Now, we can collect the features from this folder into a matrix.

In [6]:
import numpy as np 
features_dict = { }

# get all output files with features
features = glob('features/*.pkl')

for feature in features:
    try:
        rl = FeatureCollector.create_dict_for_feature_table(feature)
        print(rl)
        features = []
        for d in rl:
            features.append(d['feature'])
        features = np.vstack(features)
        features = FeatureCollector._select_features(CHEMISTRY_FEATURES + METAL_CENTER_FEATURES + ["crystal_nn_no_steinhardt"], features)
        # note, that this is a simplification for this example.
        features_dict[Path(feature).stem] = features
    except Exception as e:
        print(e)

[{'metal': 'Zr', 'coordinate_x': 2, 'coordinate_y': 0, 'coordinate_z': 0, 'feature': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.012144928568813614, 0.002363518542180815, 0.0016663896982716338, 0.000562834740986501, 0.003969949437084093, 0.0040569855620313814, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.9878550714311863, -0.010659354165259273, 0.20279424084613343, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 42.99999999999999, 12.0, 3.0, 2.11, 0.0, 4.0, 2.0, 0.0, 2.0, 0.0, 2.0, 8.0, 0.0, 6.0, 0.0, 42.99999999999999, 12.0, -3.0, 2.11, 0.0, 4.0, -2.0, 0.0, 2.0, 0.0, 2.0, -8.0, 0.0, -6.0, 0.0, 43.0, 12.0, -3.0, 2.11, 0.0, 4.0, -2.0, 0.0, 2.0, 0.0, 2.0, -8.0, 0.0, -6.0, 0.0, 43.0, 12.0, -3.0, 2.11, 0.0, 4.0, -2.0, 0.0, 2.0, 0.0, 2.0, -8.0, 0.0, -6.0, 0.0, 19.466111011895165, 7.336235248969159, 0.6282169399285839, 0.00044807085817113074, 72.05159299006988, 28.151654318756815, 40.40330378392711, 5.589917849572598, 40, 5, 4, 4.0, 4.0, 0.0, 0.0, 8.0, 16], 'name': 'U

Let's look at the output. The keys of the dictionary are the names of the structures.

In [9]:
features_dict.keys()

dict_keys(['ACODAA', 'BaO2_mp-1105_computed', 'SnO2_mp-856_computed', 'KAJZIH_freeONLY', 'SnO_mp-2097_computed', 'BaO_mp-1342_computed'])

And the values are the feature values.

In [11]:
len(features_dict['BaO2_mp-1105_computed'][0])

116