# Example of use of featurizer and featurecollector

In [1]:
import pandas as pd
from glob import glob
import os 
from pathlib import Path

# this package
from mine_mof_oxstate.featurize import GetFeatures, FeatureCollector

# plotting
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

pyplot.py: Loaded backend module://ipykernel.pylab.backend_inline version unknown.
pyplot.py: Loaded backend module://ipykernel.pylab.backend_inline version unknown.
pyplot.py: Loaded backend module://ipykernel.pylab.backend_inline version unknown.


In [2]:
example_structures = glob('structures/*.cif') 

In [3]:
example_structures

['structures/KAJZIH_freeONLY.cif',
 'structures/SnO_mp-2097_computed.cif',
 'structures/BaO_mp-1342_computed.cif',
 'structures/ACODAA.cif',
 'structures/BaO2_mp-1105_computed.cif',
 'structures/SnO2_mp-856_computed.cif']

Define the features we are interested in.

In [4]:
METAL_CENTER_FEATURES = [
    "column",
    "row",
    "valenceelectrons",
    "diffto18electrons",
    "sunfilled",
    "punfilled",
    "dunfilled",
]
GEOMETRY_FEATURES = ["crystal_nn_fingerprint", "behler_parinello"]
CHEMISTRY_FEATURES = ["local_property_stats"]

In [5]:
features_dict = {}
# Get the structures for which we also have features in the output folder
already_featurized = [Path(s).stem for s in glob("features/*.pkl")]

# Iterate over all structures
for s in example_structures:
    name = Path(s).stem
    # check if they are already in the output folder
    if (name not in already_featurized):
        print(name)
        # If they are not, then we will run the featurization for them
        # the features are written as pickle files to the 'features' folder
        gf = GetFeatures.from_file(s, 'features')
        gf.run_featurization()

Now, we can collect the features from this folder into a matrix.

In [7]:
import numpy as np 
features_dict = { }

# get all output files with features
features = glob('features/*.pkl')

for feature in features:
    try:
        rl = FeatureCollector.create_dict_for_feature_table(feature)
        print(rl)
        features = []
        for d in rl:
            features.append(d['feature'])
        features = np.vstack(features)
        features = FeatureCollector._select_features(CHEMISTRY_FEATURES + METAL_CENTER_FEATURES + ["crystal_nn_no_steinhardt"], features)
        # note, that this is a simplification for this example.
        features_dict[Path(feature).stem] = features
    except Exception as e:
        print(e)

[{'metal': 'Fe', 'coordinate_x': 4, 'coordinate_y': 4, 'coordinate_z': 1, 'feature': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.9987267606674715, 0.09778406061116889, 0.9705995190864324, 0.011267358265849929, 0.24224889739165628, 0.252896078678295, 0, 0, 0, 0, 0.00127323933252854, 0.0007034873666326401, 0.0002582258332394504, 0.000166449673726952, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 27.98333628977027, 6.957058050830358, 2.1026278238939913, 1.1060637374861522, 0.1148969522281746, 2.636905450814201, 5.96319261499745, 0.0, 3.441184116411423, 0.1148969522281746, 2.636905450814201, 3.9754617433316324, 0.0, 1.2236593402892575, 6.560205894811526, 27.98333628977027, 5.348500719635913, -2.1026278238939913, 1.1060637374861522, -0.1148969522281746, 2.636905450814201, -5.96319261499745, 0.0, -3.441184116411423, 0.1148969522281746, 2.636905450814201, -3.9754617433316324, 0.0, -1.2236593402892575, 6.560205894811526, 37.0, 7.0, 0.0, 1.2

[{'metal': 'Sn', 'coordinate_x': 0, 'coordinate_y': 1, 'coordinate_z': 1, 'feature': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0.14389414868120215, 0.22635258630106428, 0.02587115377392368, 0.2672201906655181, 0.2749444608096665, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4.413551999021881, 1.2610148568633945, 1.891522285295092, 0.933150994078912, 0.0, 1.2610148568633945, 6.305074284316972, 0.0, 5.044059427453578, 0.0, 1.2610148568633945, 0.0, 0.0, 1.2610148568633945, 0.0, 4.413551999021881, 1.2610148568633945, -1.891522285295092, 0.933150994078912, 0.0, 1.2610148568633945, -6.305074284316972, 0.0, -5.044059427453578, 0.0, -1.2610148568633945, 0.0, 0.0, -1.2610148568633945, 0.0, 7.0, 2.0, 0.0, 1.48, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -3.0, 0.0, 0.0, 0.0, -10.0, 0.0, -8.0, 0.0, -2.0, 0.0, 0.0, -2.0, 0.0, 10.821801004372293, 3.476347233131916, 0.26844789561852944, 0.0001885

Let's look at the output. The keys of the dictionary are the names of the structures.

In [8]:
features_dict.keys()

dict_keys(['ACODAA', 'BaO2_mp-1105_computed', 'SnO2_mp-856_computed', 'KAJZIH_freeONLY', 'SnO_mp-2097_computed', 'BaO_mp-1342_computed'])

And the values are the feature values.

In [19]:
features_dict['BaO2_mp-1105_computed']

array([[ 7.71107094e+01,  1.38403837e+01,  3.95439535e+00,
         2.52092704e+00,  0.00000000e+00,  3.95439535e+00,
         0.00000000e+00,  0.00000000e+00,  3.95439535e+00,
         0.00000000e+00,  1.97719768e+00,  0.00000000e+00,
         0.00000000e+00,  1.97719768e+00,  0.00000000e+00,
         7.71107094e+01,  1.38403837e+01, -3.95439535e+00,
         2.52092704e+00,  0.00000000e+00,  3.95439535e+00,
         0.00000000e+00,  0.00000000e+00,  3.95439535e+00,
         0.00000000e+00,  1.97719768e+00,  0.00000000e+00,
         0.00000000e+00,  1.97719768e+00,  0.00000000e+00,
         7.80000000e+01,  1.40000000e+01,  0.00000000e+00,
         2.55000000e+00,  0.00000000e+00,  4.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  4.00000000e+00,
         0.00000000e+00,  2.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  2.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00, -4.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+0