# Example of use of featurizer and featurecollector

In [1]:
import pandas as pd
from glob import glob
import os 
from pathlib import Path

# this package
from mine_mof_oxstate.featurize import GetFeatures, FeatureCollector

# plotting
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

lities.misc', 'sympy.utilities.lambdify', 'sympy.utilities.decorator', 'sympy.core.decorators', 'sympy.utilities.runtests', 'doctest', 'sympy.utilities.exceptions', 'sympy.utilities.source', 'sympy.utilities.timeutils', 'sympy.core.mul', 'sympy.core.operations', 'sympy.core.numbers', 'sympy.core.containers', 'sympy.core.power', 'sympy.core.function', 'sympy.core.add', 'sympy.core.rules', 'sympy.core.symbol', 'sympy.logic', 'sympy.logic.boolalg', 'sympy.logic.inference', 'sympy.core.mod', 'sympy.core.exprtools', 'sympy.core.coreerrors', 'sympy.core.relational', 'sympy.core.multidimensional', 'sympy.assumptions', 'sympy.assumptions.assume', 'sympy.assumptions.ask', 'sympy.assumptions.cnf', 'sympy.assumptions.ask_generated', 'sympy.assumptions.refine', 'sympy.polys', 'sympy.polys.polytools', 'sympy.polys.polyoptions', 'sympy.polys.polyerrors', 'sympy.polys.constructor', 'sympy.polys.domains', 'sympy.polys.domains.domain', 'sympy.polys.domains.domainelement', 'sympy.polys.orderings', 'symp

In [2]:
example_structures = glob('structures/*.cif') 

In [3]:
example_structures

['structures/KAJZIH_freeONLY.cif',
 'structures/SnO_mp-2097_computed.cif',
 'structures/BaO_mp-1342_computed.cif',
 'structures/ACODAA.cif',
 'structures/BaO2_mp-1105_computed.cif',
 'structures/SnO2_mp-856_computed.cif']

Define the features we are interested in.

In [4]:
METAL_CENTER_FEATURES = [
    "column",
    "row",
    "valenceelectrons",
    "diffto18electrons",
    "sunfilled",
    "punfilled",
    "dunfilled",
]
GEOMETRY_FEATURES = ["crystal_nn_fingerprint", "behler_parinello"]
CHEMISTRY_FEATURES = ["local_property_stats"]

In [6]:
features_dict = {}
# Get the structures for which we also have features in the output folder
already_featurized = [Path(s).stem for s in glob("features/*.pkl")]

# Iterate over all structures
for s in example_structures:
    name = Path(s).stem
    # check if they are already in the output folder
    if (name not in already_featurized):
        print(name)
        # If they are not, then we will run the featurization for them
        # the features are written as pickle files to the 'features' folder
        gf = GetFeatures.from_file(s, 'features')
        gf.run_featurization()

Now, we can collect the features from this folder into a matrix.

In [7]:
import numpy as np 
features_dict = { }

# get all output files with features
features = glob('features/*.pkl')

for feature in features:
    try:
        rl = FeatureCollector.create_dict_for_feature_table(feature)
        print(rl)
        features = []
        for d in rl:
            features.append(d['feature'])
        features = np.vstack(features)
        features = FeatureCollector._select_features(CHEMISTRY_FEATURES + METAL_CENTER_FEATURES + ["crystal_nn_no_steinhardt"], features)
        # note, that this is a simplification for this example.
        features_dict[Path(feature).stem] = features
    except Exception as e:
        print(e)

74565, 0.0, -3.943968626392696, -0.6940619366174564, 1.861782816540326, 0.0, 0.0, 1.16772087992287, 1.0647853045505702, 23.0, 5.0, 0.0, 1.54, 1.0, 4.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0, 0.0, 3.0, 4.496, 0.0, 0.0, -2.0, 0.0, 0.0, 0.0, -10.0, 0.0, -7.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 17.435763635419587, 5.374581696800158, 0.40614835815524203, 0.002841778596625798, 51.2414469709615, 15.522830035691467, 31.686917546299355, 2.4960708542616, 29, 4, 11, 11.0, 7.0, 1.0, 0.0, 0.0, 17], 'name': 'KAJZIH_freeONLY'}, {'metal': 'Cu', 'coordinate_x': 15, 'coordinate_y': 3, 'coordinate_z': 3, 'feature': [0, 0, 0.8526361170230569, 1.0017993606954586e-08, 4.377493898063098e-05, 0.007498845592509233, 0.6111369862632468, 0.5807478372602991, 0, 0, 0, 0, 0.14736388297694308, 0.054042050882633336, 0.042474899118055215, 0.05968076080103379, 0.07867041227033907, 0.07921223858604352, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13.5

Let's look at the output. The keys of the dictionary are the names of the structures.

In [8]:
features_dict.keys()

dict_keys(['ACODAA', 'BaO2_mp-1105_computed', 'SnO2_mp-856_computed', 'KAJZIH_freeONLY', 'SnO_mp-2097_computed', 'BaO_mp-1342_computed'])

And the values are the feature values.

In [9]:
features_dict['BaO2_mp-1105_computed']

array([[ 7.71107094e+01,  1.38403837e+01,  3.95439535e+00,
         2.52092704e+00,  0.00000000e+00,  3.95439535e+00,
         0.00000000e+00,  0.00000000e+00,  3.95439535e+00,
         0.00000000e+00,  1.97719768e+00,  0.00000000e+00,
         0.00000000e+00,  1.97719768e+00,  0.00000000e+00,
         7.71107094e+01,  1.38403837e+01, -3.95439535e+00,
         2.52092704e+00,  0.00000000e+00,  3.95439535e+00,
         0.00000000e+00,  0.00000000e+00,  3.95439535e+00,
         0.00000000e+00,  1.97719768e+00,  0.00000000e+00,
         0.00000000e+00,  1.97719768e+00,  0.00000000e+00,
         7.80000000e+01,  1.40000000e+01,  0.00000000e+00,
         2.55000000e+00,  0.00000000e+00,  4.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  4.00000000e+00,
         0.00000000e+00,  2.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  2.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00, -4.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+0