In [1]:
import lmdb
import pickle
import glob
import os
import pandas as pd
from torch_geometric.data import Data

In [3]:
train_dir = "oc22/is2re-total/train"
train_files = sorted(glob.glob(os.path.join(train_dir, "data.*.lmdb")))

# all the properties available in the metadata pickle file
columns = [
    "sid",
    "y_relaxed",
    "bulk_id",
    "bulk_symbols",
    "miller_index",
    "traj_id",
    "slab_sid",
    "ads_symbols",
    "nads"
]

# read metadata file for bulk symbol
with open("oc22_metadata.pkl", "rb") as file:
    metadata = pickle.load(file)

all_data = []

for lmdb_file in train_files:
    env = lmdb.open(lmdb_file, readonly=True, subdir=False, lock=False)

    with env.begin() as txn:
        num_entries = txn.stat()['entries']
        print(f"Processing {lmdb_file}: {num_entries} entries")

        cursor = txn.cursor()

        for key, value in cursor:
            try:
                # load pytorch data obj manually due to the imcompatible PyG version of OC22 dataset
                data = Data.from_dict(pickle.loads(value).__dict__)

                # get target value and sid from training dataset
                sid = data.sid
                y_relaxed = data.y_relaxed

                row = {
                    "sid": sid,
                    "y_relaxed": y_relaxed
                }

                # grab rest of the data from metadata
                if sid in metadata:
                    row.update(metadata[sid])

                all_data.append(row)
            except:
                pass
    env.close()

df = pd.DataFrame(all_data, columns=columns)
df.to_csv("training_FULL.csv", index=False)

Processing oc22/is2re-total/train/data.0000.lmdb: 1121 entries
Processing oc22/is2re-total/train/data.0001.lmdb: 1121 entries
Processing oc22/is2re-total/train/data.0002.lmdb: 1121 entries
Processing oc22/is2re-total/train/data.0003.lmdb: 1121 entries
Processing oc22/is2re-total/train/data.0004.lmdb: 1121 entries
Processing oc22/is2re-total/train/data.0005.lmdb: 1121 entries
Processing oc22/is2re-total/train/data.0006.lmdb: 1121 entries
Processing oc22/is2re-total/train/data.0007.lmdb: 1121 entries
Processing oc22/is2re-total/train/data.0008.lmdb: 1121 entries
Processing oc22/is2re-total/train/data.0009.lmdb: 1121 entries
Processing oc22/is2re-total/train/data.0010.lmdb: 1121 entries
Processing oc22/is2re-total/train/data.0011.lmdb: 1120 entries
Processing oc22/is2re-total/train/data.0012.lmdb: 1120 entries
Processing oc22/is2re-total/train/data.0013.lmdb: 1120 entries
Processing oc22/is2re-total/train/data.0014.lmdb: 1120 entries
Processing oc22/is2re-total/train/data.0015.lmdb: 1120 

In [4]:
# get the samples with H as adsorbate
# creates a mask where True if ads is H
contains_h_adsorbate = (df['ads_symbols'] == 'H') & (df['nads'] == 1)

# robust mask for rows with no adsorbates: handles None/NaN, empty lists, and stringified empty lists like '[]'
def no_ads(x):
    # None or NaN
    if pd.isna(x):
        return True
    
    # actual empty list/tuple/set
    if isinstance(x, (list, tuple, set)):
        return len(x) == 0
    
    # string forms: '', '[]', , 'None'
    s = str(x).strip()
    if s == '' or s == '[]' or s.lower() == 'none':
        return True
    
    return False

# creates a mask where False if ads is None
contains_no_ads = df['ads_symbols'].apply(no_ads)

df_only_h = df[contains_h_adsorbate]
df_only_h = df_only_h.reset_index(drop=True)

df_no_ads = df[contains_no_ads]
df_no_ads = df_no_ads.reset_index(drop=True)

df_only_h.to_csv("training_FULL_h.csv", index=False)
df_no_ads.to_csv("training_FULL_no_ads.csv", index=False)

# --------------------------------------------------------------
# randomly sample 100 data points from each dataset
# for the purpose of testing the featurizer. 
# remember to remove this section later

df_only_h_sample = df_only_h.sample(n=100)
df_no_ads_sample = df_no_ads.sample(n=100)
df_only_h_sample.to_csv("training_SAMPLE_h.csv", index=False)
df_no_ads_sample.to_csv("training_SAMPLE_no_ads.csv", index=False)

print(f"size of entire dataset: {len(df)}")
print(f"size of H-ads dataset: {len(df_only_h)}")
print(f"size of no-ads dataset: {len(df_no_ads)}")
print(f"sample size: n = {len(df_only_h_sample)}")

size of entire dataset: 45890
size of H-ads dataset: 1605
size of no-ads dataset: 14646
sample size: n = 100


In [5]:
# featurization
# use sample datasets to test the featurizers
h = df_only_h_sample        # use df_only_h to process full dataset
no_ads = df_no_ads_sample   # use df_no_ads to process full dataset

h_clean = h[['bulk_id', 'bulk_symbols', 'y_relaxed']]
no_ads_clean = no_ads[['bulk_id', 'bulk_symbols', 'y_relaxed']]

print(h_clean.head())
print(no_ads_clean.head())

         bulk_id bulk_symbols    y_relaxed
302     mp-27130     Li4W4O14  -249.670162
693   mp-1095546        Re4O8  -378.716662
65      mp-29799   K12Bi12O36  -529.147894
904    mvc-14111  Al20Fe12O48 -1137.260388
1341     mp-4359     Sr4Pd2O6  -404.913352
             bulk_id bulk_symbols   y_relaxed
1333        mp-17853     Y6Ga6O18 -880.212070
12800     mp-1176491    Mg6Sn6O18 -624.031903
12056      mvc-15159     Mg4Co4O8 -565.615959
2968        mp-23195       Bi8O12 -624.859030
5324   TlBiO4-rutile     Tl1Bi1O4 -333.348905


In [6]:
# convert bulk symbols to Composition object
from pymatgen.core import Composition

pd.options.mode.copy_on_write = True
h_clean['bulk_symbols'] = h_clean['bulk_symbols'].apply(Composition)
no_ads_clean['bulk_symbols'] = no_ads_clean['bulk_symbols'].apply(Composition)

print("--- H-ads ---")
for c in h_clean['bulk_symbols'].head():
    print(f"formula: {c.formula}; num of atoms: {c.num_atoms}")

print("\n--- no-ads ---")
for c in no_ads_clean['bulk_symbols'].head():
    print(f"formula: {c.formula}; num of atoms: {c.num_atoms}")

--- H-ads ---
formula: Li4 W4 O14; num of atoms: 22.0
formula: Re4 O8; num of atoms: 12.0
formula: K12 Bi12 O36; num of atoms: 60.0
formula: Al20 Fe12 O48; num of atoms: 80.0
formula: Sr4 Pd2 O6; num of atoms: 12.0

--- no-ads ---
formula: Y6 Ga6 O18; num of atoms: 30.0
formula: Mg6 Sn6 O18; num of atoms: 30.0
formula: Mg4 Co4 O8; num of atoms: 16.0
formula: Bi8 O12; num of atoms: 20.0
formula: Tl1 Bi1 O4; num of atoms: 6.0


I think it is good to provide a good justification for which featurizers we use. 

I removed elemental fractions because the features are too sparse and creates high-dimensionality

In [None]:
from matminer.featurizers.composition import ElementProperty
from matminer.featurizers.composition import Stoichiometry
from matminer.featurizers.composition import ValenceOrbital
from matminer.featurizers.composition import TMetalFraction
from matminer.featurizers.composition import BandCenter
from matminer.featurizers.composition import CohesiveEnergy
from matminer.featurizers.composition import AtomicOrbitals

from dotenv import load_dotenv


# make sure to have your materials project api in the .env file
load_dotenv()
key = os.getenv('API_KEY')
print(f"API KEY: {key}")

def featurize_composition(df):
    featurized_df = df.copy()
    print("Initial df shape:", featurized_df.shape)
    
    featurizers = [
        ElementProperty.from_preset("magpie"),
        Stoichiometry(),
        ValenceOrbital(),
        TMetalFraction(),
        BandCenter(),
        # CohesiveEnergy(mapi_key=key),
        AtomicOrbitals(),
    ]

    for featurizer in featurizers:
        featurizer.set_n_jobs(1)
        featurized_df = featurizer.featurize_dataframe(featurized_df, 'bulk_symbols')
    
    return featurized_df

API KEY: H1TcFfkjPB8IR4vjQB1vXdZ7CbceH5I4


In [70]:
import warnings


# suppress annoying warnings
warnings.filterwarnings('ignore', message='.*impute_nan.*')

# warning: this step can take a while!
h_comp_featurized = featurize_composition(h_clean)
no_ads_comp_featurized = featurize_composition(no_ads_clean)

Initial df shape: (100, 3)


ElementProperty:   0%|          | 0/100 [00:00<?, ?it/s]

Stoichiometry:   0%|          | 0/100 [00:00<?, ?it/s]

ValenceOrbital:   0%|          | 0/100 [00:00<?, ?it/s]

TMetalFraction:   0%|          | 0/100 [00:00<?, ?it/s]

BandCenter:   0%|          | 0/100 [00:00<?, ?it/s]

AtomicOrbitals:   0%|          | 0/100 [00:00<?, ?it/s]

Initial df shape: (100, 3)


ElementProperty:   0%|          | 0/100 [00:00<?, ?it/s]

Stoichiometry:   0%|          | 0/100 [00:00<?, ?it/s]

ValenceOrbital:   0%|          | 0/100 [00:00<?, ?it/s]

TMetalFraction:   0%|          | 0/100 [00:00<?, ?it/s]

BandCenter:   0%|          | 0/100 [00:00<?, ?it/s]

AtomicOrbitals:   0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
h_comp_featurized.head()

Unnamed: 0,bulk_id,bulk_symbols,y_relaxed,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,MagpieData avg_dev Number,MagpieData mode Number,MagpieData minimum MendeleevNumber,...,frac f valence electrons,transition metal fraction,band center,HOMO_character,HOMO_element,HOMO_energy,LUMO_character,LUMO_element,LUMO_energy,gap_AO
302,mp-27130,"(Li, W, O)",-249.670162,3.0,74.0,71.0,19.090909,19.966942,8.0,1.0,...,0.333333,0.181818,5.782963,p,O,-0.338381,d,W,-0.220603,0.117778
693,mp-1095546,"(Re, O)",-378.716662,8.0,75.0,67.0,30.333333,29.777778,8.0,54.0,...,0.424242,0.333333,6.110761,d,Re,-0.258639,d,Re,-0.258639,0.0
65,mp-29799,"(K, Bi, O)",-529.147894,8.0,83.0,75.0,25.2,23.12,8.0,3.0,...,0.291667,0.0,5.322543,p,O,-0.338381,p,O,-0.338381,0.0
904,mvc-14111,"(Al, Fe, O)",-1137.260388,8.0,26.0,18.0,11.95,4.74,8.0,55.0,...,0.0,0.15,5.546183,d,Fe,-0.295049,d,Fe,-0.295049,0.0
1341,mp-4359,"(Sr, Pd, O)",-404.913352,8.0,46.0,38.0,24.333333,16.333333,8.0,8.0,...,0.0,0.166667,5.006266,d,Pd,-0.160771,d,Pd,-0.160771,0.0


In [None]:
no_ads_comp_featurized.head()

Unnamed: 0,bulk_id,bulk_symbols,y_relaxed,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,MagpieData avg_dev Number,MagpieData mode Number,MagpieData minimum MendeleevNumber,...,frac f valence electrons,transition metal fraction,band center,HOMO_character,HOMO_element,HOMO_energy,LUMO_character,LUMO_element,LUMO_energy,gap_AO
1333,mp-17853,"(Y, Ga, O)",-880.21207,8.0,39.0,31.0,18.8,12.96,8.0,12.0,...,0.0,0.0,5.355295,p,O,-0.338381,s,Ga,-0.328019,0.010362
12800,mp-1176491,"(Mg, Sn, O)",-624.031903,8.0,50.0,42.0,17.2,13.12,8.0,68.0,...,0.0,0.0,5.863217,p,O,-0.338381,p,O,-0.338381,0.0
12056,mvc-15159,"(Mg, Co, O)",-565.615959,8.0,27.0,19.0,13.75,6.625,8.0,58.0,...,0.0,0.25,5.519684,d,Co,-0.322368,d,Co,-0.322368,0.0
2968,mp-23195,"(Bi, O)",-624.85903,8.0,83.0,75.0,38.0,36.0,8.0,86.0,...,0.368421,0.0,5.918403,p,O,-0.338381,p,Bi,-0.180198,0.158183
5324,TlBiO4-rutile,"(Tl, Bi, O)",-333.348905,8.0,83.0,75.0,32.666667,32.888889,8.0,76.0,...,0.35,0.0,5.894746,p,O,-0.338381,p,O,-0.338381,0.0


In [None]:
h_comp_featurized.to_csv("training_SAMPLE_h_comp_featurized.csv", index=False)
no_ads_comp_featurized.to_csv("training_SAMPLE_no_ads_comp_featurized.csv", index=False)

possible MP fields that can be requested by API:

['builder_meta',
 'nsites',
 'elements',
 'nelements',
 'composition',
 'composition_reduced',
 'formula_pretty',
 'formula_anonymous',
 'chemsys',
 'volume',
 'density',
 'density_atomic',
 'symmetry',
 'deprecated',
 'deprecation_reasons',
 'last_updated',
 'origins',
 'warnings',
 'property_name',
 'task_ids',
 'uncorrected_energy_per_atom',
 'formation_energy_per_atom',
 'energy_above_hull',
 'is_stable',
 'equilibrium_reaction_energy_per_atom',
 'decomposes_to',
 'xas',
 'grain_boundaries',
 'band_gap',
 'cbm',
 'vbm',
 'efermi',
 'is_gap_direct',
 'is_metal',
 'es_source_calc_id',
 'bandstructure',
 'dos',
 'dos_energy_up',
 'dos_energy_down',
 'is_magnetic',
 'ordering',
 'total_magnetization',
 'total_magnetization_normalized_vol',
 'total_magnetization_normalized_formula_units',
 'num_magnetic_sites',
 'num_unique_magnetic_sites',
 'types_of_magnetic_species',
 'bulk_modulus',
 'shear_modulus',
 'universal_anisotropy',
 'homogeneous_poisson',
 'e_total',
 'e_ionic',
 'e_electronic',
 'n',
 'e_ij_max',
 'weighted_surface_energy_EV_PER_ANG2',
 'weighted_surface_energy',
 'weighted_work_function',
 'surface_anisotropy',
 'shape_factor',
 'has_reconstructed',
 'possible_species',
 'has_props',
 'theoretical',
 'database_IDs']

DensityFeatures:
- density
- vpa
- packing fraction

GlobalSymmetryFeatures:
- spacegroup_num
- crystal_system
- crystal_system_int
- is_centrosymmetric
- n_symmetry_ops

In [61]:
# get structure properties
from mp_api.client import MPRester


def fetch_structure(df):
    structures = []
    bulk_symbols_col = df['bulk_symbols']

    with MPRester(key) as mpr:
        for row in bulk_symbols_col:
            formula = row.reduced_formula

            try:
                # retrieve SummaryDoc
                docs = mpr.materials.summary.search(
                    formula=formula,
                    fields=['structure', 'energy_per_atom']
                )
                if docs:
                    best_doc = min(docs, key=lambda x: x.energy_per_atom)
                    best_structure = best_doc.structure
                    structures.append(best_structure)
                else:
                    structures.append(None)
                    print(f"unable to get structure for {formula}")
            except:
                structures.append(None)
                print(f"cannot unable to communicate with MP REST API: {formula}")
    
    result_df = df.copy().drop(columns=['bulk_id', 'y_relaxed'])
    result_df['structure'] = structures
    return result_df

In [63]:
h_structures = fetch_structure(h_clean)
no_ads_structures = fetch_structure(no_ads_clean)

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/3 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/4 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/3 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/4 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/3 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/3 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/12 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents: 0it [00:00, ?it/s]

unable to get structure for BiRhO4


Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/3 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/3 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/3 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/3 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/4 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/3 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/5 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/5 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/3 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/9 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/4 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/4 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/4 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/5 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/10 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/3 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/3 [00:00<?, ?it/s]

Retrieving SummaryDoc documents: 0it [00:00, ?it/s]

unable to get structure for GaRhO4


Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/5 [00:00<?, ?it/s]

Retrieving SummaryDoc documents: 0it [00:00, ?it/s]

unable to get structure for InPdO4


Retrieving SummaryDoc documents:   0%|          | 0/6 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/21 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents: 0it [00:00, ?it/s]

unable to get structure for GeSbO4


Retrieving SummaryDoc documents: 0it [00:00, ?it/s]

unable to get structure for IrPtO4


Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/4 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/5 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/8 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/3 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/8 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/9 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/9 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/3 [00:00<?, ?it/s]

Retrieving SummaryDoc documents: 0it [00:00, ?it/s]

unable to get structure for RuRhO4


Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/3 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/5 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/29 [00:00<?, ?it/s]

Retrieving SummaryDoc documents: 0it [00:00, ?it/s]

unable to get structure for TlBiO4


Retrieving SummaryDoc documents:   0%|          | 0/23 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/3 [00:00<?, ?it/s]

Retrieving SummaryDoc documents: 0it [00:00, ?it/s]

unable to get structure for AgTeO4


Retrieving SummaryDoc documents:   0%|          | 0/3 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/18 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/3 [00:00<?, ?it/s]

Retrieving SummaryDoc documents: 0it [00:00, ?it/s]

unable to get structure for PtAuO4


Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/3 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/7 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/4 [00:00<?, ?it/s]

Retrieving SummaryDoc documents: 0it [00:00, ?it/s]

unable to get structure for CoAuO4


Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/15 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/3 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/3 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents: 0it [00:00, ?it/s]

unable to get structure for SnRuO4


Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/3 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/3 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/3 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/7 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/6 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/16 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/4 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/5 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/12 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/7 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/17 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents: 0it [00:00, ?it/s]

unable to get structure for CuAgO4


Retrieving SummaryDoc documents:   0%|          | 0/5 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/4 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/4 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieving SummaryDoc documents:   0%|          | 0/1 [00:00<?, ?it/s]

In [64]:
h_structures.head()

Unnamed: 0,bulk_symbols,structure
302,"(Li, W, O)","[[6.20487132 9.06384169 5.58873846] Li, [1.194..."
693,"(Re, O)","[[1.14626867 2.42691622 3.43674322] Re, [3.438..."
65,"(K, Bi, O)","[[7.54990346 7.54990346 7.54990346] K, [2.5166..."
904,"(Al, Fe, O)",[[-1.57311829e-17 3.36348733e+00 2.30274786e...
1341,"(Sr, Pd, O)","[[2.01840774 2.47061523 0.7154461 ] Sr, [1.098..."


In [None]:
from matminer.featurizers.structure import DensityFeatures 
from matminer.featurizers.structure import GlobalSymmetryFeatures
from matminer.featurizers.structure import SiteStatsFingerprint
from matminer.featurizers.structure import StructuralHeterogeneity
from matminer.featurizers.structure import MaximumPackingEfficiency
from matminer.featurizers.structure import ChemicalOrdering
from matminer.featurizers.structure import CoulombMatrix


def featurize_structure(df):
    featurized_df = df.copy()
    
    featurizers = [
        DensityFeatures(),
        GlobalSymmetryFeatures(),
        SiteStatsFingerprint.from_preset("LocalPropertyDifference_ward-prb-2017"),
        StructuralHeterogeneity(),
        MaximumPackingEfficiency(),
        ChemicalOrdering(),
    ]
    
    for featurizer in featurizers:
        featurizer.set_n_jobs(1)
        featurized_df = featurizer.featurize_dataframe(
            featurized_df, 
            'structure', 
            ignore_errors=True
        )

    coulomb_matrix = CoulombMatrix(flatten=True)
    coulomb_matrix.fit(featurized_df['structure'].dropna().tolist())
    
    coulomb_matrix.set_n_jobs(1)
    featurized_df = coulomb_matrix.featurize_dataframe(
        featurized_df,
        'structure',
        ignore_errors=True
    )
    
    return featurized_df

In [83]:
print("Featurizing h-ads dataset: ")
h_structure_featurized = featurize_structure(h_structures)

print("Featurizing no-ads dataset:")
no_ads_structure_featurized = featurize_structure(no_ads_structures)

Featurizing h-ads dataset: 


DensityFeatures:   0%|          | 0/100 [00:00<?, ?it/s]

GlobalSymmetryFeatures:   0%|          | 0/100 [00:00<?, ?it/s]

SiteStatsFingerprint:   0%|          | 0/100 [00:00<?, ?it/s]

StructuralHeterogeneity:   0%|          | 0/100 [00:00<?, ?it/s]

MaximumPackingEfficiency:   0%|          | 0/100 [00:00<?, ?it/s]

ChemicalOrdering:   0%|          | 0/100 [00:00<?, ?it/s]

CoulombMatrix:   0%|          | 0/100 [00:00<?, ?it/s]

Featurizing no-ads dataset:


DensityFeatures:   0%|          | 0/100 [00:00<?, ?it/s]

GlobalSymmetryFeatures:   0%|          | 0/100 [00:00<?, ?it/s]

SiteStatsFingerprint:   0%|          | 0/100 [00:00<?, ?it/s]

StructuralHeterogeneity:   0%|          | 0/100 [00:00<?, ?it/s]

MaximumPackingEfficiency:   0%|          | 0/100 [00:00<?, ?it/s]

ChemicalOrdering:   0%|          | 0/100 [00:00<?, ?it/s]

CoulombMatrix:   0%|          | 0/100 [00:00<?, ?it/s]

In [84]:
h_structure_featurized.head()

Unnamed: 0,bulk_symbols,structure,density,vpa,packing fraction,spacegroup_num,crystal_system,crystal_system_int,is_centrosymmetric,n_symmetry_ops,...,coulomb matrix eig 80,coulomb matrix eig 81,coulomb matrix eig 82,coulomb matrix eig 83,coulomb matrix eig 84,coulomb matrix eig 85,coulomb matrix eig 86,coulomb matrix eig 87,coulomb matrix eig 88,coulomb matrix eig 89
302,"(Li, W, O)","[[6.20487132 9.06384169 5.58873846] Li, [1.194...",6.036841,12.341977,0.3866,2.0,triclinic,7.0,True,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
693,"(Re, O)","[[1.14626867 2.42691622 3.43674322] Re, [3.438...",11.504196,10.498757,0.384666,60.0,orthorhombic,5.0,True,16.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
65,"(K, Bi, O)","[[7.54990346 7.54990346 7.54990346] K, [2.5166...",5.78355,17.001575,0.758446,201.0,cubic,1.0,True,48.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
904,"(Al, Fe, O)",[[-1.57311829e-17 3.36348733e+00 2.30274786e...,4.033613,10.177346,0.423714,12.0,monoclinic,6.0,True,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1341,"(Sr, Pd, O)","[[2.01840774 2.47061523 0.7154461 ] Sr, [1.098...",6.027966,15.135297,0.894477,71.0,orthorhombic,5.0,True,16.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [85]:
# combine composition and structure features
h_final = pd.merge(h_comp_featurized, h_structure_featurized, on='bulk_symbols')
no_ads_final = pd.merge(no_ads_comp_featurized, no_ads_structure_featurized, on='bulk_symbols')

# here save the final fully featurized dataset.
h_final.to_csv("training_SAMPLE_h_all_featurized.csv", index=False)
no_ads_final.to_csv("training_SAMPLE_no_ads_all_featurized.csv", index=False)