In [1]:
%load_ext autoreload
%autoreload 2

In [9]:
from collections import defaultdict
import itertools
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import Markdown
from matminer.datasets import load_dataset
from pymatgen.core import Composition

from modnet.preprocessing import MODData

# from modnet.models import MODNetModel
from modnet.featurizers import MODFeaturizer
from modnet.models import MODNetModel
from modnet.featurizers.presets import DeBreuck2020Featurizer

In [3]:
from gptchem.data import get_matbench_glass

In [4]:
df = get_matbench_glass()

In [5]:
import modnet

modnet.__version__

'0.1.12'

In [6]:
class CompositionOnlyFeaturizer(DeBreuck2020Featurizer):
    def __init__(self):
        super().__init__()
        self.oxid_composition_featurizers = ()
        self.structure_featurizers = ()
        self.site_featurizers = ()

In [7]:
PRECOMPUTED_MODDATA = "./precomputed/glass_benchmark_moddata.pkl.gz"

if os.path.isfile(PRECOMPUTED_MODDATA):
    data = MODData.load(PRECOMPUTED_MODDATA)
else:
    # Use a fresh copy of the dataset
    # df = load_dataset("matbench_glass")
    # df["composition"] = df["composition"].map(Composition)
    df["structure"] = df["composition"].map(Composition)

    data = MODData(
        materials=df["structure"].tolist(),
        targets=df["gfa"].tolist(),
        target_names=["gfa"],
        featurizer=CompositionOnlyFeaturizer(),
        # num_classes = {'gfa':2}
    )
    data.featurize()
    # As this is a small data/feature set, order all features
    data.feature_selection(n=-1)
    data.save(PRECOMPUTED_MODDATA)

2023-02-07 14:13:16,179 - modnet - INFO - Loaded CompositionOnlyFeaturizer featurizer.
2023-02-07 14:13:16,197 - modnet - INFO - Computing features, this can take time...
2023-02-07 14:13:16,199 - modnet - INFO - Applying composition featurizers...
2023-02-07 14:13:16,225 - modnet - INFO - Applying featurizers (AtomicOrbitals(), AtomicPackingEfficiency(), BandCenter(), ElementFraction(), ElementProperty(data_source=<matminer.utils.data.MagpieData object at 0x2a17f25b0>,
                features=['Number', 'MendeleevNumber', 'AtomicWeight',
                          'MeltingT', 'Column', 'Row', 'CovalentRadius',
                          'Electronegativity', 'NsValence', 'NpValence',
                          'NdValence', 'NfValence', 'NValence', 'NsUnfilled',
                          'NpUnfilled', 'NdUnfilled', 'NfUnfilled', 'NUnfilled',
                          'GSvolume_pa', 'GSbandgap', 'GSmagmom',
                          'SpaceGroupNumber'],
                stats=['minimum', 'm

MultipleFeaturizer:   0%|          | 0/5680 [00:00<?, ?it/s]

  alp_a = np.multiply(1.5, np.power(v_a, 2 / 3)) / reduce(lambda x, y: 1 / x + 1 / y, np.power(n_ws, 1 / 3))
  alp_a = np.multiply(1.5, np.power(v_a, 2 / 3)) / reduce(lambda x, y: 1 / x + 1 / y, np.power(n_ws, 1 / 3))
  alp_a = np.multiply(1.5, np.power(v_a, 2 / 3)) / reduce(lambda x, y: 1 / x + 1 / y, np.power(n_ws, 1 / 3))
  alp_a = np.multiply(1.5, np.power(v_a, 2 / 3)) / reduce(lambda x, y: 1 / x + 1 / y, np.power(n_ws, 1 / 3))
  alp_a = np.multiply(1.5, np.power(v_a, 2 / 3)) / reduce(lambda x, y: 1 / x + 1 / y, np.power(n_ws, 1 / 3))
  alp_a = np.multiply(1.5, np.power(v_a, 2 / 3)) / reduce(lambda x, y: 1 / x + 1 / y, np.power(n_ws, 1 / 3))
  alp_a = np.multiply(1.5, np.power(v_a, 2 / 3)) / reduce(lambda x, y: 1 / x + 1 / y, np.power(n_ws, 1 / 3))
  alp_a = np.multiply(1.5, np.power(v_a, 2 / 3)) / reduce(lambda x, y: 1 / x + 1 / y, np.power(n_ws, 1 / 3))
  alp_a = np.multiply(1.5, np.power(v_a, 2 / 3)) / reduce(lambda x, y: 1 / x + 1 / y, np.power(n_ws, 1 / 3))
  alp_a = np.multip

2023-02-07 14:29:47,467 - modnet - INFO - Data has successfully been featurized!
2023-02-07 14:29:47,493 - modnet - INFO - Multiprocessing on 1 workers.
2023-02-07 14:29:47,495 - modnet - INFO - Computing "self" MI (i.e. information entropy) of features


100%|██████████| 270/270 [00:06<00:00, 44.81it/s]

2023-02-07 14:29:53,552 - modnet - INFO - Computing cross NMI between all features...



100%|██████████| 16110/16110 [04:00<00:00, 66.95it/s] 


2023-02-07 14:33:54,716 - modnet - INFO - Starting target 1/1: gfa ...
2023-02-07 14:33:54,716 - modnet - INFO - Computing mutual information between features and target...


  mutual_info.loc[:, target_name] = _mifun(df_feat, df_target[target_name], **kwargs)


2023-02-07 14:34:00,210 - modnet - INFO - Computing optimal features...
2023-02-07 14:34:01,538 - modnet - INFO - Selected 50/180 features...
2023-02-07 14:34:02,532 - modnet - INFO - Selected 100/180 features...
2023-02-07 14:34:03,079 - modnet - INFO - Selected 150/180 features...
2023-02-07 14:34:03,202 - modnet - INFO - Done with target 1/1: gfa.
2023-02-07 14:34:03,204 - modnet - INFO - Merging all features...
2023-02-07 14:34:03,205 - modnet - INFO - Done.
2023-02-07 14:34:04,483 - modnet - INFO - Data successfully saved as ./precomputed/glass_benchmark_moddata.pkl.gz!


In [11]:
best_settings = {
    "increase_bs": True,
    "num_neurons": [[128], [64], [16], []],
    "n_feat": 150,
    "lr": 0.002,
    "epochs": 200,
    "verbose": 0,
    "act": "elu",
    "batch_size": 64,
    "num_classes": {"gfa": 2},
    "loss": "categorical_crossentropy",
    # "xscale": "standard",
}

In [14]:
model = MODNetModel(
    [[["gfa"]]],
    {"gfa": 1},
    num_neurons=best_settings["num_neurons"],
    num_classes=best_settings.get("num_classes"),
    act=best_settings.get("act"),
    out_act=best_settings.get("out_act", "linear"),
    n_feat=best_settings["n_feat"],
)

In [15]:
model.fit(data, epochs=best_settings["epochs"], batch_size=best_settings["batch_size"], 
verbose=best_settings["verbose"], lr=best_settings["lr"], loss=best_settings["loss"])

  super().__init__(name, **kwargs)


In [16]:
predictions = model.predict(data)



In [17]:
predictions

Unnamed: 0,gfa
id0,0
id1,1
id2,1
id3,1
id4,1
...,...
id5675,0
id5676,1
id5677,1
id5678,1
