In [1]:
%load_ext autoreload
%autoreload 2

In [9]:
from collections import defaultdict
import itertools
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import Markdown
from matminer.datasets import load_dataset
from pymatgen.core import Composition

from modnet.preprocessing import MODData

# from modnet.models import MODNetModel
from modnet.featurizers import MODFeaturizer
from modnet.models import MODNetModel
from modnet.featurizers.presets import DeBreuck2020Featurizer

In [3]:
from gptchem.data import get_matbench_glass

In [4]:
df = get_matbench_glass()

In [20]:
df

Unnamed: 0,composition,gfa,structure
0,Al,False,(Al)
1,Al(NiB)2,True,"(Al, Ni, B)"
2,Al10Co21B19,True,"(Al, Co, B)"
3,Al10Co23B17,True,"(Al, Co, B)"
4,Al10Co27B13,True,"(Al, Co, B)"
...,...,...,...
5675,ZrTi9,False,"(Zr, Ti)"
5676,ZrTiSi2,True,"(Zr, Ti, Si)"
5677,ZrTiSi3,True,"(Zr, Ti, Si)"
5678,ZrVCo8,True,"(Zr, V, Co)"


In [5]:
import modnet

modnet.__version__

'0.1.12'

In [6]:
class CompositionOnlyFeaturizer(DeBreuck2020Featurizer):
    def __init__(self):
        super().__init__()
        self.oxid_composition_featurizers = ()
        self.structure_featurizers = ()
        self.site_featurizers = ()

In [7]:
PRECOMPUTED_MODDATA = "./precomputed/glass_benchmark_moddata.pkl.gz"

if os.path.isfile(PRECOMPUTED_MODDATA):
    data = MODData.load(PRECOMPUTED_MODDATA)
else:
    # Use a fresh copy of the dataset
    # df = load_dataset("matbench_glass")
    # df["composition"] = df["composition"].map(Composition)
    df["structure"] = df["composition"].map(Composition)

    data = MODData(
        materials=df["structure"].tolist(),
        targets=df["gfa"].tolist(),
        target_names=["gfa"],
        featurizer=CompositionOnlyFeaturizer(),
        # num_classes = {'gfa':2}
    )
    data.featurize()
    # As this is a small data/feature set, order all features
    data.feature_selection(n=-1)
    data.save(PRECOMPUTED_MODDATA)

2023-02-07 14:13:16,179 - modnet - INFO - Loaded CompositionOnlyFeaturizer featurizer.
2023-02-07 14:13:16,197 - modnet - INFO - Computing features, this can take time...
2023-02-07 14:13:16,199 - modnet - INFO - Applying composition featurizers...
2023-02-07 14:13:16,225 - modnet - INFO - Applying featurizers (AtomicOrbitals(), AtomicPackingEfficiency(), BandCenter(), ElementFraction(), ElementProperty(data_source=<matminer.utils.data.MagpieData object at 0x2a17f25b0>,
                features=['Number', 'MendeleevNumber', 'AtomicWeight',
                          'MeltingT', 'Column', 'Row', 'CovalentRadius',
                          'Electronegativity', 'NsValence', 'NpValence',
                          'NdValence', 'NfValence', 'NValence', 'NsUnfilled',
                          'NpUnfilled', 'NdUnfilled', 'NfUnfilled', 'NUnfilled',
                          'GSvolume_pa', 'GSbandgap', 'GSmagmom',
                          'SpaceGroupNumber'],
                stats=['minimum', 'm

MultipleFeaturizer:   0%|          | 0/5680 [00:00<?, ?it/s]

  alp_a = np.multiply(1.5, np.power(v_a, 2 / 3)) / reduce(lambda x, y: 1 / x + 1 / y, np.power(n_ws, 1 / 3))
  alp_a = np.multiply(1.5, np.power(v_a, 2 / 3)) / reduce(lambda x, y: 1 / x + 1 / y, np.power(n_ws, 1 / 3))
  alp_a = np.multiply(1.5, np.power(v_a, 2 / 3)) / reduce(lambda x, y: 1 / x + 1 / y, np.power(n_ws, 1 / 3))
  alp_a = np.multiply(1.5, np.power(v_a, 2 / 3)) / reduce(lambda x, y: 1 / x + 1 / y, np.power(n_ws, 1 / 3))
  alp_a = np.multiply(1.5, np.power(v_a, 2 / 3)) / reduce(lambda x, y: 1 / x + 1 / y, np.power(n_ws, 1 / 3))
  alp_a = np.multiply(1.5, np.power(v_a, 2 / 3)) / reduce(lambda x, y: 1 / x + 1 / y, np.power(n_ws, 1 / 3))
  alp_a = np.multiply(1.5, np.power(v_a, 2 / 3)) / reduce(lambda x, y: 1 / x + 1 / y, np.power(n_ws, 1 / 3))
  alp_a = np.multiply(1.5, np.power(v_a, 2 / 3)) / reduce(lambda x, y: 1 / x + 1 / y, np.power(n_ws, 1 / 3))
  alp_a = np.multiply(1.5, np.power(v_a, 2 / 3)) / reduce(lambda x, y: 1 / x + 1 / y, np.power(n_ws, 1 / 3))
  alp_a = np.multip

2023-02-07 14:29:47,467 - modnet - INFO - Data has successfully been featurized!
2023-02-07 14:29:47,493 - modnet - INFO - Multiprocessing on 1 workers.
2023-02-07 14:29:47,495 - modnet - INFO - Computing "self" MI (i.e. information entropy) of features


100%|██████████| 270/270 [00:06<00:00, 44.81it/s]

2023-02-07 14:29:53,552 - modnet - INFO - Computing cross NMI between all features...



100%|██████████| 16110/16110 [04:00<00:00, 66.95it/s] 


2023-02-07 14:33:54,716 - modnet - INFO - Starting target 1/1: gfa ...
2023-02-07 14:33:54,716 - modnet - INFO - Computing mutual information between features and target...


  mutual_info.loc[:, target_name] = _mifun(df_feat, df_target[target_name], **kwargs)


2023-02-07 14:34:00,210 - modnet - INFO - Computing optimal features...
2023-02-07 14:34:01,538 - modnet - INFO - Selected 50/180 features...
2023-02-07 14:34:02,532 - modnet - INFO - Selected 100/180 features...
2023-02-07 14:34:03,079 - modnet - INFO - Selected 150/180 features...
2023-02-07 14:34:03,202 - modnet - INFO - Done with target 1/1: gfa.
2023-02-07 14:34:03,204 - modnet - INFO - Merging all features...
2023-02-07 14:34:03,205 - modnet - INFO - Done.
2023-02-07 14:34:04,483 - modnet - INFO - Data successfully saved as ./precomputed/glass_benchmark_moddata.pkl.gz!


In [11]:
best_settings = {
    "increase_bs": True,
    "num_neurons": [[128], [64], [16], []],
    "n_feat": 150,
    "lr": 0.002,
    "epochs": 200,
    "verbose": 0,
    "act": "elu",
    "batch_size": 64,
    "num_classes": {"gfa": 2},
    "loss": "categorical_crossentropy",
    # "xscale": "standard",
}

In [14]:
model = MODNetModel(
    [[["gfa"]]],
    {"gfa": 1},
    num_neurons=best_settings["num_neurons"],
    num_classes=best_settings.get("num_classes"),
    act=best_settings.get("act"),
    out_act=best_settings.get("out_act", "linear"),
    n_feat=best_settings["n_feat"],
)

In [15]:
model.fit(data, epochs=best_settings["epochs"], batch_size=best_settings["batch_size"], 
verbose=best_settings["verbose"], lr=best_settings["lr"], loss=best_settings["loss"])

  super().__init__(name, **kwargs)


In [16]:
predictions = model.predict(data)



In [17]:
predictions

Unnamed: 0,gfa
id0,0
id1,1
id2,1
id3,1
id4,1
...,...
id5675,0
id5676,1
id5677,1
id5678,1


In [19]:
data.df_targets

Unnamed: 0,gfa
id0,False
id1,True
id2,True
id3,True
id4,True
...,...
id5675,False
id5676,True
id5677,True
id5678,True


In [30]:
num_train_points = [10, 50, 100, 200, 500, 1000]

PRECOMPUTED_MODDATA = "./precomputed/glass_benchmark_moddata.pkl.gz"

DATA = MODData.load(PRECOMPUTED_MODDATA)

OUTDIR = 'out_baseline'

NUM_REPEATS = 10 

BEST_SETTINGS = {
    "increase_bs": True,
    "num_neurons": [[128], [64], [16], []],
    "n_feat": 150,
    "lr": 0.002,
    "epochs": 200,
    "verbose": 0,
    "act": "elu",
    "batch_size": 64,
    "num_classes": {"gfa": 2},
    "loss": "categorical_crossentropy",
    # "xscale": "standard",
}

from sklearn.model_selection import train_test_split
from gptchem.evaluator import evaluate_classification
import time 
from pathlib import Path 
from fastcore.xtras import save_pickle

def train_test(train_size, seed: int = 42):
    df = get_matbench_glass()
    train_idx, test_idx = train_test_split(
        np.arange(len(df)), train_size=train_size, random_state=seed, stratify=df["gfa"]
    )
    train_data, test_data = DATA.split((train_idx, test_idx))
    assert len(train_data.df_targets) == len(train_idx)
    assert len(test_data.df_targets) == len(test_idx)
    model = MODNetModel(
        [[["gfa"]]],
        {"gfa": 1},
        num_neurons=BEST_SETTINGS["num_neurons"],
        num_classes=BEST_SETTINGS.get("num_classes"),
        act=BEST_SETTINGS.get("act"),
        out_act=BEST_SETTINGS.get("out_act", "linear"),
        n_feat=BEST_SETTINGS["n_feat"],
    )

    model.fit(
        train_data,
        epochs=BEST_SETTINGS["epochs"],
        batch_size=BEST_SETTINGS["batch_size"],
        verbose=BEST_SETTINGS["verbose"],
        lr=BEST_SETTINGS["lr"],
        loss=BEST_SETTINGS["loss"],
    )

    predictions = model.predict(test_data)
    assert len(predictions) == len(test_data.df_targets)
    true = test_data.df_targets["gfa"].values.astype(int)
    pred = predictions["gfa"].values.astype(int)
    assert len(predictions) == len(test_data.df_targets) == len(pred)
    metrics = evaluate_classification(true, pred)
    print(f"Train size {train_size} - {metrics['accuracy']}")
    metrics["train_size"] = train_size
    metrics["seed"] = seed
    metrics["pred"] = pred
    metrics["true"] = true

    timestr = time.strftime("%Y%m%d-%H%M%S")
    save_pickle(Path(OUTDIR) / f"metrics_{timestr}.pkl", metrics)
    return metrics



for i in range(NUM_REPEATS):
    for train_size in num_train_points:
        train_test(train_size, seed=i)

2023-02-07 17:25:39,711 - modnet - INFO - Loaded <modnet.preprocessing.MODData object at 0x2c2e29910> object, created with modnet version 0.1.12


  super().__init__(name, **kwargs)


Train size 10 - 0.6164021164021164


  super().__init__(name, **kwargs)


Train size 50 - 0.6955595026642984


  super().__init__(name, **kwargs)


Train size 100 - 0.6912186379928316


  super().__init__(name, **kwargs)


Train size 200 - 0.7324817518248176


  super().__init__(name, **kwargs)


Train size 500 - 0.7787644787644787


  super().__init__(name, **kwargs)


Train size 1000 - 0.8055555555555556


  super().__init__(name, **kwargs)


Train size 10 - 0.7391534391534391


  super().__init__(name, **kwargs)


Train size 50 - 0.7420959147424512


  super().__init__(name, **kwargs)


Train size 100 - 0.7555555555555555


  super().__init__(name, **kwargs)


Train size 200 - 0.7576642335766424


  super().__init__(name, **kwargs)


Train size 500 - 0.7916988416988417


  super().__init__(name, **kwargs)


Train size 1000 - 0.8002136752136753


  super().__init__(name, **kwargs)


Train size 10 - 0.6793650793650794


  super().__init__(name, **kwargs)


Train size 50 - 0.6959147424511545


  super().__init__(name, **kwargs)


Train size 100 - 0.7216845878136201


  super().__init__(name, **kwargs)


Train size 200 - 0.7571167883211679


  super().__init__(name, **kwargs)


Train size 500 - 0.7818532818532818


  super().__init__(name, **kwargs)


Train size 1000 - 0.8200854700854701


  super().__init__(name, **kwargs)


Train size 10 - 0.6788359788359788


  super().__init__(name, **kwargs)


Train size 50 - 0.6634103019538188


  super().__init__(name, **kwargs)


Train size 100 - 0.7105734767025089


  super().__init__(name, **kwargs)


Train size 200 - 0.7304744525547445


  super().__init__(name, **kwargs)


Train size 500 - 0.7706563706563706


  super().__init__(name, **kwargs)


Train size 1000 - 0.7933760683760683


  super().__init__(name, **kwargs)


Train size 10 - 0.46084656084656084


  super().__init__(name, **kwargs)


Train size 50 - 0.6912966252220248


  super().__init__(name, **kwargs)


Train size 100 - 0.7148745519713262


  super().__init__(name, **kwargs)


Train size 200 - 0.7492700729927008


  super().__init__(name, **kwargs)


Train size 500 - 0.7953667953667953


  super().__init__(name, **kwargs)


Train size 1000 - 0.8083333333333333


  super().__init__(name, **kwargs)


Train size 10 - 0.6724867724867725


  super().__init__(name, **kwargs)


Train size 50 - 0.7156305506216696


  super().__init__(name, **kwargs)


Train size 100 - 0.7191756272401434


  super().__init__(name, **kwargs)


Train size 200 - 0.7463503649635036


  super().__init__(name, **kwargs)


Train size 500 - 0.7791505791505792


  super().__init__(name, **kwargs)


Train size 1000 - 0.8179487179487179


  super().__init__(name, **kwargs)


Train size 10 - 0.5571428571428572


  super().__init__(name, **kwargs)


Train size 50 - 0.6548845470692718


  super().__init__(name, **kwargs)


Train size 100 - 0.6865591397849462


  super().__init__(name, **kwargs)


Train size 200 - 0.7390510948905109


  super().__init__(name, **kwargs)


Train size 500 - 0.800965250965251


  super().__init__(name, **kwargs)


Train size 1000 - 0.805982905982906


  super().__init__(name, **kwargs)


Train size 10 - 0.5793650793650794


  super().__init__(name, **kwargs)


Train size 50 - 0.6850799289520426


  super().__init__(name, **kwargs)


Train size 100 - 0.7302867383512545


  super().__init__(name, **kwargs)


Train size 200 - 0.7678832116788321


  super().__init__(name, **kwargs)


Train size 500 - 0.7988416988416989


  super().__init__(name, **kwargs)


Train size 1000 - 0.8126068376068376


  super().__init__(name, **kwargs)


Train size 10 - 0.6


  super().__init__(name, **kwargs)


Train size 50 - 0.7003552397868561


  super().__init__(name, **kwargs)


Train size 100 - 0.7362007168458782


  super().__init__(name, **kwargs)


Train size 200 - 0.7496350364963503


  super().__init__(name, **kwargs)


Train size 500 - 0.794015444015444


  super().__init__(name, **kwargs)


Train size 1000 - 0.802991452991453


  super().__init__(name, **kwargs)


Train size 10 - 0.682010582010582


  super().__init__(name, **kwargs)


Train size 50 - 0.7460035523978685


  super().__init__(name, **kwargs)


Train size 100 - 0.7014336917562723


  super().__init__(name, **kwargs)


Train size 200 - 0.7534671532846715


  super().__init__(name, **kwargs)


Train size 500 - 0.766023166023166


  super().__init__(name, **kwargs)


Train size 1000 - 0.802991452991453
