In [61]:
%cd /content/drive/MyDrive/ML_Data/IDAO-2022
!pip install -r IDAO-2022/requirements.txt

In [1]:
import warnings

warnings.filterwarnings('ignore')

# import yaml
import json

import pandas as pd
import numpy as np
import tensorflow as tf

from pathlib import Path
from pymatgen.core import Structure
from sklearn.model_selection import train_test_split, KFold
from megnet.models import MEGNetModel
from megnet.data.crystal import CrystalGraph
import tarfile

# '''
# tar_pub = tarfile.open('./IDAO-2022/data/dichalcogenides_public.tar.gz')
# tar_pri = tarfile.open('./IDAO-2022/data/dichalcogenides_private.tar.gz')
# tar_pub.extractall('./IDAO-2022/data')
# tar_pri.extractall('./IDAO-2022/data')
# tar_pub.close()
# tar_pri.close()
# '''
np.random.seed(42)

In [2]:
def read_pymatgen_dict(file):
    with open(file, "r") as f:
        d = json.load(f)
    return Structure.from_dict(d)


def energy_within_threshold(prediction, target):
    # compute absolute error on energy per system.
    # then count the no. of systems where max energy error is < 0.02.
    e_thresh = 0.02
    error_energy = tf.math.abs(target - prediction)

    success = tf.math.count_nonzero(error_energy < e_thresh)
    total = tf.size(target)
    return success / tf.cast(total, tf.int64)

In [3]:
dataset_path = './IDAO-2022/data/dichalcogenides_public'
config = {'datapath': './IDAO-2022/data/dichalcogenides_public',
'test_datapath': './IDAO-2022/data/dichalcogenides_private',
'checkpoint_path': './IDAO-2022/callback/val_mae_00779_0.040351.hdf5',
'epochs': 1000,
'batch_size': 16,
'learning_rate': 2e-4,
'cutoff': 4}

In [4]:
dataset_path = './data/dichalcogenides_public'
config = {'datapath': './data/dichalcogenides_public',
'test_datapath': './data/dichalcogenides_private',
'checkpoint_path': './callback/val_mae_00779_0.040351.hdf5',
'epochs': 1000,
'batch_size': 16,
'learning_rate': 2e-4,
'cutoff': 4}

In [5]:
model_form = MEGNetModel.from_file('../megnet/mvl_models/mp-2018.6.1/band_gap_regression.hdf5')

2022-02-23 00:42:16.684477: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-02-23 00:42:16.820899: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda/lib64:
2022-02-23 00:42:16.820931: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1850] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2022-02-23 00:42:16.821829: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow bina

In [6]:
embedding_layer = [i for i in model_form.layers if i.name.startswith('embedding')][0]
embedding = embedding_layer.get_weights()[0]
print('Embedding matrix dimension is ', embedding.shape)

Embedding matrix dimension is  (95, 16)


In [7]:
nfeat_bond = 100
r_cutoff = config['cutoff']
gaussian_centers = np.linspace(0, r_cutoff + 1, nfeat_bond)
gaussian_width = 0.8

In [8]:
dataset_path = Path(dataset_path)
targets = pd.read_csv(dataset_path / "targets.csv", index_col=0)

struct = {
        item.name.strip(".json"): read_pymatgen_dict(item)
        for item in (dataset_path / "structures").iterdir()
    }

data = pd.DataFrame(columns=["structures"], index=struct.keys())
data = data.assign(structures=struct.values(), targets=targets)

In [9]:
dataset_path = Path(config['test_datapath'])
struct = {item.name.strip('.json'): read_pymatgen_dict(item) for item in (dataset_path/'structures').iterdir()}
private_test = pd.DataFrame(columns=['id', 'structures'], index=struct.keys())
private_test = private_test.assign(structures=struct.values())

In [10]:
# val_mae in the megnet source code have been changed so it actually calculates (1 - EwT) although the message looks the same
preds = pd.DataFrame(np.zeros(len(private_test)), index=private_test.index, columns=['predictions'])
kf = KFold(5, shuffle=True, random_state=42)
fold = 1
for train_index, test_index in kf.split(data):
  model = MEGNetModel(
        nblocks=3,
        #,n1=1
        #,n2=1
        #,n3=1
        graph_converter=CrystalGraph(cutoff=r_cutoff),
        centers=gaussian_centers,
        width=gaussian_width,
        loss=["MAE"],
        npass=2,
        lr=config['learning_rate'],
        metrics=energy_within_threshold,
        nvocal=95,
        embedding_dim=16,
        random_state=42
    )
  # find the embedding layer  index in all the model layers
  embedding_layer_index = [i for i, j in enumerate(model.layers) if j.name.startswith('atom_embedding')][0]

  # Set the weights to our previous embedding
  model.layers[embedding_layer_index].set_weights([embedding])

  # Freeze the weights
  model.layers[embedding_layer_index].trainable = True

  train, test = data.iloc[train_index], data.iloc[test_index]
  model.train(
        train.structures,
        train.targets,
        validation_structures=test.structures,
        validation_targets=test.targets,
        epochs=int(config['epochs']),
        batch_size=int(config['batch_size'])
        #,verbose=2
        )
  preds['predictions'] += np.squeeze(model.predict_structures(private_test.structures))   
  model.save_model('fold_transfer_' + str(fold) + '.hdf5')
  fold += 1
preds /= 5

Epoch 1/1000

INFO:megnet.callbacks:
Epoch 00001: val_mae improved from inf to 0.00000, saving model to callback/val_mae_00001_0.000000.hdf5


HELLLOOO MAE KODU CAGIRILDI AMA BIZ ICERI SIZDIK
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000

In [None]:
private_test['predictions'] = preds['predictions']
private_test[['predictions']].to_csv('./submission.csv', index_label='id')