# [ALIGNN](https://github.com/usnistgov/alignn) example

In [None]:
!pip install -qqq alignn

[?25l[K     |████                            | 10 kB 23.8 MB/s eta 0:00:01[K     |████████▏                       | 20 kB 24.8 MB/s eta 0:00:01[K     |████████████▎                   | 30 kB 11.1 MB/s eta 0:00:01[K     |████████████████▍               | 40 kB 4.6 MB/s eta 0:00:01[K     |████████████████████▌           | 51 kB 4.3 MB/s eta 0:00:01[K     |████████████████████████▋       | 61 kB 4.8 MB/s eta 0:00:01[K     |████████████████████████████▊   | 71 kB 5.4 MB/s eta 0:00:01[K     |████████████████████████████████| 79 kB 3.2 MB/s 
[K     |████████████████████████████████| 42 kB 785 kB/s 
[K     |████████████████████████████████| 251 kB 13.4 MB/s 
[K     |████████████████████████████████| 10.9 MB 7.8 MB/s 
[K     |████████████████████████████████| 954 kB 37.7 MB/s 
[K     |████████████████████████████████| 67 kB 5.2 MB/s 
[K     |████████████████████████████████| 4.4 MB 32.7 MB/s 
[K     |████████████████████████████████| 38.1 MB 5.2 MB/s 
[K     |█████████

In [None]:
import os
# Clone ALIGNN repo to get example folder
if not os.path.exists('alignn'):
  !git clone https://github.com/usnistgov/alignn.git

os.chdir('alignn')
# Install using setup.py in case pip didn't work
# !python setup.py develop

!pip install dgl-cu111 # Colab has cuda 11.1

Cloning into 'alignn'...
remote: Enumerating objects: 2158, done.[K
remote: Counting objects: 100% (2158/2158), done.[K
remote: Compressing objects: 100% (1128/1128), done.[K
remote: Total 2158 (delta 1376), reused 1642 (delta 960), pack-reused 0[K
Receiving objects: 100% (2158/2158), 1.25 MiB | 9.43 MiB/s, done.
Resolving deltas: 100% (1376/1376), done.
Collecting dgl-cu111
  Downloading dgl_cu111-0.6.1-cp37-cp37m-manylinux1_x86_64.whl (41.0 MB)
[K     |████████████████████████████████| 41.0 MB 14.7 MB/s 
Installing collected packages: dgl-cu111
Successfully installed dgl-cu111-0.6.1


## prepare IDAO data

In [None]:
!git clone https://github.com/HSE-LAMBDA/IDAO-2022.git
!cd IDAO-2022/data && tar -xf dichalcogenides_public.tar.gz
%pip install -qqq pymatgen wandb

Cloning into 'IDAO-2022'...
remote: Enumerating objects: 22, done.[K
remote: Counting objects: 100% (22/22), done.[K
remote: Compressing objects: 100% (20/20), done.[K
remote: Total 22 (delta 2), reused 19 (delta 0), pack-reused 0[K
Unpacking objects: 100% (22/22), done.
[K     |████████████████████████████████| 40.6 MB 2.4 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
[K     |████████████████████████████████| 1.7 MB 37.4 MB/s 
[K     |████████████████████████████████| 98 kB 7.2 MB/s 
[K     |████████████████████████████████| 109 kB 47.7 MB/s 
[K     |████████████████████████████████| 65 kB 3.4 MB/s 
[K     |████████████████████████████████| 546 kB 52.6 MB/s 
[K     |████████████████████████████████| 181 kB 35.1 MB/s 
[K     |████████████████████████████████| 144 kB 53.5 MB/s 
[K     |████████

In [None]:
import yaml
import json

import pandas as pd
import numpy as np
from pathlib import Path
from pymatgen.core import Structure
import os

def convert_to_POSCAR(file):
    with open(file, "r") as f:
        d = json.load(f)
    s = Structure.from_dict(d)

    parts = list(file.parts)
    parts[-1] = file.stem + ".vasp"
    parts[-2] = "POSCAR"
    new_file = Path(*parts)
    
    s.to(fmt="poscar", filename=new_file)

def convert_dataset_to_POSCAR(dataset_path):
    dataset_path = Path(dataset_path)

    ! rm -r dataset_path/"POSCAR"
    os.makedirs(dataset_path/"POSCAR", exist_ok=True)
    
    try:
        targets = pd.read_csv(dataset_path / "targets.csv")
        targets._id = targets._id + ".vasp"
        targets.to_csv(dataset_path/"POSCAR"/"id_prop.csv", index=False, header=False)
    except:
        pass
    for item in (dataset_path / "structures").iterdir():
        convert_to_POSCAR(item)


convert_dataset_to_POSCAR('./IDAO-2022/data/dichalcogenides_public/')

rm: cannot remove 'dataset_path/POSCAR': No such file or directory


# Train a model. Parameters are provided in `config_example.json` file.

Command line train_folder.py is used below.

In [None]:
!rm -r temp

rm: cannot remove 'temp': No such file or directory


In [None]:
%%writefile config.json
{
    "version": "112bbedebdaecf59fb18e11c929080fb2f358246",
    "dataset": "user_data",
    "target": "target",
    "atom_features": "cgcnn",
    "neighbor_strategy": "k-nearest",
    "id_tag": "jid",
    "random_seed": 123,
    "classification_threshold": null,
    "n_val": null,
    "n_test": null,
    "n_train": null,
    "train_ratio": 0.8,
    "val_ratio": 0.1,
    "test_ratio": 0.1,
    "target_multiplication_factor": null,
    "epochs": 200,
    "batch_size": 8,
    "weight_decay": 1e-05,
    "learning_rate": 0.001,
    "filename": "sample",
    "warmup_steps": 2000,
    "criterion": "mse",
    "optimizer": "adamw",
    "scheduler": "onecycle",
    "pin_memory": false,
    "save_dataloader": false,
    "write_checkpoint": true,
    "write_predictions": false,
    "store_outputs": true,
    "progress": true,
    "log_tensorboard": false,
    "standard_scalar_and_pca": false,
    "use_canonize": true,
    "num_workers": 0,
    "cutoff": 8.0,
    "max_neighbors": 12,
    "keep_data_order": false,
    "model": {
        "name": "alignn",
        "alignn_layers": 4,
        "gcn_layers": 4,
        "atom_input_features": 92,
        "edge_input_features": 80,
        "triplet_input_features": 40,
        "embedding_features": 64,
        "hidden_features": 128,
        "output_features": 1,
        "link": "identity",
        "zero_inflated": false,
        "classification": false
    }
}

Overwriting config.json


In [None]:
!train_folder.py --root_dir "IDAO-2022/data/dichalcogenides_public/POSCAR/" --config config.json --output_dir=temp

Using backend: pytorch
MAX val: 1.8106
MIN val: 0.0938999999999999
MAD: 0.4596230686826515
Baseline MAE: 0.45039721269541044
data range 1.8106 0.1720999999999999
  r = torch.tensor(r).type(torch.get_default_dtype())
100% 2372/2372 [06:55<00:00,  5.70it/s]
Setting it to max atomic number available here, 103
Setting it to max atomic number available here, 103
Setting it to max atomic number available here, 103
building line graphs
100% 2372/2372 [00:47<00:00, 50.29it/s]
data range 1.8103 0.2542
100% 296/296 [00:50<00:00,  5.83it/s]
building line graphs
100% 296/296 [00:05<00:00, 50.40it/s]
data range 1.8103 0.0938999999999999
100% 296/296 [00:50<00:00,  5.83it/s]
building line graphs
100% 296/296 [00:05<00:00, 50.18it/s]
n_train: 2372
n_val: 296
n_test: 296
version='112bbedebdaecf59fb18e11c929080fb2f358246' dataset='user_data' target='target' atom_features='cgcnn' neighbor_strategy='k-nearest' id_tag='jid' random_seed=123 classification_threshold=None n_val=None n_test=None n_train=None 

In [None]:
import glob
latest = 0
weight_ = None
for weight in glob.glob("temp/*.pt"):
    epoch = int(weight.split('_')[-1].split('.')[0])  
    if epoch > latest:
        latest = epoch
        weight_ = weight

In [None]:
from google.colab import files
files.download(weight_)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import json
import pandas as pd
true = pd.read_csv("IDAO-2022/data/dichalcogenides_public/POSCAR/id_prop.csv", header=None)

In [None]:
from alignn.models.alignn import ALIGNN, ALIGNNConfig
from alignn.pretrained import get_multiple_predictions
import torch
from jarvis.db.jsonutils import loadjson
from alignn.config import TrainingConfig
from jarvis.core.atoms import Atoms

model = ALIGNN(ALIGNNConfig(name="alignn", output_features=1))
model.load_state_dict(torch.load(weight_, map_location='cuda:0')["model"])
model.to('cuda:0')
model.eval()

import glob
atoms_array = []
for name in true[0]:
    i = f"IDAO-2022/data/dichalcogenides_public/POSCAR/{name}"
    atoms = Atoms.from_poscar(i)
    atoms_array.append(atoms)
get_multiple_predictions(model=model, atoms_array=atoms_array)

with open("pred_data.json") as f:
    data = json.load(f)
pred = pd.DataFrame.from_dict(data)
pred = pred.drop(columns="atoms")

Using backend: pytorch


RuntimeError: ignored

In [None]:
files.download("pred_data.json")

In [None]:
import numpy as np
def energy_within_threshold(prediction, target):
    # compute absolute error on energy per system.
    # then count the no. of systems where max energy error is < 0.02.
    e_thresh = 0.02
    error_energy = np.abs(target - prediction)

    success = np.count_nonzero(error_energy < e_thresh)
    total = target.shape[0]
    return success / total
energy_within_threshold(pred.pred, true[1]), np.mean(np.abs(pred.pred - true[1]))

# Prepare test data and predict

In [None]:
!cd IDAO-2022/data && tar -xf dichalcogenides_private.tar.gz

In [None]:
!cp IDAO-2022/submission.csv IDAO-2022/data/dichalcogenides_private/targets.csv

In [None]:
convert_dataset_to_POSCAR('./IDAO-2022/data/dichalcogenides_private/')

In [None]:
import json
import pandas as pd
submission = pd.read_csv("IDAO-2022/submission.csv")

In [None]:
from alignn.models.alignn import ALIGNN, ALIGNNConfig
from alignn.pretrained import get_multiple_predictions
import torch
from jarvis.db.jsonutils import loadjson
from alignn.config import TrainingConfig
from jarvis.core.atoms import Atoms

weight_ = "./checkpoint_200.pt"

model = ALIGNN(ALIGNNConfig(name="alignn", output_features=1))
model.load_state_dict(torch.load(weight_, map_location='cuda:0')["model"])
model.to('cuda:0')
model.eval()

import glob
atoms_array = []
for name in submission["id"]:
    i = f"IDAO-2022/data/dichalcogenides_private/POSCAR/{name}.vasp"
    atoms = Atoms.from_poscar(i)
    atoms_array.append(atoms)
get_multiple_predictions(model=model, atoms_array=atoms_array)

with open("pred_data.json") as f:
    data = json.load(f)
pred = pd.DataFrame.from_dict(data)
pred = pred.drop(columns="atoms")

submission["predictions"] = pred["pred"]

In [None]:
submission["predictions"] = pred["pred"]

In [None]:
submission

In [None]:
from google.colab import files
submission.to_csv("submission.csv", index=False)
files.download("submission.csv")