<a href="https://colab.research.google.com/github/kostyayatsok/IDAO-2022/blob/main/IDAO_2022_ALIGNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [IDAO-2022](https://idao.world/)
[IDAO 2022](https://idao.world/) solution by team182. Based on [ALIGNN](https://github.com/usnistgov/alignn).

In [None]:
!pip install -qqq alignn

In [None]:
import os
# Clone ALIGNN repo to get example folder
if not os.path.exists('alignn'):
  !git clone https://github.com/usnistgov/alignn.git

os.chdir('alignn')
# Install using setup.py in case pip didn't work
# !python setup.py develop

!pip install dgl-cu111 # Colab has cuda 11.1

# Prepare data

In [None]:
!git clone https://github.com/HSE-LAMBDA/IDAO-2022.git
!cd IDAO-2022/data && tar -xf dichalcogenides_public.tar.gz
%pip install -qqq pymatgen wandb

In [None]:
import yaml
import json

import pandas as pd
import numpy as np
from pathlib import Path
from pymatgen.core import Structure
import os

def convert_to_POSCAR(file):
    with open(file, "r") as f:
        d = json.load(f)
    s = Structure.from_dict(d)

    parts = list(file.parts)
    parts[-1] = file.stem + ".vasp"
    parts[-2] = "POSCAR"
    new_file = Path(*parts)
    
    s.to(fmt="poscar", filename=new_file)

def convert_dataset_to_POSCAR(dataset_path):
    dataset_path = Path(dataset_path)

    ! rm -r dataset_path/"POSCAR"
    os.makedirs(dataset_path/"POSCAR", exist_ok=True)
    
    try:
        targets = pd.read_csv(dataset_path / "targets.csv")
        targets._id = targets._id + ".vasp"
        targets.to_csv(dataset_path/"POSCAR"/"id_prop.csv", index=False, header=False)
    except:
        pass
    for item in (dataset_path / "structures").iterdir():
        convert_to_POSCAR(item)


convert_dataset_to_POSCAR('./IDAO-2022/data/dichalcogenides_public/')

# Train a model

Command line train_folder.py is used below.

In [None]:
!rm -r temp

In [None]:
%%writefile config.json
{
    "version": "112bbedebdaecf59fb18e11c929080fb2f358246",
    "dataset": "user_data",
    "target": "target",
    "atom_features": "cgcnn",
    "neighbor_strategy": "k-nearest",
    "id_tag": "jid",
    "random_seed": 123,
    "classification_threshold": null,
    "n_val": null,
    "n_test": null,
    "n_train": null,
    "train_ratio": 0.8,
    "val_ratio": 0.1,
    "test_ratio": 0.1,
    "target_multiplication_factor": null,
    "epochs": 200,
    "batch_size": 8,
    "weight_decay": 1e-05,
    "learning_rate": 0.001,
    "filename": "sample",
    "warmup_steps": 2000,
    "criterion": "mse",
    "optimizer": "adamw",
    "scheduler": "onecycle",
    "pin_memory": false,
    "save_dataloader": false,
    "write_checkpoint": true,
    "write_predictions": false,
    "store_outputs": true,
    "progress": true,
    "log_tensorboard": false,
    "standard_scalar_and_pca": false,
    "use_canonize": true,
    "num_workers": 0,
    "cutoff": 8.0,
    "max_neighbors": 12,
    "keep_data_order": false,
    "model": {
        "name": "alignn",
        "alignn_layers": 4,
        "gcn_layers": 4,
        "atom_input_features": 92,
        "edge_input_features": 80,
        "triplet_input_features": 40,
        "embedding_features": 64,
        "hidden_features": 128,
        "output_features": 1,
        "link": "identity",
        "zero_inflated": false,
        "classification": false
    }
}

In [None]:
!train_folder.py --root_dir "IDAO-2022/data/dichalcogenides_public/POSCAR/" --config config.json --output_dir=temp

In [None]:
import glob
latest = 0
weight_ = None
for weight in glob.glob("temp/*.pt"):
    epoch = int(weight.split('_')[-1].split('.')[0])  
    if epoch > latest:
        latest = epoch
        weight_ = weight

In [None]:
from google.colab import files
files.download(weight_)

In [None]:
import json
import pandas as pd
true = pd.read_csv("IDAO-2022/data/dichalcogenides_public/POSCAR/id_prop.csv", header=None)

In [None]:
from alignn.models.alignn import ALIGNN, ALIGNNConfig
from alignn.pretrained import get_multiple_predictions
import torch
from jarvis.db.jsonutils import loadjson
from alignn.config import TrainingConfig
from jarvis.core.atoms import Atoms

model = ALIGNN(ALIGNNConfig(name="alignn", output_features=1))
model.load_state_dict(torch.load(weight_, map_location='cuda:0')["model"])
model.to('cuda:0')
model.eval()

import glob
atoms_array = []
for name in true[0]:
    i = f"IDAO-2022/data/dichalcogenides_public/POSCAR/{name}"
    atoms = Atoms.from_poscar(i)
    atoms_array.append(atoms)
get_multiple_predictions(model=model, atoms_array=atoms_array)

with open("pred_data.json") as f:
    data = json.load(f)
pred = pd.DataFrame.from_dict(data)
pred = pred.drop(columns="atoms")

In [None]:
files.download("pred_data.json")

In [None]:
import numpy as np
def energy_within_threshold(prediction, target):
    # compute absolute error on energy per system.
    # then count the no. of systems where max energy error is < 0.02.
    e_thresh = 0.02
    error_energy = np.abs(target - prediction)

    success = np.count_nonzero(error_energy < e_thresh)
    total = target.shape[0]
    return success / total
energy_within_threshold(pred.pred, true[1]), np.mean(np.abs(pred.pred - true[1]))

# Prepare test data and predict

In [None]:
!wget https://github.com/kostyayatsok/IDAO-2022/raw/main/checkpoint_200.pt

In [None]:
!cd IDAO-2022/data && tar -xf dichalcogenides_private.tar.gz

In [None]:
!cp IDAO-2022/submission.csv IDAO-2022/data/dichalcogenides_private/targets.csv

In [None]:
convert_dataset_to_POSCAR('./IDAO-2022/data/dichalcogenides_private/')

In [None]:
import json
import pandas as pd
submission = pd.read_csv("IDAO-2022/submission.csv")

In [None]:
from alignn.models.alignn import ALIGNN, ALIGNNConfig
from alignn.pretrained import get_multiple_predictions
import torch
from jarvis.db.jsonutils import loadjson
from alignn.config import TrainingConfig
from jarvis.core.atoms import Atoms

weight_ = "./checkpoint_200.pt"

model = ALIGNN(ALIGNNConfig(name="alignn", output_features=1))
model.load_state_dict(torch.load(weight_, map_location='cuda:0')["model"])
model.to('cuda:0')
model.eval()

import glob
atoms_array = []
for name in submission["id"]:
    i = f"IDAO-2022/data/dichalcogenides_private/POSCAR/{name}.vasp"
    atoms = Atoms.from_poscar(i)
    atoms_array.append(atoms)
get_multiple_predictions(model=model, atoms_array=atoms_array)

with open("pred_data.json") as f:
    data = json.load(f)
pred = pd.DataFrame.from_dict(data)
pred = pred.drop(columns="atoms")

submission["predictions"] = pred["pred"]

In [None]:
submission["predictions"] = pred["pred"]

In [None]:
submission

In [None]:
from google.colab import files
submission.to_csv("submission.csv", index=False)
files.download("submission.csv")