## Install necessary packages

In [None]:
!pip install rdkit
!pip install condacolab
!pip install pandas==1.5.3
!pip install molsets
exit()
import condacolab
condacolab.install()
exit()
import torch
!pip uninstall torch -y
!conda install -c conda-forge pymatgen openbabel -y
!pip install torch==1.13.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117
!pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-1.13.0+cu117.html
!pip install torch_geometric
exit()
!pip install torchmetrics tqdm transformers wandb apex curtsies fairscale imageio joblib Pillow pytorch_lightning PyYAML ray scikit-learn ipykernel docopt seaborn
!pip install git+https://github.com/tensorflow/docs
exit()

## Clone GHP-MOFAssembly Github repository

In [None]:
!git clone https://github.com/hyunp2/ghp_mof.git

## Generate new MOF structures
In this example, we generate Cu paddlewheel MOFs at four catenation levels (cat0/cat1/cat2/cat3)

In [None]:
!cd ghp_mof && git pull && bash generate_mofs.sh

## Predict the CO2 capacities of generated MOF structures using the modified version of CGCNN model

In [None]:
!cd ghp_mof && git pull && python -m main \
--which_mode infer \
--dataset cifdata \
--backbone cgcnn \
--batch_size 2 \
--load_ckpt_path models \
--name cgcnn_pub_hmof_0.1 \
--gpu \
--train_frac 1 \
--data_dir_crystal MOFs/MOFs_all \
--ensemble_names cgcnn_pub_hmof_0.1 cgcnn_pub_hmof_0.1_dgx cgcnn_pub_hmof_0.1_v2

In [None]:
# remove dummy column
import pandas as pd
predictions = pd.read_csv("ghp_mof/publication_figures/ensemble_property_prediction.csv", index_col=0)
predictions = predictions.drop(['real'],axis=1)
predictions.to_csv('ghp_mof/publication_figures/ensemble_property_prediction.csv',index=False)

## Metrics

In [None]:
# Calcualte SAscore and SCscore
# The output is saved at output_for_assembly/n_atoms_5/sc_sa_score/CuCu_linker.csv
!cd ghp_mof && git pull && python calc_SAscore_SCscore.py

In [None]:
# Calculate Moses metrics including validity, uniquness and internal diversity
# The output is saved at output_for_assembly/n_atoms_5/metrics/CuCu.txt
!cd ghp_mof && git pull && python calc_moses_metrics.py

### Generate figures in the paper

In [None]:
# Plot empirical cumulative distribution functions of hMOF dataset
# Plots located at ./publication_figures/hMOF
!cd ghp_mof && git pull && python analyze_hMOF.py

In [None]:
# Plot the confusion matrix of pre-trained model predictions on 10% test set and the empirical cumulative distribution functions of predictions on generated structures
# Plots located at ./publication_figures/regression_model_inference_MOF and ./publication_figures/regression_model_inference_generated
!cd ghp_mof && git pull && python analyze_regression_model.py

In [None]:
# Plot the distributions of SAscore and SCscore of generated linkers
# Plots located at ./publication_figures/generated_linkers
!cd ghp_mof && git pull && python analyze_linkers.py

### Optional: Visualization of linker generation process using DiffLinker

In [None]:
# Sample difflinker
!cd ghp_mof/DiffLinker && git pull && python -W ignore difflinker_sample_and_analyze.py \
--fragments geom/datasets/geom_multifrag_test_frag.sdf \
--model checkpoints/geom_difflinker.ckpt \
--linker_size checkpoints/geom_size_gnn.ckpt \
--output geom/output \
--samples_dir geom/samples

In [None]:
import os, base64, tempfile
from openbabel.pybel import readfile
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from IPython.display import HTML
IPythonConsole.ipython_useSVG=True

workdir = "ghp_mof/DiffLinker"
samples_dir = os.path.join(workdir, "geom/samples")
output_dir = os.path.join(workdir, "geom/output")
mol_ids = [int(x.replace("mol_", "")) for x in os.listdir(samples_dir) if os.path.isdir(os.path.join(samples_dir, x))]
for mol_id in sorted(mol_ids):
    mol = "mol_" + str(mol_id)
    gif_path = os.path.join(samples_dir, mol, "output.gif")
    print(gif_path)
    b64 = base64.b64encode(open(gif_path,'rb').read()).decode('ascii')
    display(HTML(f'<img src="data:image/gif;base64,{b64}" />'))

    xyz = "output_" + str(mol_id) + "_geom_multifrag_test_frag_.xyz"
    print(os.path.join(output_dir, xyz))
    mol = list(readfile("xyz", os.path.join(output_dir, xyz)))[-1]

    mol.write("sdf", "tmp.sdf", overwrite=True)
    rdmol = next(iter(Chem.SDMolSupplier("tmp.sdf")))
    display(rdmol)
    if os.path.exists("tmp.sdf"):
        os.remove("tmp.sdf")
    print("\n")