# Catalysis-hub database query

This notebook performs a query on the [Catalysis-hub](https://catalysis-hub.org) database to acquire a dataset of heterogeneous catalysis reactions. The data comprises keyword features, such as the names of the reactants and products, the chemical compostion, reaction and activation energies, as well as the complete atomic structures of the reaction configurations.

In [1]:
# Imports
import numpy as np
import requests
import json
import io
import os
import ase.io
from ase.io import read

# Define the Catalysis-hub API path and the project root directory
GRAPHQL = "http://api.catalysis-hub.org/graphql"
ROOT_DIR = os.getcwd()

In [2]:
# Define the keyvalues used in the query
KEY_VALUES = [
    "chemicalComposition",
    "surfaceComposition",
    "facet",
    "sites",
    "coverages",
    "reactants",
    "products",
    "Equation",
    "reactionEnergy",
    "activationEnergy",
    "dftCode",
    "dftFunctional",
    "username",
    "pubId",
    "id",
]

In [3]:
def query_reactions(endcursor):
    """
    The function performs a batch query on the database. The database is
    queried in batches of 50 reactions, and all reactions with an
    activation enregy under 100 eV are selected.
    
    Parameters:
      endcursor:  Cursor to indicate the batch on the query.
    Returns:
      data:       The acquired data on each batch.
    """
    
    # Define the query string
    query_string = "{"
    query_string += f'reactions(first: 50, after: "{endcursor}"'
    query_string += ', activationEnergy: 100, op: "<"'
    query_string += """) {
  totalCount
  pageInfo {
    endCursor
  }
  edges {
    node {"""
  
    # Add the keywords into the query string
    for key_value in KEY_VALUES:
        query_string += str("\n" + " "*6 + key_value)
        
    query_string += """
      systems {
        id
        Trajdata
        energy
        InputFile(format: "xyz")
        keyValuePairs
      }
    }
  }
}}"""
    
    data = requests.post(GRAPHQL, {"query": query_string})
    try:
        # Read the acquired data into a dictionary
        data = data.json()["data"]
    except Exception as e:
        # Handle exceptions in a general manner
        print(e)
        print("Error: Something went wrong. Please check your query string.")
    
    return data

In [4]:
def parse_reaction(reaction):
    """
    The function parses a single reaction. All the missing keyvalues are
    labeled as 'None', and the structural data is saved separately.
    
    Parameters:
      reaction:       A dictionary containing the data for a single reaction.
    Returns:
      reaction_dict:  A parsed data dictionary.
    """
    
    reaction_dict = {}
    key_value_pairs = {}
    
    # Go through the keyvalues
    for key_value in KEY_VALUES:
        try:
            key_value_pairs[key_value] = reaction[key_value]
        except ValueError:
            key_value_pairs[key_value] = "None"
    if key_value_pairs["coverages"] is None:
        key_value_pairs["coverages"] = "None"
    if key_value_pairs["sites"] is None:
        key_value_pairs["sites"] = "None"
    reaction_dict["key_value_pairs"] = key_value_pairs
    
    # Go through the structural data
    structures = []
    for structure in reaction["systems"]:
        struct = {}
        struct["energy"] = structure["energy"]
        struct["InputFile"] = structure["InputFile"]
        struct["keyValuePairs"] = structure["keyValuePairs"]
        structures.append(struct)
    reaction_dict["structures"] = structures
    
    return reaction_dict

In [5]:
# Run queries and save results to file
reaction_list = {}
N_fetched = 0
endcursor = ""
n = 0
totalcount = 100000

while n * 50 + 1 < totalcount:
    data = query_reactions(endcursor)
    for reaction in data["reactions"]["edges"]:
        reaction = reaction["node"]
        reaction_list[reaction["id"]] = parse_reaction(reaction)
    endcursor = data["reactions"]["pageInfo"]["endCursor"]
    totalcount = data["reactions"]["totalCount"]
    N_fetched += totalcount
    count = 50 * (n + 1)
    if count >= totalcount:
        count = totalcount
    print(f"Fetched reactions {50*n+1}-{count}/{totalcount}")
    n += 1
print("Done!")

with open (f"{ROOT_DIR}/data/reactions_cathub.json", "w") as outfile:
    json.dump(reaction_list, outfile)

Fetched reactions 1-50/1431
Fetched reactions 51-100/1431
Fetched reactions 101-150/1431
Fetched reactions 151-200/1431
Fetched reactions 201-250/1431
Fetched reactions 251-300/1431
Fetched reactions 301-350/1431
Fetched reactions 351-400/1431
Fetched reactions 401-450/1431
Fetched reactions 451-500/1431
Fetched reactions 501-550/1431
Fetched reactions 551-600/1431
Fetched reactions 601-650/1431
Fetched reactions 651-700/1431
Fetched reactions 701-750/1431
Fetched reactions 751-800/1431
Fetched reactions 801-850/1431
Fetched reactions 851-900/1431
Fetched reactions 901-950/1431
Fetched reactions 951-1000/1431
Fetched reactions 1001-1050/1431
Fetched reactions 1051-1100/1431
Fetched reactions 1101-1150/1431
Fetched reactions 1151-1200/1431
Fetched reactions 1201-1250/1431
Fetched reactions 1251-1300/1431
Fetched reactions 1301-1350/1431
Fetched reactions 1351-1400/1431
Fetched reactions 1401-1431/1431
Done!
