# Dataset Construction for Accelerated Chemical Space Exploration  

This notebook leverages the Materials Project API to build datasets aimed at faster exploration of chemical space, enabling more efficient experimentation in bandgap prediction.  


In [10]:
from mp_api.client import MPRester
import pandas as pd
from collections import Counter
import numpy as np


In [None]:
API_Key='yourAPIkey'
mpr=MPRester(API_Key)

In [12]:
elements_to_discard = [
    "Pb",  # Lead
    "Bi",  # Bismuth
    "Po",  # Polonium
    "At",  # Astatine
    "Rn",  # Radon
    "Fr",  # Francium
    "Ra",  # Radium
    "Ac",  # Actinium
    "Th",  # Thorium
    "Pa",  # Protactinium
    "U",   # Uranium
    "Np",  # Neptunium
    "Pu",  # Plutonium
    "Am",  # Americium
    "Cm",  # Curium
    "Bk",  # Berkelium
    "Cf",  # Californium
    "Es",  # Einsteinium
    "Fm",  # Fermium
    "Md",  # Mendelevium
    "No",  # Nobelium
    "Lr",  # Lawrencium
    "Rf",  # Rutherfordium
    "Db",  # Dubnium
    "Sg",  # Seaborgium
    "Bh",  # Bohrium
    "Hs",  # Hassium
    "Mt",  # Meitnerium
    "Ds",  # Darmstadtium
    "Rg",  # Roentgenium
    "Cn",  # Copernicium
    "Nh",  # Nihonium
    "Fl",  # Flerovium
    "Mc",  # Moscovium
    "Lv",  # Livermorium
    "Ts",  # Tennessine
    "Og"   # Oganesson
]


In [36]:
docs = mpr.materials.summary.search(
                          formula="ABC3", 
                            band_gap=(0.1, None),
                            exclude_elements=elements_to_discard[0:5],
                            spacegroup_number=221,
                            fields=['formula_pretty',
                                    "material_id", 
                                    "band_gap", 
                                    "structure",
                                    "is_gap_direct",
                                    "theoretical"])


Retrieving SummaryDoc documents: 100%|██████████| 194/194 [00:00<00:00, 1331743.00it/s]


In [14]:
def generate_formula(sites):

    element_counts = Counter()
    zero_coords_elements = set()

    for site in sites:
    
        element = site.species_string 
        frac_coords = site.frac_coords
        
        frac_coords = np.array(frac_coords)
        
        if np.all(frac_coords == [0.0, 0.0, 0.0]):
            zero_coords_elements.add(element)
        element_counts[element] += 1
    
    formula_parts = []
    other_parts = []
    count_three_parts = []
    
    for elem in zero_coords_elements:
        formula_parts.append(elem + (str(element_counts[elem]) if element_counts[elem] > 1 else ''))
    
    for elem, count in element_counts.items():
        if elem not in zero_coords_elements:
            if count == 3:
                count_three_parts.append(elem + str(count))
            else:
                other_parts.append(elem + (str(count) if count > 1 else ''))
    
    formula = ''.join(formula_parts + other_parts + count_three_parts)
    
    def simplify_by_division(counts):
        min_divisor = min(counts.values())
        while min_divisor % 2 == 0 and all(count % 2 == 0 for count in counts.values()):
            counts = {elem: count // 2 for elem, count in counts.items()}
            min_divisor = min(counts.values())
        return counts
    
    if all(count % 2 == 0 for count in element_counts.values()):
        simplified_counts = simplify_by_division(element_counts.copy())
        
        simplified_formula_parts = []
        simplified_other_parts = []
        simplified_count_three_parts = []
        
        for elem in zero_coords_elements:
            simplified_formula_parts.append(elem + (str(simplified_counts[elem]) if simplified_counts[elem] > 1 else ''))
        
        for elem, count in simplified_counts.items():
            if elem not in zero_coords_elements:
                if count == 3:
                    simplified_count_three_parts.append(elem + str(count))
                else:
                    simplified_other_parts.append(elem + (str(count) if count > 1 else ''))
        
        formula = ''.join(simplified_formula_parts + simplified_other_parts + simplified_count_three_parts)
    
    return formula


In [97]:
rows = []
for doc in docs:
    rows.append(pd.DataFrame([{
        'formula': generate_formula(doc.structure.sites),
        'mp_id': str(doc.material_id),
        'lattice_parameter': float(doc.structure.lattice.abc[0]),
        'mp_bandgap': float(doc.band_gap),
        'direc_bandgap': bool(doc.is_gap_direct)
    }]))

band_gap_df = pd.concat(rows, ignore_index=True)

tasks_doc = mpr.materials.tasks.search(
            list(band_gap_df['mp_id']),           
            fields=["calcs_reversed"]
)
band_gap_df['functional'] = [str(task.run_type) for task in tasks_doc]

band_gap_df.to_csv('Perovskite_221_bg_mp.csv')
display(band_gap_df)

Retrieving TaskDoc documents: 100%|██████████| 194/194 [00:00<00:00, 3376327.70it/s]


Unnamed: 0,formula,mp_id,lattice_parameter,mp_bandgap,direc_bandgap,functional
0,AcAlO3,mp-1183115,3.858634,4.1024,True,GGA
1,AcBO3,mp-1183052,3.721668,0.8071,False,GGA
2,AcCrO3,mp-866101,3.944287,2.0031,False,GGA
3,AcFeO3,mp-861502,3.953570,0.9888,False,GGA
4,AcGaO3,mp-1183053,3.946262,2.8959,False,GGA
...,...,...,...,...,...,...
189,YCoO3,mp-1435972,3.785831,0.8330,False,GGA
190,YCrO3,mp-18770,3.853879,1.7679,False,GGA
191,YFeO3,mp-1416093,3.883222,0.6373,False,GGA
192,AgZnF3,mp-14099,4.073079,1.5852,False,GGA


In [None]:
tasks_doc = mpr.materials.tasks.search(
            list(band_gap_df['mp_id']),           # task_id of this calculation
            fields=["calcs_reversed"]
        )
run_type=str(tasks_doc[0].run_type)
print(run_type)

Retrieving TaskDoc documents: 100%|██████████| 194/194 [00:00<00:00, 2650472.23it/s]

GGA



