In [1]:
# Download cif files of materials from materialsproject.org

from mp_api.client import MPRester
import pandas as pd

import os.path as osp
import kogger    # a tool designed for logging
from tqdm import tqdm

In [2]:
def read_material_ids(data_path):
    df = pd.read_csv(data_path)
    kogger.info('Read {} samples from {}'.format(len(df), data_path))
    material_ids = df['ID'].tolist()                # access the column named 'ID'
    return material_ids

In [3]:
def download_materials(material_ids, save_dir, label):
    kogger.info('Download...') 
    my_api_key = ' '
    with MPRester(my_api_key) as mpr:
        ids = []

        # Fetch data in batches
        docs = []
        batch_size=10000
        batches = [material_ids[i:i + batch_size] for i in range(0, len(material_ids), batch_size)]
        for batch in batches:
            results = mpr.materials.bonds.search(material_ids=batch, fields=['material_id', 'structure_graph'])
            docs.extend(results)
  
        length = len(docs)
        for idx in tqdm(range(length)):
            struc = docs[idx].structure_graph.structure
            id = docs[idx].material_id
            ids.append(id)
            struc.to(filename=osp.join(save_dir, '{}.cif'.format(id)))  # save cif files, if uncommenting, no cif files

        # append to 'data/id_prop_{}.csv'
        labels = [label] * len(ids)                # create a list of labels
        df = pd.DataFrame(list(zip(ids, labels)))  # zipped list, then DataFrame for csv files
        path = osp.join(save_dir, 'id_prop_{}.csv'.format(label))
        kogger.info('Appending to {}'.format(path))
        df.to_csv(path, header=None, index=None, mode='a')  # save csv files

In [5]:
def main():
    # Define configurations for each type of dataset
    configs = [
    {"label": 1, "data_path": 'positive.csv', "save_dir": r'D:\data_big\P4_20240624\positive'},    
    {"label": 0, "data_path": 'negative.csv', "save_dir": r'D:\data_big\P4_20240624\negative'},      
    {"label": -1, "data_path": 'candidate.csv', "save_dir": r'D:\data_big\P4_20240624\candidate'},    
    {"label": 1, "data_path": 'test_positive.csv', "save_dir": r'D:\data_big\P4_20240624\test_positive'}, 
    {"label": 0, "data_path": 'test_negative.csv', "save_dir": r'D:\data_big\P4_20240624\test_negative'}
    ]
    
    for config in configs:
        data_path = config["data_path"]
        save_dir = config["save_dir"]
        label = config["label"]
    
        material_ids = read_material_ids(data_path)
        
        download_materials(material_ids, save_dir, label)
        
main()

[2024-06-24 22:15:55] [INFO] Read 90 samples from positive.csv
[2024-06-24 22:15:55] [INFO] Download...


Retrieving BondingDoc documents:   0%|          | 0/90 [00:00<?, ?it/s]

100%|█████████████████████████████████████████████████████████████████████████████████| 90/90 [00:00<00:00, 407.35it/s]


[2024-06-24 22:15:56] [INFO] Appending to D:\data_big\P4_20240624\positive\id_prop_1.csv
[2024-06-24 22:15:56] [INFO] Read 21601 samples from negative.csv
[2024-06-24 22:15:56] [INFO] Download...


Retrieving BondingDoc documents:   0%|          | 0/10000 [00:00<?, ?it/s]

Retrieving BondingDoc documents:   0%|          | 0/9999 [00:00<?, ?it/s]

Retrieving BondingDoc documents:   0%|          | 0/1601 [00:00<?, ?it/s]

100%|███████████████████████████████████████████████████████████████████████████| 21600/21600 [00:51<00:00, 421.60it/s]


[2024-06-24 22:17:42] [INFO] Appending to D:\data_big\P4_20240624\negative\id_prop_0.csv
[2024-06-24 22:17:43] [INFO] Read 45217 samples from candidate.csv
[2024-06-24 22:17:43] [INFO] Download...


Retrieving BondingDoc documents:   0%|          | 0/9998 [00:00<?, ?it/s]

Retrieving BondingDoc documents:   0%|          | 0/9998 [00:00<?, ?it/s]

Retrieving BondingDoc documents:   0%|          | 0/10000 [00:00<?, ?it/s]

Retrieving BondingDoc documents:   0%|          | 0/10000 [00:00<?, ?it/s]

Retrieving BondingDoc documents:   0%|          | 0/5216 [00:00<?, ?it/s]

100%|███████████████████████████████████████████████████████████████████████████| 45212/45212 [02:30<00:00, 300.21it/s]


[2024-06-24 22:22:59] [INFO] Appending to D:\data_big\P4_20240624\candidate\id_prop_-1.csv
[2024-06-24 22:23:00] [INFO] Read 14 samples from test_positive.csv
[2024-06-24 22:23:00] [INFO] Download...


Retrieving BondingDoc documents:   0%|          | 0/14 [00:00<?, ?it/s]

100%|█████████████████████████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 447.41it/s]


[2024-06-24 22:23:01] [INFO] Appending to D:\data_big\P4_20240624\test_positive\id_prop_1.csv
[2024-06-24 22:23:01] [INFO] Read 83 samples from test_negative.csv
[2024-06-24 22:23:01] [INFO] Download...


Retrieving BondingDoc documents:   0%|          | 0/83 [00:00<?, ?it/s]

100%|█████████████████████████████████████████████████████████████████████████████████| 83/83 [00:00<00:00, 384.47it/s]

[2024-06-24 22:23:02] [INFO] Appending to D:\data_big\P4_20240624\test_negative\id_prop_0.csv





# Find which materials are missing when fetching from the website
#### Note: Below we find 1 in negative is missing, 5 in candidate is missing.
#### These 6 materials can be fetched using mpr.material.summary.search in data_process.ipynb, but they cannot be fetched by using mpr.materials.bonds.search. The possible reason is that Materials Project website lacks bonding information for these materials. Also, just use the website interface, we also cannot download the cif files. And, bonding information is important for us. 
### So, I decided to just let go of these 6 materials.

### mpr.material.summary.search  ------ primitive unit cell by default

### mpr.materials.bonds.search  ------ conventional unit cell by default

## (1) negative dataset

In [6]:
negative = pd.read_csv('negative.csv')
len(negative)

21601

In [8]:
negative_fetch = pd.read_csv(r'D:\data_big\P4_20240624\negative\id_prop_0.csv', header=None)
negative_fetch = negative_fetch.rename(columns={0: 'ID'})
len(negative_fetch)

21600

In [9]:
# Find IDs in df1 that are not in df2
df1 = negative
df2 = negative_fetch

not_in_df2 = df1[~df1['ID'].isin(df2['ID'])]

print(not_in_df2)

               ID
19449  mp-2276800


## (2) candidate dataset

In [10]:
candidate = pd.read_csv('candidate.csv')
len(candidate)

45217

In [11]:
candidate_fetch = pd.read_csv(r'D:\data_big\P4_20240624\candidate\id_prop_-1.csv', header=None)
candidate_fetch = candidate_fetch.rename(columns={0: 'ID'})
len(candidate_fetch)

45212

In [12]:
# Find IDs in df1 that are not in df2
df1 = candidate
df2 = candidate_fetch

not_in_df2 = df1[~df1['ID'].isin(df2['ID'])]

print(not_in_df2)

               ID
4929   mp-2277947
7790   mp-2278159
11003  mp-2276572
19718  mp-2277564
42180  mp-2287684


# Below is just for exploring more.

## An example to successfully fetch information for one material 
#### mpr.materials.bonds.search    ---- conventional unit cell

In [None]:
with MPRester('uKaYoq6fc2leUGXshzG9tB4Csa6q2WIT') as mpr:
    ids=[]
    docs=[]
    results = mpr.materials.bonds.search(material_ids='mp-1226395', fields=['material_id', 'structure_graph'])
    docs.extend(results)
    length = len(docs)
    struc = docs[0].structure_graph.structure
    id = docs[0].material_id
    ids.append(id)

    # download and save cif file
    struc.to(filename='save_files/{}.cif'.format(id)) 
   
    # Mark label and save as csv file
    label = 666
    labels = [label] * len(ids)                # create a list of labels
    df = pd.DataFrame(list(zip(ids, labels)))  # zipped list, then DataFrame for csv files
    path = 'save_files/id_prop_{}.csv'.format(label)
    kogger.info('Appending to {}'.format(path))
    df.to_csv(path, header=None, index=None, mode='a')  # save csv files

## A second method to get cif file.

#### mpr.get_structure_by_material_id   ----- primitive unit cell

In [None]:
from mp_api.client import MPRester
from pymatgen.core.structure import Structure

# Replace with your actual Materials Project API key
api_key = "uKaYoq6fc2leUGXshzG9tB4Csa6q2WIT"

# Initialize MPRester with your API key
with MPRester(api_key) as mpr:
    # Fetch the structure for the material with ID 'mp-2276800'
    structure = mpr.get_structure_by_material_id("mp-2276800")

    # Convert the structure to CIF format
    cif_string = structure.to(fmt="cif")

    # Save the CIF file to disk
    file_name = "mp-2276800.cif"
    with open(file_name, 'w') as file:
        file.write(cif_string)

    print(f"CIF file for mp-2276800 saved as {file_name}")


## The method to get conventional cif file  (use SpacegroupAnalyzer to convert)

In [None]:
from mp_api.client import MPRester
from pymatgen.symmetry.analyzer import SpacegroupAnalyzer

# Replace with your actual Materials Project API key
api_key = "uKaYoq6fc2leUGXshzG9tB4Csa6q2WIT"

# Initialize MPRester with your API key
with MPRester(api_key) as mpr:
    # Fetch the structure for the material with ID 'mp-1226395'
    structure = mpr.get_structure_by_material_id("mp-1226395")

    # Convert the structure to the conventional unit cell
    sga = SpacegroupAnalyzer(structure)
    conventional_structure = sga.get_conventional_standard_structure()

    # Convert the conventional structure to CIF format
    cif_string = conventional_structure.to(fmt="cif")

    # Save the CIF file to disk
    file_name = "mp-1226395_conventional.cif"
    with open(file_name, 'w') as file:
        file.write(cif_string)

    print(f"Conventional CIF file for mp-1226395 saved as {file_name}")
