### Imports

In [31]:
import os
import pandas as pd
# pd.options.display.max_rows = 2000

from mp_api.client import MPRester
from matminer.featurizers.structure.composite import JarvisCFID
from matminer.featurizers.conversions import StrToComposition
from matminer.featurizers.composition import ElementProperty

from dotenv import load_dotenv

In [32]:
# Initialize Materials Project API key

load_dotenv() 
mapi_key = os.environ.get('MAPI_KEY')

### Paths Setup

In [33]:
# Set input and output file paths according to your directory structure

input_file_path = "../data/raw/exp_dataset.xlsx"
output_file_path = "../data/processed"

### Load Input Data

In [34]:
if os.path.exists(input_file_path):
    exp_data = pd.read_excel(input_file_path)
    print(f"Input data of shape {exp_data.shape}, loaded from: {input_file_path}")
else:
    print("Invalid Path, directory doesn't exists.")

Input data of shape (6354, 2), loaded from: ../data/raw/exp_dataset.xlsx


### Prepare Data

#### Step 1: 
Renaming band gap column for readability. [Optional, If you dont want to change the column name update the next occurences accordingly.]

In [35]:
exp_data.rename(columns={'Eg (eV)': 'band_gap'}, inplace=True)
exp_data.head(3)

Unnamed: 0,composition,band_gap
0,Hg0.7Cd0.3Te,0.35
1,CuBr,3.077
2,LuP,1.3


#### Step 2: 
Check the materials with multi-valued compositions, and calculate mean of the compositions with more than one value.

In [36]:
composition_counts = exp_data['composition'].value_counts()
compositions_with_multiple_records = composition_counts[composition_counts > 1]

print(compositions_with_multiple_records)

CdS                         27
GaSe                        26
CdSe                        25
GaAs                        25
As2S3                       24
                            ..
SrTe                         2
Tl2TeI6                      2
PbSnS3                       2
AgInTe2                      2
Pb0.97Se0.97Sn0.03Se0.03     2
Name: composition, Length: 451, dtype: int64


In [37]:
exp_data = exp_data.groupby('composition')['band_gap'].mean().reset_index()
exp_data.shape

(4941, 2)

#### Step 3: 
Filtering materials, only taking materials with the bandgap between 0 and 6.

In [38]:
exp_data = exp_data[(exp_data['band_gap'] > 0) & (exp_data['band_gap'] < 6)]
exp_data.reset_index(drop=True, inplace=True)
exp_data.shape

(2422, 2)

#### Step 4: 
Retrieve some additional data for the filtered materials from the Materials Project.


Note that our goal is to the retrieve the structure and some other major properties of filtered materials only for the stable materails, hence specifying the threshold for the energy_above_hull (0).

In [39]:
def clean_column_names(df):
    for col in df.columns:
        # Check if the column name is not a string or a proper name
        if not isinstance(col, str) or not col.isidentifier():
            # Extract the first element of the first row as a potential column name
            new_col_name = df[col].iloc[0][0] if isinstance(df[col].iloc[0], tuple) else str(df[col].iloc[0])
            
            # Assign the new column name
            df = df.rename(columns={col: new_col_name})
    
    return df

In [40]:
exp_data = exp_data.sample(10)

In [None]:
dataframes_to_concat = []
unstable_materials = []
others = []

mpr = MPRester(api_key=mapi_key)

for index, row in exp_data.iterrows():
    composition = str(row['composition'])
    band_gap = row['band_gap']
    is_processed = False

    print(f"Prcoessing material number : {index}/{len(exp_data)}")

    docs = mpr.summary.search(
        formula=composition,
        energy_above_hull=(0,0),
        all_fields=False,
        fields=[
            'nelements', 'formula_pretty', 'volume', 'density', 'density_atomic',
            'structure', 'uncorrected_energy_per_atom', 'energy_per_atom',
            'formation_energy_per_atom', 'efermi', 'is_gap_direct',
            'total_magnetization', 'total_magnetization_normalized_vol',
            'total_magnetization_normalized_formula_units'
        ]
    )

    if len(docs) != 0:
        sdf = pd.DataFrame([docs[0]])
        sdf = clean_column_names(sdf)
        sdf['composition'] = composition
        sdf['band_gap'] = band_gap
        dataframes_to_concat.append(sdf)
        is_processed = True
    else:
        unstable_materials.append(composition)
        is_processed = True

    if not is_processed:
        others.append(composition)

In [42]:
print(f"Total number of stable materials: {len(dataframes_to_concat)}")
print(f"Total number of unstable materials: {len(unstable_materials)}")
print(f"Total number of corrupted materials: {len(others)}")

Total number of stable materials: 4
Total number of unstable materials: 6
Total number of corrupted materials: 0


In [43]:
# Creating a dataframe of stable materials

result_df = pd.concat(dataframes_to_concat, ignore_index=True)
result_df.shape

(4, 70)

In [44]:
result_df

Unnamed: 0,builder_meta,nsites,elements,nelements,composition,composition_reduced,formula_pretty,formula_anonymous,chemsys,volume,...,weighted_surface_energy,weighted_work_function,surface_anisotropy,shape_factor,has_reconstructed,possible_species,has_props,theoretical,database_IDs,fields_not_requested
0,"(builder_meta, None)","(nsites, None)","(elements, None)","(nelements, 3)",ZnSnP2,"(composition_reduced, None)","(formula_pretty, ZnSnP2)","(formula_anonymous, None)","(chemsys, None)","(volume, 180.59078792277623)",...,"(weighted_surface_energy, None)","(weighted_work_function, None)","(surface_anisotropy, None)","(shape_factor, None)","(has_reconstructed, None)","(possible_species, None)","(has_props, None)","(theoretical, None)","(database_IDs, None)","(fields_not_requested, [builder_meta, nsites, ..."
1,"(builder_meta, None)","(nsites, None)","(elements, None)","(nelements, 4)",Ba3In2P4O16,"(composition_reduced, None)","(formula_pretty, Ba3In2(PO4)4)","(formula_anonymous, None)","(chemsys, None)","(volume, 748.7657292020361)",...,"(weighted_surface_energy, None)","(weighted_work_function, None)","(surface_anisotropy, None)","(shape_factor, None)","(has_reconstructed, None)","(possible_species, None)","(has_props, None)","(theoretical, None)","(database_IDs, None)","(fields_not_requested, [builder_meta, nsites, ..."
2,"(builder_meta, None)","(nsites, None)","(elements, None)","(nelements, 3)",Cu3NbSe4,"(composition_reduced, None)","(formula_pretty, NbCu3Se4)","(formula_anonymous, None)","(chemsys, None)","(volume, 181.82844767762958)",...,"(weighted_surface_energy, None)","(weighted_work_function, None)","(surface_anisotropy, None)","(shape_factor, None)","(has_reconstructed, None)","(possible_species, None)","(has_props, None)","(theoretical, None)","(database_IDs, None)","(fields_not_requested, [builder_meta, nsites, ..."
3,"(builder_meta, None)","(nsites, None)","(elements, None)","(nelements, 2)",SnBr2,"(composition_reduced, None)","(formula_pretty, SnBr2)","(formula_anonymous, None)","(chemsys, None)","(volume, 106.50399856128574)",...,"(weighted_surface_energy, None)","(weighted_work_function, None)","(surface_anisotropy, None)","(shape_factor, None)","(has_reconstructed, None)","(possible_species, None)","(has_props, None)","(theoretical, None)","(database_IDs, None)","(fields_not_requested, [builder_meta, nsites, ..."


#### Step 5:

Transform Data

In [45]:
for column_name in result_df.columns:
    # Update values only if they are tuples
    result_df[column_name] = result_df[column_name].apply(lambda x: x[1] if isinstance(x, tuple) else x)
    
result_df

Unnamed: 0,builder_meta,nsites,elements,nelements,composition,composition_reduced,formula_pretty,formula_anonymous,chemsys,volume,...,weighted_surface_energy,weighted_work_function,surface_anisotropy,shape_factor,has_reconstructed,possible_species,has_props,theoretical,database_IDs,fields_not_requested
0,,,,3,ZnSnP2,,ZnSnP2,,,180.590788,...,,,,,,,,,,"[builder_meta, nsites, elements, composition, ..."
1,,,,4,Ba3In2P4O16,,Ba3In2(PO4)4,,,748.765729,...,,,,,,,,,,"[builder_meta, nsites, elements, composition, ..."
2,,,,3,Cu3NbSe4,,NbCu3Se4,,,181.828448,...,,,,,,,,,,"[builder_meta, nsites, elements, composition, ..."
3,,,,2,SnBr2,,SnBr2,,,106.503999,...,,,,,,,,,,"[builder_meta, nsites, elements, composition, ..."


#### Step 6:
Check for Nan values and drop extra columns

In [46]:
result_df.drop('fields_not_requested', axis=1 , inplace=True)

In [47]:
result_df.isna().sum()

builder_meta         4
nsites               4
elements             4
nelements            0
composition          0
                    ..
has_reconstructed    4
possible_species     4
has_props            4
theoretical          4
database_IDs         4
Length: 69, dtype: int64

#### Step 7: 
Create Descriptors

In [48]:
# Creating Jarvis descriptors from structures

jf = JarvisCFID()

In [52]:
jarvis_df = jf.featurize_dataframe(result_df, col_id="structure", ignore_errors=True)

JarvisCFID:   0%|          | 0/4 [00:00<?, ?it/s]

In [53]:
# Check rows with the Nan enteries

rows_with_nan = jarvis_df[jarvis_df.isna().any(axis=1)]
num_rows_with_nan = len(rows_with_nan)
num_rows_with_nan

4

In [54]:
# Dropping Nan rows

jarvis_df.dropna(inplace=True)
jarvis_df.shape

(4, 1626)

In [55]:
# Creating Magpie descriptors from compositions

comp_df = jarvis_df.copy()
str_to_comp = StrToComposition(target_col_id='composition_pmg')
comp_df = str_to_comp.featurize_dataframe(comp_df, col_id='composition')

featurizer = ElementProperty.from_preset('magpie')
comp_df = featurizer.featurize_dataframe(comp_df, col_id='composition_pmg', ignore_errors=True)
comp_df.shape

StrToComposition:   0%|          | 0/4 [00:00<?, ?it/s]

ElementProperty:   0%|          | 0/4 [00:00<?, ?it/s]

(4, 1759)

In [56]:
# Check for the Nan values, and drop if any.

rows_with_nan = comp_df[comp_df.isna().any(axis=1)]
num_rows_with_nan = len(rows_with_nan)
num_rows_with_nan

4

In [57]:
comp_df.shape

(4, 1759)

In [None]:
# Save processed data

comp_df.to_csv('../data/processed/exp_materials_with_jarvis_and_magpie.csv', index=False)

### Create Train and Test sets

In [None]:
with open('../data/amp_test_materials.txt', 'r') as file:
    amp_test_compositions = file.read().splitlines()
    
len(amp_test_compositions)

In [None]:
with open('../data/exp_test_materials.txt', 'r') as file:
    exp_test_compositions = file.read().splitlines()
    
len(exp_test_compositions)

In [None]:
benchmark_materials = amp_test_compositions + exp_test_compositions
len(benchmark_materials)

In [None]:
test_df = comp_df[comp_df['composition'].isin(benchmark_materials)]
test_df.shape

In [None]:
indices_to_remove = test_df.index
indices_to_remove

train_df = comp_df.drop(indices_to_remove)
train_df.shape

In [None]:
test_distribution = train_df.sample(frac=0.1, random_state=42)
train_distribution = train_df.drop(test_samples.index)

train_distribution.shape, test_distribution.shape

In [None]:
test_set = pd.concat([test_distribution, test_df], ignore_index=True)
train_set = train_distribution

train_set.reset_index(drop=True, inplace=True)
test_set.reset_index(drop=True, inplace=True)

train_set.shape, test_set.shape

In [None]:
train_set.to_csv('../data/processed/train_set.csv', index=False)
test_set.to_csv('../data/processed/test_set.csv', index=False)