### Imports

In [1]:
import os
import pandas as pd

from mp_api.client import MPRester
from matminer.featurizers.structure.composite import JarvisCFID
from matminer.featurizers.conversions import StrToComposition
from matminer.featurizers.composition import ElementProperty

from dotenv import load_dotenv

In [2]:
# Initialize Materials Project API key

load_dotenv() 
mapi_key = os.environ.get('MAPI_KEY')

### Paths Setup

In [12]:
# Set input and output file paths according to your directory structure

input_file_path = "../data/raw/exp_dataset.xlsx"
output_file_path = "../data/processed"

### Load Input Data

In [None]:
if os.path.exists(input_file_path):
    exp_data = pd.read_excel(input_file_path)
    print(f"Input data of shape {exp_data.shape}, loaded from: {input_file_path}")
else:
    print("Invalid Path, directory doesn't exists.")

### Prepare Data

#### Step 1: 
Renaming band gap column for readability. [Optional, If you dont want to change the column name update the next occurences accordingly.]

In [None]:
exp_data.rename(columns={'Eg (eV)': 'band_gap'}, inplace=True)
exp_data.head(3)

#### Step 2: 
Check the materials with multi-valued compositions, and calculate mean of the compositions with more than one value.

In [None]:
composition_counts = exp_data['composition'].value_counts()
compositions_with_multiple_records = composition_counts[composition_counts > 1]

print(compositions_with_multiple_records)

In [None]:
exp_data = exp_data.groupby('composition')['band_gap'].mean().reset_index()
exp_data.shape

#### Step 3: 
Filtering materials, only taking materials with the bandgap between 0 and 6.

In [None]:
exp_data = exp_data[(exp_data['band_gap'] > 0) & (exp_data['band_gap'] < 6)]
exp_data.reset_index(drop=True, inplace=True)
exp_data.shape

#### Step 4: 
Retrieve some additional data for the filtered materials from the Materials Project.


Note that our goal is to the retrieve the structure and some other major properties of filtered materials only for the stable materails, hence specifying the threshold for the energy_above_hull (0).

In [None]:
dataframes_to_concat = []
unstable_materials = []
others = []

mpr = MPRester(api_key=mapi_key)

for index, row in exp_data.iterrows():
    composition = str(row['composition'])
    band_gap = row['band_gap']
    is_processed = False

    print(f"Prcoessing material number : {index}/{len(exp_data)}")

    docs = mpr.summary.search(
        formula=composition,
        energy_above_hull=(0,0),
        all_fields=False,
        fields=[
            'nelements', 'formula_pretty', 'volume', 'density', 'density_atomic',
            'structure', 'uncorrected_energy_per_atom', 'energy_per_atom',
            'formation_energy_per_atom', 'efermi', 'is_gap_direct',
            'total_magnetization', 'total_magnetization_normalized_vol',
            'total_magnetization_normalized_formula_units'
        ]
    )

    if len(docs) != 0:
        sdf = pd.DataFrame([docs[0]])
        sdf['composition'] = composition
        sdf['band_gap'] = band_gap
        dataframes_to_concat.append(sdf)
        is_processed = True
    else:
        unstable_materials.append(composition)
        is_processed = True

    if not is_processed:
        others.append(composition)

In [None]:
print(f"Total number of stable materials: {len(dataframes_to_concat)}")
print(f"Total number of unstable materials: {len(unstable_materials)}")
print(f"Total number of corrupted materials: {len(others)}")

In [None]:
# Creating a dataframe of stable materials

result_df = pd.concat(dataframes_to_concat, ignore_index=True)
result_df.shape

#### Step 5:
Transform Data 

In [None]:
def transform_dataframe(df):
    column_names = []
    for property_name in df.iloc[0].values[:-2]:
        column_names.append(str(property_name[0]))
    column_names.append('composition')
    column_names.append('band_gap')
    
    print(f"Number of Columns: {len(column_names)}")
    
    df.columns = column_names
    
    def extract_value(tuple_value):
        if isinstance(tuple_value, tuple):
            return tuple_value[1]
        else:
            print(f"Warning: Value '{tuple_value}' is not a tuple. It's a {type(tuple_value)}.")
            return tuple_value
    
    columns_to_exclude = ['composition', 'band_gap']
    columns_to_apply = [col for col in df.columns if col not in columns_to_exclude]  
    df[columns_to_apply] = df[columns_to_apply].applymap(extract_value)
    
    return df

In [None]:
trans_df = transform_dataframe(result_df.copy())
trans_df.head(3)

#### Step 6:
Check for Nan values and drop extra columns

In [None]:
trans_df.drop('fields_not_requested', axis=1 , inplace=True)

In [None]:
trans_df.isna().sum()

#### Step 7: 
Create Descriptors

In [None]:
# Creating Jarvis descriptors from structures

jf = JarvisCFID()

In [None]:
jarvis_df = jf.featurize_dataframe(trans_df, col_id="structure", ignore_errors=True)

In [None]:
# Check rows with the Nan enteries

rows_with_nan = jarvis_df[jarvis_df.isna().any(axis=1)]
num_rows_with_nan = len(rows_with_nan)
num_rows_with_nan

In [None]:
# Dropping Nan rows

jarvis_df.dropna(inplace=True)
jarvis_df.shape

In [None]:
# Creating Magpie descriptors from compositions

comp_df = jarvis_df.copy()
str_to_comp = StrToComposition(target_col_id='composition_pmg')
comp_df = str_to_comp.featurize_dataframe(comp_df, col_id='composition')

featurizer = ElementProperty.from_preset('magpie')
comp_df = featurizer.featurize_dataframe(comp_df, col_id='composition_pmg', ignore_errors=True)
comp_df.shape

In [None]:
# Check for the Nan values, and drop if any.

rows_with_nan = comp_df[comp_df.isna().any(axis=1)]
num_rows_with_nan = len(rows_with_nan)
num_rows_with_nan

#### Step 8: 

Seperate train and test materials

In [41]:
comp_df = pd.read_csv('../data/processed/exp_materials_with_jarvis_and_magpie.csv')
comp_df.shape

(1023, 1706)

In [42]:
with open('../data/amp_test_materials.txt', 'r') as file:
    amp_test_compositions = file.read().splitlines()
    
len(amp_test_compositions)

124

In [43]:
test_df = comp_df[comp_df['composition'].isin(amp_test_compositions)]
test_df.shape

(93, 1706)

In [44]:
indices_to_remove = test_df.index
indices_to_remove

train_df = comp_df.drop(indices_to_remove)
train_df.shape

(930, 1706)

#### Step 9:

Finalize Data

* Remove columns having all zeros.This maybe different in your case, so please first checkout the zero-values columns and then remove them from the data.
* Feature Selection. We have already created a list for you after performing an EDA on features, and have choose top 50 features.


In [45]:
zero_val_columns = [
    "jml_C-3",
    "jml_C-5",
    "jml_C-9",
    "jml_C-11",
    "jml_C-15",
    "jml_C-17",
    "jml_C-18",
    "jml_C-19",
    "jml_C-20",
    "jml_C-22",
    "jml_C-28",
    "jml_C-29",
    "jml_C-30",
    "jml_C-31",
    "jml_C-32",
    "jml_C-34",
    "jml_nn_1",
    "jml_nn_2",
    "jml_nn_8",
    "jml_nn_9",
    "jml_rdf_1",
    "jml_rdf_2",
    "jml_rdf_7",
    "jml_rdf_8",
    "MagpieData minimum GSmagmom",
    "MagpieData minimum NdUnfilled",
    "MagpieDtaa minimum NfUnfilled",
    "MagpieDtaa minimum NfValence",
    "MagpieData mode NfUnfilled",
]

In [46]:
top_50_features = [
    'jml_mop_eg',
     'jml_op_eg',
     'efermi',
     'density',
     'MagpieData mode Electronegativity',
     'formation_energy_per_atom',
     'MagpieData mean CovalentRadius',
     'MagpieData mean MeltingT',
     'MagpieData range Electronegativity',
     'MagpieData mean Electronegativity',
     'MagpieData maximum Electronegativity',
     'MagpieData minimum MeltingT',
     'jml_first_ion_en_divi_hfus',
     'MagpieData mode MeltingT',
     'MagpieData avg_dev Electronegativity',
     'jml_atom_mass_divi_mol_vol',
     'MagpieData mean Row',
     'MagpieData avg_dev NpValence',
     'jml_oq_enp',
     'jml_hfus_mult_atom_mass',
     'jml_hfus_divi_polzbl',
     'jml_hfus_divi_therm_cond',
     'jml_mol_vol_divi_mp',
     'jml_first_ion_en_divi_therm_cond',
     'jml_mp_mult_atom_mass',
     'jml_first_ion_en_divi_bp',
     'jml_atom_rad_mult_X',
     'jml_first_ion_en_mult_atom_rad',
     'MagpieData avg_dev SpaceGroupNumber',
     'jml_first_ion_en_divi_mp',
     'jml_max_oxid_s',
     'jml_mp_divi_polzbl',
     'jml_mol_vol_divi_atom_mass',
     'jml_me3',
     'MagpieData mode GSvolume_pa',
     'jml_e3',
     'jml_atom_rad_add_X',
     'MagpieData mean NdUnfilled',
     'MagpieData avg_dev NpUnfilled',
     'density_atomic',
     'jml_mp_divi_therm_cond',
     'jml_nn_72',
     'jml_min_oxid_s',
     'jml_C-2',
     'jml_adf1_120',
     'jml_bp_divi_therm_cond',
     'jml_nn_35',
     'jml_elec_aff_mult_therm_cond',
     'MagpieData mean Number',
     'jml_rdf_30',
     'band_gap' # target column
]

In [47]:
def finalize_data(comp_df):    
    # Filter bandgap
#     comp_df = comp_df[(comp_df['band_gap'] <= 5)]
    
    # Remove outliers
#     comp_df = comp_df[comp_df["total_magnetization_normalized_formula_units"] < 25]
#     comp_df = comp_df[comp_df["volume"] < 6000]
#     comp_df = comp_df[comp_df["density"] < 12]
    
#     # Remove zero-valued columns
#     data_columns = comp_df.columns.tolist()

#     for col in data_columns:
#         if col in zero_val_columns:
#             comp_df.drop(col, axis=1, inplace=True)
            
    # Feature Selection
#     comp_df = comp_df[top_50_features]

    # Remove non-numerical columns
    non_numeric_columns = comp_df.select_dtypes(exclude=['int64', 'float64']).columns.tolist()
    comp_df = comp_df.drop(columns=non_numeric_columns, axis=1)
    
    # Reset Index
    comp_df.reset_index(drop=True, inplace=True)

    return comp_df

In [48]:
train_data = finalize_data(train_df)
test_data = finalize_data(test_df)

train_data.shape, test_data.shape

((930, 1701), (93, 1701))

#### Step 10:
Save the finalized data

In [49]:
train_data.to_csv(os.path.join(output_file_path, "train_dataset.csv"), index=False)
test_data.to_csv(os.path.join(output_file_path, "test_dataset.csv"), index=False)