In [1]:
import json
from ase import Atoms, Atom
from ase.visualize import view
from ase.constraints import dict2constraint
from ase.calculators.singlepoint import SinglePointCalculator

from pymatgen.io.ase import AseAtomsAdaptor
from pymatgen.analysis.local_env import CrystalNN
from pymatgen.core.structure import IStructure
from pymatgen.core.periodic_table import Element
from mp_api.client import MPRester

from matminer.featurizers.base import MultipleFeaturizer
from matminer.featurizers.conversions import StrToComposition
from matminer.featurizers.composition import (Miedema, 
                                                WenAlloys,
                                                BandCenter,
                                             )
import pandas as pd
import numpy as np

from tqdm import tqdm
import time

import warnings
warnings.filterwarnings("ignore")



  from tqdm.autonotebook import tqdm


In [2]:
df = pd.read_json('h_data.json').dropna()
df.head(10)

Unnamed: 0,adsorbate,mpid,miller,shift,top,coordination,neighborcoord,energy,atoms,results,initial_configuration
0,H,mp-632250,"[1, 1, 0]",0.0,True,H,[H:H-H],-3.320858,"{'atoms': [{'symbol': 'H', 'position': [1.7595...","{'energy': -7.11874891, 'forces': [[-9e-08, -1...","{'atoms': {'atoms': [{'symbol': 'H', 'position..."
1,H,mp-865082,"[1, 0, 0]",0.125,False,Ca-Na,"[Ca:Au-Au-Au-Au-Na-Na-Na, Na:Au-Au-Au-Au-Ca-Ca...",0.017842,"{'atoms': [{'symbol': 'H', 'position': [3.8027...","{'energy': -22.29980084, 'forces': [[0.0128185...","{'atoms': {'atoms': [{'symbol': 'H', 'position..."
2,H,mp-7956,"[0, 0, 1]",0.166299,False,Na-Sb,"[Na:Na-Na-Na-Sb-Sb, Sb:Na-Na-Na-Na-Na-Na]",-0.172016,"{'atoms': [{'symbol': 'H', 'position': [-2.472...","{'energy': -20.0745408, 'forces': [[-0.0062643...","{'atoms': {'atoms': [{'symbol': 'H', 'position..."
3,H,mp-7956,"[0, 0, 1]",0.166299,False,Sb,[Sb:Na-Na-Na-Na-Na-Na-Na],-0.140213,"{'atoms': [{'symbol': 'H', 'position': [2.7181...","{'energy': -20.04273818, 'forces': [[-0.001605...","{'atoms': {'atoms': [{'symbol': 'H', 'position..."
4,H,mp-865729,"[1, 0, 0]",0.125,True,Ag,[Ag:Ca-Ca-Sn-Sn],0.870511,"{'atoms': [{'symbol': 'H', 'position': [1.4285...","{'energy': -22.73704972, 'forces': [[-0.000118...","{'atoms': {'atoms': [{'symbol': 'H', 'position..."
5,H,mp-11514,"[1, 1, 0]",0.25,True,Nb-Pt,"[Pt:Nb-Nb-Nb-Pt-Pt-Pt-Pt, Nb:Nb-Pt-Pt-Pt-Pt-Pt...",-0.185126,"{'atoms': [{'symbol': 'H', 'position': [2.8673...","{'energy': -89.4699673, 'forces': [[0.01010254...","{'atoms': {'atoms': [{'symbol': 'H', 'position..."
6,H,mp-11514,"[1, 1, 0]",0.25,True,Pt,[Pt:Nb-Nb-Nb-Pt-Pt-Pt-Pt-Pt],-0.236797,"{'atoms': [{'symbol': 'H', 'position': [4.5151...","{'energy': -89.52163914, 'forces': [[-0.008459...","{'atoms': {'atoms': [{'symbol': 'H', 'position..."
7,H,mp-11514,"[1, 1, 0]",0.25,True,Pt-Pt,"[Pt:Nb-Nb-Nb-Pt-Pt-Pt-Pt, Pt:Nb-Nb-Nb-Pt-Pt-Pt...",-0.304492,"{'atoms': [{'symbol': 'H', 'position': [3.8816...","{'energy': -89.58933363, 'forces': [[-0.009728...","{'atoms': {'atoms': [{'symbol': 'H', 'position..."
8,H,mp-11514,"[1, 1, 0]",0.25,True,Pt-Pt,"[Pt:Nb-Nb-Nb-Nb-Pt-Pt-Pt-Pt, Pt:Nb-Nb-Nb-Nb-Pt...",-0.319046,"{'atoms': [{'symbol': 'H', 'position': [0.3642...","{'energy': -89.60388742, 'forces': [[0.0022538...","{'atoms': {'atoms': [{'symbol': 'H', 'position..."
9,H,mp-11514,"[1, 1, 0]",0.25,True,Pt-Pt,"[Pt:Nb-Nb-Nb-Nb-Pt-Pt-Pt-Pt, Pt:Nb-Nb-Nb-Nb-Pt...",-0.318719,"{'atoms': [{'symbol': 'H', 'position': [0.3827...","{'energy': -89.60356036, 'forces': [[0.0033011...","{'atoms': {'atoms': [{'symbol': 'H', 'position..."


# Data Cleaning

##### Remove DFT results of relaxed structure, keep only the data of original structure

In [3]:
df1=df.drop(['atoms','results'],axis=1)

Filter out the material contained the elements on the exclusion list, whose energys value are probably inaccurate   
-> According to the github page of the work https://github.com/ulissigroup/GASpy_manuscript/blob/master/figures/misc_info.ipynb

In [4]:
excluded_elements = ['Ca', 'Na', 'Nb', 'S', 'Se']
col_to_del = [0] # the first data-point is just hydrogen not a intermetallic material 

for i in range(len(df)):
    comp = df1.iloc[i]['initial_configuration']['atoms']['chemical_symbols']     # read the chemical symbols  
    if any(i in excluded_elements for i in comp):
        col_to_del.append(i)
        
col_to_del.sort(reverse=True)
print('# of materials to be delete', len(col_to_del))

df2 = df1.drop(index=col_to_del,)
df2.index = range(len(df2))

# of materials to be delete 3450


Generate the formula of original material based on mp-id

In [5]:
with MPRester("JcjACBt45HOv4RNHua4YOSrdWEFqoIJw") as mpr:
    mp_list=list(df2['mpid'].unique())
    prop_list = ['material_id','formula_pretty']
    docs = mpr.summary.search(material_ids=mp_list,fields=prop_list)
    mp_for = {}                                                      # create a dictionary for mp-id and chemical formula
    for prop in prop_list:
        if prop=="material_id":
            mp_for[prop]= [str(getattr(doc, prop)) for doc in docs]  # convert material-id to str type
        else: 
            mp_for[prop] = [getattr(doc, prop) for doc in docs]
    mp_for = pd.DataFrame.from_dict(mp_for)                          
    print(len(mp_list),len(mp_for['material_id'].unique()))
# according to the print result, there are some unrecognized mp id

Retrieving SummaryDoc documents:   0%|          | 0/776 [00:00<?, ?it/s]

782 776


Remove rows with non-existing material-id

In [6]:
# There is non-existing mp-id, remove the correspond data from the dataset
mp_to_del=[]
for i in df2['mpid'].unique():
    if i not in mp_for['material_id'].unique():
        mp_to_del.append(i)                    # stored the un recognized mp-ids
col_to_del=[]
df2['formula']=''
for i in tqdm(range(len(df2)), desc = 'tqdm() Progress Bar'):
    mpid = df2.iloc[i]['mpid']
    if mpid in mp_to_del:                      
        col_to_del.append(i)                   # find the row with non-existing mp-id
col_to_del.sort(reverse=True)
df2 = df2.drop(index=col_to_del,)              # delete the rows 
df2.index = range(len(df2))                    # and reindex the dataframe

tqdm() Progress Bar: 100%|██████████| 19225/19225 [00:01<00:00, 10395.55it/s]


In [40]:
dic = dict(zip(mp_for.material_id, mp_for.formula_pretty))
df2['formula']=df2['mpid'].map(dic)

In [41]:
print('number of adsorption site:', len(df2))
print('number of intermetallic structure:', len(df2['mpid'].unique()))

number of adsorption site: 18880
number of intermetallic structure: 776


## Featurization

### Featurizing the data based on the Composition

In [13]:
df3 = df2.copy()
df3 = StrToComposition().featurize_dataframe(df3, "formula")

comp_feat = [Miedema(),
            WenAlloys(),
            BandCenter(),
            ]   

featurizer = MultipleFeaturizer(comp_feat)

df3 = featurizer.fit_featurize_dataframe(df=df3, 
                                         col_id='composition',
                                         ignore_errors=True
                                        )

StrToComposition:   0%|          | 0/18880 [00:00<?, ?it/s]

MultipleFeaturizer:   0%|          | 0/18880 [00:00<?, ?it/s]

In [24]:
fea = ['mpid','formula','energy']+list(df3.keys())[11:]
df3[fea].to_csv('comp_fea.csv')

In [9]:
comp_fea = pd.read_csv('comp_fea.csv')

### Featurize the data based on structure information

Convert the structure to pymatgen structures based on the 'atoms' in initial_configuration

In [16]:
# copy the data reading function from the github page of the dataset 
# https://github.com/ulissigroup/GASpy_manuscript/blob/master/read_data.ipynb
def make_atoms_from_doc(doc):
    '''
    Args:
        doc     Dictionary/json/Mongo document created by the
                `make_doc_from_atoms` function.
    Returns:
        atoms   ase.Atoms object with an ase.SinglePointCalculator attached
    '''
    atoms = Atoms([Atom(atom['symbol'],
                        atom['position'],
                        tag=atom['tag'],
                        momentum=atom['momentum'],
                        magmom=atom['magmom'],
                        charge=atom['charge'])
                   for atom in doc['atoms']['atoms']],
                  cell=doc['atoms']['cell'],
                  pbc=doc['atoms']['pbc'],
                  info=doc['atoms']['info'],
                  constraint=[dict2constraint(constraint_dict)
                              for constraint_dict in doc['atoms']['constraints']])
    results = doc['results']
    calc = SinglePointCalculator(energy=results.get('energy', None),
                                 forces=results.get('forces', None),
                                 stress=results.get('stress', None),
                                 atoms=atoms)
    atoms.set_calculator(calc)
    return atoms

In [15]:
# create a new column of pymatgen strcuture
df2['pmg_struc']=0
for i in tqdm(range(len(df2)), desc = 'tqdm() Progress Bar'):
    ase_struc = make_atoms_from_doc(df2['initial_configuration'][i])
    df2['pmg_struc'][i] = AseAtomsAdaptor.get_structure(ase_struc)

tqdm() Progress Bar: 100%|██████████| 18880/18880 [00:35<00:00, 538.90it/s]


In [19]:
pdf = df2.copy()

In [20]:
pdf = df2.copy()
feature_list=[]
num_of_nbr=8

for i in range(num_of_nbr):
    f1='N'+str(i+1)+'_Z'
    f2='N'+str(i+1)+'_X'
    f3='N'+str(i+1)+'_CN'
    feature_list.append(f1)
    feature_list.append(f2)
    feature_list.append(f3)
    
for i in feature_list:
    pdf[i]=0
pdf

### add d bands

for i in tqdm(range(len(pdf)), desc = 'tqdm() Progress Bar'):
    struc = pdf['pmg_struc'][i]   
    nbr = struc.get_all_neighbors(r=7,sites=[struc[0]])[0]
    if len(nbr)==0:
        pass
    elif len(nbr)>=num_of_nbr:
        nbr_idx = [site.index for site in nbr[:num_of_nbr]]
        for j, idx in enumerate(nbr_idx):
            f1='N'+str(j+1)+'_Z'
            f2='N'+str(j+1)+'_X'
            f3='N'+str(j+1)+'_CN'
            elem = struc[idx].species_string
            d=struc.get_distance(0,idx)
            pdf[f1][i] = Element(elem).Z/d
            pdf[f2][i] = Element(elem).X/d
            pdf[f3][i] = CrystalNN().get_cn(structure=struc,n=idx)/d
            
    elif len(nbr)<num_of_nbr:
        print("Dummie Species Required")
        nbr_idx = [site.index for site in nbr]
        for j, idx in enumerate(nbr_idx):
            f1='N'+str(j+1)+'_Z'
            f2='N'+str(j+1)+'_X'
            f3='N'+str(j+1)+'_CN'
            elem = struc[idx].species_string
            pdf[f1][i] = Element(elem).Z/d
            pdf[f2][i] = Element(elem).X/d
            pdf[f3][i] = CrystalNN().get_cn(structure=struc,n=idx)/d
        

tqdm() Progress Bar: 100%|██████████| 18880/18880 [1:17:57<00:00,  4.04it/s]  


In [26]:
#pdf[list(pdf.keys())[-18:]]
fea=['mpid','energy']+list(pdf.keys())[11:]
pdf[fea].to_csv('site_feature.csv')

In [28]:
site_fea = pd.read_csv('site_feature.csv')
site_fea

Unnamed: 0.1,Unnamed: 0,mpid,energy,N1_Z,N1_X,N1_CN,N2_Z,N2_X,N2_CN,N3_Z,...,N5_CN,N6_Z,N6_X,N6_CN,N7_Z,N7_X,N7_CN,N8_Z,N8_X,N8_CN
0,0,mp-998949,-0.173716,10.930460,0.765132,0.496839,5.002512,0.350176,1.819095,10.930460,...,1.367727,10.930460,0.765132,0.496839,23.527288,0.687721,2.413055,13.335809,0.389816,1.367775
1,1,mp-998949,-0.439394,58.328820,1.704996,2.991222,22.175322,0.648202,2.274392,4.457550,...,2.991222,22.175322,0.648202,2.274392,58.328820,1.704996,2.991222,30.604906,0.894605,1.569482
2,2,mp-998949,-0.345462,39.334364,1.149774,2.521434,7.166661,0.501666,2.606059,18.371430,...,1.150355,39.334364,1.149774,2.521434,39.334364,1.149774,2.521434,39.337126,1.149854,2.521611
3,3,mp-998949,-0.431536,19.605172,0.573074,2.010787,11.479064,0.335542,1.177340,23.333286,...,1.598366,19.605172,0.573074,2.010787,3.728440,0.260991,1.355796,19.605172,0.573074,2.010787
4,4,mp-998949,-0.171099,9.373189,0.656123,3.834487,4.553383,0.318737,1.655776,31.625283,...,1.990126,12.071445,0.352858,1.238097,61.027017,1.783867,2.347193,9.373189,0.656123,3.834487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18875,18875,mp-640159,-0.117911,16.209077,0.635396,1.620908,4.133331,0.292927,1.976810,4.760510,...,2.587191,6.913939,0.403685,2.453333,8.151676,0.319546,0.815168,4.448758,0.315282,1.547394
18876,18876,mp-971754,-0.788132,10.806350,0.344635,0.292064,13.965494,0.445386,1.509783,4.529281,...,1.846858,24.196299,0.771666,2.615816,24.128971,0.769519,2.608537,4.528929,0.320963,1.575280
18877,18877,mp-1017556,0.397091,4.688870,0.319848,0.167460,10.484631,0.715202,0.748902,10.484631,...,0.432379,1.117665,0.485386,0.798332,2.140166,0.929444,1.834428,7.678531,0.503076,3.177323
18878,18878,mp-24289,0.329268,12.561563,0.600770,1.638465,0.193128,0.424881,0.193128,21.123649,...,1.276948,22.047303,1.054436,2.875735,0.331933,0.730253,1.991598,0.167887,0.369351,0.167887


# Finalizing the dataset

In [30]:
fdf=df2.copy()
# splite the list of miller index into seperate column
fdf[['miller1','miller2','miller3']] = pd.DataFrame(fdf.miller.tolist(), index= fdf.index)
# convert "top" from boolean to [1,0]
fdf['top']=fdf['top'].apply(np.float)

# Save only the mpid and the digital features 
fdf = fdf.drop(columns=['adsorbate','miller','coordination','neighborcoord','initial_configuration','formula','pmg_struc'],axis=0)
fdf

Unnamed: 0,mpid,shift,top,energy,miller1,miller2,miller3
0,mp-998949,0.25000,1.0,-0.173716,1,0,0
1,mp-998949,0.25000,1.0,-0.439394,1,0,0
2,mp-998949,0.25000,1.0,-0.345462,1,0,0
3,mp-998949,0.25000,1.0,-0.431536,1,0,0
4,mp-998949,0.25000,1.0,-0.171099,1,0,0
...,...,...,...,...,...,...,...
18875,mp-640159,0.06006,1.0,-0.117911,1,1,1
18876,mp-971754,0.00000,1.0,-0.788132,1,1,0
18877,mp-1017556,0.00000,1.0,0.397091,2,1,1
18878,mp-24289,0.00000,1.0,0.329268,1,0,0


In [66]:
fea=['formula']+list(comp_fea.keys())[4:]
final_df = [fdf,comp_fea[fea],site_fea[list(site_fea)[3:]]]
# concat the features from the orignal data and generated compositional & site features
final_df = pd.concat(final_df,axis=1).drop(columns=['Weight Fraction','Atomic Fraction']) # drop the non-digital features 
final_df.shape

(18880, 59)

Browse the feature columns

In [62]:
final_df[list(final_df.keys())[0:25]]

Unnamed: 0,mpid,shift,top,energy,miller1,miller2,miller3,formula,Miedema_deltaH_inter,Miedema_deltaH_amor,...,APE mean,Radii local mismatch,Radii gamma,Configuration entropy,Atomic weight mean,Total weight,Lambda entropy,Electronegativity delta,Electronegativity local mismatch,VEC mean
0,mp-998949,0.25000,1.0,-0.173716,1,0,0,TiPt,-1.148587,-0.932720,...,0.023883,0.750,1.012971,-0.005763,121.47550,242.9510,-50.559831,0.195767,0.4625,7.00
1,mp-998949,0.25000,1.0,-0.439394,1,0,0,TiPt,-1.148587,-0.932720,...,0.023883,0.750,1.012971,-0.005763,121.47550,242.9510,-50.559831,0.195767,0.4625,7.00
2,mp-998949,0.25000,1.0,-0.345462,1,0,0,TiPt,-1.148587,-0.932720,...,0.023883,0.750,1.012971,-0.005763,121.47550,242.9510,-50.559831,0.195767,0.4625,7.00
3,mp-998949,0.25000,1.0,-0.431536,1,0,0,TiPt,-1.148587,-0.932720,...,0.023883,0.750,1.012971,-0.005763,121.47550,242.9510,-50.559831,0.195767,0.4625,7.00
4,mp-998949,0.25000,1.0,-0.171099,1,0,0,TiPt,-1.148587,-0.932720,...,0.023883,0.750,1.012971,-0.005763,121.47550,242.9510,-50.559831,0.195767,0.4625,7.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18875,mp-640159,0.06006,1.0,-0.117911,1,1,1,V2GaSn2,-0.305312,-0.228571,...,-0.032543,5.040,1.092248,-0.008771,81.80520,409.0260,-1.680350,0.014354,0.0320,10.20
18876,mp-971754,0.00000,1.0,-0.788132,1,1,0,V2ReW,-0.165750,-0.042540,...,0.006344,0.625,1.013470,-0.008644,117.98250,471.9300,-105.027498,0.086655,0.2125,5.75
18877,mp-1017556,0.00000,1.0,0.397091,2,1,1,CuNi3N,0.188802,0.343087,...,-0.013794,8.640,1.370048,-0.007901,50.72658,253.6329,-0.224727,0.147536,0.3856,11.20
18878,mp-24289,0.00000,1.0,0.329268,1,0,0,HPd,0.546052,0.577257,...,,,,,,,,,,


In [63]:
final_df[list(final_df.keys())[25:40]]

Unnamed: 0,Mixing enthalpy,Mean cohesive energy,Interant electrons,Interant s electrons,Interant p electrons,Interant d electrons,Interant f electrons,Shear modulus mean,Shear modulus delta,Shear modulus local mismatch,Shear modulus strength model,band center,N1_Z,N1_X,N1_CN
0,74.00,5.3450,12.0,1.0,0.0,11.0,0.0,50.180,0.216022,5.42000,-0.021226,4.386974,10.930460,0.765132,0.496839
1,74.00,5.3450,12.0,1.0,0.0,11.0,0.0,50.180,0.216022,5.42000,-0.021226,4.386974,58.328820,1.704996,2.991222
2,74.00,5.3450,12.0,1.0,0.0,11.0,0.0,50.180,0.216022,5.42000,-0.021226,4.386974,39.334364,1.149774,2.521434
3,74.00,5.3450,12.0,1.0,0.0,11.0,0.0,50.180,0.216022,5.42000,-0.021226,4.386974,19.605172,0.573074,2.010787
4,74.00,5.3450,12.0,1.0,0.0,11.0,0.0,50.180,0.216022,5.42000,-0.021226,4.386974,9.373189,0.656123,3.834487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18875,2.88,3.9420,6.0,0.0,3.0,3.0,0.0,33.470,0.379659,6.73440,-0.062913,3.752636,16.209077,0.635396,1.620908
18876,8.00,6.8875,12.0,0.0,0.0,12.0,0.0,106.125,0.568224,31.40625,-0.120747,3.908208,10.806350,0.344635,0.292064
18877,44.64,4.3460,12.0,1.0,3.0,8.0,0.0,54.056,0.544017,14.40160,-0.190907,4.886539,4.688870,0.319848,0.167460
18878,,,,,,,,,,,,5.649307,12.561563,0.600770,1.638465


In [64]:
final_df[list(final_df.keys())[40:]]

Unnamed: 0,N2_Z,N2_X,N2_CN,N3_Z,N3_X,N3_CN,N4_Z,N4_X,N4_CN,N5_Z,...,N5_CN,N6_Z,N6_X,N6_CN,N7_Z,N7_X,N7_CN,N8_Z,N8_X,N8_CN
0,5.002512,0.350176,1.819095,10.930460,0.765132,0.496839,23.519619,0.687497,2.412269,13.335340,...,1.367727,10.930460,0.765132,0.496839,23.527288,0.687721,2.413055,13.335809,0.389816,1.367775
1,22.175322,0.648202,2.274392,4.457550,0.312028,1.620927,12.467258,0.364428,1.278693,58.328820,...,2.991222,22.175322,0.648202,2.274392,58.328820,1.704996,2.991222,30.604906,0.894605,1.569482
2,7.166661,0.501666,2.606059,18.371430,0.537011,1.884249,3.854615,0.269823,1.401678,11.215964,...,1.150355,39.334364,1.149774,2.521434,39.334364,1.149774,2.521434,39.337126,1.149854,2.521611
3,11.479064,0.335542,1.177340,23.333286,0.682050,1.196579,6.426969,0.449888,2.337080,15.584073,...,1.598366,19.605172,0.573074,2.010787,3.728440,0.260991,1.355796,19.605172,0.573074,2.010787
4,4.553383,0.318737,1.655776,31.625283,0.924431,1.621809,9.373189,0.656123,3.834487,19.403728,...,1.990126,12.071445,0.352858,1.238097,61.027017,1.783867,2.347193,9.373189,0.656123,3.834487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18875,4.133331,0.292927,1.976810,4.760510,0.337375,1.655830,7.890632,0.309313,0.157813,8.911435,...,2.587191,6.913939,0.403685,2.453333,8.151676,0.319546,0.815168,4.448758,0.315282,1.547394
18876,13.965494,0.445386,1.509783,4.529281,0.320988,1.575402,4.525440,0.320716,0.393516,17.314294,...,1.846858,24.196299,0.771666,2.615816,24.128971,0.769519,2.608537,4.528929,0.320963,1.575280
18877,10.484631,0.715202,0.748902,10.484631,0.715202,0.748902,6.053304,0.412922,0.432379,6.053305,...,0.432379,1.117665,0.485386,0.798332,2.140166,0.929444,1.834428,7.678531,0.503076,3.177323
18878,0.193128,0.424881,0.193128,21.123649,1.010261,2.755259,9.702625,0.464039,0.210927,9.789937,...,1.276948,22.047303,1.054436,2.875735,0.331933,0.730253,1.991598,0.167887,0.369351,0.167887


#### Final Cleaning

In [75]:
#Some of the material are not intermetallic materials, which giving Nan as feature 
# Perform final cleaning and drop those rows
final_df.replace([np.inf, -np.inf], np.nan, inplace=True)
final_df = final_df.dropna(axis=0).drop_duplicates() #final cleaning 
col = list(final_df.columns)

#reorder the columns of 
col_new = ['mpid', 'formula','energy','shift', 'top', 'miller1', 'miller2', 'miller3', 'Miedema_deltaH_inter', 'Miedema_deltaH_amor', 'Miedema_deltaH_ss_min', 'Yang delta', 'Yang omega', 'APE mean', 'Radii local mismatch', 'Radii gamma', 'Configuration entropy', 'Atomic weight mean', 'Total weight', 'Lambda entropy', 'Electronegativity delta', 'Electronegativity local mismatch', 'VEC mean', 'Mixing enthalpy', 'Mean cohesive energy', 'Interant electrons', 'Interant s electrons', 'Interant p electrons', 'Interant d electrons', 'Interant f electrons', 'Shear modulus mean', 'Shear modulus delta', 'Shear modulus local mismatch', 'Shear modulus strength model', 'band center','N1_Z', 'N1_X', 'N1_CN', 'N2_Z', 'N2_X', 'N2_CN', 'N3_Z', 'N3_X', 'N3_CN', 'N4_Z', 'N4_X', 'N4_CN', 'N5_Z', 'N5_X', 'N5_CN', 'N6_Z', 'N6_X', 'N6_CN', 'N7_Z', 'N7_X', 'N7_CN', 'N8_Z', 'N8_X', 'N8_CN'] 
# make sure the column are the same
print(set(col)==set(col_new))
final_df=final_df[col_new].sort_values(by=['mpid'])

final_df.index=range(len(final_df))

final_df

True


Unnamed: 0,mpid,formula,energy,shift,top,miller1,miller2,miller3,Miedema_deltaH_inter,Miedema_deltaH_amor,...,N5_CN,N6_Z,N6_X,N6_CN,N7_Z,N7_X,N7_CN,N8_Z,N8_X,N8_CN
0,mp-10010,Al(CoSi)2,-0.124481,0.913035,1.0,2,2,1,-0.371987,-0.237271,...,3.119071,5.067373,0.352839,1.501444,2.157972,0.267256,0.995987,4.507944,0.611792,2.897964
1,mp-10010,Al(CoSi)2,-0.260432,0.913035,1.0,2,2,1,-0.371987,-0.237271,...,3.655748,3.246080,0.402015,2.996382,2.494917,0.338596,1.960292,5.081792,0.353843,1.505716
2,mp-10010,Al(CoSi)2,-0.423122,0.055556,1.0,2,1,0,-0.371987,-0.237271,...,0.832026,5.837791,0.406483,1.729716,3.290049,0.407460,0.506161,5.004016,0.679116,1.072289
3,mp-10010,Al(CoSi)2,-0.169797,0.000000,1.0,1,1,0,-0.371987,-0.237271,...,3.931352,11.231276,0.782030,3.327786,11.231276,0.782030,0.831946,3.289735,0.407421,1.518339
4,mp-10010,Al(CoSi)2,-0.016857,0.913035,1.0,2,2,1,-0.371987,-0.237271,...,2.069668,7.154088,0.498136,2.119730,5.424231,0.377687,0.803590,18.929266,1.318038,2.103252
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18077,mp-9998,MoAs,-0.858928,0.083333,1.0,1,0,2,-0.476905,-0.345852,...,3.224633,6.968280,0.358369,0.995469,6.996506,0.359820,1.166084,11.132160,0.572511,1.855360
18078,mp-9998,MoAs,0.142167,0.083333,1.0,1,0,2,-0.476905,-0.345852,...,1.371450,13.994674,0.924497,2.120405,8.658036,0.445270,1.236862,9.524598,0.489836,0.680328
18079,mp-9998,MoAs,-0.858649,0.083333,1.0,1,0,2,-0.476905,-0.345852,...,2.135051,7.186372,0.474736,1.088844,7.879999,0.520557,1.910303,14.307869,0.735833,2.043981
18080,mp-9998,MoAs,-0.132346,0.083333,1.0,1,0,2,-0.476905,-0.345852,...,1.378942,10.914110,0.561297,1.819018,6.602029,0.436134,1.400430,8.852576,0.455275,1.264654


In [72]:
final_df.to_csv('cleaned_H_data.csv')