## Data Processing
This notebook process the OQMD data containing delta_e, volume_pa and bandgap containing 620K entries, 
after removing outliers [delta_e out outside (-20,5) and outside 5 std], we end up with 307K unique entries. 
We compute the physical attributes and elemental fractions, the whole processed data is saved at oqmd_all.csv. 
The train and test are available at respective csv files containing all, only physical or only fractions.

In [2]:
import re, numpy as np, os, sys, pandas
from pymatgen import Composition
import data_utils
import magpie
from pymatgen import Composition
from matminer.featurizers.base import MultipleFeaturizer
from matminer.featurizers import composition as cf
from matminer.utils.conversions import str_to_composition
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

In [5]:
oqmd_data_path = '../training-data/oqmd_all-22Mar18.csv'
oqmd_data = pandas.read_csv(oqmd_data_path,sep=r"\s+",engine='python', na_values= 'None',error_bad_lines=False)
oqmd_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 620196 entries, 0 to 620195
Data columns (total 7 columns):
comp         620196 non-null object
energy_pa    620189 non-null float64
volume_pa    620189 non-null float64
magmom_pa    455663 non-null float64
bandgap      619819 non-null float64
delta_e      620196 non-null float64
stability    581531 non-null float64
dtypes: float64(6), object(1)
memory usage: 33.1+ MB


In [6]:
oqmd_data.head()

Unnamed: 0,comp,energy_pa,volume_pa,magmom_pa,bandgap,delta_e,stability
0,Cs1Ho1S4Si1,-5.353489,27.1652,6.9e-05,3.024,-1.60894,-0.064029
1,Lu1,-4.511592,28.7838,0.046445,0.0,0.01259,0.01259
2,Tm1,-4.468631,29.537,0.0201,0.0,0.006394,0.006394
3,Ne1,-0.029181,21.7199,,11.91,0.000137,0.000137
4,La1,-4.804203,37.7862,0.582882,0.0,0.131232,0.131232


In [7]:
oqmd_data.describe()

Unnamed: 0,energy_pa,volume_pa,magmom_pa,bandgap,delta_e,stability
count,620189.0,620189.0,455663.0,619819.0,620196.0,581531.0
mean,-5.434664,20.86548,0.406732,0.130374,0.007116,0.58325
std,2.615643,7.870184,0.556038,0.644729,1.996298,2.905439
min,-203.629754,2.72938,-4.248135,0.0,-198.69561,-198.683327
25%,-6.731937,15.3211,0.000346,0.0,-0.254396,0.207708
50%,-5.382519,19.0389,0.175463,0.0,0.110805,0.421494
75%,-4.076177,24.8237,0.717187,0.0,0.467793,0.713214
max,1122.552855,203.685,5.923581,18.437,1126.321181,1126.858389


In [8]:
oqmd_data.rename(columns={'comp':'composition'}, inplace=True)
oqmd_data[:2]

Unnamed: 0,composition,energy_pa,volume_pa,magmom_pa,bandgap,delta_e,stability
0,Cs1Ho1S4Si1,-5.353489,27.1652,6.9e-05,3.024,-1.60894,-0.064029
1,Lu1,-4.511592,28.7838,0.046445,0.0,0.01259,0.01259


In [9]:
oqmd_data.query('delta_e > -20 and delta_e < 5', inplace=True)
oqmd_data = oqmd_data[~oqmd_data['delta_e'].isnull()]
oqmd_data.shape

(619991, 7)

In [10]:
%%time
oqmd_data['comp_obj'] = oqmd_data['composition'].apply(lambda x: Composition(x))

CPU times: user 15.8 s, sys: 102 ms, total: 15.9 s
Wall time: 15.9 s


In [12]:
%%time
oqmd_data['pretty_comp'] = oqmd_data['comp_obj'].apply(lambda x: x.reduced_formula)

CPU times: user 35.4 s, sys: 119 ms, total: 35.5 s
Wall time: 35.7 s


In [13]:
%%time
oqmd_data.sort_values('delta_e', ascending=True, inplace=True)
oqmd_data.drop_duplicates('pretty_comp', keep='first', inplace=True)
print('Reduced dataset to %d entries'%len(oqmd_data))

Reduced dataset to 341708 entries
CPU times: user 944 ms, sys: 36.1 ms, total: 980 ms
Wall time: 982 ms


In [14]:
oqmd_data['nelems'] = oqmd_data['comp_obj'].apply(lambda x: len(x))

In [15]:
oqmd_data = oqmd_data[np.abs(oqmd_data.delta_e - oqmd_data.delta_e.mean()) <= (5*oqmd_data.delta_e.std())]
oqmd_data.shape

(341688, 10)

In [12]:
oqmd_data.groupby('nelems').count()

Unnamed: 0_level_0,composition,energy_pa,volume_pa,magmom_pa,bandgap,delta_e,stability,comp_obj,pretty_comp
nelems,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,88,88,88,41,88,88,88,88,88
2,16250,16250,16250,9669,16219,16250,16240,16250,16250
3,263175,263174,263174,166354,263069,263175,249351,263175,263175
4,60519,60518,60518,54599,60507,60519,55179,60519,60519
5,1429,1429,1429,744,1428,1429,1428,1429,1429
6,202,202,202,79,202,202,202,202,202
7,25,25,25,10,25,25,25,25,25


In [13]:
oqmd_data.query('nelems > 1', inplace=True)
print (oqmd_data.groupby('nelems').count())
oqmd_data.shape

        composition  energy_pa  volume_pa  magmom_pa  bandgap  delta_e  \
nelems                                                                   
2             16250      16250      16250       9669    16219    16250   
3            263175     263174     263174     166354   263069   263175   
4             60519      60518      60518      54599    60507    60519   
5              1429       1429       1429        744     1428     1429   
6               202        202        202         79      202      202   
7                25         25         25         10       25       25   

        stability  comp_obj  pretty_comp  
nelems                                    
2           16240     16250        16250  
3          249351    263175       263175  
4           55179     60519        60519  
5            1428      1429         1429  
6             202       202          202  
7              25        25           25  


(341600, 10)

In [14]:
%%time
oqmd_data['comp_dict'] = oqmd_data['pretty_comp'].apply(lambda x: data_utils.parse_formula(x))

CPU times: user 4.13 s, sys: 16 ms, total: 4.14 s
Wall time: 4.14 s


In [15]:
%%time
elements_present = set()
for i, row in oqmd_data.iterrows():
    elements_present |= set(row['comp_dict'].keys())
print (elements_present)

{'Ta', 'Be', 'Rh', 'V', 'C', 'Br', 'I', 'Fe', 'Tm', 'Th', 'H', 'Mo', 'Rb', 'Gd', 'Y', 'Ce', 'Xe', 'Se', 'Zr', 'Ho', 'Eu', 'Dy', 'Pa', 'Kr', 'Os', 'K', 'W', 'Sb', 'Hg', 'Na', 'In', 'Ag', 'Cl', 'Re', 'Pm', 'Nd', 'Yb', 'Cu', 'O', 'Hf', 'Ba', 'Ru', 'Sm', 'U', 'Er', 'B', 'Pd', 'Ti', 'Li', 'Cr', 'Ga', 'P', 'Pu', 'Ca', 'Tb', 'Ir', 'Np', 'Si', 'Cd', 'As', 'N', 'Lu', 'La', 'Mg', 'Te', 'Pr', 'Cs', 'Sc', 'Tl', 'F', 'Au', 'Ac', 'Co', 'Pb', 'Al', 'Ge', 'S', 'Bi', 'Mn', 'Sn', 'Tc', 'Zn', 'Sr', 'Nb', 'Ni', 'Pt'}
CPU times: user 15.8 s, sys: 45.8 ms, total: 15.8 s
Wall time: 15.8 s


In [16]:
elements = ['H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', 'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'Ar', 'K', 
            'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr', 'Rb',
            'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In', 'Sn', 'Sb', 'Te', 'I', 'Xe', 'Cs', 
            'Ba', 'La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Hf', 'Ta',
            'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi', 'Po', 'At', 'Rn', 'Fr', 'Ra', 'Ac', 'Th', 'Pa',
            'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr', 'Rf', 'Db', 'Sg', 'Bh', 'Hs', 'Mt',
            'Ds', 'Rg', 'Cn']


elements_tl = ['H', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'K',
 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se',
 'Br', 'Kr', 'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In',
 'Sn', 'Sb', 'Te', 'I', 'Xe', 'Cs', 'Ba', 'La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd',
 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir', 'Pt', 'Au',
 'Hg', 'Tl', 'Pb', 'Bi', 'Ac','Th', 'Pa', 'U', 'Np', 'Pu']

elem_pos = dict()
i=0
for el in elements:
   elem_pos[el] = i
   i+=1

In [17]:
print (elements_present, len(elements_present))
print ([e for e in elements_present if e not in elements_tl])
print ([e for e in elements_tl if e not in elements_present])

{'Ta', 'Be', 'Rh', 'V', 'C', 'Br', 'I', 'Fe', 'Tm', 'Th', 'H', 'Mo', 'Rb', 'Gd', 'Y', 'Ce', 'Xe', 'Se', 'Zr', 'Ho', 'Eu', 'Dy', 'Pa', 'Kr', 'Os', 'K', 'W', 'Sb', 'Hg', 'Na', 'In', 'Ag', 'Cl', 'Re', 'Pm', 'Nd', 'Yb', 'Cu', 'O', 'Hf', 'Ba', 'Ru', 'Sm', 'U', 'Er', 'B', 'Pd', 'Ti', 'Li', 'Cr', 'Ga', 'P', 'Pu', 'Ca', 'Tb', 'Ir', 'Np', 'Si', 'Cd', 'As', 'N', 'Lu', 'La', 'Mg', 'Te', 'Pr', 'Cs', 'Sc', 'Tl', 'F', 'Au', 'Ac', 'Co', 'Pb', 'Al', 'Ge', 'S', 'Bi', 'Mn', 'Sn', 'Tc', 'Zn', 'Sr', 'Nb', 'Ni', 'Pt'} 86
[]
[]


In [18]:
oqmd_data['comp_fractions'] = oqmd_data['comp_dict'].apply(lambda x: data_utils.get_fractions(x))

In [19]:
print (oqmd_data[:1])
#oqmd_data['solution_term'] = oqmd_data['comp_obj'].apply(lambda x: compute_mixing_term(x))
print (oqmd_data.shape)
oqmd_data = oqmd_data[~oqmd_data['comp_fractions'].isnull()]
print (oqmd_data.shape)

       composition  energy_pa  volume_pa  magmom_pa  bandgap   delta_e  \
251840   Cs1F10Lu3  -6.356016    15.3841  -0.000026    7.611 -4.284843   

        stability     comp_obj pretty_comp  nelems  \
251840  -0.411923  (Cs, F, Lu)    CsLu3F10       3   

                                                comp_dict  \
251840  {'Cs': 0.07142857142857142, 'Lu': 0.2142857142...   

                                           comp_fractions  
251840  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.71428573...  
(341600, 12)
(341600, 12)


In [20]:
for i,e in enumerate(elements_tl):
    oqmd_data[e] = [ x[i] for x in oqmd_data['comp_fractions']]
oqmd_data.shape

(341600, 98)

In [21]:
oqmd_data.columns

Index(['composition', 'energy_pa', 'volume_pa', 'magmom_pa', 'bandgap',
       'delta_e', 'stability', 'comp_obj', 'pretty_comp', 'nelems',
       'comp_dict', 'comp_fractions', 'H', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F',
       'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'K', 'Ca', 'Sc', 'Ti', 'V',
       'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br',
       'Kr', 'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag',
       'Cd', 'In', 'Sn', 'Sb', 'Te', 'I', 'Xe', 'Cs', 'Ba', 'La', 'Ce', 'Pr',
       'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu',
       'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi',
       'Ac', 'Th', 'Pa', 'U', 'Np', 'Pu'],
      dtype='object')

In [22]:
feature_calculators = MultipleFeaturizer([cf.Stoichiometry(), cf.ElementProperty.from_preset("magpie"),
                                          cf.ValenceOrbital(props=['avg']), cf.IonProperty(fast=True)])

In [23]:
feature_labels = feature_calculators.feature_labels()

In [24]:
print (feature_labels, len(feature_labels))

['0-norm', '2-norm', '3-norm', '5-norm', '7-norm', '10-norm', 'MagpieData minimum Number', 'MagpieData maximum Number', 'MagpieData range Number', 'MagpieData mean Number', 'MagpieData avg_dev Number', 'MagpieData mode Number', 'MagpieData minimum MendeleevNumber', 'MagpieData maximum MendeleevNumber', 'MagpieData range MendeleevNumber', 'MagpieData mean MendeleevNumber', 'MagpieData avg_dev MendeleevNumber', 'MagpieData mode MendeleevNumber', 'MagpieData minimum AtomicWeight', 'MagpieData maximum AtomicWeight', 'MagpieData range AtomicWeight', 'MagpieData mean AtomicWeight', 'MagpieData avg_dev AtomicWeight', 'MagpieData mode AtomicWeight', 'MagpieData minimum MeltingT', 'MagpieData maximum MeltingT', 'MagpieData range MeltingT', 'MagpieData mean MeltingT', 'MagpieData avg_dev MeltingT', 'MagpieData mode MeltingT', 'MagpieData minimum Column', 'MagpieData maximum Column', 'MagpieData range Column', 'MagpieData mean Column', 'MagpieData avg_dev Column', 'MagpieData mode Column', 'Magpi

In [25]:
%%time
feature_calculators.featurize_dataframe(oqmd_data, col_id='comp_obj');

HBox(children=(FloatProgress(value=0.0, description='MultipleFeaturizer', max=341600.0, style=ProgressStyle(de…


CPU times: user 38.8 s, sys: 5.65 s, total: 44.5 s
Wall time: 6min 25s


Unnamed: 0,composition,energy_pa,volume_pa,magmom_pa,bandgap,delta_e,stability,comp_obj,pretty_comp,nelems,...,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber,avg s valence electrons,avg p valence electrons,avg d valence electrons,avg f valence electrons,compound possible,max ionic char,avg ionic char
251840,Cs1F10Lu3,-6.356016,15.3841,-0.000026,7.611,-4.284843,-0.411923,"(Cs, F, Lu)",CsLu3F10,3,...,68.642857,76.632653,15.0,1.928571,3.571429,0.214286,3.000000,True,0.921450,0.176525
240464,F3Tb1,-6.695208,11.7933,0.000322,8.317,-4.280576,-3.306395,"(F, Tb)",TbF3,2,...,59.750000,67.125000,15.0,2.000000,3.750000,0.000000,2.250000,True,0.874268,0.163925
270658,Er3F10K1,-6.367075,12.6559,0.000001,7.353,-4.270220,-0.027914,"(Er, F, K)",KEr3F10,3,...,68.642857,76.632653,15.0,1.928571,3.571429,0.000000,2.571429,True,0.917619,0.177111
249661,F3Sc1,-6.944619,15.7811,0.000013,6.597,-4.269834,-3.171397,"(F, Sc)",ScF3,2,...,59.750000,67.125000,15.0,2.000000,3.750000,0.250000,0.000000,True,0.820234,0.153794
527724,F10Lu3Rb1,-6.346997,14.1302,-0.000014,7.371,-4.268079,-0.391680,"(F, Lu, Rb)",RbLu3F10,3,...,68.642857,76.632653,15.0,1.928571,3.571429,0.214286,3.000000,True,0.917619,0.176228
483610,F9Rb1Th2,-6.670073,15.0591,-0.000028,6.783,-4.261621,-0.022216,"(F, Rb, Th)",RbTh2F9,3,...,67.833333,79.250000,15.0,1.916667,3.750000,0.333333,0.000000,True,0.917619,0.162375
250859,O2Th1,-9.367937,14.9368,-0.000087,4.867,-4.236914,-1.191793,"(O, Th)",ThO2,2,...,83.000000,94.666667,12.0,2.000000,2.666667,0.666667,0.000000,True,0.681744,0.151499
270266,F10K1Y3,-6.739374,13.2561,-0.000034,7.122,-4.235145,-0.177026,"(F, K, Y)",KY3F10,3,...,68.642857,76.632653,15.0,1.928571,3.571429,0.214286,0.000000,True,0.917619,0.177686
265231,F9Na1Th2,-6.656020,13.2539,0.000015,6.719,-4.227890,-0.005018,"(F, Na, Th)",NaTh2F9,3,...,67.833333,79.250000,15.0,1.916667,3.750000,0.333333,0.000000,True,0.902278,0.161106
1028,Ca1F2,-5.849962,12.9555,,7.800,-4.219407,-0.945717,"(Ca, F)",CaF2,2,...,85.000000,93.333333,15.0,2.000000,3.333333,0.000000,0.000000,True,0.891402,0.198089


In [27]:
oqmd_data.head(3)

Unnamed: 0,composition,energy_pa,volume_pa,magmom_pa,bandgap,delta_e,stability,comp_obj,pretty_comp,nelems,...,Hg,Tl,Pb,Bi,Ac,Th,Pa,U,Np,Pu
251840,Cs1F10Lu3,-6.356016,15.3841,-2.6e-05,7.611,-4.284843,-0.411923,"(Cs, F, Lu)",CsLu3F10,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
240464,F3Tb1,-6.695208,11.7933,0.000322,8.317,-4.280576,-3.306395,"(F, Tb)",TbF3,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
270658,Er3F10K1,-6.367075,12.6559,1e-06,7.353,-4.27022,-0.027914,"(Er, F, K)",KEr3F10,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
oqmd_data.keys()

Index(['composition', 'energy_pa', 'volume_pa', 'magmom_pa', 'bandgap',
       'delta_e', 'stability', 'comp_obj', 'pretty_comp', 'nelems',
       'comp_dict', 'comp_fractions', 'H', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F',
       'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'K', 'Ca', 'Sc', 'Ti', 'V',
       'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br',
       'Kr', 'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag',
       'Cd', 'In', 'Sn', 'Sb', 'Te', 'I', 'Xe', 'Cs', 'Ba', 'La', 'Ce', 'Pr',
       'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu',
       'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi',
       'Ac', 'Th', 'Pa', 'U', 'Np', 'Pu'],
      dtype='object')

In [None]:
oqmd_data = oqmd_data[~oqmd_data[feature_labels].isnull().any(axis=1)]
print (oqmd_data.shape)

In [30]:
oqmd_data.describe()

Unnamed: 0,energy_pa,volume_pa,magmom_pa,bandgap,delta_e,stability,nelems,H,Li,Be,...,Hg,Tl,Pb,Bi,Ac,Th,Pa,U,Np,Pu
count,341598.0,341598.0,231455.0,341450.0,341600.0,322425.0,341600.0,341600.0,341600.0,341600.0,...,341600.0,341600.0,341600.0,341600.0,341600.0,341600.0,341600.0,341600.0,341600.0,341600.0
mean,-5.499689,22.038643,0.418654,0.140866,0.004932,0.485025,3.140026,0.003202,0.015583,0.013491,...,0.012559,0.013159,0.013118,0.013344,0.010544,0.009918,0.010274,0.009929,0.009561,0.009487
std,1.94036,7.951881,0.604427,0.676178,0.857872,1.110661,0.477863,0.038269,0.071415,0.067315,...,0.065216,0.066346,0.066228,0.06702,0.061459,0.059982,0.060812,0.058719,0.05899,0.058853
min,-13.575205,4.14911,-4.248135,0.0,-4.284843,-3.306395,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-6.824986,16.3004,0.000264,0.0,-0.262579,0.182297,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,-5.421248,20.6516,0.161911,0.0,0.114121,0.39575,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,-4.091805,26.2049,0.711843,0.0,0.473441,0.670759,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.67297,203.685,5.035564,10.34,4.061899,198.836034,7.0,0.888889,0.833333,0.928571,...,0.916667,0.857143,0.833333,0.833333,0.833333,0.75,0.75,0.857143,0.75,0.857143


In [31]:
oqmd_data = oqmd_data[~oqmd_data['delta_e'].isnull()]
oqmd_data.shape

(341600, 98)

In [32]:
oqmd_data = oqmd_data[~oqmd_data['volume_pa'].isnull()]
oqmd_data.shape

(341598, 98)

In [33]:
oqmd_data = oqmd_data[~oqmd_data['bandgap'].isnull()]
oqmd_data.shape

(341450, 98)

In [34]:
properties = set(oqmd_data.columns.tolist()) - (set(feature_labels) | set(elements_present))
print (properties)

{'magmom_pa', 'volume_pa', 'comp_dict', 'composition', 'stability', 'delta_e', 'nelems', 'energy_pa', 'bandgap', 'comp_obj', 'pretty_comp', 'comp_fractions'}


In [35]:
for col in ['composition', 'comp_fractions', 'comp_dict', 'nelems', 'comp_obj']: oqmd_data.drop(col, axis=1, inplace=True)
oqmd_data.shape

(341450, 93)

In [36]:
properties = set(oqmd_data.columns.tolist()) - (set(feature_labels) | set(elements_present) |{'pretty_comp'})
print (properties)

{'magmom_pa', 'volume_pa', 'stability', 'delta_e', 'energy_pa', 'bandgap'}


In [37]:
oqmd_data = oqmd_data[['pretty_comp']+elements_tl+list(feature_labels)+list(properties)]

KeyError: "['0-norm' '2-norm' '3-norm' '5-norm' '7-norm' '10-norm'\n 'MagpieData minimum Number' 'MagpieData maximum Number'\n 'MagpieData range Number' 'MagpieData mean Number'\n 'MagpieData avg_dev Number' 'MagpieData mode Number'\n 'MagpieData minimum MendeleevNumber' 'MagpieData maximum MendeleevNumber'\n 'MagpieData range MendeleevNumber' 'MagpieData mean MendeleevNumber'\n 'MagpieData avg_dev MendeleevNumber' 'MagpieData mode MendeleevNumber'\n 'MagpieData minimum AtomicWeight' 'MagpieData maximum AtomicWeight'\n 'MagpieData range AtomicWeight' 'MagpieData mean AtomicWeight'\n 'MagpieData avg_dev AtomicWeight' 'MagpieData mode AtomicWeight'\n 'MagpieData minimum MeltingT' 'MagpieData maximum MeltingT'\n 'MagpieData range MeltingT' 'MagpieData mean MeltingT'\n 'MagpieData avg_dev MeltingT' 'MagpieData mode MeltingT'\n 'MagpieData minimum Column' 'MagpieData maximum Column'\n 'MagpieData range Column' 'MagpieData mean Column'\n 'MagpieData avg_dev Column' 'MagpieData mode Column'\n 'MagpieData minimum Row' 'MagpieData maximum Row' 'MagpieData range Row'\n 'MagpieData mean Row' 'MagpieData avg_dev Row' 'MagpieData mode Row'\n 'MagpieData minimum CovalentRadius' 'MagpieData maximum CovalentRadius'\n 'MagpieData range CovalentRadius' 'MagpieData mean CovalentRadius'\n 'MagpieData avg_dev CovalentRadius' 'MagpieData mode CovalentRadius'\n 'MagpieData minimum Electronegativity'\n 'MagpieData maximum Electronegativity'\n 'MagpieData range Electronegativity' 'MagpieData mean Electronegativity'\n 'MagpieData avg_dev Electronegativity'\n 'MagpieData mode Electronegativity' 'MagpieData minimum NsValence'\n 'MagpieData maximum NsValence' 'MagpieData range NsValence'\n 'MagpieData mean NsValence' 'MagpieData avg_dev NsValence'\n 'MagpieData mode NsValence' 'MagpieData minimum NpValence'\n 'MagpieData maximum NpValence' 'MagpieData range NpValence'\n 'MagpieData mean NpValence' 'MagpieData avg_dev NpValence'\n 'MagpieData mode NpValence' 'MagpieData minimum NdValence'\n 'MagpieData maximum NdValence' 'MagpieData range NdValence'\n 'MagpieData mean NdValence' 'MagpieData avg_dev NdValence'\n 'MagpieData mode NdValence' 'MagpieData minimum NfValence'\n 'MagpieData maximum NfValence' 'MagpieData range NfValence'\n 'MagpieData mean NfValence' 'MagpieData avg_dev NfValence'\n 'MagpieData mode NfValence' 'MagpieData minimum NValence'\n 'MagpieData maximum NValence' 'MagpieData range NValence'\n 'MagpieData mean NValence' 'MagpieData avg_dev NValence'\n 'MagpieData mode NValence' 'MagpieData minimum NsUnfilled'\n 'MagpieData maximum NsUnfilled' 'MagpieData range NsUnfilled'\n 'MagpieData mean NsUnfilled' 'MagpieData avg_dev NsUnfilled'\n 'MagpieData mode NsUnfilled' 'MagpieData minimum NpUnfilled'\n 'MagpieData maximum NpUnfilled' 'MagpieData range NpUnfilled'\n 'MagpieData mean NpUnfilled' 'MagpieData avg_dev NpUnfilled'\n 'MagpieData mode NpUnfilled' 'MagpieData minimum NdUnfilled'\n 'MagpieData maximum NdUnfilled' 'MagpieData range NdUnfilled'\n 'MagpieData mean NdUnfilled' 'MagpieData avg_dev NdUnfilled'\n 'MagpieData mode NdUnfilled' 'MagpieData minimum NfUnfilled'\n 'MagpieData maximum NfUnfilled' 'MagpieData range NfUnfilled'\n 'MagpieData mean NfUnfilled' 'MagpieData avg_dev NfUnfilled'\n 'MagpieData mode NfUnfilled' 'MagpieData minimum NUnfilled'\n 'MagpieData maximum NUnfilled' 'MagpieData range NUnfilled'\n 'MagpieData mean NUnfilled' 'MagpieData avg_dev NUnfilled'\n 'MagpieData mode NUnfilled' 'MagpieData minimum GSvolume_pa'\n 'MagpieData maximum GSvolume_pa' 'MagpieData range GSvolume_pa'\n 'MagpieData mean GSvolume_pa' 'MagpieData avg_dev GSvolume_pa'\n 'MagpieData mode GSvolume_pa' 'MagpieData minimum GSbandgap'\n 'MagpieData maximum GSbandgap' 'MagpieData range GSbandgap'\n 'MagpieData mean GSbandgap' 'MagpieData avg_dev GSbandgap'\n 'MagpieData mode GSbandgap' 'MagpieData minimum GSmagmom'\n 'MagpieData maximum GSmagmom' 'MagpieData range GSmagmom'\n 'MagpieData mean GSmagmom' 'MagpieData avg_dev GSmagmom'\n 'MagpieData mode GSmagmom' 'MagpieData minimum SpaceGroupNumber'\n 'MagpieData maximum SpaceGroupNumber' 'MagpieData range SpaceGroupNumber'\n 'MagpieData mean SpaceGroupNumber' 'MagpieData avg_dev SpaceGroupNumber'\n 'MagpieData mode SpaceGroupNumber' 'avg s valence electrons'\n 'avg p valence electrons' 'avg d valence electrons'\n 'avg f valence electrons' 'compound possible' 'max ionic char'\n 'avg ionic char'] not in index"

In [38]:
print (oqmd_data.columns)
print (oqmd_data.shape)

Index(['energy_pa', 'volume_pa', 'magmom_pa', 'bandgap', 'delta_e',
       'stability', 'pretty_comp', 'H', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F',
       'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'K', 'Ca', 'Sc', 'Ti', 'V',
       'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br',
       'Kr', 'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag',
       'Cd', 'In', 'Sn', 'Sb', 'Te', 'I', 'Xe', 'Cs', 'Ba', 'La', 'Ce', 'Pr',
       'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu',
       'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi',
       'Ac', 'Th', 'Pa', 'U', 'Np', 'Pu'],
      dtype='object')
(341450, 93)


In [39]:
train_data, test_data = train_test_split(oqmd_data, test_size=0.1, random_state=1234567)

In [40]:
train_data.shape, test_data.shape

((307305, 93), (34145, 93))

In [42]:
train_data_fract = train_data[['pretty_comp']+elements_tl+list(properties)]
test_data_fract = test_data[['pretty_comp']+elements_tl+list(properties)]
#train_data_phys = train_data[['pretty_comp']+list(feature_labels)+list(properties)]
#test_data_phys = test_data[['pretty_comp']+list(feature_labels)+list(properties)]
#print (train_data_fract.shape, test_data_fract.shape, train_data_phys.shape, test_data_phys.shape)

In [43]:
train_data.to_csv('../training-data/train_set.csv', index=False)
train_data_fract.to_csv('../training-data/train_fract_set.csv', index=False)
#train_data_phys.to_csv('../training-data/train_phys_set.csv', index=False)
test_data.to_csv('../training-data//test_set.csv', index=False)
test_data_fract.to_csv('../training-data/test_fract_set.csv', index=False)
#test_data_phys.to_csv('../training-data/test_phys_set.csv', index=False)