In [1]:
import numpy as np

## Build up elementary property dictionary
- function buildDict creates a dictionary of element name and a type of attribute
- get a dictionary list with differnet elementary properties

In [2]:
def buildDict (attribute_index, refsheet, data_type):
    attribute = dict()
    if (data_type == 'd') :
        for row_index in range(1, refsheet.nrows):
            element_name = refsheet.cell(row_index, 0).value
            data = refsheet.cell(row_index, attribute_index).value
            if (data == '' or data == ' '):
                attribute[element_name] = -1.111
            else:
                try:
                    attribute[element_name] = float(data)
                except:
                    print (data, row_index, attribute_index)
            
    else:
        for row_index in range(1, refsheet.nrows):
            element_name = refsheet.cell(row_index, 0).value
            attribute[element_name] = refsheet.cell(row_index, attribute_index).value           
    return attribute

## get continuous/discrict/binominal property seperately

In [3]:
from xlrd import open_workbook
refbk = open_workbook('continousproperty.xlsx')
refst = refbk.sheet_by_index(0)
contList = []
for attribute_index in range(1, refst.ncols):
    contList.append(buildDict(attribute_index, refst, 'd'))

In [4]:
from xlrd import open_workbook
refbk2 = open_workbook('discrictproperty.xlsx')
refst2 = refbk2.sheet_by_index(0)
discList = []
for attribute_index in range(1, refst2.ncols):
    discList.append(buildDict(attribute_index, refst2, 'd'))

In [5]:
from xlrd import open_workbook
refbk3 = open_workbook('binominalproperty.xlsx')
refst3 = refbk3.sheet_by_index(0)
binoList = []
for attribute_index in range(1, refst3.ncols):
    binoList.append(buildDict(attribute_index, refst3, 'd'))

## Get Composition infomation 
- function getComp returns a list of atoms with elementary properties at one type of site
- function findNum parse the name of the perovskite to get the number of atoms
- for every compound in the spreadsheet, generate a list of attribute values

In [6]:
def getComp (row_index, sheetname, col_index, compositionname, dicList):
    sites = []
    for index in range(3):
        if (col_index+index >= 8): 
            break
        if (sheetname.cell(row_index, col_index+index).value != ''):
            name = sheetname.cell(row_index, col_index+index).value
            num = findNum(compositionname, name)
            properties = [name, num]
            for attrDic in dicList:
                properties.append(attrDic[name])
            sites.append(properties)
    return sites

In [7]:
def findNum (composition, element):
    if element == '':
        return 0
    pos = composition.find(element)+len(element)
    num = 0
    while (pos < len(composition) and composition[pos].isdigit()): 
        num = num*10 + int(composition[pos])
        pos += 1
    return num

## generate total descriptors

In [8]:
## generate total descriptors

from xlrd import open_workbook
book = open_workbook('PerovskiteScreenedMaterialsData_SI.xlsx')
sheet = book.sheet_by_index(0)
examples = []
dexamples = []
cexamples = []
for row_index in range(1, sheet.nrows):  # skip header line
    composition = sheet.cell(row_index, 0).value
    stability = sheet.cell(row_index, 13).value
    numberofelements = sheet.cell(row_index, 8).value
    ## get all statistics for continous property
    ## only get append max/min for discrict property

    Acontsites = getComp(row_index, sheet, 1, composition, contList)
    Bcontsites = getComp(row_index, sheet, 4, composition, contList)
    Xcontsites = getComp(row_index, sheet, 7, composition[-4:], contList)

    Adiscsites = getComp(row_index, sheet, 1, composition, discList)
    Bdiscsites = getComp(row_index, sheet, 4, composition, discList)
    Xdiscsites = getComp(row_index, sheet, 7, composition[-4:], discList)

    Adstatics = np.array(Adiscsites)[:,1:len(Adiscsites[0])].astype(float)
    Bdstatics = np.array(Bdiscsites)[:,1:len(Bdiscsites[0])].astype(float)
    Xdstatics = np.array(Xdiscsites)[:,1:len(Xdiscsites[0])].astype(float)
    
    Astatics = np.array(Acontsites)[:,1:len(Acontsites[0])].astype(float)
    Bstatics = np.array(Bcontsites)[:,1:len(Bcontsites[0])].astype(float)
    Xstatics = np.array(Xcontsites)[:,1:len(Xcontsites[0])].astype(float)
    Atot = np.sum(Astatics[:,0], axis=0)
    Btot = np.sum(Astatics[:,0], axis=0)
    Xtot = np.sum(Xstatics[:,0], axis=0) 
    if (Atot != 8 or Btot!=8 or Xtot !=24):
        print(Atot, Btot, Xtot, row_index, composition)
    Acoef = Astatics[:,0] / Atot if Atot != 0 else 0
    Bcoef = Bstatics[:,0] / Btot if Btot != 0 else 0
    Xcoef = Xstatics[:,0] / Xtot if Xtot != 0 else 0

    Asite_dweighted = np.dot(Acoef, Adstatics)[1:]
    Asite_dmax = np.amax(Adstatics, axis=0)[1:]
    Asite_dmin = np.amin(Adstatics, axis=0)[1:]
    Asite_dptp = np.ptp(Adstatics, axis=0)[1:]
    Bsite_dweighted = np.dot(Bcoef, Bdstatics)[1:]
    Bsite_dmax = np.amax(Bdstatics, axis=0)[1:]
    Bsite_dmin = np.amin(Bdstatics, axis=0)[1:]
    Bsite_dptp = np.ptp(Bdstatics, axis=0)[1:]
    Xsite_dweighted = np.dot(Xcoef, Xdstatics)[1:]

    Asite_avg = np.mean(Astatics, axis=0) 
    Asite_weighted = np.dot(Acoef, Astatics)[1:]
    Asite_std = np.std(Astatics, axis=0)[1:]
    Asite_max = np.amax(Astatics, axis=0)
    Asite_min = np.amin(Astatics, axis=0)
    Asite_ptp = np.ptp(Astatics, axis=0)
    Bsite_avg = np.mean(Bstatics, axis=0) 
    Bsite_weighted = np.dot(Bcoef, Bstatics)[1:]
    Bsite_std = np.std(Bstatics, axis=0)[1:]
    Bsite_max = np.amax(Bstatics, axis=0)
    Bsite_min = np.amin(Bstatics, axis=0)
    Bsite_ptp = np.ptp(Bstatics, axis=0)
    Xsite_avg = np.mean(Xstatics, axis=0)
    ## stochiometry relations
    ABratio = Atot/Btot if Btot !=0 else 100
    AXratio = Atot/Xtot if Xtot !=0 else 100
    BXratio = Btot/Xtot if Xtot !=0 else 100

    with np.errstate(divide='ignore', invalid='ignore'):
        ABPRatio = np.true_divide(Asite_weighted,Bsite_weighted)
        AXPRatio = np.true_divide(Asite_weighted,Xsite_avg[1:])
        BXPRatio = np.true_divide(Bsite_weighted,Xsite_avg[1:])
        ABPRatio[ABPRatio == np.inf] = 0
        ABPRatio[ABPRatio == - np.inf] = 0
        ABPRatio = np.nan_to_num(ABPRatio)
        AXPRatio[AXPRatio == np.inf] = 0
        AXPRatio[AXPRatio == - np.inf] = 0
        AXPRatio = np.nan_to_num(AXPRatio)
        BXPRatio[BXPRatio == np.inf] = 0
        BXPRatio[BXPRatio == - np.inf] = 0
        BXPRatio = np.nan_to_num(BXPRatio)
    ra = Asite_weighted[0]
    rb = Bsite_weighted[0]
    rx = Xsite_avg[1]
    goldschmidt_TF = (ra+rx)/(np.sqrt(2)*(rb+rx))
                              
    example = np.r_[row_index, goldschmidt_TF, Atot, Btot, Xtot, numberofelements, ABratio, AXratio, BXratio, Asite_avg, Bsite_avg, 
                    Xsite_avg, Asite_weighted, Bsite_weighted, Asite_dweighted, Bsite_dweighted, Xsite_dweighted, 
                    Asite_dmax, Bsite_dmax, Asite_dmin, Bsite_dmin, Asite_dptp, Bsite_dptp, Asite_std, Bsite_std, 
                    Asite_max, Bsite_max, Asite_min, Bsite_min, Asite_ptp, Bsite_ptp, ABPRatio, AXPRatio, 
                    BXPRatio, stability]
    examples.append(example)
    
    dexample = np.r_[row_index, numberofelements, Asite_dweighted, Bsite_dweighted, Xsite_dweighted, 
                    Asite_dmax, Bsite_dmax, Asite_dmin, Bsite_dmin, Asite_dptp, Bsite_dptp, stability]
    dexamples.append(dexample)
    
    cexample = np.r_[row_index, goldschmidt_TF, Asite_avg[1:], Bsite_avg[1:], 
                    Asite_weighted, Bsite_weighted, Asite_std, Bsite_std, 
                    Asite_max, Bsite_max, Asite_min, Bsite_min, Asite_ptp, Bsite_ptp, 
                    stability]
    cexamples.append(cexample)
    
header = np.array(range(example.size),dtype=str)
header[0] = ''
header[-1] = 'Ehu'
dheader = np.array(range(dexample.size),dtype=str)
dheader[0] = ''
dheader[-1] = 'Ehu'
cheader = np.array(range(cexample.size),dtype=str)
cheader[0] = ''
cheader[-1] = 'Ehu'
examples.insert(0,header)
dexamples.insert(0,dheader)
cexamples.insert(0,cheader)

## Write into csv file

In [9]:
np.savetxt("new_stability_output_with_descriptors.csv", np.array(examples),  fmt="%s", delimiter=',')

In [25]:
np.savetxt("continuous_stability_output_with_descriptors.csv", np.array(cexamples),  fmt="%s", delimiter=',')

In [26]:
np.savetxt("disc_stability_output_with_descriptors.csv", np.array(dexamples),  fmt="%s", delimiter=',')

In [24]:
np.array(examples).shape

(1957, 866)

In [22]:
np.array(cexamples).shape

(1957, 489)

In [23]:
np.array(dexamples).shape

(1957, 210)

In [20]:
cexamples[2].shape

(489,)

In [12]:
import numpy as np
import pandas as pd
book = pd.read_excel('PerovskiteScreenedMaterialsData_SI.xlsx',sheetname=0)
stability = book[book.columns[13]]

In [18]:
len(stability[stability<=40])/len(stability[stability>40])

0.41431670281995664

In [15]:
len(stability<=40)

1956