In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import math

## Mean and Weighted mean

In [6]:
## Weightage mean
def mean_el(prop_el):
    return np.average(prop_el)

def wt_mean(prop_el,list_weight):
    list1=[(x/sum(prop_el)) for x in prop_el]
    list2=[(x/sum(list_weight)) for x in list_weight]
    m=[a*b for a,b in zip(prop_el,list2)]
    return sum(m)


## Geometric mean

In [7]:
def geomean(prop_el):
    a= np.array(prop_el)
    return a.prod()**(1.0/len(a))

In [8]:
wt_mean([26.98,138.9],[1,4])

116.516

In [9]:
wt_mean([26.98,138.9],[1,4])

116.516

## Weighted geometric mean

In [10]:
def wt_geomean(prop_el,list_weight):
    temp=[]
    for i in range(len(prop_el)):
        temp.append(prop_el[i]**list_weight[i])
    a=np.asarray(temp).prod()
    
    return a**(1/sum(list_weight))

In [11]:
wt_geomean([26.98,138.9],[2,1])

46.58671045576934

## Entropy

In [12]:
def entropy(prop_el):
    wi=[]
    for i in range(len(prop_el)):
        w=(prop_el[i]/sum(prop_el))
        #print(w)
        wi.append(-1*w*np.log(w))
    return sum(wi)

In [13]:
entropy([91.28,186])

0.6336091611028696

## Weighted entropy

In [14]:
def wt_entropy(prop_el,list_weight):
    pw=[]
    entropy_wt=[]
    for i in range(len(prop_el)):
        w=(prop_el[i]/sum(prop_el))
        p=list_weight[i]/sum(list_weight)
        #p=list_weight[i]
        pw.append(p*w)
    for i in range(len(pw)):
        Ai=pw[i]/sum(pw)
        #print(Ai)
        #print(sum(pw))
        entropy_wt.append(-1*Ai*np.log(Ai))
    return sum(entropy_wt)
        

## Range and Weighted range

In [15]:
def range_el(prop_el):
    return max(prop_el)-min(prop_el)
def wt_range_el(prop_el,weight_list):
    
    
    return max(prop_el)*weight_list[np.argmax(prop_el)]/sum(weight_list)  - min(prop_el)*weight_list[np.argmin(prop_el)]/sum(weight_list)

## Standard deviation and weighted standard deviation

In [16]:
def std_dv(prop_el):
    return np.std(prop_el)

def wt_std_dv(prop_el,weight_list):
    wt_m=wt_mean(prop_el,weight_list)
    #print(wt_m)
    t=[]
    for i in range(len(prop_el)):
        m=prop_el[i]-wt_m
        m=weight_list[i]*(1/sum(weight_list))*m**2
        t.append(m)
        
    return sum(t)**(1/2)

# feature engineering

## Get statistical details of an element

In [17]:
def get_stats(element_list,composition):
    no_el=len(element_list)
    mean_s=mean_el(element_list)
    wt_mean1=wt_mean(element_list,composition)
    geomean1=geomean(element_list)
    wt_geomean1=wt_geomean(element_list,composition)
    entropy1=entropy(element_list)
    wt_entropy1=wt_entropy(element_list,composition)
    range_el1=range_el(element_list)
    wt_range_el1=wt_range_el(element_list,composition)
    std_dv1=std_dv(element_list)
    wt_std_dv1=wt_std_dv(element_list,composition)
    return [no_el,mean_s,wt_mean1,geomean1,wt_geomean1,entropy1,wt_entropy1,range_el1,wt_range_el1,std_dv1,wt_std_dv1]


In [29]:
get_stats([91.28,186],[1,2])

[2,
 138.64,
 154.42666666666668,
 130.29996162700894,
 146.71251881350724,
 0.6336091611028696,
 0.49625728370029376,
 94.72,
 93.57333333333334,
 47.36,
 44.65143620932652]

In [19]:
import re
r = re.compile("([a-zA-Z]+)([0-9]+)")


In [20]:
strings = 'Co12NA4'
print(r.match(strings).groups(3))

('Co', '12')


In [22]:
s = "C12H1"
sub = []
char = ""
num = ""
for letter in s:
    if letter.isdigit():
        if char:
            sub.append(char)
            char = ""
        num += letter
    else:
        if num:
            sub.append(num)
            num = ""
        char += letter
sub.append(char) if char else sub.append(num)
print(sub)
        

['C', '12', 'H', '1']


In [23]:
def compound_to_string(s):
    sub = []
    char = ""
    num = ""
    for letter in s:
        if letter.isdigit():
            if char:
                sub.append(char)
                char = ""
            num += letter
        else:
            if num:
                sub.append(num)
                num = ""
            char += letter
    sub.append(char) if char else sub.append(num)
    return sub

In [26]:
def compound_to_stat(string):
    t=compound_to_string(string)
    element_list=[]
    composition_list=[]
    for i in range(len(t)):
        if i ==0 or i%2==0:
            element_list.append(t[i])
        else:
            composition_list.append(int(t[i]))
    return element_list,composition_list
        

In [27]:
compound_to_string('Ca1H4Naaa5')

['Ca', '1', 'H', '4', 'Naaa', '5']

## Progress check

In [28]:
compound_to_stat('Ca1H4Naaa5')

(['Ca', 'H', 'Naaa'], [1, 4, 5])

In [37]:
d=pd.read_excel('database.xlsx',index_col=0,skiprows=1)

In [79]:
#d

In [39]:
properties=d.index.tolist()

In [76]:
t='Si2Na1'
el,qu=compound_to_stat(t)


In [77]:
prop1=[]
p='Molar volume'
for i in el:
    prop1.append(d.loc[p][i])
t=get_stats(prop1,qu)


In [84]:
t='Si2Na1'
el,qu=compound_to_stat(t)
expand_list=[]
for each in properties:
    prop1=[]
    p=each
    for i in el:
        prop1.append(d.loc[p][i])
    t=get_stats(prop1,qu)
    print(t)
    expand=expand_list.extend(t)
    

[2, 12.5, 12.999999999999998, 12.409673645990857, 12.918623821828753, 0.6859298002523728, 0.594880254188934, 3.0, 5.666666666666668, 1.5, 1.414213562373095]
[2, 25.53738464, 26.38658976, 25.409991543265026, 26.272081926870815, 0.6881628421059182, 0.6025294495344129, 5.09523072, 11.060076906666666, 2.54761536, 2.401914795881343]
[2, 7.5, 9.666666666666666, 3.7416573867739413, 5.808785733563703, 0.24493002679463532, 0.1499949236104151, 13.0, 9.0, 6.5, 6.128258770283411]
[2, 3.0, 3.0, 3.0, 3.0, 0.6931471805599453, 0.6365141682948128, 0.0, 0.0, 0.0, 0.0]
[2, 1.5, 1.6666666666666665, 1.4142135623730951, 1.5874010519681994, 0.6365141682948128, 0.500402423538188, 1.0, 1.0, 0.5, 0.4714045207910317]
[2, 1649.0, 1876.0, 1501.8122385970892, 1738.5842184871597, 0.6052657044634085, 0.4590388693960619, 1362.0, 1230.6666666666665, 681.0, 642.0529573173851]
[2, 17.92, 15.966666666666665, 16.93478077803194, 15.122914215302576, 0.6386835763519336, 0.6931219886869282, 11.72, -0.11333333333333417, 5.86, 5

In [86]:
len(expand_list)

341

In [83]:
expand

In [51]:
prop1

[12.06, 23.78]

In [None]:
data={}
for prop in properties:
    compound_to_stat('Ca1H4Naaa5')
    

In [40]:
properties

['Atomic number',
 'Relative atomic mass (Ar)',
 'Group in periodic table',
 'Period in periodic table',
 'Block in 2erio3ic table',
 'Density of solid',
 'Molar volume',
 'Thermal conductivity',
 'Melting point',
 'Boiling point',
 'Liquid range',
 'Enthalpy of fusion',
 'Enthalpy of vaporisation',
 'Enthalpy of atomisation',
 'Atomic radius (empirical)',
 'Covalent radius (2008 values)',
 'Molecular single bond covalent radius',
 'Molecular double bond covalent radius',
 'van der Waals radius',
 'electronegativity',
 'Space group number',
 'Structure',
 'a',
 'b',
 'c',
 'α',
 'β',
 'γ',
 ' First ionisation energy',
 ' Second ionisation energy',
 ' Third ionisation energy']