* Ionic liquids— salts that melt below the boiling point of water.   
* Ionic liquids don't have a measurable vapor pressure at room temperature.     
* With typical vapor pressures in the range of 10^-10 pascal (10-14 psi), ionic liquids have essentially no vapor emissions and so look like excellent candidates for "green solvents" to replace hazardous, air-polluting organic solvents like acetone and benzene. 
* With dozens of anions and cations to choose from, they can be tailored to specific needs and may be particularly useful as solvents for biocatalysis.

In [1]:
import pandas as pd
import numpy as np
import salty
from salty import check_name
import pickle

density_all=pd.read_csv('density.csv') #all 30k data
d2=density_all
density_all=density_all.drop(['Molar volume, m<SUP>3</SUP>/mol','Specific volume, m<SUP>3</SUP>/kg'],axis=1)
salts=np.array(density_all.salt_name)
unique_salts=np.unique(salts)
len(unique_salts)
print('there are '+str(len(unique_salts)) + ' unique salts')

there are 744 unique salts


#### Database ~30,000 out of which 744 salts are unique, the rest are repeated at different Temperatures and Pressures

In [2]:
salts=[]
two=[]
three=[]
four=[]
more=[]
for i in unique_salts:
    A=i.split()              #sorting 2,3,4 and more ions into separate lists
    if len(A)==2:
        two.append(A)
    elif len(A)==3:
        three.append(A)
    elif len(A)==4:
        four.append(A)
    else:
        more.append(A)

print('There are '+ str(len(two))+' salts of 1 each') #looks fine and clean
print('There are '+ str(len(three))+' salts of 2 cations/1anion or 2 anions/1 cation each') #confirm which belongs to what
print('There are '+ str(len(four))+' salts of 2 each')
print('There are '+ str(len(more))+' salts of 2 or more each')

There are 668 salts of 1 each
There are 72 salts of 2 cations/1anion or 2 anions/1 cation each
There are 1 salts of 2 each
There are 3 salts of 2 or more each


### Using salty to generate smiles for salts with 1 cation and 1 anion each

In [3]:
cation2=[]
anion2=[]
cation2_smiles=[]
anion2_smiles=[]
error2_anion=[]
error2_cation=[]

for i in two:
    cation2.append(i[0])
    anion2.append(i[1])

for i in cation2:
    #print(i)
    try:
        check_name(i)
        cation2_smiles.append(check_name(i))
    except:
        UnboundLocalError
        error2_cation.append(i)
        
for i in anion2:   #CHECKing CHECK_NAME FUNC FOR MISSING ANION OR CATION
    #print(i)
    try:
        check_name(i)
        anion2_smiles.append(check_name(i))
    except:
        UnboundLocalError
        error2_anion.append(i)
        #print(i)

print('There are '+ str(len(set(error2_anion)))+' unique missing anions from the data base')
#print(error2_anion)
print('There are '+ str(len(set(error2_cation)))+ ' unique missing cations from the data base')
#len(set(error2_anion))
#print(len(cation2))
#print(len(anion2))
#print(len(anion2_smiles))

There are 48 unique missing anions from the data base
There are 141 unique missing cations from the data base


#### add these missing ions into db(future work)

#### making a db for just smiles and 1cat/1an names for cnns, saving it in uniqsalts+smiles.csv

In [17]:
df = pd.DataFrame({'cation':cation2,'anion':anion2,'cation_smiles':np.nan,'anion_smiles':np.nan})
for i in df.index:
    try:
        p=check_name(df['cation'][i])
        df.loc[i,'cation_smiles']=p
    except:
        UnboundLocalError
for i in df.index:
    try:
        p=check_name(df['anion'][i])
        df.loc[i,'anion_smiles']=p
    except:
        UnboundLocalError    
print(df.shape)

(668, 4)


In [18]:
nan_rows = df[df.isnull().T.any().T]
print('Removing '+str(nan_rows.shape[0]) +' rows')
df=df.dropna()
print(df.shape)
df.to_csv('uniqsalts+smiles+density.csv')

Removing 221 rows
(447, 4)


### Handling two word anions 

In [4]:
anion3=[]  #handling 2 word anions
cation3=[] 
count=0
for i in three:
    if 'sulfate' in i:
        anion3.append(i[1]+' '+i[2])
        cation3.append(i[0])
        count+=1
    elif 'phosphate' in i:
        anion3.append(i[1]+' '+i[2])
        cation3.append(i[0])
        count+=1
    elif 'phosphonate' in i:
        anion3.append(i[1]+' '+i[2])
        cation3.append(i[0])
        count+=1
    elif 'carbonate' in i:
        anion3.append(i[1]+' '+i[2])
        cation3.append(i[0])
        count+=1

print('handling '+ str(count)+ ' out of '+str(len(three)))

handling 60 out of 72


In [5]:
for n,i in enumerate(anion3): #fixing the space to make it compatible with database
    if i=='diethyl phosphate':
        anion3[n]='diethylphosphate'
    elif i=='dimethyl phosphate':
        anion3[n]='dimethylphosphate'


error3_anion=[]
for i in anion3:
        try:
            check_name(i)
        except:
            UnboundLocalError
            error3_anion.append(i)

error3_cation=[]
for i in cation3:
        try:
            check_name(i)
        except:
            UnboundLocalError
            error3_cation.append(i)



print('There are '+ str(len(set(error3_anion)))+ ' unique missing anions from the data base')
#error3_anion
print('There are '+ str(len(set(error3_cation)))+ ' unique missing cations from the data base')
#error3_cation

There are 2 unique missing anions from the data base
There are 19 unique missing cations from the data base


### Combining descriptors with the salts to 1 database

In [6]:
for i in d2.index:
    A=d2['salt_name'][i].split()
    if len(A)==2:
        pass
    else:
        d2.drop(i,axis=0,inplace=True)
d2=d2.drop(['Molar volume, m<SUP>3</SUP>/mol','Specific volume, m<SUP>3</SUP>/kg'],axis=1)

In [7]:
cationDescriptors = salty.load_data("cationDescriptors.csv")
cationDescriptors.columns = [str(col) + '-cation' for col in cationDescriptors.columns]
anionDescriptors = salty.load_data("anionDescriptors.csv")
anionDescriptors.columns = [str(col) + '-anion' for col in anionDescriptors.columns]

In [8]:
c=[]
a=[]
for i in d2.index:
    c1,a1=d2['salt_name'][i].split()
    c.append(c1)
    a.append(a1)
d2['name-cation']=c  
d2['name-anion']=a
d3=pd.merge(d2,cationDescriptors,on="name-cation", how="inner")
d4=pd.merge(d3,anionDescriptors,on='name-anion',how="inner")
print(d4.shape)
d4.head()

(26317, 196)


Unnamed: 0,"Pressure, kPa","Specific density, kg/m<SUP>3</SUP>","Temperature, K",salt_name,name-cation,name-anion,smiles-cation,steiger-cation,Marsili Partial Charges-cation,BalabanJ-cation,...,VSA_EState2-anion,VSA_EState3-anion,VSA_EState4-anion,VSA_EState5-anion,VSA_EState6-anion,VSA_EState7-anion,VSA_EState8-anion,VSA_EState9-anion,VSA_EState10-anion,Topliss fragments-anion
0,101.325,899.1,298.15,tetradecyl(trihexyl)phosphonium dicyanamide,tetradecyl(trihexyl)phosphonium,dicyanamide,C(CCCCCCCC[P+](CCCCCC)(CCCCCC)CCCCCC)CCCCC,777.0,777.0,4.337445,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,0.0,777.0
1,100.0,897.36,298.15,tetradecyl(trihexyl)phosphonium dicyanamide,tetradecyl(trihexyl)phosphonium,dicyanamide,C(CCCCCCCC[P+](CCCCCC)(CCCCCC)CCCCCC)CCCCC,777.0,777.0,4.337445,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,0.0,777.0
2,101.325,903.4,288.15,tetradecyl(trihexyl)phosphonium dicyanamide,tetradecyl(trihexyl)phosphonium,dicyanamide,C(CCCCCCCC[P+](CCCCCC)(CCCCCC)CCCCCC)CCCCC,777.0,777.0,4.337445,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,0.0,777.0
3,101.325,897.6,298.15,tetradecyl(trihexyl)phosphonium dicyanamide,tetradecyl(trihexyl)phosphonium,dicyanamide,C(CCCCCCCC[P+](CCCCCC)(CCCCCC)CCCCCC)CCCCC,777.0,777.0,4.337445,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,0.0,777.0
4,101.325,891.9,308.15,tetradecyl(trihexyl)phosphonium dicyanamide,tetradecyl(trihexyl)phosphonium,dicyanamide,C(CCCCCCCC[P+](CCCCCC)(CCCCCC)CCCCCC)CCCCC,777.0,777.0,4.337445,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,0.0,777.0


### Removing rows with NA values for density

In [9]:
nan_rows = d4[d4.isnull().T.any().T]
print('Removing '+str(nan_rows.shape[0]) +' rows')
d4=d4.dropna()
d4.shape
d4.to_csv("salts+descriptors+density.csv")
print('Final Shape: '+str(d4.shape))

Removing 326 rows
Final Shape: (25991, 196)


In [10]:
d4.drop_duplicates(keep='first').shape

(25215, 196)

In [11]:
d4.describe()

Unnamed: 0,"Pressure, kPa","Specific density, kg/m<SUP>3</SUP>","Temperature, K",steiger-cation,Marsili Partial Charges-cation,BalabanJ-cation,BertzCT-cation,Ipc-cation,HallKierAlpha-cation,Kappa1-cation,...,VSA_EState2-anion,VSA_EState3-anion,VSA_EState4-anion,VSA_EState5-anion,VSA_EState6-anion,VSA_EState7-anion,VSA_EState8-anion,VSA_EState9-anion,VSA_EState10-anion,Topliss fragments-anion
count,25991.0,25991.0,25991.0,25991.0,25991.0,25991.0,25991.0,25991.0,25991.0,25991.0,...,25991.0,25991.0,25991.0,25991.0,25991.0,25991.0,25991.0,25991.0,25991.0,25991.0
mean,18375.956244,1227.093166,324.836412,777.0,777.0,2.575698,183.084008,897342.6,-0.601849,9.39978,...,0.0,0.0,0.0,0.0,0.0,0.0,55.86363,-3.083439,-4.235906,777.0
std,36332.27262,170.698953,35.222623,0.0,0.0,0.576401,74.246881,5331852.0,0.407139,6.654011,...,0.0,0.0,0.0,0.0,0.0,0.0,52.838949,18.186569,6.242559,0.0
min,86.5,841.3,217.577,777.0,777.0,1.0,2.0,2.0,-2.08,1.96,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-70.380285,-15.239861,777.0
25%,101.325,1091.265,299.65,777.0,777.0,2.19061,166.791393,130.5735,-0.79,6.331803,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-12.406111,-13.443056,777.0
50%,101.325,1212.0,318.15,777.0,777.0,2.422978,190.892591,256.7231,-0.79,7.318578,...,0.0,0.0,0.0,0.0,0.0,0.0,39.0,-6.0,0.0,777.0
75%,20000.0,1363.3,338.15,777.0,777.0,2.837541,215.368085,711.7488,-0.79,9.299206,...,0.0,0.0,0.0,0.0,0.0,0.0,109.1825,8.353739,0.0,777.0
max,300000.0,1741.5,473.15,777.0,777.0,4.739991,525.851293,193697800.0,0.43,33.43,...,0.0,0.0,0.0,0.0,0.0,0.0,222.417323,45.366093,17.194444,777.0
