* Ionic liquids— salts that melt below the boiling point of water.   
* Ionic liquids don't have a measurable vapor pressure at room temperature.     
* With typical vapor pressures in the range of 10^-10 pascal (10-14 psi), ionic liquids have essentially no vapor emissions and so look like excellent candidates for "green solvents" to replace hazardous, air-polluting organic solvents like acetone and benzene. 
* With dozens of anions and cations to choose from, they can be tailored to specific needs and may be particularly useful as solvents for biocatalysis.

In [1]:
import pandas as pd
import numpy as np
import salty
from salty import check_name
import pickle

cpt_all=pd.read_csv('cpt.csv') #all 17k data
d2=cpt_all
salts=np.array(cpt_all.salt_name)
unique_salts=np.unique(salts)
len(unique_salts)
print('there are '+str(len(unique_salts)) + ' unique salts')

there are 245 unique salts


#### Database ~17,000 out of which 245 salts are unique, the rest are repeated at different Temperatures and Pressures

In [2]:
salts=[]
two=[]
three=[]
four=[]
more=[]
for i in unique_salts:
    A=i.split()              #sorting 2,3,4 and more ions into separate lists
    if len(A)==2:
        two.append(A)
    elif len(A)==3:
        three.append(A)
    elif len(A)==4:
        four.append(A)
    else:
        more.append(A)

print('There are '+ str(len(two))+' salts of 1 each') #looks fine and clean
print('There are '+ str(len(three))+' salts of 2 cations/1anion or 2 anions/1 cation each') #confirm which belongs to what
print('There are '+ str(len(four))+' salts of 2 each')
print('There are '+ str(len(more))+' salts of 2 or more each')

There are 226 salts of 1 each
There are 17 salts of 2 cations/1anion or 2 anions/1 cation each
There are 0 salts of 2 each
There are 2 salts of 2 or more each


### Using salty to generate smiles for salts with 1 cation and 1 anion each

In [3]:
cation2=[]
anion2=[]
cation2_smiles=[]
anion2_smiles=[]
error2_anion=[]
error2_cation=[]

for i in two:
    cation2.append(i[0])
    anion2.append(i[1])

for i in cation2:
    #print(i)
    try:
        check_name(i)
        cation2_smiles.append(check_name(i))
    except:
        UnboundLocalError
        error2_cation.append(i)
        
for i in anion2:   #CHECKing CHECK_NAME FUNC FOR MISSING ANION OR CATION
    #print(i)
    try:
        check_name(i)
        anion2_smiles.append(check_name(i))
    except:
        UnboundLocalError
        error2_anion.append(i)
        #print(i)

print('There are '+ str(len(set(error2_anion)))+' unique missing anions from the data base')
#print(error2_anion)
print('There are '+ str(len(set(error2_cation)))+ ' unique missing cations from the data base')
#len(set(error2_anion))
#print(len(cation2))
#print(len(cation2_smiles))
#print(len(anion2))
#print(len(anion2_smiles))
#error2_cation

There are 9 unique missing anions from the data base
There are 35 unique missing cations from the data base


#### add these missing ions into db(future work)

#### making a db for just smiles and 1cat/1an names for cnns, saving it in uniqsalts+smiles.csv

In [4]:
df = pd.DataFrame({'cation':cation2,'anion':anion2,'cation_smiles':np.nan,'anion_smiles':np.nan})
for i in df.index:
    try:
        p=check_name(df['cation'][i])
        df.loc[i,'cation_smiles']=p
    except:
        UnboundLocalError
for i in df.index:
    try:
        p=check_name(df['anion'][i])
        df.loc[i,'anion_smiles']=p
    except:
        UnboundLocalError    
print(df.shape)

(226, 4)


In [5]:
nan_rows = df[df.isnull().T.any().T]
print('Removing '+str(nan_rows.shape[0]) +' rows')
df=df.dropna()
print(df.shape)
df.to_csv('uniqsalts+smiles+cpt.csv')

Removing 46 rows
(180, 4)


### Handling two word anions 

In [6]:
anion3=[]  #handling 2 word anions
cation3=[] 
count=0
for i in three:
    if 'sulfate' in i:
        anion3.append(i[1]+' '+i[2])
        cation3.append(i[0])
        count+=1
    elif 'phosphate' in i:
        anion3.append(i[1]+' '+i[2])
        cation3.append(i[0])
        count+=1
    elif 'phosphonate' in i:
        anion3.append(i[1]+' '+i[2])
        cation3.append(i[0])
        count+=1
    elif 'carbonate' in i:
        anion3.append(i[1]+' '+i[2])
        cation3.append(i[0])
        count+=1

print('handling '+ str(count)+ ' out of '+str(len(three)))

handling 10 out of 17


In [7]:
for n,i in enumerate(anion3): #fixing the space to make it compatible with database
    if i=='diethyl phosphate':
        anion3[n]='diethylphosphate'
    elif i=='dimethyl phosphate':
        anion3[n]='dimethylphosphate'


error3_anion=[]
for i in anion3:
        try:
            check_name(i)
        except:
            UnboundLocalError
            error3_anion.append(i)

error3_cation=[]
for i in cation3:
        try:
            check_name(i)
        except:
            UnboundLocalError
            error3_cation.append(i)



print('There are '+ str(len(set(error3_anion)))+ ' unique missing anions from the data base')
#error3_anion
print('There are '+ str(len(set(error3_cation)))+ ' unique missing cations from the data base')

There are 0 unique missing anions from the data base
There are 4 unique missing cations from the data base


### Combining descriptors with the salts to 1 database

In [8]:
for i in d2.index:
    A=d2['salt_name'][i].split()
    if len(A)==2:
        pass
    else:
        d2.drop(i,axis=0,inplace=True)
#d2=d2.drop(['Molar volume, m<SUP>3</SUP>/mol','Specific volume, m<SUP>3</SUP>/kg'],axis=1)

In [9]:
cationDescriptors = salty.load_data("cationDescriptors.csv")
cationDescriptors.columns = [str(col) + '-cation' for col in cationDescriptors.columns]
anionDescriptors = salty.load_data("anionDescriptors.csv")
anionDescriptors.columns = [str(col) + '-anion' for col in anionDescriptors.columns]

In [10]:
c=[]
a=[]
for i in d2.index:
    c1,a1=d2['salt_name'][i].split()
    c.append(c1)
    a.append(a1)
d2['name-cation']=c  
d2['name-anion']=a
d3=pd.merge(d2,cationDescriptors,on="name-cation", how="inner")
d4=pd.merge(d3,anionDescriptors,on='name-anion',how="inner")
print(d4.shape)
d4

(14274, 196)


Unnamed: 0,"Heat capacity at constant pressure, J/K/mol","Pressure, kPa","Temperature, K",salt_name,name-cation,name-anion,smiles-cation,steiger-cation,Marsili Partial Charges-cation,BalabanJ-cation,...,VSA_EState2-anion,VSA_EState3-anion,VSA_EState4-anion,VSA_EState5-anion,VSA_EState6-anion,VSA_EState7-anion,VSA_EState8-anion,VSA_EState9-anion,VSA_EState10-anion,Topliss fragments-anion
0,434.6,101.325,298.10,1-(3-cyanopropyl)-3-methylimidazolium cyanocya...,1-(3-cyanopropyl)-3-methylimidazolium,cyanocyanamide,N#CCCCn1cc[n+](c1)C,777.0,777.0,2.341442,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0000,20.000000,0.000000,777.0
1,602.0,101.325,298.10,"3-(3-cyanopropyl)-1-methylimidazolium 1,1,1-tr...",3-(3-cyanopropyl)-1-methylimidazolium,"1,1,1-trifluoro-N-[(trifluoromethyl)sulfonyl]m...",N#CCCCn1cc[n+](c1)C,777.0,777.0,2.341442,...,0.0,0.0,0.0,0.0,0.0,0.0,109.1825,-12.406111,-13.443056,777.0
2,578.0,101.325,298.15,"1-butyl-3-methylpyridinium 1,1,1-trifluoro-N-[...",1-butyl-3-methylpyridinium,"1,1,1-trifluoro-N-[(trifluoromethyl)sulfonyl]m...",CCCC[n+]1cccc(c1)C,777.0,777.0,2.564805,...,0.0,0.0,0.0,0.0,0.0,0.0,109.1825,-12.406111,-13.443056,777.0
3,622.0,101.325,298.00,"1-butyl-3-methylpyridinium 1,1,1-trifluoro-N-[...",1-butyl-3-methylpyridinium,"1,1,1-trifluoro-N-[(trifluoromethyl)sulfonyl]m...",CCCC[n+]1cccc(c1)C,777.0,777.0,2.564805,...,0.0,0.0,0.0,0.0,0.0,0.0,109.1825,-12.406111,-13.443056,777.0
4,641.0,101.325,323.00,"1-butyl-3-methylpyridinium 1,1,1-trifluoro-N-[...",1-butyl-3-methylpyridinium,"1,1,1-trifluoro-N-[(trifluoromethyl)sulfonyl]m...",CCCC[n+]1cccc(c1)C,777.0,777.0,2.564805,...,0.0,0.0,0.0,0.0,0.0,0.0,109.1825,-12.406111,-13.443056,777.0
5,541.0,101.325,293.10,"1-butyl-3-methylpyridinium 1,1,1-trifluoro-N-[...",1-butyl-3-methylpyridinium,"1,1,1-trifluoro-N-[(trifluoromethyl)sulfonyl]m...",CCCC[n+]1cccc(c1)C,777.0,777.0,2.564805,...,0.0,0.0,0.0,0.0,0.0,0.0,109.1825,-12.406111,-13.443056,777.0
6,542.0,101.325,294.10,"1-butyl-3-methylpyridinium 1,1,1-trifluoro-N-[...",1-butyl-3-methylpyridinium,"1,1,1-trifluoro-N-[(trifluoromethyl)sulfonyl]m...",CCCC[n+]1cccc(c1)C,777.0,777.0,2.564805,...,0.0,0.0,0.0,0.0,0.0,0.0,109.1825,-12.406111,-13.443056,777.0
7,544.0,101.325,295.10,"1-butyl-3-methylpyridinium 1,1,1-trifluoro-N-[...",1-butyl-3-methylpyridinium,"1,1,1-trifluoro-N-[(trifluoromethyl)sulfonyl]m...",CCCC[n+]1cccc(c1)C,777.0,777.0,2.564805,...,0.0,0.0,0.0,0.0,0.0,0.0,109.1825,-12.406111,-13.443056,777.0
8,546.0,101.325,296.10,"1-butyl-3-methylpyridinium 1,1,1-trifluoro-N-[...",1-butyl-3-methylpyridinium,"1,1,1-trifluoro-N-[(trifluoromethyl)sulfonyl]m...",CCCC[n+]1cccc(c1)C,777.0,777.0,2.564805,...,0.0,0.0,0.0,0.0,0.0,0.0,109.1825,-12.406111,-13.443056,777.0
9,547.0,101.325,297.10,"1-butyl-3-methylpyridinium 1,1,1-trifluoro-N-[...",1-butyl-3-methylpyridinium,"1,1,1-trifluoro-N-[(trifluoromethyl)sulfonyl]m...",CCCC[n+]1cccc(c1)C,777.0,777.0,2.564805,...,0.0,0.0,0.0,0.0,0.0,0.0,109.1825,-12.406111,-13.443056,777.0


### Removing rows with NA values for density

In [11]:
nan_rows = d4[d4.isnull().T.any().T]
print('Removing '+str(nan_rows.shape[0]) +' rows')
d4=d4.dropna()
d4.shape
d4.to_csv("salts+descriptors+cpt.csv")
print('Final Shape: '+str(d4.shape))

Removing 40 rows
Final Shape: (14234, 196)


d4['index_original'] = d4.groupby(['Pressure, kPa', 'Temperature, K']).col1.transform('idxmin')    
d4[df4.duplicated(subset=['Temperature, K','Pressure, kPa'], keep='first')]

In [18]:
d4.drop_duplicates(keep='first').shape

(13884, 196)

In [22]:
d4[d4['Pressure, kPa']>10000].shape

(292, 196)

In [23]:
d4.describe()

Unnamed: 0,"Heat capacity at constant pressure, J/K/mol","Pressure, kPa","Temperature, K",steiger-cation,Marsili Partial Charges-cation,BalabanJ-cation,BertzCT-cation,Ipc-cation,HallKierAlpha-cation,Kappa1-cation,...,VSA_EState2-anion,VSA_EState3-anion,VSA_EState4-anion,VSA_EState5-anion,VSA_EState6-anion,VSA_EState7-anion,VSA_EState8-anion,VSA_EState9-anion,VSA_EState10-anion,Topliss fragments-anion
count,14234.0,14234.0,14234.0,14234.0,14234.0,14234.0,14234.0,14234.0,14234.0,14234.0,...,14234.0,14234.0,14234.0,14234.0,14234.0,14234.0,14234.0,14234.0,14234.0,14234.0
mean,411.448022,801.824384,259.117504,777.0,777.0,2.446059,205.382597,428261.3,-0.733409,9.222415,...,0.0,0.0,0.0,0.0,0.0,0.0,50.60049,-5.512349,-3.323904,777.0
std,329.333945,4923.019162,120.006801,0.0,0.0,0.479708,67.443651,2320825.0,0.30382,5.363097,...,0.0,0.0,0.0,0.0,0.0,0.0,47.012925,12.088476,6.457344,0.0
min,0.0,100.0,0.0,777.0,777.0,1.632993,2.754888,2.754888,-2.05,2.96,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-70.380285,-13.443056,777.0
25%,207.5,101.325,170.0,777.0,777.0,2.153243,178.796761,147.2798,-0.79,7.25929,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-12.406111,-13.443056,777.0
50%,387.0,101.325,294.66,777.0,777.0,2.422978,190.892591,256.7231,-0.79,7.318578,...,0.0,0.0,0.0,0.0,0.0,0.0,39.0,-6.0,0.0,777.0
75%,506.0,101.325,337.0445,777.0,777.0,2.613882,215.368085,711.7488,-0.79,9.299206,...,0.0,0.0,0.0,0.0,0.0,0.0,109.1825,0.0,0.0,777.0
max,9160.0,60000.0,663.1,777.0,777.0,4.391491,714.705652,13127210.0,0.43,33.43,...,0.0,0.0,0.0,0.0,0.0,0.0,222.417323,45.366093,17.194444,777.0
