* Ionic liquids— salts that melt below the boiling point of water.   
* Ionic liquids don't have a measurable vapor pressure at room temperature.     
* With typical vapor pressures in the range of 10^-10 pascal (10-14 psi), ionic liquids have essentially no vapor emissions and so look like excellent candidates for "green solvents" to replace hazardous, air-polluting organic solvents like acetone and benzene. 
* With dozens of anions and cations to choose from, they can be tailored to specific needs and may be particularly useful as solvents for biocatalysis.

In [17]:
import pandas as pd
import numpy as np
import salty
from salty import check_name
import pickle

total=pd.read_csv('intersection_all.csv') #all 30k data
#total=total.drop(['Molar volume, m<SUP>3</SUP>/mol','Specific volume, m<SUP>3</SUP>/kg'],axis=1)
d2=total.copy()
salts=np.array(total.salt_name)
unique_salts=np.unique(salts)
len(unique_salts)
print('there are '+str(len(unique_salts)) + ' unique salts')

there are 106 unique salts


salts=[]
two=[]
three=[]
four=[]
more=[]
for i in unique_salts:
    A=i.split()              #sorting 2,3,4 and more ions into separate lists
    if len(A)==2:
        two.append(A)
    elif len(A)==3:
        three.append(A)
    elif len(A)==4:
        four.append(A)
    else:
        more.append(A)

print('There are '+ str(len(two))+' salts of 1 each') #looks fine and clean
print('There are '+ str(len(three))+' salts of 2 cations/1anion or 2 anions/1 cation each') #confirm which belongs to what
print('There are '+ str(len(four))+' salts of 2 each')
print('There are '+ str(len(more))+' salts of 2 or more each')

### Using salty to generate smiles for salts with 1 cation and 1 anion each

cation2=[]
anion2=[]
cation2_smiles=[]
anion2_smiles=[]
error2_anion=[]
error2_cation=[]

for i in two:
    cation2.append(i[0])
    anion2.append(i[1])

for i in cation2:
    #print(i)
    try:
        check_name(i)
        cation2_smiles.append(check_name(i))
    except:
        UnboundLocalError
        error2_cation.append(i)
        
for i in anion2:   #CHECKing CHECK_NAME FUNC FOR MISSING ANION OR CATION
    #print(i)
    try:
        check_name(i)
        anion2_smiles.append(check_name(i))
    except:
        UnboundLocalError
        error2_anion.append(i)
        #print(i)

print('There are '+ str(len(set(error2_anion)))+' unique missing anions from the data base')
#print(error2_anion)
print('There are '+ str(len(set(error2_cation)))+ ' unique missing cations from the data base')
#len(set(error2_anion))
#print(len(cation2))
#print(len(anion2))
#print(len(anion2_smiles))

#### add these missing ions into db(future work)

#### making a db for just smiles and 1cat/1an names for cnns, saving it in uniqsalts+smiles.csv

df = pd.DataFrame({'cation':cation2,'anion':anion2,'cation_smiles':np.nan,'anion_smiles':np.nan})
for i in df.index:
    try:
        p=check_name(df['cation'][i])
        df.loc[i,'cation_smiles']=p
    except:
        UnboundLocalError
for i in df.index:
    try:
        p=check_name(df['anion'][i])
        df.loc[i,'anion_smiles']=p
    except:
        UnboundLocalError    
print(df.shape)

nan_rows = df[df.isnull().T.any().T]
print('Removing '+str(nan_rows.shape[0]) +' rows')
df=df.dropna()
print(df.shape)
df.to_csv('uniqsalts+smiles+all.csv')

### Handling two word anions 

anion3=[]  #handling 2 word anions
cation3=[] 
count=0
for i in three:
    if 'sulfate' in i:
        anion3.append(i[1]+' '+i[2])
        cation3.append(i[0])
        count+=1
    elif 'phosphate' in i:
        anion3.append(i[1]+' '+i[2])
        cation3.append(i[0])
        count+=1
    elif 'phosphonate' in i:
        anion3.append(i[1]+' '+i[2])
        cation3.append(i[0])
        count+=1
    elif 'carbonate' in i:
        anion3.append(i[1]+' '+i[2])
        cation3.append(i[0])
        count+=1

print('handling '+ str(count)+ ' out of '+str(len(three)))

for n,i in enumerate(anion3): #fixing the space to make it compatible with database
    if i=='diethyl phosphate':
        anion3[n]='diethylphosphate'
    elif i=='dimethyl phosphate':
        anion3[n]='dimethylphosphate'


error3_anion=[]
for i in anion3:
        try:
            check_name(i)
        except:
            UnboundLocalError
            error3_anion.append(i)

error3_cation=[]
for i in cation3:
        try:
            check_name(i)
        except:
            UnboundLocalError
            error3_cation.append(i)



print('There are '+ str(len(set(error3_anion)))+ ' unique missing anions from the data base')
#error3_anion
print('There are '+ str(len(set(error3_cation)))+ ' unique missing cations from the data base')
#error3_cation

### Combining descriptors with the salts to 1 database

In [20]:
for i in d2.index:
    A=d2['salt_name'][i].split()
    if len(A)==2:
        pass
    else:
        d2.drop(i,axis=0,inplace=True)
d2.shape

(25461, 7)

In [21]:
cationDescriptors = salty.load_data("cationDescriptors.csv")
cationDescriptors.columns = [str(col) + '-cation' for col in cationDescriptors.columns]
anionDescriptors = salty.load_data("anionDescriptors.csv")
anionDescriptors.columns = [str(col) + '-anion' for col in anionDescriptors.columns]

In [22]:
c=[]
a=[]
for i in d2.index:
    c1,a1=d2['salt_name'][i].split()
    c.append(c1)
    a.append(a1)
d2['name-cation']=c  
d2['name-anion']=a
d3=pd.merge(d2,cationDescriptors,on="name-cation", how="inner")
d4=pd.merge(d3,anionDescriptors,on='name-anion',how="inner")
print(d4.shape)
d4.head()

(25434, 199)


Unnamed: 0.1,Unnamed: 0,"Pressure, kPa","Specific density, kg/m<SUP>3</SUP>","Temperature, K",salt_name,"Heat capacity at constant pressure, J/K/mol","Viscosity, Pa&#8226;s",name-cation,name-anion,smiles-cation,...,VSA_EState2-anion,VSA_EState3-anion,VSA_EState4-anion,VSA_EState5-anion,VSA_EState6-anion,VSA_EState7-anion,VSA_EState8-anion,VSA_EState9-anion,VSA_EState10-anion,Topliss fragments-anion
0,0,101.325,1518.9,298.15,1-ethyl-3-methylimidazolium bis[(trifluorometh...,504.8,0.0365,1-ethyl-3-methylimidazolium,bis[(trifluoromethyl)sulfonyl]imide,CCn1cc[n+](c1)C,...,0.0,0.0,0.0,0.0,0.0,0.0,109.1825,-12.406111,-13.443056,777.0
1,1,101.325,1518.9,298.15,1-ethyl-3-methylimidazolium bis[(trifluorometh...,504.8,0.0341,1-ethyl-3-methylimidazolium,bis[(trifluoromethyl)sulfonyl]imide,CCn1cc[n+](c1)C,...,0.0,0.0,0.0,0.0,0.0,0.0,109.1825,-12.406111,-13.443056,777.0
2,2,101.325,1518.9,298.15,1-ethyl-3-methylimidazolium bis[(trifluorometh...,504.8,0.03429,1-ethyl-3-methylimidazolium,bis[(trifluoromethyl)sulfonyl]imide,CCn1cc[n+](c1)C,...,0.0,0.0,0.0,0.0,0.0,0.0,109.1825,-12.406111,-13.443056,777.0
3,3,101.325,1518.9,298.15,1-ethyl-3-methylimidazolium bis[(trifluorometh...,504.8,0.03251,1-ethyl-3-methylimidazolium,bis[(trifluoromethyl)sulfonyl]imide,CCn1cc[n+](c1)C,...,0.0,0.0,0.0,0.0,0.0,0.0,109.1825,-12.406111,-13.443056,777.0
4,4,101.325,1518.9,298.15,1-ethyl-3-methylimidazolium bis[(trifluorometh...,504.8,0.0334,1-ethyl-3-methylimidazolium,bis[(trifluoromethyl)sulfonyl]imide,CCn1cc[n+](c1)C,...,0.0,0.0,0.0,0.0,0.0,0.0,109.1825,-12.406111,-13.443056,777.0


### Removing rows with NA values

In [23]:
nan_rows = d4[d4.isnull().T.any().T]
print('Removing '+str(nan_rows.shape[0]) +' rows')
d4=d4.dropna()
d4.shape
d4.to_csv("salts+descriptors+all1.csv")
print('Final Shape: '+str(d4.shape)) #duplicates were removed 

Removing 1452 rows
Final Shape: (23982, 199)


In [24]:
d4.groupby(['salt_name','Temperature, K','Pressure, kPa']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 0,"Specific density, kg/m<SUP>3</SUP>","Heat capacity at constant pressure, J/K/mol","Viscosity, Pa&#8226;s",name-cation,name-anion,smiles-cation,steiger-cation,Marsili Partial Charges-cation,BalabanJ-cation,...,VSA_EState2-anion,VSA_EState3-anion,VSA_EState4-anion,VSA_EState5-anion,VSA_EState6-anion,VSA_EState7-anion,VSA_EState8-anion,VSA_EState9-anion,VSA_EState10-anion,Topliss fragments-anion
salt_name,"Temperature, K","Pressure, kPa",Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
"1,2-diethylpyridinium ethylsulfate",298.15,101.325,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
"1,2-diethylpyridinium ethylsulfate",308.15,101.325,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
"1,3-dimethylimidazolium dimethylphosphate",298.15,101.325,4,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,4
"1,3-dimethylimidazolium dimethylphosphate",303.15,101.325,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
"1,3-dimethylimidazolium dimethylphosphate",313.15,101.325,8,8,8,8,8,8,8,8,8,8,...,8,8,8,8,8,8,8,8,8,8
"1,3-dimethylimidazolium dimethylphosphate",323.15,101.325,4,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,4
"1,3-dimethylimidazolium methylsulfate",318.15,101.325,6,6,6,6,6,6,6,6,6,6,...,6,6,6,6,6,6,6,6,6,6
"1,3-dimethylimidazolium methylsulfate",328.15,101.325,8,8,8,8,8,8,8,8,8,8,...,8,8,8,8,8,8,8,8,8,8
1-benzyl-3-methyl-1H-imidazolium chloride,298.15,101.325,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1-butyl-1-methylpiperidinium bis(trifluoromethanesulfonyl)imide,298.15,101.325,4,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,4


In [25]:
d4.describe()

Unnamed: 0.1,Unnamed: 0,"Pressure, kPa","Specific density, kg/m<SUP>3</SUP>","Temperature, K","Heat capacity at constant pressure, J/K/mol","Viscosity, Pa&#8226;s",steiger-cation,Marsili Partial Charges-cation,BalabanJ-cation,BertzCT-cation,...,VSA_EState2-anion,VSA_EState3-anion,VSA_EState4-anion,VSA_EState5-anion,VSA_EState6-anion,VSA_EState7-anion,VSA_EState8-anion,VSA_EState9-anion,VSA_EState10-anion,Topliss fragments-anion
count,23982.0,23982.0,23982.0,23982.0,23982.0,23982.0,23982.0,23982.0,23982.0,23982.0,...,23982.0,23982.0,23982.0,23982.0,23982.0,23982.0,23982.0,23982.0,23982.0,23982.0
mean,22824.663456,119.514273,1293.78339,309.517938,462.135696,0.086175,777.0,777.0,2.501594,189.699708,...,0.0,0.0,0.0,0.0,0.0,0.0,66.08843,-7.53213,-5.148047,777.0
std,12897.967977,536.527726,135.122477,15.692692,145.878712,0.158172,0.0,0.0,0.318132,24.403902,...,0.0,0.0,0.0,0.0,0.0,0.0,37.259162,9.088634,6.436078,0.0
min,0.0,100.0,847.5,278.15,231.8,0.00316,777.0,777.0,1.806454,82.568425,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-70.380285,-13.443056,777.0
25%,10742.25,101.325,1195.0,298.15,369.0,0.0289,777.0,777.0,2.422978,190.892591,...,0.0,0.0,0.0,0.0,0.0,0.0,39.0,-12.406111,-13.443056,777.0
50%,23683.5,101.325,1275.7,303.15,413.6,0.0516,777.0,777.0,2.422978,190.892591,...,0.0,0.0,0.0,0.0,0.0,0.0,39.0,-6.0,0.0,777.0
75%,33873.75,101.325,1414.0,318.15,536.0,0.1,777.0,777.0,2.422978,190.892591,...,0.0,0.0,0.0,0.0,0.0,0.0,109.1825,-6.0,0.0,777.0
max,43082.0,20000.0,1557.1,373.15,1764.0,10.2,777.0,777.0,4.391491,475.748716,...,0.0,0.0,0.0,0.0,0.0,0.0,222.417323,35.0,3.699074,777.0


In [30]:
arr=np.array(d4['salt_name'])
len(np.unique(arr))

99