# Introduction 

This notebook has been developed to extract data used for model training from this article:

Kamil Paduszynski, Extensive Databases and Group Contribution QSPRs of Ionic Liquids Properties. 1. Density, Ind. Eng. Chem. Res. 2019, 58, 13, 5322–5338
https://pubs.acs.org/doi/abs/10.1021/acs.iecr.9b00130

The supporting information file was downloaded (ESI01.xlsx).

# Load Libraries and Define Functions

In [1]:
import tqdm
import pandas as pd 

filename = './Data/ESI01.xlsx'

In [2]:
def getErrorStats(df_1, df_2, unit_conversion = 1):
    """
    Calculate and print various error statistics between two dataframes.
    
    Parameters:
    df_1 (DataFrame): First dataframe to compare.
    df_2 (DataFrame): Second dataframe to compare.
    unit_conversion (float): Factor by which to divide the dataframes for unit conversion (default is 1, meaning no conversion).
    
    Returns:
    None
    """
    # Apply unit conversion to the dataframes
    df_1 = df_1 / unit_conversion
    df_2 = df_2 / unit_conversion
    
    # Calculate the difference between the two dataframes
    diff = df_1 - df_2
    
    # Calculate Average Absolute Relative Deviation (AARD)
    AARD = (diff / df_1 * 100).abs().mean()
    
    # Calculate Mean Absolute Error (MAE)
    MAE = diff.abs().mean()
    
    # Calculate the standard deviation of the differences
    std = diff.std()
    
    # Find the minimum error in the differences
    min_error = diff.min()
    
    # Find the maximum error in the differences
    max_error = diff.max()
    
    # Print the calculated error statistics
    print('AARD: {:.4f}%'.format(AARD))
    print('MAE: {:.4f}'.format(MAE))
    print('std: {:.4f}'.format(std))
    print('Minimum error: {:.4f}'.format(min_error))
    print('Maximum error: {:.4f}'.format(max_error))

# Read the Density Data (Actual and Predicted by the Referenced Article)

From the excel file (ESI01.xlsx), we only need sheet: S8 | Modeling vs "raw" database.
This sheet contains the entire dataset of densities, temperatures, pressures and what the molecule is.
It also contains predictions that are referenced in the article.

In [2]:
# Load the database
df_S8 = pd.read_excel(filename, 
                      sheet_name = 'S8 | Modeling vs "raw" database')
display(df_S8)

Unnamed: 0,Dataset ID,IL ID,Cation,Anion,Cationic family,Anionic family,Excluded IL,Accepted dataset,T / K,p / MPa,ρ / kg/m3,SWMLR (v0) + FFANN (f),SWMLR (v0) + FFANN (f).1,FFANN (v0) + FFANN (f),FFANN (v0) + FFANN (f).1,LSSVM (v0) + FFANN (f),LSSVM (v0) + FFANN (f).1
0,1,1,"azp-2o1,1",ntf2,azepanium,NTf2 derivatives,no,no,298.20,0.1,1415.6,1407.849325,1407.945826,1413.202477,1413.299344,1408.884942,1408.981514
1,2,1,"azp-2o1,1",ntf2,azepanium,NTf2 derivatives,no,no,293.10,0.1,1420.6,1412.431626,1412.592777,1417.802200,1417.963964,1413.470613,1413.631883
2,2,1,"azp-2o1,1",ntf2,azepanium,NTf2 derivatives,no,no,298.10,0.1,1415.6,1407.938823,1408.036746,1413.292314,1413.390610,1408.974505,1409.072501
3,2,1,"azp-2o1,1",ntf2,azepanium,NTf2 derivatives,no,no,303.10,0.1,1411.1,1403.478570,1403.499373,1408.815102,1408.835984,1404.510972,1404.531790
4,2,1,"azp-2o1,1",ntf2,azepanium,NTf2 derivatives,no,no,313.10,0.1,1402.1,1394.625131,1394.466382,1399.927999,1399.768647,1395.651020,1395.492155
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41245,4318,2267,"turea-8,(2,0),(2,0)",ntf2,thiouronium,NTf2 derivatives,no,yes,318.15,0.1,1229.6,1256.825230,1259.502776,1246.513135,1249.168712,1251.956151,1254.623324
41246,4318,2267,"turea-8,(2,0),(2,0)",ntf2,thiouronium,NTf2 derivatives,no,yes,328.15,0.1,1220.8,1246.486259,1250.591570,1236.258993,1240.330620,1241.657234,1245.746640
41247,4318,2267,"turea-8,(2,0),(2,0)",ntf2,thiouronium,NTf2 derivatives,no,yes,338.15,0.1,1213.7,1235.505553,1241.801170,1225.368383,1231.612345,1230.719068,1236.990295
41248,4318,2267,"turea-8,(2,0),(2,0)",ntf2,thiouronium,NTf2 derivatives,no,yes,348.15,0.1,1205.3,1223.689462,1233.167467,1213.649242,1223.049480,1218.948755,1228.390041


# Select Only Accepted Data

The authors have performed quality analysis of the results so these notebooks rely on that. No extra data analysis was performed.
The author included that information in two columns: Excluded IL and Accepted dataset.

In [3]:
# Narrow down the database to only good data
df_S8 = df_S8.loc[(df_S8['Excluded IL'] == 'no') & (df_S8['Accepted dataset'] == 'yes'), :]
display(df_S8)

Unnamed: 0,Dataset ID,IL ID,Cation,Anion,Cationic family,Anionic family,Excluded IL,Accepted dataset,T / K,p / MPa,ρ / kg/m3,SWMLR (v0) + FFANN (f),SWMLR (v0) + FFANN (f).1,FFANN (v0) + FFANN (f),FFANN (v0) + FFANN (f).1,LSSVM (v0) + FFANN (f),LSSVM (v0) + FFANN (f).1
10,3,1,"azp-2o1,1",ntf2,azepanium,NTf2 derivatives,no,yes,293.15,0.1,1422.6,1412.386510,1412.547114,1417.756913,1417.918128,1413.425464,1413.586187
11,3,1,"azp-2o1,1",ntf2,azepanium,NTf2 derivatives,no,yes,298.15,0.1,1417.9,1407.894072,1407.991285,1413.247394,1413.344976,1408.929722,1409.027007
12,3,1,"azp-2o1,1",ntf2,azepanium,NTf2 derivatives,no,yes,303.15,0.1,1413.4,1403.434105,1403.454083,1408.770468,1408.790522,1404.466474,1404.486466
13,3,1,"azp-2o1,1",ntf2,azepanium,NTf2 derivatives,no,yes,313.15,0.1,1404.2,1394.581017,1394.421317,1399.883717,1399.723410,1395.606874,1395.447056
14,3,1,"azp-2o1,1",ntf2,azepanium,NTf2 derivatives,no,yes,323.15,0.1,1395.2,1385.771961,1385.416417,1391.041166,1390.684270,1386.791338,1386.435532
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41245,4318,2267,"turea-8,(2,0),(2,0)",ntf2,thiouronium,NTf2 derivatives,no,yes,318.15,0.1,1229.6,1256.825230,1259.502776,1246.513135,1249.168712,1251.956151,1254.623324
41246,4318,2267,"turea-8,(2,0),(2,0)",ntf2,thiouronium,NTf2 derivatives,no,yes,328.15,0.1,1220.8,1246.486259,1250.591570,1236.258993,1240.330620,1241.657234,1245.746640
41247,4318,2267,"turea-8,(2,0),(2,0)",ntf2,thiouronium,NTf2 derivatives,no,yes,338.15,0.1,1213.7,1235.505553,1241.801170,1225.368383,1231.612345,1230.719068,1236.990295
41248,4318,2267,"turea-8,(2,0),(2,0)",ntf2,thiouronium,NTf2 derivatives,no,yes,348.15,0.1,1205.3,1223.689462,1233.167467,1213.649242,1223.049480,1218.948755,1228.390041


# Load Cation-Anion Pairs Information

Ionic liquids are built with a cation and anion. The sheet before does not include molecular descriptor for each cation and anion. The sheet that has this information is: S2 | Ions.

In [4]:
# Load database containing molecular descriptors
df_S2 = pd.read_excel(filename, 
                      sheet_name = 'S2 | Ions', 
                      index_col = 'Abbreviation')
display(df_S2)

Unnamed: 0_level_0,Ion type,Chemical name,SMILES,Charge,Family,M / g/mol,Number of ILs composed of the ion,Im,Im1,Im12,...,aNCHBr,aNCD3,aNCD2,aNNH2,cycNCH3,cycNCH2,cycNCH,cycNC(O),cycSCH3,cycSCH2
Abbreviation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
im,cation,1H-imidazol-3-ium,c1c[nH+]c[nH]1,1,imidazolium,69.085,4,1,0,0,...,0,0,0,0,0,0,0,0,0,0
im-1,cation,1-methyl-1H-imidazol-3-ium,Cn1cc[nH+]c1,1,imidazolium,83.111,8,0,1,0,...,0,0,0,0,0,0,0,0,0,0
im-2,cation,1-ethyl-1H-imidazol-3-ium,CCn1cc[nH+]c1,1,imidazolium,97.138,5,0,1,0,...,0,0,0,0,0,0,0,0,0,0
im-3,cation,1-propyl-1H-imidazol-3-ium,CCCn1cc[nH+]c1,1,imidazolium,111.164,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
im-4,cation,1-butyl-1H-imidazol-3-ium,CCCCn1cc[nH+]c1,1,imidazolium,125.191,6,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
triazolide-no2[2],anion,"3-nitro-4H-1,2,4-triazol-4-ide",[O-][N+](=O)c1nnc[n-]1,-1,heterocyclic amines,113.057,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
imidazolide-no2[4],anion,4-nitro-1H-imidazol-1-ide,[O-][N+](=O)c1c[n-]cn1,-1,heterocyclic amines,112.069,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"imidazolide-cl[4],cl[5]",anion,"4,5-dichloro-1H-imidazol-1-ide",Clc1nc[n-]c1Cl,-1,heterocyclic amines,135.960,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"imidazolide-cn[4],cn[5]",anion,"4,5-dicyano-1H-imidazol-1-ide",N#Cc1nc[n-]c1C#N,-1,heterocyclic amines,117.092,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Map Cation-Anion Pairs With Their Molecular Descriptors and Densities

The idea of the models is to correlate the molecular descriptors with density of the molecules. That's why we need to map all ILs extracted before with their molecular descriptors.

In [5]:
# Initiate empty df containers for:
# Molecular descriptors
structure = pd.DataFrame(index = df_S8.index, 
                         columns = df_S2.columns[7:])

# Molar mass
M = pd.DataFrame(index = df_S8.index, 
                 columns = ['M g/mol'])

# Iterate through every available IL
for i in tqdm.tqdm(df_S8.index):
    cation = df_S8.loc[i, 'Cation']
    anion = df_S8.loc[i, 'Anion']
    IL = f'{cation} {anion}'
    
    # Take only those ILs that have cation:anion ratio of 1:1
    if df_S2.loc[cation, 'Charge'] != 1 or df_S2.loc[cation, 'Charge'] != 1:
        print(cation, df_S2.loc[cation, 'Charge'], anion, df_S2.loc[cation, 'Charge'])
        continue
    
    try:
        structure.loc[i, :] = df_S2.loc[cation].iloc[7:] + df_S2.loc[anion].iloc[7:]
        M.loc[i, 'M g/mol'] = df_S2.loc[cation, 'M / g/mol']+df_S2.loc[anion, 'M / g/mol']
        
    except Exception as e:
        print(cation, anion, e)

  3%|▎         | 776/23989 [00:00<00:16, 1374.91it/s]

dc[im-1,1]-5-n-1,1,1 2 ntf2 2
dc[im-1,1]2-1[1o1]31 2 ntf2 2
dc[im-1,1]2-3[1i] 2 ntf2 2
dc[im-1,1]2-3[2i] 2 ntf2 2
dc[im-1,1]2-3[2ichch] 2 ntf2 2
dc[im-1,1]2-3 2 ntf2 2
dc[im-1,1]2-4[1i,4i] 2 ntf2 2
dc[im-1,1]2-4[1i] 2 ntf2 2
dc[im-1,1]2-4[2chch] 2 ntf2 2
dc[im-1,1]2-4 2 ntf2 2
dc[im-1,1]2-5[3i,3i] 2 ntf2 2
dc[im-1,1]2-5[3i] 2 ntf2 2
dc[im-1,1]2-5[3t] 2 ntf2 2
dc[im-1,1]2-5 2 ntf2 2
dc[im-1,1]2-5 2 ntf2 2
dc[im-1,1]2-7 2 ntf2 2
dc[im-1,1]2-9 2 bf4 2
dc[im-1,1]2-9 2 ntf2 2
dc[im-1]-11[n3]1-im-v 2 br 2
dc[im-1]-11[n3]1-im-v 2 dca 2
dc[im-1]-2-n-1,1,1 2 dca 2
dc[im-1]-2-n-a,1,1 2 dca 2
dc[im-1]-2-n-a,1,1 2 dca 2
dc[im-1]-2-n-a,1,1 2 dca 2
dc[im-1]-2-n-a,1,1 2 dca 2
dc[im-1]-2-n-a,1,1 2 dca 2
dc[im-1]-2-n-a,1,1 2 dca 2
dc[im-1]-2-n-a,1,1 2 dca 2
dc[im-1]-5-n-1,1,1 2 ntf2 2
dc[im-1]-5-n-1,1,1 2 ntf2 2
dc[im-1]-5-n-1,1,1 2 otf 2
dc[im-1]-5-n-1,1,1 2 pf6 2
dc[im-1]2-10 2 bf4 2
dc[im-1]2-10 2 npf2 2
dc[im-1]2-10 2 ntf2 2
dc[im-1]2-12 2 bf4 2
dc[im-1]2-12 2 br 2
dc[im-1]2-12 2 npf2 2
dc[im-1]2-1

 28%|██▊       | 6707/23989 [00:07<00:19, 894.42it/s] 

im-4,1 febr4 'febr4'


100%|█████████▉| 23894/23989 [00:26<00:00, 907.08it/s]

tc[im-1]im-10,10[im-1] 3 ntf2 3
tc[im-1]im-3,3[im-1] 3 ntf2 3
tc[im-1]im-6,6[im-1] 3 ntf2 3
tc[im-1ph]im-10,10[im-1ph] 3 ntf2 3
tc[im-1ph]im-3,3[im-1ph] 3 ntf2 3
tc[im-1ph]im-6,6[im-1ph] 3 ntf2 3
tc[im-4]im-10,10[im-4] 3 bf4 3
tc[im-4]im-10,10[im-4] 3 ntf2 3
tc[im-4]im-10,10[im-4] 3 otf 3
tc[im-4]im-3,3[im-4] 3 ntf2 3
tc[im-4]im-6,6[im-4] 3 ntf2 3


100%|██████████| 23989/23989 [00:26<00:00, 910.50it/s]


# Drop Invalid Data

In [6]:
# Drop any missing data points
df = pd.concat([df_S8, M, structure], axis = 1)
df.dropna(axis = 0, inplace = True)
df

Unnamed: 0,Dataset ID,IL ID,Cation,Anion,Cationic family,Anionic family,Excluded IL,Accepted dataset,T / K,p / MPa,...,aNCHBr,aNCD3,aNCD2,aNNH2,cycNCH3,cycNCH2,cycNCH,cycNC(O),cycSCH3,cycSCH2
10,3,1,"azp-2o1,1",ntf2,azepanium,NTf2 derivatives,no,yes,293.15,0.1,...,0,0,0,0,1,1,0,0,0,0
11,3,1,"azp-2o1,1",ntf2,azepanium,NTf2 derivatives,no,yes,298.15,0.1,...,0,0,0,0,1,1,0,0,0,0
12,3,1,"azp-2o1,1",ntf2,azepanium,NTf2 derivatives,no,yes,303.15,0.1,...,0,0,0,0,1,1,0,0,0,0
13,3,1,"azp-2o1,1",ntf2,azepanium,NTf2 derivatives,no,yes,313.15,0.1,...,0,0,0,0,1,1,0,0,0,0
14,3,1,"azp-2o1,1",ntf2,azepanium,NTf2 derivatives,no,yes,323.15,0.1,...,0,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41245,4318,2267,"turea-8,(2,0),(2,0)",ntf2,thiouronium,NTf2 derivatives,no,yes,318.15,0.1,...,0,0,0,0,0,0,0,0,0,0
41246,4318,2267,"turea-8,(2,0),(2,0)",ntf2,thiouronium,NTf2 derivatives,no,yes,328.15,0.1,...,0,0,0,0,0,0,0,0,0,0
41247,4318,2267,"turea-8,(2,0),(2,0)",ntf2,thiouronium,NTf2 derivatives,no,yes,338.15,0.1,...,0,0,0,0,0,0,0,0,0,0
41248,4318,2267,"turea-8,(2,0),(2,0)",ntf2,thiouronium,NTf2 derivatives,no,yes,348.15,0.1,...,0,0,0,0,0,0,0,0,0,0


# Save Data

In [7]:
# Save the dataset
df.to_csv('./Data/data combined.csv')

# Check Referenced Article Predictions

unit_conversion = 1000 was used to convert densities from kg/m3 to g/cm3. This is to ensure comparability to later studies.

In [9]:
print('\nStats for SWMLR (v0) + FFANN (f)')
getErrorStats(df['ρ / kg/m3'], df['SWMLR (v0) + FFANN (f)'], 1000)

print('\nStats for FFANN (v0) + FFANN (f)')
getErrorStats(df['ρ / kg/m3'], df['FFANN (v0) + FFANN (f)'], 1000)

print('\nStats for LSSVM (v0) + FFANN (f)')
getErrorStats(df['ρ / kg/m3'], df['LSSVM (v0) + FFANN (f)'], 1000)


Stats for SWMLR (v0) + FFANN (f)
AARD: 2.3064%
MAE: 0.0283
std: 1.0386
Minimum error: -8.2703
Maximum error: 158.9015

Stats for FFANN (v0) + FFANN (f)
AARD: 1.6021%
MAE: 0.0200
std: 0.1271
Minimum error: -11.7237
Maximum error: 0.7870

Stats for LSSVM (v0) + FFANN (f)
AARD: 3.6354%
MAE: 0.0435
std: 3.6665
Minimum error: -564.9361
Maximum error: 0.7825
