# Data enhancement by ion specific features

In this notebook all dataframes loaded in `data_loadinig.ipynb` are enhanced with ion specific features. These features are subsequently scaled by selected range.

In [None]:
# dependencies

import pandas as pd
import numpy as np

__Restore all dataframes created in `data_loading.ipynb`__

In [None]:
%store -r all_cation_dfs
%store -r all_anion_dfs

In [None]:
# Lists storing used ionic species
cations = ["Na", "K", "Li", "NH4", "H"]
anions = ["Br","Cl", "I", "2SO4", "2CO3", "NO3", "OH"]

## Ion specific features data
Additional data for model enhancement. All features have values ordered by ions the same way as cations and anions lists defined above.


In [None]:
# charges of ionic species
# order kept from cations / anions lists 
# (also true for all additional features)

c_q = [1,1,1,1,1]
a_q = [1,1,1,2,2,1,1]

In [None]:
# molar weight
c_mw = [22.990,39.098,6.941, 18.038, 1.008]
a_mw = [79.904,35.453,126.904,96.063,60.009,62.005,17.007]

In [None]:
# number of protons
a_Z = [35,17,53,48,30,31,9]
c_Z = [11,19,3,11,1]

In [None]:
# entropy of solvation
c_es = [59,102.5,13.4,96.9,0]
a_es = [82.4,56.5,111.3,18.8,-43.2,146.6,-10.7]

In [None]:
# ionic radius
c_r = [0.102, 0.138, 0.069, 0.148, 0.03]
a_r = [0.196,0.181,0.22,0.23,0.178,0.179,0.133]

In [None]:
# hydration shell width
c_dr = [0.116,0.074,0.172,0.065,0.3]
a_dr = [0.035,0.043,0.026,0.043,0.076,0.044,0.079]

In [None]:
# hydration number
c_hn = [3.5,2.6,5.2,2.4,12]
a_hn = [1.8,2,1.6,3.1,4,2,2.7]

In [None]:
# Gibbs electrical energy
c_ge = [-440,-372,-558,-358,-1058]
a_ge = [-297,-315,-272,-1010,-1195,-317,-380]

In [None]:
# Gibbs hydration energy
c_gh = [-365,-295,-475,-285,-1050]
a_gh = [-315,-340,-275,-1080,-1315,-300,-430]

In [None]:
# acid pKa, base pKb
a_pK = [-9,-8,-10,-3,6.37,-1.3,15.7]
c_pK = [0.2,0.5,-0.36,4.75,-1.7]

In [None]:
# features is a list of all feauture lists
# fnames holds the names of the features

anion_features = [a_es, a_Z, a_r,a_dr,a_hn,a_ge,a_gh, a_mw, a_q, a_pK]
anion_fnames = ["a_es","a_Z","a_r","a_dr","a_hn","a_ge", "a_gh", "a_mw", "a_q", "a_pK"]

cation_features = [c_es, c_Z, c_r,c_dr,c_hn,c_ge,c_gh, c_mw, c_q, c_pK]
cation_fnames = ["c_es","c_Z","c_r","c_dr","c_hn","c_ge", "c_gh", "c_mw", "c_q", "c_pK"]

## Ion specific features ranges
Definition of ranges for all ion specific features. The range is defined as a tuple of minimum and maximum possible values. The ranges are used for scaling by a selected range. 


In [None]:
# feature ranges - the comments specify which species are responsible for the limits
rc_q = (1,2)
ra_q = (1,2)

rc_mw = (1.008, 87.62)    # (H, Sr)
ra_mw = (17.007, 126.904)  # (OH, I)

rc_Z = (1, 38)     # (H, Sr)
ra_Z = (9, 53)     # (OH, I)

rc_es = (-130,130)
ra_es = (-50,150)

rc_dr = (0.06,0.300) # (Rb+ to H+)
ra_dr = (0.019,0.079)   # (ClO4- to F-)

rc_r = (0.03,0.149) # (H+ to Rb+)
ra_r = (0.133,0.250) # (F- to ClO4-)

rc_hn = (2.4,12)  # (Rb+ to H+)
ra_hn = (1.4, 4)  # (ClO4-, CO32-)

rc_ge = (-3225,-356)  #  (Be2+,Rb+)
ra_ge = (-1195,-245)  # (CO32-,ClO4-)

rc_gh = (-2395,-275) #  (Be2+,Rb+)
ra_gh = (-1315,-275)  # (CO32-, I-)

ra_pK = (-10,15.7)  # (HI, H2O)
rc_pK = (-1.7, 5.4)   # (H2O, BeOH2)



In [None]:
# natural_ranges is a list of all feauture range tuples

a_natural_ranges = [ra_es, ra_Z, ra_r,ra_dr,ra_hn,ra_ge,ra_gh, ra_mw, ra_q, ra_pK]
c_natural_ranges = [rc_es, rc_Z, rc_r,rc_dr,rc_hn,rc_ge,rc_gh, rc_mw, rc_q, rc_pK]

## Addition of ion specific features to dataframes

Addition is done through the __all_cation_dfs__ list.

__Helper functions__

In [None]:
def add_anion_feature(df_list, *, feature, values):   
    """
    Add features to every df in df_list.
    Feature name is stored in parameter feature.
    Feature values are obtained from list values.
    df_list and values must be ordered by anions in the same way.
    """
    
    for i, df in enumerate(df_list):
        if df.empty:
            continue
        df[feature] = values[i]  

In [None]:
def add_cation_feature(dfs, *, feature, values):
    """
    Add features to every df in df_list (dfs is list of df_lists).
    Feature name is stored in parameter feature.
    Feature values are obtained from list values.
    dfs and values must be ordered by cations in the same way.
    """
        
    for i, df_list in enumerate(dfs):
        for df in df_list:
            if df.empty:
                continue
            df[feature] = values[i] 

### Addition of all ion specific features to all electrolyte DataFrames

In [None]:
# anion features
for fn,fv in zip(anion_fnames, anion_features):
    for df_list in all_cation_dfs:
        add_anion_feature(df_list, feature = fn, values = fv)

In [None]:
# cation features
for fn,fv in zip(cation_fnames, cation_features):
    add_cation_feature(all_cation_dfs, feature = fn, values = fv)

In [None]:
# add electrolyte name as another feature
# this is not a modelling feature, only informative one

for cation,df_list in zip(cations, all_cation_dfs):
    for anion, df in zip(anions, df_list):
        df["medium"] = cation + anion

## Creation of dataset for ML task

In [None]:
# appends all enhanced electrolyte dataframes into one 

df_generator = (df for df in all_cation_dfs)
dataframe = pd.DataFrame([])
for df_list in df_generator:
    dataframe = dataframe.append(pd.concat([df for df in df_list], 
                                           ignore_index=True, 
                                           sort = False),
                                 sort = False)

In [None]:
dataframe.shape

__Load datasets for outside matrix test__<br>
The lines below should be uncommented and run to add derived features to RbI dataset used for outside matrix test. The dataset already contains remaining ion specific features, so only all cells from here down must be run.

In [None]:
# uncomment to enrich and scale outside matrix test data

#path = r"../data/outtest/" 
#dataframe = pd.read_csv(path + "water_RbI.csv", index_col=0)
#dataframe.shape

## Addition of derived features and their ranges

In [None]:
# number of ions from complete dissociation of the electrolyte

dataframe["moles"] = dataframe["a_q"] + dataframe["c_q"]
# the range
r_moles = tuple(sum(x) for x in zip(ra_q, rc_q))

In [None]:
# geometric mean of cation/anion proton numbers

dataframe["Z"] = np.sqrt(dataframe["a_Z"] * dataframe["c_Z"])
# the range
r_Z = tuple(np.sqrt(np.prod(x)) for x in zip(ra_Z, rc_Z))

In [None]:
# water equilibrium constant
dataframe["Kw"] = np.exp(-1 / dataframe["T"])

In [None]:
# transformed molality feature
dataframe["cm"] = dataframe["c"] ** (2/3)

In [None]:
# ionic strength
dataframe["is"] = 0.5 * (dataframe["c"] * np.power(dataframe["a_q"],2)
                         * dataframe["c_q"] 
                         +
                         dataframe["c"] * np.power(dataframe["c_q"],2) 
                         * dataframe["a_q"])

In [None]:
# list of all derived features names
derived = ["moles","Z", "Kw", "cm", "is"]

## Scaling by a range for all ion specific features
The following scaling relation is employed to squash all ion specific features into $[-1,1]$ range:<br><br>
<center>$x' = \frac{(2x - max - min)}{(max - min)}$ </center>

In [None]:
# scale anion features

for (minimum, maximum), f in zip(a_natural_ranges, anion_fnames):
    dataframe[f] = (2*dataframe[f] - maximum - minimum) / (maximum - minimum)

In [None]:
# scale cation features

for (minimum, maximum), f in zip(c_natural_ranges, cation_fnames):
    dataframe[f] = (2*dataframe[f] - maximum - minimum) / (maximum - minimum)

In [None]:
# scale derived feautes

dataframe["Z"] = (2*dataframe["Z"] - r_Z[1] 
                  - r_Z[0]) / (r_Z[1] - r_Z[0])
dataframe["moles"] = (2*dataframe["moles"] - r_moles[1]
                      - r_moles[0]) / (r_moles[1] - r_moles[0])

## Dataset saving

In [None]:
# order columns inside the dataframe
# this also quarantees that no other feature will be present
# some raw CSV files might have additional column, such as density

dataframe = dataframe[["T","c"] 
                      + anion_fnames 
                      + cation_fnames 
                      + derived 
                      + ["medium","sound"]]

### Drop all features not considered for model training

In [None]:
# the proton number features are dropped,
# because they are very similar to molar weight
# they were only used for creation of the Z feature

dataframe = dataframe.drop(["c_Z"], axis = 1)
dataframe = dataframe.drop(["a_Z"], axis = 1)

# the cation charge feature is dropped,
# because only single charged cation data are available
# this line should be deleted if double charge cation
# data become available

dataframe = dataframe.drop(["c_q"], axis = 1)

In [None]:
# print dataframe size
dataframe.shape

In [None]:
# check that scaling and feature dropping went well
# and no data is missing

# also explore feature statistics
# no ion specific feature std should be too low, 
# they have to remain discriminative

dataframe.describe().transpose()

In [None]:
# directory to save datasets
path = r"../datasets/"

In [None]:
# save the dataframe into CSV file
# use the outtest_dataset version, when handling RbI data

dataframe.to_csv(path + "dataset.csv")
#dataframe.to_csv(path + "outtest_dataset.csv")