# Data loading and visualizations

This is the first notebook used in the speed of sound prediction experiment. It loads all input CSV files. There are several data visualization functions in the second part of the notebook.

In [None]:
# dependencies

import pandas as pd
import numpy as np
import matplotlib as mpl
from matplotlib import pyplot as plt
import sys
import os

%matplotlib inline

## Definition of all chemistry related variables
All used chemical names and properties are definied to be used later in code

In [None]:
# lists storing used ionic species

cations = ["Na", "K", "Li", "NH4", "H"]
anions = ["Br","Cl", "I", "2SO4", "2CO3", "NO3", "OH"]

In [None]:
# dictionary converting species to its textual representation 
# with super / subscripts

chem_repr = {"Na" : r'$\mathrm{Na}^+$',
             "K" : r'$\mathrm{K}^+$',
             "NH4" : r'$\mathrm{NH}_4^+$',
             "Li" : r'$\mathrm{Li}^+$',
             "H" : r'$\mathrm{H}^+$',
             "Br" : r'$\mathrm{Br}^-$',
             "Cl" : r'$\mathrm{Cl}^-$',
             "I" : r'$\mathrm{I}^-$',
             "2SO4" : r'$\mathrm{SO}_4^{2-}$',
             "2CO3" : r'$\mathrm{CO}_3^{2-}$',
             "NO3" : r'$\mathrm{NO}_3^-$',
             "OH" : r'$\mathrm{OH}^-$'}

In [None]:
# experimental dimensions
parameters = ["T", "sound", "c"]

In [None]:
# units for for all experimental physical dimensions
units = {"T":"K", "c": "mol/kg", "sound": "m/s"}

## Load CSV files into pandas DataFrames
CSV files are loaded from the `data/` directory. The electrolyte names are created by concatenation of cation and anion names and have the following format: `water_electrolyte.csv`.

In [None]:
# the file path to csv files
# current working directory must be the notebook's directory
# if it is not use os.chdir(path) to set it there

file_path = r"../data/"

__Helper functions for data loading__

In [None]:
def csv_read(filename):
    """
    Loads CSV file. Filename is the path to file. 
    Returns CSV contents as pandas DataFrame.
    Accepts ',' or ';' separators. 
    First row is header, next two are skipped. 
    First column is index.  
    """
    
    try:
        with open(filename, 'r') as csv:
            df = pd.read_csv(csv,header=0, sep = "[,;]", 
                             skiprows = [1,2], index_col=0)
    
    except FileNotFoundError:
        print("{0} does not exist".format(filename))
    except:
        print("Unexpected error with {0}: {1}"
              .format(filename, sys.exc_info()[0]))
    else:
        return df

In [None]:
def cation_anion_switch(element):
    """
    Returns anions list for cation and vice versa.
    """
    
    if element in cations:
        return anions
    elif element in anions:
        return cations
    else:
        raise NameError("Unknown element {0}".format(element))

In [None]:
def load_by_ion(ion):
    """
    Loads and creates list of all dataframes containing input ion. 
    Empty dataframe is inserted for nonavailable 
    cation/anion combination.
    Files are loaded from file_path folder.
    """
    
    counterions = cation_anion_switch(ion)
    path = file_path + r"water_"
    list = []
    empty_df = pd.DataFrame()
    
    for i in counterions:
        
        # check ion identity to produce correct filename
        if ion in cations: 
            filename = path + ion + i + ".csv"
        else:
            filename = path + i + ion + ".csv"
        
        if os.path.exists(filename):
            list.append(csv_read(filename))
        else:
            print("{0} does not exist".format(filename))
            list.append(empty_df)
    
    return list
    

### Load all datasets

In [None]:
NH4 = load_by_ion("NH4")
Na = load_by_ion("Na")
K = load_by_ion("K")
Li = load_by_ion("Li")
H = load_by_ion("H")

In [None]:
# stores all DataFrames as a list of lists of DataFrames
# each for one cation
all_cation_dfs = [Na, K, Li, NH4, H]

In [None]:
Cl = load_by_ion("Cl")
Br = load_by_ion("Br")
I = load_by_ion("I")
SO4 = load_by_ion("2SO4")
CO3 = load_by_ion("2CO3")
NO3 = load_by_ion("NO3")
OH = load_by_ion("OH")

In [None]:
# stores all DataFrames as a list of lists of DataFrames
# each for one anion
all_anion_dfs = [Br, Cl, I, SO4, CO3, NO3, OH]

In [None]:
# print dataset sizes for all considered electrolytes

for cation, dfs in zip(cations, all_cation_dfs):
    for anion, df in zip(anions, dfs):
        print("Size of {0}{1} dataset: {2}".format(cation, anion, df.shape[0]))

In [None]:
# store the lists of DataFrames, so that they can be access across all the notebooks
%store all_cation_dfs
%store all_anion_dfs

## Data visualizations:
Every visualization has configurable parameter (T, c, sound).
 - plot_parameter_range: displays min and max value
 - boxplot_parameter: displays boxplot
 - plot_sound: displays variation of sound with given parameter
    
Every visualization can be plotted for single ion or for more ions. To plot all electrolyte data use these functions:
 - plot_all_parameter_ranges
 - boxplot_all_parameter
 - plot_all_sound

__Helper functions__

In [None]:
# this function operates figures with more than one subplot

def plot_for_all(*data, parameter = "T", function):
    """
    Non named parameters are passed as follows: 
    df_list_1, ..., df_list_N, ion_1, ..., ion_N
    Funtion cretes figure with len(data) / 2 subplots. 
    Calls function for every axes.
    Each call to funtion is for one ion and corresponding df_list.
    """
    # determine the number of used species
    total_params = len(data)
    size = total_params // 2
    
    # check that at least two species are plotted
    if size == 1:
        print("This function is not intended for one species. " 
              "Use simple plot_parameter() instead.")
        return
    
    # create the figure
    fig, ax = plt.subplots(nrows=1, ncols = size, 
                           sharex = False, sharey = True)
    # stack figures next to each other
    plt.subplots_adjust(wspace = 0)
    
    plotting_function = function
    
    # call plotting_function for each input ion
    for i in range(size):
        try:
            dfs = data[i]
            ion = data[i + size]
            
            plotting_function(dfs, ion, 
                              parameter = parameter, 
                              subplot = ax[i])
        except:
            print("Incorrect data input")

    plt.show()

### Parameter range plotting
Displays min and max values for selected parameter "T", "c" or "sound"

In [None]:
# this function does not plot
# it extracts ranges used for plot_parameter_range

def parameter_range(df_list, ion, *, parameter="T", printing = True):
    """
    Returns list of triples (counter_ion, min, max)
    for input ion and input dataframes (df_list).
    If printing == True then all triples are printed out 
    with supporting text.
    """
    
    if printing:
        print("Range of " + parameter + " values for ion " + ion)
    
    range_data = []
    pair_ions = cation_anion_switch(ion)
    
    # print and/or save the range for every counter ion to input ion
    for (df, i) in zip(df_list, pair_ions):
        if df.empty != True:
            if printing:
                print(i, end = "\t")
            try:
                column = df[parameter]
            except:
                if printing:
                    print("Nonexistent column " + parameter + 
                          " for " + ion + " with " + i)            
            else:
                minimum = column.min()
                maximum = column.max()
                range_data.append((chem_repr[i],minimum,maximum))
                if printing:
                    print("{0:.2f}".format(minimum), end = "    ")
                    print("{0:.2f}".format(maximum))
        else:
            # append NA tag to result for non-available datasets
            range_data.append((chem_repr[i],"NA","NA"))
    
    # return the obtained ranges
    return range_data

In [None]:
def plot_parameter_range(df_list, ion,*, parameter="T", subplot = None):
    """
    Plots min and max values for paramater. 
    df_list stores dataframes for ion.
    Figure axes can be passed through subplot parameter.
    """
    # obtain the ranges
    data = parameter_range(df_list, ion,
                           parameter = parameter, printing = False)
    labels = [""]
    data_range = range(len(data))
    
    # coordinates for displaying NA values
    y = {"T":300, "sound":1500, "c":0.5}
    
    # create subplot if not provided
    if subplot is None:
        fig, subplot = plt.subplots(ncols=1, nrows=1)
    
    # plot either NA or min/max range
    for i in data_range:
        if data[i][1] == "NA":
            subplot.text(s="NA",x = i+1, y=y[parameter], 
                         horizontalalignment='center', 
                         verticalalignment='center',color='k')
        else:
            subplot.scatter([i+1]*2, [data[i][1], data[i][2]], 
                            color= "r")
        # label is the counter ion to input ion
        labels.append(data[i][0])
    
    # set the labels as xticks
    subplot.set_xticks(list(range(len(data) + 1)))   
    subplot.set_xticklabels(labels)
    # the x-axis name is the input ion
    subplot.set_xlabel(chem_repr[ion])
    
    # try to put graph title a y-axis label
    # it is not used in matrix plotting later on
    try:
        fig = plt.gcf()
        fig.suptitle("Range of {0} values for specified electrolytes"
                     .format(parameter), y = 0.93)
        
        fig.text(0.04, 0.5, "{0} [{1}]".
                 format(parameter, units[parameter]), rotation = 90)
    except:
        pass


In [None]:
# example call
plot_parameter_range(Li, "Li", parameter = "c")

In [None]:
def plot_all_parameter_ranges(*data, parameter = "T"):
    """
    Extension of plot_parameter_range for more ions.
    Non named parameters are passed as follows: 
    df_list_1, ..., df_list_N, ion_1, ..., ion_N
    """
    
    plot_for_all(*data, parameter=parameter, 
                 function=plot_parameter_range)
    

In [None]:
# example call
plot_all_parameter_ranges(Li, H, "Li", "H", parameter = "sound")

### Boxplot plotting
Displays boxplot for selected parameter "T", "c" or "sound"

In [None]:
def boxplot_parameter(df_list, ion, parameter="T", subplot = None):
    """
    Plots boxplots for paramater. df_list stores dataframes for ion.
    Figure axes can be passed through subplot parameter.
    """
        
    columns = []
    pair_ions = cation_anion_switch(ion)
    # number of datasets is set to number of counter ions
    # however in matrix plotting used later on only single electrolyte is plotted 
    data_range = len(pair_ions) if len(df_list) > 1 else 1
    
    # coordinates for displaying NA values
    y = {"T":300, "sound":1500, "c":0.5}
    
    if subplot is None:
        fig, subplot = plt.subplots(ncols=1, nrows=1)
      
    for pos, (df, i) in enumerate(zip(df_list, pair_ions)):
        # plot NA for nonavailable dataset
        if df.empty == True:
            subplot.text(s="NA",x = pos+1, y=y[parameter], 
                         horizontalalignment='center', 
                         verticalalignment='center',color='k')
            columns.append([])
        else:
            # add values for available dataset so that boxplot can be plotted
            try:
                column = df[parameter]
            except:
                subplot.text(s="NA",x = pos+1, y=y[parameter], 
                             horizontalalignment='center', 
                             verticalalignment='center',color='k')
                columns.append([])
            else:
                columns.append(column)
    
    # plot the boxplots
    subplot.boxplot(columns, medianprops=dict(linestyle='-', color="r"))
    subplot.set_xticks(list(range(data_range+1)))
    
    # xticks are counter ion names
    labels = [""] + [chem_repr[i] for i in pair_ions]
    subplot.set_xticklabels(labels)
    subplot.set_xlabel(chem_repr[ion])
    
    # try to put graph title a y-axis label
    # it is not used in matrix plotting later on   
    try:
        fig = plt.gcf()
        fig.suptitle("Boxplot of {0} dimension for specified electrolytes"
                     .format(parameter), y = 0.93)
        fig.text(0.04, 0.5, "{0} [{1}]"
                 .format(parameter, units[parameter]), rotation = 90)
    except:
        pass
    


In [None]:
# example call
boxplot_parameter(Li, "Li", parameter = "T")

In [None]:
def boxplot_all_parameter(*data, parameter = "T"):
    """
    Extension of boxplot_parameter for more ions.
    Non named parameters are passed as follows: 
    df_list_1, ..., df_list_N, ion_1, ..., ion_N
    """
        
    plot_for_all(*data, parameter = parameter, 
                 function = boxplot_parameter)

In [None]:
# example call
boxplot_all_parameter(Li, H, "Li", "H", parameter = "c")

### Plots of speed of sound variation along different dimensions
Displays variation of speed of sound along either temperature of molality dimension

In [None]:
def plot_sound(df_list, ion,*, parameter = "T", 
               subplot = None, pair_ions = None):
    """
    Plots variation in speed of sound with parameter. 
    df_list stores dataframes for ion.
    Figure axes can be passed through subplot parameter. 
    The number of plotted
    electrolytes can be limited through setting pair_ions parameter
    to a list of counter ions.
    """
    ion_pairs = cation_anion_switch(ion)
    
    # list of colours so that every electrolyte has different color
    colors = ["r", "b", "k", "m", "c", "y", "orange"]
    
    # list designating if a counter ion will be plotted
    # here preset to all True in case pair_ions is not limited
    include = np.repeat(True, len(ion_pairs))

    if not pair_ions is None:
        include = [True if i in pair_ions else 
                   False for i in ion_pairs]
    
    if subplot is None:
        fig, subplot = plt.subplots(nrows=1, ncols=1)
    
    # plot dataframe by dataframe
    for n, (df, i) in enumerate(zip(df_list, ion_pairs)):
        if df.empty == True:
            continue
        if include[n] == False:
            continue
        try:
            column = df[parameter]
        except:
            pass
        else:
            try:
                # groups dataframes by molality to disconnect unrelated data 
                for name, group in df.groupby("c"):
                    # some dataframes have continuos molality change
                    # and are ploted as points
                    if group.shape[0] == 1:
                        lines, = subplot.plot(group[parameter].values, 
                                              group["sound"].values, 
                                              marker='o',markersize=1,
                                              color = colors[n])
                    else:
                        lines, = subplot.plot(group[parameter].values, 
                                              group["sound"].values, 
                                              linewidth = 2, 
                                              color = colors[n])
                # every line is labeled by corresponding counter ion
                lines.set_label("{0}".format(chem_repr[i]))
            except:
                pass
    
    # x-label is the input ion
    subplot.set(xlabel="{0}".format(chem_repr[ion]))
    
    # put legend lowerleft
    if subplot.lines:
        subplot.legend(loc = 3)

    # in case of no data being plotted (missing dataframe in electrolyte matrix visualization) 
    else:
        subplot.text(s="NA",x = 0.5, y=0.5, 
                     horizontalalignment='center', 
                     verticalalignment='center',color='k',
                     transform=subplot.transAxes)


In [None]:
# example call; NaOH dataset
plot_sound(Na, "Na", parameter = "T", pair_ions=["OH"])

In [None]:
def plot_all_sound(*data, parameter="T"):
    """
    Extension of plot_sound for more ions.
    Non named parameters are passed as follows: 
    df_list_1, ..., df_list_N, ion_1, ..., ion_N
    """
    plot_for_all(*data, parameter=parameter, function=plot_sound)

In [None]:
# example call
plot_all_sound(Li, Na, "Li","Na", parameter = "T")

## Data visualization matrix
Type of visualization that using previously defined plotting visualizations shows data in the form of a matrix.
Rows correspond to cations, while columns correspond to anions. Therefore each cell is an electrolyte.

In [None]:
def plot_electrolyte_matrix(cations, anions, *data, 
                            typ = "boxplot", parameter = "T"):
    """
    This function provides a way to visualize all electolyte data 
    using matrix system. 
    Cations forms rows, while anions form colums.
    In every cell data is plotted for corresponding eletrolyte 
    using function provided in typ parameter.
    Calling: plot_electrolyte_matrix(cations,anions, 
                                    *all_cation_dfs, **kwargs)
    """
    
    # check that accepted plotting type was used
    if not typ in ["boxplot", "range", "sound"]:
        print("Accepted types are: boxplot, range, sound")
        return
    
    # assign correct plot function
    if typ == "boxplot":
        plot_function = boxplot_parameter  
    elif typ == "sound":
        plot_function = plot_sound
    elif typ == "range":
        plot_function = plot_parameter_range
    
    # matrix dimensions
    rows = len(cations)
    cols = len(anions)
    df_cations = data[:rows]
    
    # set fixed figure size
    plt.rcParams["figure.figsize"] = (2*cols,2*rows)
    fig, ax = plt.subplots(nrows = rows, ncols = cols, 
                           sharey = True, sharex = True)
    
    # no white space between subplots
    plt.subplots_adjust(wspace = 0, hspace = 0)
        
    # plot row by row
    for row, (dfs, ion) in enumerate(zip(df_cations, cations)):
        for col, df in enumerate(dfs):
            plot_function([df], ion, 
                          parameter = parameter, 
                          subplot = ax[row][col])
            
            # remove legend created by plot_sound function
            if typ == "sound" and ax[row][col].lines:
                ax[row][col].get_legend().remove()
    
    # set cation / anion labels on y / x-axis
    for row in range(rows):
        ax[row][0].set_ylabel(chem_repr[cations[row]])
    for col in range(cols):
        ax[rows-1][col].set_xlabel(chem_repr[anions[col]])
        
        # remove x-ticks for boxplot and range type 
        # (only sound has units there)
        if typ != "sound":
            ax[rows-1][col].set_xticks([])
    
    # add axis labels and title
    if typ == "sound":
        fig.suptitle("Matrix showing sound speed variation with {0}"
                     .format(parameter), y = 0.92)
        fig.text(0.05, 0.5, "Sound speed [m/s]", rotation=90)
        fig.text(0.5,0.04, "{0} [{1}]"
                 .format(parameter, units[parameter]))
    elif typ == "boxplot":
        fig.suptitle("Matrix showing boxplot for {0} parameter"
                     .format(parameter), y = 0.92)
    elif typ == "range":
        fig.suptitle("Matrix showing range of values for {0} parameter"
                     .format(parameter), y = 0.92)
    # reset matplotlib to default    
    mpl.rcdefaults()


### Electrolyte matrix plots for all three visualizations and parameters

__Variation of speed of sound with temperature__

In [None]:
plot_electrolyte_matrix(cations,anions, *all_cation_dfs, typ = "sound")

__Variation of speed of sound with molality__

In [None]:
plot_electrolyte_matrix(cations,anions, *all_cation_dfs,
                        typ = "sound", parameter = "c")

__Range of temperature values__

In [None]:
plot_electrolyte_matrix(cations,anions, *all_cation_dfs, typ = "range")

__Range of molality values__

In [None]:
plot_electrolyte_matrix(cations,anions, *all_cation_dfs, 
                        typ = "range", parameter = "c")

__Range of sound speed values__

In [None]:
plot_electrolyte_matrix(cations,anions, *all_cation_dfs, 
                        typ = "range", parameter = "sound")

__Boxplot of temperature values__

In [None]:
plot_electrolyte_matrix(cations,anions,*all_cation_dfs,typ = "boxplot")

__Boxplot of molality values__

In [None]:
plot_electrolyte_matrix(cations,anions, *all_cation_dfs, 
                        typ = "boxplot", parameter = "c")

__Boxplot of sound speed values__

In [None]:
plot_electrolyte_matrix(cations,anions, *all_cation_dfs, 
                        typ = "boxplot", parameter = "sound")