# How to select an objective function using information theory
Reproduce figure 3 of the paper.

In [None]:
# compute likelihood
import numpy as np
from scipy.stats import pearsonr
import scipy.stats


def normal_ll(y, y_hat, transform=None, gradient=1):
    '''Log likelihood for the normal distribution with change of variable
    
    The normal distribution is the formal likelihood for the mean squared error (MSE).
    

    Parameters
    ----------
    y : array_like
        Observations.
    y_hat : array_like
        Predictions.
    transform : function
        Change of variable transformation.
    gradient : function
        Gradient of the transform function.
        
    Proof
    -----
    https://www.statlect.com/probability-distributions/normal-distribution
    '''
    if transform is not None:
        y = transform(y)
        y_hat = transform(y_hat)
        
    e = y - y_hat
    n = len(e)
    sigma = e.std()
    log_gradient = np.sum(np.log(np.abs(gradient)))
    ll = -n * np.log(sigma) - n/2*np.log(2*np.pi) - 1/(2*sigma**2) * (e**2).sum() + log_gradient
    return ll


def laplace_ll(y, y_hat, transform=None, gradient=1):
    '''Log likelihood for Laplace distribution with change of variable
    
    The laplace distribution is the formal likelihood for the mean absolute
    error (MAE).
    
    Parameters
    ----------
    y : array_like
        Observations.
    y_hat : array_like
        Predictions.
    transform : function
        Change of variable transformation.
    gradient : function
        Gradient of the transform function.
    '''
    if transform is not None:
        y = transform(y)
        y_hat = transform(y_hat)
        
    e = (y - y_hat).abs()
    n = len(e)
    b = e.mean()
    log_gradient = np.sum(np.log(np.abs(gradient)))
    ll = -n * np.log(2*b) - 1/b * e.sum() + log_gradient
    return ll.sum()
                                   

def msre_ll(y, y_hat):
    '''Log likelihood for mean squared square-root error
    
    Parameters
    ----------
    y : array_like
    y_hat : array_like
    '''
    return normal_ll(y, y_hat, transform=lambda x: np.sqrt(x), gradient=-1/(2*np.sqrt(y)))


def mare_ll(y, y_hat):
    '''Log likelihood for mean absolute square-root error
    
    Parameters
    ----------
    y : array_like
    y_hat : array_like
    '''
    return laplace_ll(y, y_hat, transform=lambda x: np.sqrt(x), gradient=-1/(2*np.sqrt(y)))


def lognormal_ll(y, y_hat):
    '''Lognormal log likelihood
    
    The lognormal distribution is the formal likelihood for the mean squared
    log error (MSLE).
    
    Parameters
    ----------
    y : array_like
    y_hat : array_like
    '''
    return normal_ll(y, y_hat, transform=lambda x: np.log(x), gradient=1/y)


def mspe_ll(y, y_hat):
    '''Log likelhood for mean squared percentage error
    
    Parameters
    ----------
    y : array_like
    y_hat : array_like
    
    '''
    return normal_ll(y, y_hat, transform=lambda x: x/y, gradient=-1/(y**2)) 


def nse_ll(y, y_hat, group='gage_id'):
    '''Log likelihood for normalized squared error (NSE)
    
    NSE is equivalent to the Nash–Sutcliffe model efficiency coefficient.
    
    Parameters
    ----------
    y : array_like
    y_hat : array_like
    '''
    sigma_o = y.groupby('gage_id').transform(lambda x: x.std())
    return normal_ll(y, y_hat, transform=lambda x: x/sigma_o, gradient=1/sigma_o)


def loglaplace_ll(y, y_hat):
    '''Log likelihood for log Laplace distribution
    
    Parameters
    ----------
    y : array_like
    y_hat : array_like
    '''
    return laplace_ll(y, y_hat, transform=lambda x: np.log(x), gradient=1/y)


def uniform_ll(y, y_hat):
    '''Log likelihood for uniform distribution.
    
    The uniform log likelihood minimizes the maximum error.
    
    Parameters
    ----------
    y : array_like
    y_hat : array_like
    '''
    e = np.abs(y - y_hat)
    n = len(e)
    #ll = -n * np.log(e.max()-e.min()) # standard formulation
    ll = -n * np.log(e.max() - 0)
    return ll


def bernoulli_ll(y, y_hat, groupby=None):
    '''TODO and use within zi_ll
    
    Parameters
    ----------
    y : array_like
    y_hat : array_like
    '''
    pass



def zi_ll(y, y_hat, ll=normal_ll, threshold=0.01, groupby=None):
    ''' Zero-inflated log likelihood.
    
     Parameters
    ----------
    y : array_like
    y_hat : array_like
    ll : function
        Zero-inflated log likelihood 
    threshold : float
        Value below which is treated as zero
    groupby : string
        Optional groupby term (testing)
    '''
    y_o = y <= threshold
    y_hat_o = y_hat <= threshold
    
    if groupby is None:
        n1 = (y_o & y_hat_o).sum() # correct zero-flow prediction
        n2 = (y_o ^ y_hat_o).sum() # incorrect zero-flow prediction 
    else:
        n1 = (y_o & y_hat_o).groupby(groupby).sum() # correct zero-flow prediction
        n2 = (y_o ^ y_hat_o).groupby(groupby).sum() # incorrect zero-flow prediction

    n3 = (~y_o & ~y_hat_o) # correct flow predictions
    
    # fraction of correctly predicted zero flows
    rho = np.where( (n1+n2) == 0, 0, n1 / (n1 + n2))
    n_rho = 1-rho
    
    # n1 * np.log(rho) + n2 * np.log(1-rho)
    ll_zero = n1[rho!=0] * np.log(rho[rho!=0]) + n2[n_rho!=0]* np.log(n_rho[n_rho!=0])
    
    return ll_zero.sum() + ll(y[n3], y_hat[n3])


def zilognormal_ll(y, y_hat):
    '''Log likelihood for zero-inflated lognormal.
    
    Parameters
    ----------
    y : array_like
    y_hat : array_like
    '''
       
    return zi_ll(y, y_hat, ll=lognormal_ll, threshold=0.01)


def ziloglaplace_ll(y, y_hat):
    '''Log likelihood for zero-inflated laplace.
    
    Parameters
    ----------
    y : array_like
    y_hat : array_like
    '''
    return zi_ll(y, y_hat, ll=loglaplace_ll, threshold=0.01)


Overwrite the nse log likelihood such that we can pass it sigmo_o such that sigma_o is not part of the bootstrap.

In [None]:

def nse_ll(y, y_hat, sigma_o=None, group='gage_id'):
    '''Log likelihood for normalized squared error (NSE)
    
    NSE is equivalent to the Nash–Sutcliffe model efficiency coefficient.
    
    Parameters
    ----------
    y : array_like
    y_hat : array_like
    '''
    if sigma_o is None:
        sigma_o = y.groupby('gage_id').transform(lambda x: x.std(ddof=0))
        
    return normal_ll(y, y_hat, transform=lambda x: x/sigma_o, gradient=1/sigma_o)


In [None]:
# read local copy
import pandas as pd
import numpy as np

df = pd.read_parquet('gages2_nndar.parquet')
df[df < 0.01] = 0.01

sigma_global = df['obs'].groupby('gage_id').transform(lambda x: x.std())


# create a convenience function for nse log likelihood that uses the global variance
def nse_g(y, y_hat):
    return nse_ll(y, y_hat, sigma_o = sigma_o)

In [None]:
objectives = {
    'U' : {'name':'uniformly distributed error', 'f':uniform_ll},
    'MSE' : {'name':'mean squared error', 'f':normal_ll},
    'NSE' : {'name':'normalized squared error', 'f':nse_g},
    'MAE' : {'name': 'mean absolute error', 'f':laplace_ll},
    'MSPE' : {'name': 'mean squared percent error', 'f':mspe_ll},
    'MSLE' : {'name':'mean squared log error*', 'f':lognormal_ll},
    'MALE' : {'name':'mean absolute log error*', 'f':loglaplace_ll},
    'ZMSLE' : {'name':'zero-inflated MSLE', 'f':zilognormal_ll},
    'ZMALE' : {'name':'zero-inflated MALE', 'f':ziloglaplace_ll},
    'MARE' : {'name':'mean absolute square root error', 'f':mare_ll},
}

obj_df = pd.DataFrame.from_dict(objectives, orient='index')

In [None]:
# break the bootstrapping into two chunks, in case anything fails
output = pd.DataFrame()
filename='convergence_bootstrap.csv'

# first run
columns = ['MSPE','U','MSE','NSE','MAE', 'MSLE','MARE','ZMSLE','MALE','ZMALE']
results_dict = {k:[] for k in columns}
results_dict['n'] = []

#

#output.to_csv(filename)

for i in range(30, 3000, 10): #10
#for i in range(3000, 5000, 10): #10
    print(i)
    #temp_df = df.groupby('gage_id').head(i)
    temp_df = df.groupby('gage_id').sample(i, replace=True, random_state=12345 * i)
    sigma_o = sigma_global.groupby('gage_id').head(i)

    for index, row in obj_df.iterrows():
        results_dict[index].append(- row.f(temp_df['obs'], temp_df['NNDAR'])/ len(temp_df)/ np.log(2))
    
    results_dict['n'].append(i)

    output = pd.DataFrame(data=results_dict)

    output.to_csv(filename)

In [None]:
# second_run
output = pd.DataFrame()
filename='convergence_bootstrap_2.csv'

columns = ['MSPE','U','MSE','NSE','MAE', 'MSLE','MARE','ZMSLE','MALE','ZMALE']
results_dict = {k:[] for k in columns}
results_dict['n'] = []

#

#output.to_csv(filename)

#for i in range(30, 3000, 10): #10
for i in range(3000, 5010, 10): #10
    print(i)
    #temp_df = df.groupby('gage_id').head(i)
    temp_df = df.groupby('gage_id').sample(i, replace=True, random_state=12345 * i)
    sigma_o = sigma_global.groupby('gage_id').head(i)

    for index, row in obj_df.iterrows():
        results_dict[index].append(- row.f(temp_df['obs'], temp_df['NNDAR'])/ len(temp_df)/ np.log(2))
    
    results_dict['n'].append(i)

    output = pd.DataFrame(data=results_dict)

    output.to_csv(filename)

In [None]:
# join the chunks
import pandas as pd

df = pd.read_csv('convergence_bootstrap.csv')
df2 = pd.read_csv('convergence_bootstrap_2.csv')

df = pd.concat([df, df2], ignore_index=True)

## make plot

In [None]:
import matplotlib.pyplot as plt


plt.rc('font', family='serif')
plt.rc('xtick', labelsize='x-small')
plt.rc('ytick', labelsize='x-small')

import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

In [None]:
columns = ['MSPE','U','MSE','NSE','MAE', 'MSLE','MARE','ZMSLE','MALE','ZMALE']
columns.append('n')

df = df[columns]
df = df.set_index('n')

In [None]:
solution = df.tail(5).mean()

df_deviations = (df - solution).abs()

In [None]:
fig, axes = plt.subplots(2, 5, figsize=(5.51181, 2.7))

df_deviations.plot(subplots=True, ls='', marker='.', ax=axes, color=['#af2d46'], markersize=1)


for i, ax_r in enumerate(axes):
    #ax_r[0].set_ylabel('Divergence in bits')
    for ax in ax_r:
        leg = ax.legend(frameon=False, loc='upper right', handlelength=0, handletextpad=0, fontsize=8)
        for item in leg.legend_handles:
            item.set_visible(False)
        ax.set_ylim((0,2))
        ax.set_xlim((0,5000))
        ax.spines[['right', 'top']].set_visible(False)
        ax.spines[['bottom','left']].set_position(('outward',5))
        #ax.spines[['bottom']].set_position(('outward',5))

        ax.get_yaxis().set_ticks([])
        ax.get_yaxis().set_visible(False)

        if i ==0:
            ax.get_xaxis().set_ticks([])
            ax.get_xaxis().set_visible(False)

        if i==1:
            ax.get_xaxis().set_ticks([0,2500,5000])
        
        ax.set_xlabel(None)
        #ax.aspect_ratio('equal')
        ax.set_box_aspect(1)


axes[1][0].set_ylabel('Absolute error in bits', fontsize=8)
axes[1][0].yaxis.set_label_coords(-0.5, 1.45)
#axes[1][0].annotate('Absolute error in bits', (0.03,0.5), xycoords='figure fraction', fontsize=8, rotation=90)
#axes[1][0].annotate('Sample size (n)', (0.5, 0), xycoords='figure fraction', fontsize=8)
axes[1][0].set_xlabel('Sample size (n)', fontsize=8)
axes[1][0].xaxis.set_label_coords(3.6, -0.55)

axes[1][0].get_yaxis().set_ticks([0,1,2])
axes[1][0].get_yaxis().set_visible(True)
axes[0][0].get_yaxis().set_ticks([0,1,2])
axes[0][0].get_yaxis().set_visible(True)


fig.subplots_adjust(wspace=0.5)
fig.savefig('figure3.pdf')