In [2]:
%load_ext autoreload
%autoreload 2
import sympy as sp
from sympy.printing import latex
from sympy.printing.pycode import pycode
import numpy as np
from scipy.integrate import quad
import definitions as defs
import theorems as thms

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

# Symbolic verification of the analysis in Section 4.2

## Theorem 1
We start by symbolic verification of theorem 1.

In [3]:
# set up the sympy symbols
d_e, d_q, gamma = sp.symbols('d_e d_q gamma', positive=True, real=True)

In [4]:
# Assumption 1. The annotator presence criterion can be fulfilled (d_q >= gamma*d_e) 

# Case i: d_e >= d_q
# The timings for the discontinuities (from Table 2, Appendix A.1)
t0_i = 0
t1_i = gamma * d_e
t2_i = d_q
t3_i = d_e

# The function values at the discontinuities (from Table 2, Appendix A.1)
f_t0_i = 1
f_t1_minus_i = (d_q - gamma*d_e)/d_q
f_t1_plus_i = gamma*d_e/d_q
f_t2_i = 1
f_t3_i = 1

# The areas as functions of d_e, d_q and gamma (from Eq. 25-27, Appendix A.1)
A1_i = (f_t0_i + f_t1_minus_i)/2 * (t1_i - t0_i)
A2_i = (f_t1_plus_i + f_t2_i)/2 * (t2_i - t1_i)
A3_i = (f_t2_i + f_t3_i)/2 * (t3_i - t2_i)
A_i   = (2*A1_i + 2*A2_i + A3_i) # (from Eq. 21, Appendix A.1)

expected_label_accuracy_given_overlap_i = A_i / (d_e + d_q) # (Eq. 28, Appendix A.1)
expected_label_accuracy_given_overlap_i = sp.simplify(expected_label_accuracy_given_overlap_i)
expected_label_accuracy_given_overlap_i

# Case ii: d_e < d_q
# The timings for the discontinuities (from Table 2, Appendix A.1)
t0_ii = 0
t1_ii = gamma * d_e
t2_ii = d_e
t3_ii = d_q

# The function values at the discontinuities (from Table 2, Appendix A.1)
f_t0_ii = 1
f_t1_minus_ii = (d_q - gamma*d_e)/d_q
f_t1_plus_ii = gamma*d_e/d_q
f_t2_ii = d_e/d_q
f_t3_ii = d_e/d_q

# The areas as functions of d_e, d_q and gamma (from Eq. 25-27, Appendix A.1)
A1_ii = (f_t0_ii + f_t1_minus_ii)/2 * (t1_ii - t0_ii)
A2_ii = (f_t1_plus_ii + f_t2_ii)/2 * (t2_ii - t1_ii)
A3_ii = (f_t2_ii + f_t3_ii)/2 * (t3_ii - t2_ii)
A_ii  = (2*A1_ii + 2*A2_ii + A3_ii) # (from Eq. 21, Appendix A.1)

expected_label_accuracy_given_overlap_ii = A_ii / (d_e + d_q) # (Eq. 28, Appendix A.1)
expected_label_accuracy_given_overlap_ii = sp.simplify(expected_label_accuracy_given_overlap_ii)

# Assert that the two cases are equal
assert expected_label_accuracy_given_overlap_i.equals(expected_label_accuracy_given_overlap_ii), "The two cases should be equal"
print("Theorem 1: if d_q >= gamma*d_e, then the expected label accuracy given overlap is: ")
expected_label_accuracy_given_overlap_ii

Theorem 1: if d_q >= gamma*d_e, then the expected label accuracy given overlap is: 


d_e*(-2*d_e*gamma**2 + 2*d_q*gamma + d_q)/(d_q*(d_e + d_q))

In [5]:
# Assumption 2. The annotator presence criterion can not be fulfilled (d_q < gamma*d_e)

# The timings for the discontinuities (Appendix A.1, Assumption 2)
t_0 = 0
t_1 = d_q
t_2 = d_e
t_3 = d_e + d_q

# The function values at the discontinuities (Appendix A.1, Assumption 2)
f_t0 = 1
f_t1 = 0
f_t2 = 0
f_t3 = 1

# The areas as functions of d_e, d_q and gamma (from Figure 16, Appendix A.1)
A_1 = (f_t0 + f_t1)/2 * (t_1 - t_0)

expected_label_accuracy_given_overlap_no_presence = 2*A_1 / (d_e + d_q)
print("Theorem 1: if d_q < gamma*d_e, then the expected label accuracy given overlap is: ")
sp.simplify(expected_label_accuracy_given_overlap_no_presence) # note that both cases yield the same expression

Theorem 1: if d_q < gamma*d_e, then the expected label accuracy given overlap is: 


1.0*d_q/(d_e + d_q)

## Theorem 2

In [7]:
# Veryfying some key steps from the proof of Theorem 2 in Appendix A.2

# Expanding, collecting and rearrainging the terms to form a quadratic equation in d_q
exp1 = (2*gamma + 1)*d_q*(d_e + d_q) - (-2*d_e*gamma**2 + 2*d_q*gamma + d_q)*(d_e + 2*d_q)
exp2 = (-2*gamma - 1)*d_q**2 + 4*gamma**2 * d_e * d_q + 2*d_e**2 * gamma**2
assert exp1.equals(exp2), "The two expressions should be equal"

# both sides of the equation are multiplied by -1, now we have a quadratic function in d_q
exp3 = -exp2

# solve exp3 for d_q using sympy
d_q_opt = sp.solve(exp3, d_q)

# choose the critical point that makes d_q > 0
d_q_opt[1] # we choose the positive sign solution since d_q > 0

d_e*gamma*(2*gamma + sqrt(4*gamma**2 + 4*gamma + 2))/(2*gamma + 1)

In [8]:
# Numerical verification, where we find the critical points, compute the second derivative, and 
# numerically evaluate the second derivative at the critical points to find the local maxima
derivative = sp.diff(expected_label_accuracy_given_overlap_ii, d_q)
critical_points = sp.solve(derivative, d_q)
second_derivative = sp.diff(derivative, d_q)

# Evaluate the second derivative at each critical point to find local maxima
local_maxima = []
for point in critical_points:
    second_derivative_value = second_derivative.subs(d_q, point)
    gamma_vals = np.linspace(0.0001, 0.999, 1000)
    vs = []
    for gamma_val in gamma_vals:
        v = second_derivative_value.subs(d_e, 1).subs(gamma, gamma_val).evalf()
        vs.append(v)
        
    # check all values are negative
    if all(v < 0 for v in vs):
        local_maxima.append(point)

q_max = local_maxima[0]
q_max = sp.simplify(q_max)
print("Theorem 2: The optimal query length is given by")
q_max

Theorem 2: The optimal query length is given by


d_e*gamma*(2*gamma + sqrt(4*gamma**2 + 4*gamma + 2))/(2*gamma + 1)

## Theorem 3

In [12]:
# Substitute q_max into the integral value to find the expression for the maximum value
f_max = expected_label_accuracy_given_overlap_ii.subs(d_q, q_max)
f_max = sp.simplify(f_max)

# Theorem 3
f_max_paper = 2*gamma*(2*gamma + 1 - sp.sqrt(4*gamma**2 + 4*gamma + 2)) + 1

# Show that they are equal, which verifies Theorem 3.
assert f_max.equals(f_max_paper), "The two expressions should be equal"
print("Theorem 3: The maximum expected label accuracy given overlap is: ")
f_max_paper

Theorem 3: The maximum expected label accuracy given overlap is: 


2*gamma*(2*gamma - sqrt(4*gamma**2 + 4*gamma + 2) + 1) + 1

In [10]:
# Verification of the simplification that is omitted in the proof in Appendix A.3
lhs = (
    sp.sqrt(4*gamma**2 + 4*gamma + 2) * (2*gamma + 1)**2
) / (
    (2*gamma + sp.sqrt(4*gamma**2 + 4*gamma + 2)) * 
    (2*gamma + 1 + 2*gamma**2 + gamma*sp.sqrt(4*gamma**2 + 4*gamma + 2))
)

# Define the right-hand side (RHS)
rhs = 2*gamma * (2*gamma + 1 - sp.sqrt(4*gamma**2 + 4*gamma + 2)) + 1

# Simplify the difference
difference = sp.simplify(lhs - rhs)

print("LHS equals RHS:", lhs.equals(rhs))

LHS equals RHS: True


## Theorem 5
The additional result where we combine everything to explain the expected label accuracy of an audio recording of length T, with M events given that the events are spaced at least d_q apart.

In [17]:
T, M = sp.symbols('T M', positive=True, real=True)
expected_label_accuracy_all_cases = (A_ii*M + T-M*(d_e + d_q))/T
expected_label_accuracy_all_cases = sp.simplify(expected_label_accuracy_all_cases)
print("Theorem 5: The expected label accuracy given overlap is: ")
expected_label_accuracy_all_cases

Theorem 5: The expected label accuracy given overlap is: 


-2*M*d_e**2*gamma**2/(T*d_q) + 2*M*d_e*gamma/T - M*d_q/T + 1

# Relative Interpretation $d_q = \delta d_e$


In [18]:
# relative interpretation substitute d_e = d, and d_q = \delta d (\delta = d_q/d_e)
d, delta = sp.symbols('d delta', positive=True, real=True)
expected_label_accuracy_given_overlap_ratio = expected_label_accuracy_given_overlap_ii.subs({d_e: d, d_q: delta*d})
expected_label_accuracy_given_overlap_ratio = sp.simplify(expected_label_accuracy_given_overlap_ratio)
print("When delta = d_q/d_q theorem 1 can be re-written as")
expected_label_accuracy_given_overlap_ratio
#print("providing a relative interpretation of the expected label accuracy in case of overlap")

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.
When delta = d_q/d_q theorem 1 can be re-written as


(2*delta*gamma + delta - 2*gamma**2)/(delta*(delta + 1))

In [19]:
expected_label_accuracy_all_cases_ratio = expected_label_accuracy_all_cases.subs({d_e: d, d_q: delta*d})
expected_label_accuracy_all_cases_ratio = sp.simplify(expected_label_accuracy_all_cases_ratio)
print("When delta = d_q/d_q Theorem 5 can be re-written as")
expected_label_accuracy_all_cases_ratio

When delta = d_q/d_q Theorem 5 can be re-written as


(M*d*delta*(-delta + 2*gamma) - 2*M*d*gamma**2 + T*delta)/(T*delta)