# "Bayesian Parameter Estimation"
> "We follow Chapters 2 and 3 of Sivia and Skilling's book to illustrate a simple Bayesian Parameter Estimation workflow"

- toc:true
- branch: master
- badges: true
- comments: true
- author: John J. Molina
- categories: [Data Analysis, Parameter Estimation]

In [3]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

$$\begin{align}
P(X\lvert I) + P(\overline{X}\lvert I) &= 1 &\textrm{(Sum Rule)}\label{e:sum_rule}\\
P(X,Y\lvert I) &= P(X\lvert Y, I)\times P(Y\lvert I) &\textrm{(Product Rule)}\notag\\
&=P(Y\lvert X, I)\times P(X\lvert I) \label{e:product_rule} \\
\end{align}$$

$$\begin{align*}
\overbrace{P(X \lvert Y, I)}^{\small{\mathrm{posterior}}} &= \frac{\overbrace{P(Y\lvert X,I)}^{\small{\mathrm{likelihood}}} 
                                          \times\overbrace{P(X\lvert I)}^{\small{\mathrm{prior}}}}{\underbrace{P(Y\lvert I)}_{\small{\mathrm{evidence}}}}&\textrm{(Bayes' Theorem)}\label{e:bayes} \\
P(X\lvert I) &= \int\textrm{d}Y P(X, Y\lvert I) =\int\textrm{d}Y P(X\lvert Y, I)\times P(Y\lvert I) &\textrm{(Marginalization)}
\label{e:marginalization}
\end{align*}$$

$$\begin{align*}
P(\textrm{Hypothesis}\lvert \textrm{Data}, I) &= \frac{P(\textrm{Data}\lvert \textrm{Hypothesis}, I)\times P(\textrm{Hypothesis}\lvert I)}{P(\textrm{Data}, I)}\label{e:bayes_hypothesis}
\end{align*}$$

$$\begin{align*}
P(\Theta\lvert D, I) &= \frac{P(D\lvert\Theta, I) \times P(\Theta\lvert I)}{P(D\lvert I)} \label{e:bayes_theta}\\
&\propto P(D\lvert\Theta, I)\times P(\Theta\lvert I)
\end{align*}$$

Assume that $D = \{D_k\}_{k=1}^N$

$$\begin{align*}
P(\Theta\lvert D,I) &\propto P(\{D_k\}_{k=1}^N\lvert \Theta, I)\times P(\Theta\lvert I) \\
&= P(\{D_k\}_{k=2}^N\lvert D_1, \Theta, I) \times P(D_1\lvert \Theta, I)\times P(\Theta\lvert I) \\
&= P(\{D_k\}_{k=2}^N\lvert \Theta, I)\times P(D_1\lvert \Theta, I)\times P(\Theta\lvert I) \\ 
&= P(\{D_k\}_{k=3}^N\lvert \Theta, I)\times P(D_2\lvert \Theta, I)\times P(D_1\lvert \Theta, I) \times P(\Theta\lvert I) \\
&\vdots \\
&= \left(\Pi_{k=1}^N P(D_k\lvert \Theta, I)\right) \times P(\Theta\lvert I)
\end{align*}$$

$$\begin{align*}
P(\Theta\lvert D,I)&= \big(\Pi_{k=m+1}^{N} P(D_k\lvert \Theta, I)\big) \big(\Pi_{j=1}^{m} P(D_j\lvert\Theta, I)\big) P(\Theta\lvert I) \\
&=\big(\Pi_{k=m+1}^{N} P(D_k\lvert \Theta, I)\big) \times P(\Theta\lvert \{D_j\}_{j=1}^m, I)
\end{align*}$$

# Laplace's Approximation, Best Estimates and Error Bars

$$\begin{align*}
L &= \ln{P(\Theta\lvert D, I)} = \ln P \\
&= L\lvert_{\Theta_0} + \nabla L\lvert_{\Theta_0}\cdot(\Theta - \Theta_0) 
+ \frac{1}{2} (\Theta-\Theta_0)^{t}\cdot\nabla\nabla L\lvert_{\Theta_0}\cdot(\Theta-\Theta_0) + \mathcal{O}\big((\Theta-\Theta_0)^3\big) \\
&\simeq L\lvert_{\Theta_0} + \frac{1}{2} (\Theta-\Theta_0)^{t}\cdot\nabla\nabla L\lvert_{\Theta_0}\cdot(\Theta-\Theta_0)
\end{align*}$$

$$\begin{align*}
P(\Theta\lvert D,I)\propto \exp{\left[-\frac{1}{2} \left(\Theta-\Theta_0\right)^t \cdot 
\left(-\nabla\nabla L\lvert_{\Theta_0}\right)\cdot\left(\Theta-\Theta_0\right)\right]}
\end{align*}$$

$$\begin{align*}
P_{\textrm{Gaussian}}(A \lvert \mu, \Sigma) &=
\frac{1}{\sqrt{(2\pi)^d \det{\Sigma}}} \exp{\left[-\frac{1}{2}\left(A-\mu\right)^t \cdot \Sigma^{-1}\cdot\left(A-\mu\right)\right]} \\
\left\langle A \right\rangle &= \mu \\
\left\langle(A^i - \mu^i)(A^j - \mu^j)\right\rangle &= \Sigma^{ij}
\end{align*}$$

$$\begin{align*}
A = \mu\pm \sqrt{\textrm{diag}({\Sigma})}
\end{align*}$$

$$\begin{align*}
\mu&\rightarrow \Theta_0 \\
\Sigma&\rightarrow (-\nabla\nabla L\lvert_{\Theta_0})^{-1}
\end{align*}$$

$$\begin{align*}
P(\Theta\lvert D, I) &\propto P(D\lvert \Theta, I)\times P(\Theta\lvert I) \\
&\propto \Pi_k P(y_k\lvert \Theta, I) \\
&=\Pi_k \frac{1}{\sqrt{2\pi} \sigma_k} \exp{\left[-\frac{\big(y_k - f(X_k; \Theta)\big)^2}{2\sigma_k^2}\right]} \\
\end{align*}$$

$$\begin{align*}
L &= \textrm{constant} -\sum_k \frac{\left(y_k - f(X_k; \Theta)\right)^2}{2\sigma_k^2} \\
&= \textrm{constant} - \frac{1}{2}\chi^2
\end{align*}$$

$$\begin{align*}
\nabla L = -\frac{1}{2}\nabla \chi^2 = 0 \Longrightarrow \nabla\chi^2 = 0
\end{align*}$$

$$\begin{align}
\nabla\nabla L = -\frac{1}{2}\nabla\nabla\chi^2
\end{align}$$

$$\begin{align}
\Sigma = 2 \left(\nabla\nabla\chi^2\lvert_{\Theta_0}\right)^{-1}
\end{align}$$

$$\begin{align}
P(\Theta\lvert D, I) &=\int\textrm{d}\sigma\, P(\Theta, \sigma\lvert D, I)\\
&\propto \int\textrm{d}\sigma\, P(D\lvert \Theta, \sigma, I) \times P(\Theta, \sigma\lvert I)
\end{align}$$


$$\begin{align}
P(\Theta\lvert D, I) &\propto \int_0^\infty\textrm{d}\sigma \left(\Pi_k \frac{1}{\sqrt{2\pi}\sigma} \exp{\left[-\frac{\left(y_k - f(X_k; \Theta)\right)^2}{2\sigma^2}\right]}\right) \times \frac{1}{\sigma} \\
&= (2\pi)^{-N/2}\int_0^\infty\frac{\textrm{d}\sigma}{\sigma} \sigma^{-N}\exp{\left[-\frac{1}{2\sigma^2}\sum_k\left(y_k - f(X_k; \Theta)\right)^2\right]}
\end{align}$$

$$\begin{align}
t &= \frac{\sum_k \left(y_k - f(X_k; \Theta)\right)^2}{2 \sigma^2}  = \frac{S}{2\sigma^2}\\
\frac{\textrm{d}\sigma}{\sigma} &= -\frac{\textrm{d}t}{2t}
\end{align}$$

$$\begin{align}
P(\Theta\lvert D, I) &\propto (2\pi)^{-N/2}\int_0^\infty \frac{\textrm{d}t}{t} \left(\frac{2t}{S}\right)^{N/2} e^{-t} \\
&= \pi^{-N/2}\underbrace{\left(\int_0^\infty \frac{\textrm{d}t}{t} t^{N/2} e^{-t}\right)}_{= \Gamma(N/2)}S^{-N/2} 
\end{align}$$

$$\begin{align}
L &= \textrm{constant} - \frac{N}{2} \ln{S}
\end{align}$$

$$\begin{align}
\nabla L &= -\frac{N}{2} \frac{\nabla S}{S}= 0 \Longrightarrow \nabla S = 0
\end{align}$$

$$\begin{align}
\nabla\nabla L &= -\frac{N}{2}\left[\frac{\nabla\nabla S}{S} - \frac{\left(\nabla S\right)\left(\nabla S\right)}{S^2}\right]
\end{align}$$

$$\begin{align}
\nabla\nabla L\lvert_{\Theta_0} &= -\frac{N}{2} \frac{\nabla\nabla S\lvert_{\Theta_0}}{S_0} = -\frac{1}{2} \left.\nabla\nabla\left(\frac{S}{S_0/N}\right)\right\lvert_{\Theta_0}
\end{align}$$

$$\begin{align}
\Sigma &= 2\left(\left.\nabla\nabla\left(\frac{S}{S_0/N}\right)\right\lvert_{\Theta_0}\right)^{-1}
\end{align}$$

$$\begin{align}
\chi^2 = \sum_k\frac{\left(y_k - f(X_k;\Theta)\right)^2}{\sigma_k^2} &\longrightarrow \frac{S}{S_0/N} = \frac{N}{S_0}\sum_k\left(y_k - f(X_k;\Theta)\right)^2 \\
\sigma_k^2 &\longrightarrow \frac{S_0}{N} = \frac{1}{N}\sum_{k}\left(y_k - f(X_k;\Theta_0)\right)^2
\end{align}$$

$$\begin{align}
P(\sigma\lvert D, I) &= \int\textrm{d}\Theta P(\sigma, \Theta\lvert D, I) \\
&\propto \int\textrm{d}\Theta P(D\lvert \sigma, \Theta, I) P(\sigma, \Theta\lvert I)
\end{align}$$

$$\begin{align}
P(\sigma\lvert D, I)\propto \sigma^{-(N+1)}\int\textrm{d}\Theta\exp{\left[-\frac{S(\Theta)}{2\sigma^2}\right]}
\end{align}$$

$$\begin{align}
S \simeq S_0 + \nabla S\lvert_{\Theta_0} \cdot \left(\Theta-\Theta_0\right) + \frac{1}{2}\left(\Theta - \Theta_0\right)^t\cdot\nabla\nabla S\lvert_{\Theta_0}\cdot\left(\Theta-\Theta_0\right)
\end{align}$$

$$\begin{align}
P(\sigma\lvert D, I)&\propto \sigma^{-(N+1)}\exp{\left[-\frac{S_0}{2\sigma^2}\right]}\int\textrm{d}\Theta \exp{\left[-\frac{1}{2}\left(\Theta-\Theta_0\right)^t\cdot
\frac{\nabla\nabla S\lvert_{\Theta_0}}{2\sigma^2}\cdot\left(\Theta-\Theta_0\right)\right]} \\
&\propto\sigma^{d-(N+1)}\exp{\left[-\frac{S_0}{2\sigma^2}\right]}
\end{align}$$

$$\begin{align}
L &= \ln{P(\sigma\lvert D, I)} = \textrm{const} + \left(d - N - 1\right)\ln{\sigma} - \frac{S_0}{2\sigma^2} \\
\frac{\textrm{d}L}{\textrm{d}\sigma} &= (d-N-1)\frac{1}{\sigma} + \frac{S_0}{\sigma^3}\\
\frac{\textrm{d}^2L}{\textrm{d}^2\sigma} &= (N+1-d)\frac{1}{\sigma^2} - \frac{3 S_0}{\sigma^4}
\end{align}$$

$$\begin{align}
\sigma_0^2 = \frac{S_0}{N+1-d}
\end{align}$$

$$\begin{align}
\left.\frac{\textrm{d}^2L}{\textrm{d}^2\sigma}\right\lvert_{\sigma_0} &= \frac{(N+1-d)\sigma_0^2 - 3S_0}{\sigma_0^4}\\
&= \frac{-2(N+1-d)}{\sigma_0^2}
\end{align}$$

$$\begin{align}
\sigma = \sigma_0 \pm \frac{\sigma_0}{\sqrt{2(N+1-d)}}
\end{align}$$