\section{Introduction to regression models}

\section{Hierarchical linear models}

\section{Generalized linear models}

\section{Models for robust inference}

\section{Models for missing data}

\section{Bayesian linear regression}
\subsubsection{R code}
\begin{minted}[breaklines]{R}
library(Bolstad)

bayes.lin.reg(1:10, (1:10+rnorm(10, 1, 0.1)), slope.prior = "normal", plot.data = FALSE, mb0=10000, sb0=1)
\end{minted}

\section{Variational inference}
\subsection{Variational Bayes}
In practice, computations required are intractable even for simple cases. Hence methods for Bayesian inference are either significantly approximate or achieve samples from the exact solution at significant computational expense, e.g. Markov Chain Monte Carlo. Variational Bayes method facilitates analytical calculations of the posterior distribution over a model. 

For a general model it may not be possible (let alone easy) to evaluate the posterior probability distribution analytically. Hence we might approximate the posterior with a simpler form $q(\mathbf{w})$, which itself is parameterised by a series of hyper-parameters. We can measure the fit of this approximate distribution to the true via the free energy:
\[
F = \int q(\mathbf{w}) \log \bigg(\frac{P(\mathbf{y}|\mathbf{w})P(\mathbf{w})}{q(\mathbf{w})} \bigg)d\mathbf{w}
\]
Inferring the posterior distribution $P(\mathbf{w}|\mathbf{y})$ is now a matter of estimation of the correct $q(\mathbf{w})$, which is achieved by maximising the free energy over $q(\mathbf{w})$. 

Consider the log evidence:
\[
\log P(\mathbf{y}) = \log \int P(\mathbf{y}|\mathbf{w}) P(\mathbf{w}) d\mathbf{w}
\]
\[
= \log \int q(\mathbf{w}) \frac{P(\mathbf{y}|\mathbf{w})P(\mathbf{w})}{q(\mathbf{w})}d\mathbf{w}
\]
\[
\geq \int q(\mathbf{w}) \log \frac{P(\mathbf{y} | \mathbf{w})P(\mathbf{w})}{q(\mathbf{w})}d\mathbf{w}
\]
using Jensen's inequality on the concave logarithm $\log \bigg(\frac{\sum_{i=1}^n x_i}{n} \bigg) \geq \frac{\sum_{i=1}^n \log(x_i)}{n}$. The latter quantity is identified as the free energy and the equality holds when $q(\mathbf{w}) = P(\mathbf{w}|\mathbf{y})$. Thus the process of seeking the best approximation $q(\mathbf{w})$ becomes a process of maximization of the free energy. 

The maximisation of $F$ is equivalent to minimising the Kullback-Liebler (KL) distance between $q(\mathbf{w})$ and the true posterior. Start with the log evidence:
\[
\log P(\mathbf{y}) = \log \frac{P(\mathbf{y}, \mathbf{w})}{P(\mathbf{w}|\mathbf{y})}
\]
take the expectation with respect to the arbitrary density $q(\mathbf{w})$:
\[
\int q(\mathbf{w}) \log \frac{P(\mathbf{y}, \mathbf{w})}{P(\mathbf{w}|\mathbf{y})}d\mathbf{w} 
\]
\[
\int q(\mathbf{w})\log \bigg(\frac{P(\mathbf{y}, \mathbf{w})}{P(\mathbf{w}|\mathbf{y})} \frac{q(\mathbf{w})}{q(\mathbf{w})}\bigg) d\mathbf{w}
\]
\[
= \int q(\mathbf{w}) \log \frac{P(\mathbf{y}, \mathbf{w})}{q(\mathbf{w})}d\mathbf{w} +
\int q(\mathbf{w}) \log \frac{q(\mathbf{w})}{P(\mathbf{w}|\mathbf{y})}d\mathbf{w}
\]
\[
= F + KL
\]
\[
= ELBO + KL
\]
where $KL$ is the $KL$ divergence between $q(\mathbf{w})$ and $P(\mathbf{w}|\mathbf{y})$. The free energy is also known as evidence lower bound (ELBO). 

\subsubsection{Variational approach}
To make the integrals tractable the variational methos chooses mean field approximation for $q(\mathbf{w})$:
\[
q(\mathbf{w}) = \prod_i q_{w_i}(\mathbf{w}_i)
\]
where we have collected the parameters in $\mathbf{w}$ into separate groups $\mathbf{w}_i$, each with their own approximate posterior distribution $q(\mathbf{w}_i)$. The computation of $q(\mathbf{w}_i)$ proceeds by the maximisation of $q(\mathbf{w}_i)$ over $F$, by application of the calculus of variations this gives:
\[
\log q_{w_i}(\mathbf{w}_i) \propto \int q_{w_{ \not i}}\log P(\mathbf{y}|\mathbf{w})P(\mathbf{w})d\mathbf{w}_{\not i}
\]
where $\mathbf{w}_{\not i}$ refer to the parameters not in the ith group. 

Proof: We wish to maximise the free energy:
\[
F = \int q(\mathbf{w}) \log \frac{P(\mathbf{y}|\mathbf{w})P(\mathbf{w})}{q(\mathbf{w})}d\mathbf{w}
\]
with respect to each factorised posterior distribution in turn. $F$ is a functional (a function of a function), i.e. $F=\int f(\mathbf{w}, q(\mathbf{w}))d\mathbf{w}$, hence to maximise $F$ we need to turn to the calculus of variations. We require the maximum of $F$ with respect to a subset of the parameters, $\mathbf{w}_i$, thus we write the functional in terms of these parameters alone as:
\[
F = \int g(\mathbf{w}_i, q_{w_i}(\mathbf{w}_i)) d\mathbf{w}_i
\]
where:
\[
g(\mathbf{w}_i, q_{w_i}(\mathbf{w}_i)) = \int f(\mathbf{w}, q(\mathbf{w}))d\mathbf{w}_{\not i}
\]
From variational calculus the maximum of $F$ is the solution of the Euler differential equation:
\[
\frac{\partial}{\partial q_{w_i}(\mathbf{w}_i)}\bigg( g(\mathbf{w}_i, q(\mathbf{w}_i), q'(\mathbf{w}_i))\bigg) -
\frac{d}{d\mathbf{w}_i} \bigg( \frac{\partial}{\partial q_{w_i}'(\mathbf{w}_i)} [g(\mathbf{w}_i, q(\mathbf{w}_i), q'(\mathbf{w}_i))]\bigg) = 0
\]
where the second term is zero, in this case, as $g$ is not dependent upon $q_{w_i}'(\mathbf{w}_i)$. This can be written as
\[
\frac{\partial}{\partial q_{w_i}(\mathbf{w}_i)} \int q(\mathbf{w}) \log \frac{P(\mathbf{y}|\mathbf{w})P(\mathbf{w})}{q(\mathbf{w})}d\mathbf{w}_{\not i} = 0
\]
\[
= \int q_{w_{\not i}}(\mathbf{w}_{\not i})\log P(\mathbf{y}|\mathbf{w})P(\mathbf{w})d\mathbf{w}_{\not i} - 
\int q_{w_{\not i}}(\mathbf{w}_{\not i})\log q_{w_{\not i}}(\mathbf{w}_{\not i})d\mathbf{w}_{\not i} -
\int q_{w_{\not i}}(\mathbf{w}_{\not i})\log q_{w_{\not i}}(\mathbf{w}_{i})d\mathbf{w}_{\not i}
\]

\subsubsection{}