In [1]:
import numpy as np

# Entropy, joint entropy, conditional entropy, and mutual information

Consider the discrete source (the transmitter) $X$ with $n=3$ symbols with $p(x_i) = [0.125, 0.25, 0.625]$. Compute the entropy of the source $H_X$ 
$$ H(X) = - \sum_{i=1}^n p(x_i) \log_2(p(x_i)) $$
and the maximal entropy in the case of a uniform distribution.

In [None]:
from math import log2
p = [0.125, 0.25, 0.625]
Hx = - sum(pxi * log2(pxi) for pxi in p)
print(f"H(x) = {Hx} bit")

puniform = [1/3, 1/3, 1/3]
Hxmax = - sum(pxi * log2(pxi) for pxi in puniform)
print(f"Max H(x) = {Hxmax} bit")

H(x) = 1.2987949406953985
Max H(x) = 1.584962500721156


Consider a receiver $Y$ with $m=4$ symbols with the following conditional probabilities 
$$ \begin{array}{c}
p(y_1|x_1)=0.75\\
p(y_2|x_1)=0.25\\
p(y_2|x_2)=0.5\\
p(y_3|x_2)=0.5\\
p(y_3|x_3)=0.25\\
p(y_4|x_3)=0.75\\
\end{array} $$

The other probabilies are equal to zero. Note that for each transmitted symbol $x_i$, it has been assured that $$\sum_{j=1}^m p(y_j|x_i) = 1 \qquad \forall x_i \text{ with } i \in [1,n]$$ as the sum over all possible probabilities must be equal to one. 

In [4]:
P_Y_cond_X = [[0.75, 0.25, 0, 0],
              [0, 0.5, 0.5, 0],
              [0, 0, 0.25, 0.75]]

Check that row of the matrix represents a probability.

In [5]:
row_sums = [sum(row) for row in P_Y_cond_X]
print(f"Rowsums: {row_sums}")

Rowsums: [1.0, 1.0, 1.0]


Compute the joint probabilities using Bayes' rule 
$$ p(x_i, y_j) = p(y_j|x_i) p(x_i), $$
the probability of $y_j$ as marginal of $p(x_i, y_j)$ 
$$p(y_j) = \sum_{i=1}^n p(x_i, y_j)$$
and the conditional probabilities $p(x_{i}|y_{j})$ using Bayes' rule 
$$ p(x_i|y_j) = \frac{p(x_i, y_j)}{p(y_j)} = \frac{p(y_j|x_i) p(x_i)}{p(y_j)}.$$

In [24]:
joint_probs = [[pYcondX * pxi for pYcondX in rowpYCondX] for pxi, rowpYCondX in zip(p, P_Y_cond_X)]
print(f"Joint Probabilities p(xi, yi) = {joint_probs}")

py = [float(sum(row)) for row in np.transpose(joint_probs)]
print(f"p(yj) = {py}")

P_X_cond_Y_2 = [[pxy / pyi for pxy in rowpxy] for pyi, rowpxy in zip(py, joint_probs)] # bad for some reason
P_X_cond_Y = [[pYcondX * px / pyi for pyi, pYcondX in zip(py, rowpYcondX)] for px, rowpYcondX in zip(p, P_Y_cond_X)]

print(f"p(x|y) = {P_X_cond_Y}")
print(f"p(x|y) v2 = {P_X_cond_Y_2}")

Joint Probabilities p(xi, yi) = [[0.09375, 0.03125, 0.0, 0.0], [0.0, 0.125, 0.125, 0.0], [0.0, 0.0, 0.15625, 0.46875]]
p(yj) = [0.09375, 0.15625, 0.28125, 0.46875]
p(x|y) = [[1.0, 0.2, 0.0, 0.0], [0.0, 0.8, 0.4444444444444444, 0.0], [0.0, 0.0, 0.5555555555555556, 1.0]]
p(x|y) v2 = [[1.0, 0.3333333333333333, 0.0, 0.0], [0.0, 0.8, 0.8, 0.0], [0.0, 0.0, 0.5555555555555556, 1.6666666666666667]]


Compute the entropy $H(X)$
$$ H(Y) = - \sum_{j=1}^m p(y_j) \log_2(p(y_j))$$
and the joint entropy
$$ H(X,Y)=-\sum_{i=1}^{n}\sum_{j=1}^{m}p(x_{i},y_{j})\log p(x_{i},y_{j})$$
and the the conditional entropies
$$ H(Y|X)=-\sum_{i=1}^{n}\sum_{j=1}^{m}p(x_{i},y_{j})\log p(y_{j}|x_{i}) $$
and
$$ H(X|Y)=-\sum_{i=1}^{n}\sum_{j=1}^{m}p(x_{i},y_{j})\log p(x_{i}|y_{j}) .$$


In [25]:
Hy = - sum(pyi * log2(pyi) for pyi in py)
print(f"H(Y) = {Hy} bit")

Hxy = - sum(sum(pxy * log2(pxy) for pxy in row if pxy > 0) for row in joint_probs)
print(f"H(X, Y) = {Hxy} bit")

HycondX = - sum(sum(pxy * log2(pycondx) for pxy, pycondx in zip(rowJoint, rowCond) if pycondx > 0) for rowJoint, rowCond in zip(joint_probs, P_Y_cond_X))
print(f"H(Y|X) = {HycondX} bit")

HxcondY = - sum(sum(pxy * log2(pxcondy) for pxy, pxcondy in zip(rowJoint, rowCond) if pxcondy > 0) for rowJoint, rowCond in zip(joint_probs, P_X_cond_Y))
print(f"H(X|Y) = {HxcondY} bit")

H(Y) = 1.7657121273840979 bit
H(X, Y) = 2.1572535340397483 bit
H(Y|X) = 0.8584585933443496 bit
H(X|Y) = 0.39154140665565035 bit


Compute the mutual information between $X$ and $Y$. 
$I(X;Y) = H(X) - H(X|Y).$ 

In [None]:
Ixy = Hx - HxcondY
print(f"I(X,Y) = {Ixy} bit")

I(X,Y) = 0.9072535340397481


Verify the following theoretical results using the numerical results and the venn-diagrams
$$ H(Y|X) = H(X,Y)-H(X) $$
$$ H(Y|X)	\le	H(Y) $$
$$ H(X|Y) = H(X,Y)-H(Y) $$
$$ H(X|Y)	\le	H(X) $$
$$ 0\le\max\left[H(X),H(Y)\right]\le H(X,Y)\le H(X)+H(Y) $$
$$ I(X;Y) = H(X)-H(X|Y) $$
$$ I(X;Y) = H(X)+H(Y)-H(X,Y) $$
$$ I(X;Y) = H(Y)-H(Y|X) $$
$$ I(X;Y) = H(X,Y)-H(X|Y)-H(Y|X) $$
$$ 0\le I(X;Y)\le H(X)$$

In [41]:
print(f"H(Y|X) = {HycondX} = {Hxy - Hx} = {Hxy} - {Hx} = H(X, Y) - H(X)")
print(f"H(Y|X) = {HycondX} ≤ {Hy} = H(Y)")
print(f"H(X|Y) = {HxcondY} = {Hxy - Hy} = {Hxy} - {Hy} = H(X, Y) - H(Y)")
print(f"H(X|Y) = {HxcondY} ≤ {Hx} = H(X)")
print(f"0 ≤ Max[H(X), H(X)] = {max(Hx, Hy)} ≤ H(X, Y) = {Hxy} ≤ H(x) + H(Y) = {Hx + Hy} = {Hx} + {Hy}")
print(f"I(X; Y) = {Ixy} = {Hx - HxcondY} = {Hx} - {HxcondY} = H(X) - H(X|Y)")
print(f"I(X; Y) = {Ixy} = {Hx + Hy - Hxy} = {Hx} + {Hy} - {Hxy} = H(X) + H(Y) - H(X,Y)")
print(f"I(X; Y) = {Ixy} = {Hy - HycondX} = {Hy} - {HycondX} = H(Y) - H(Y|X)")
print(f"I(X; Y) = {Ixy} = {Hxy - HxcondY - HycondX} = {Hxy} - {HxcondY} - {HycondX} = H(X, Y) - H(X|Y) - H(Y|X)")
print(f"0 ≤ I(X; Y) = {Ixy} ≤ {Hx} = H(X)")




H(Y|X) = 0.8584585933443496 = 0.8584585933443498 = 2.1572535340397483 - 1.2987949406953985 = H(X, Y) - H(X)
H(Y|X) = 0.8584585933443496 ≤ 1.7657121273840979 = H(Y)
H(X|Y) = 0.39154140665565035 = 0.3915414066556504 = 2.1572535340397483 - 1.7657121273840979 = H(X, Y) - H(Y)
H(X|Y) = 0.39154140665565035 ≤ 1.2987949406953985 = H(X)
0 ≤ Max[H(X), H(X)] = 1.7657121273840979 ≤ H(X, Y) = 2.1572535340397483 ≤ H(x) + H(Y) = 3.0645070680794966 = 1.2987949406953985 + 1.7657121273840979
I(X; Y) = 0.9072535340397481 = 0.9072535340397481 = 1.2987949406953985 - 0.39154140665565035 = H(X) - H(X|Y)
I(X; Y) = 0.9072535340397481 = 0.9072535340397483 = 1.2987949406953985 + 1.7657121273840979 - 2.1572535340397483 = H(X) + H(Y) - H(X,Y)
I(X; Y) = 0.9072535340397481 = 0.9072535340397483 = 1.7657121273840979 - 0.8584585933443496 = H(Y) - H(Y|X)
I(X; Y) = 0.9072535340397481 = 0.9072535340397483 = 2.1572535340397483 - 0.39154140665565035 - 0.8584585933443496 = H(X, Y) - H(X|Y) - H(Y|X)
0 ≤ I(X; Y) = 0.9072535340