In [1]:
import math
import numpy as np
import pandas as pd

# Entropy of data

$Suprise=\log_{y}(\frac{1}{p(x)})$
,where $y$ indicates possible outputs

If we toss a coin three times, and we get 2x heads ($p(heads) = 0.6$) and 1x tails ($p(tails) = 0.4$):

$Suprise=\log_{2}(\frac{1}{0.6*0.6*0.4})$

$=\log_{2}(1) - \log_{2}(0.6*0.6*0.4)$

$=0 -\log_{2}(0.6)-\log_{2}(0.6)-\log_{2}(0.4)$

Total suprise is the sum of individual suprises.

In [2]:
def suprise(probabilities: list[float], outputs_number: int) -> None:
    individual = [np.array(-math.log(i, outputs_number)).round(3) for i in probabilities]
    total = np.array(sum(individual)).round(3)
    print(f"Individual suprises: {individual};\nTotal suprise: {total}")

suprise([0.6, 0.6, 0.4], 2)

Individual suprises: [0.737, 0.737, 1.322];
Total suprise: 2.796


In [3]:
df = pd.DataFrame({"Heads": [0.6, 0.737], "Tails": [0.4, 1.322]}, index=["Probability", "Suprise"])
df

Unnamed: 0,Heads,Tails
Probability,0.6,0.4
Suprise,0.737,1.322


- **Expected value:**

$E(x) = \sum P(X=x)x$

- **Entropy is the expected value of the suprise:**

$Entropy = E(Suprise) = \sum p(x)\log_{y}(\frac{1}{p(x)})$

$=(0.6 * 0.737)+(0.4 * 1.322)$

In [4]:
sum([np.prod(df[column].values) for column in df.columns])

0.9710000000000001

In [5]:
def suprise_calculator(probabilities: list[float], outputs_number: int) -> tuple[float]:
    return [np.array(-math.log(i, outputs_number)).round(3) for i in probabilities]

output = ["Heads", "Tails"]
probs = [0.1, 0.9]
suprises = suprise_calculator(probs, len(probs))

In [6]:
df = pd.DataFrame({key:[p, s] for key, p, s in zip(output, probs, suprises)}, index=["Probability", "Suprise"])
df

Unnamed: 0,Heads,Tails
Probability,0.1,0.9
Suprise,3.322,0.152


In [7]:
sum([np.prod(df[column].values) for column in df.columns])

0.4690000000000001

**Entropy is higher when we have the similar number of both types of outputs.**