# Conditional Entropy

### References

- [DataScience - StackExchange: Conditional entropy calculation in python, H(Y|X)](https://datascience.stackexchange.com/questions/58565/conditional-entropy-calculation-in-python-hyx)

In [2]:
import pandas as pd
import numpy as np

### functions

In [3]:
##Entropy
def entropy(Y):
    """
    Also known as Shanon Entropy
    Reference: https://en.wikipedia.org/wiki/Entropy_(information_theory)
    """
    unique, count = np.unique(Y, return_counts=True, axis=0)
    prob = count/len(Y)
    en = np.sum((-1)*prob*np.log2(prob))
    return en


#Joint Entropy
def jEntropy(Y,X):
    """
    H(Y;X)
    Reference: https://en.wikipedia.org/wiki/Joint_entropy
    """
    YX = np.c_[Y,X]
    return entropy(YX)

#Conditional Entropy
def cEntropy(Y, X):
    """
    conditional entropy = Joint Entropy - Entropy of X
    H(Y|X) = H(Y;X) - H(X)
    Reference: https://en.wikipedia.org/wiki/Conditional_entropy
    """
    return jEntropy(Y, X) - entropy(X)


#Information Gain
def gain(Y, X):
    """
    Information Gain, I(Y;X) = H(Y) - H(Y|X)
    Reference: https://en.wikipedia.org/wiki/Information_gain_in_decision_trees#Formal_definition
    """
    return entropy(Y) - cEntropy(Y,X)

### dataset 1

In [4]:
## data creation
attrNms = ["x1", "x2", "x3", "x4", "x5"]
data = pd.DataFrame(
    [
        [1, 0, 1, 1, 1],
        [1, 1, 0, 0, 1],
        [0, 1, 1, 1, 1],
        [1, 0, 1, 0, 1],
        [1, 0, 0, 1, 1],
        [0, 0, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 0, 0, 1, 1],
        [0, 1, 0, 0, 1],
        [0, 0, 0, 1, 1],
    ],
    columns=attrNms,
)
data["y"] = [1, 1, 1, 0, 0, 1, 1, 1, 0, 0]

In [5]:
# get values
y = data["y"].values
X = data[attrNms].values
# estimation and display
entropy(y), cEntropy(y,X), gain(y, X)

(0.9709505944546686, 0.20000000000000018, 0.7709505944546684)

### dataset 2: Iris

In [49]:
## load dataset
from sklearn.datasets import load_iris
# load dataset
dataset = load_iris()
dataset.keys()
# dataset to df
data = pd.DataFrame(dataset.data, columns = dataset.feature_names)
data['class'] = dataset.target
dclass = dict()
#for i, ic in enumerate(dataset.target_names):
#    dclass[i] = ic
#data['class'] = data['class'].map(dclass)
# columns
col_y = "class"
cols_x = list(filter(lambda x: x != col_y, data.columns.tolist()))
## data pareparation
num_discrete_values = 5
for col in cols_x:
    data[col] = pd.cut(data[col], bins=num_discrete_values, labels=np.arange(num_discrete_values), right=False)
    data[col] = data[col].astype(int)
# add random feature
data["randint5"] = np.random.randint(5, size=len(data))
cols_x = list(filter(lambda x: x != col_y, data.columns.tolist()))
## data collection
X = data[cols_x].values
y = data[col_y].values

In [50]:
entropy(y), cEntropy(y,X), gain(y, X)

(1.584962500721156, 0.07673183336217893, 1.5082306673589772)

In [52]:
for i in np.arange(1,6,1):
    print(i,cols_x[:i], cEntropy(y,X[:,:i]))

1 ['sepal length (cm)'] 0.9447200734403691
2 ['sepal length (cm)', 'sepal width (cm)'] 0.5539641975174541
3 ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)'] 0.2289531340851374
4 ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'] 0.11972104013355711
5 ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)', 'randint5'] 0.07673183336217893


In [56]:
cEntropy(y,data[['petal length (cm)', 'petal width (cm)']].values)

0.17713406521696218

> NOTA: El aumento del numero de features aumenta la predicibilidad.

In [53]:
for i in np.arange(1,6,1):
    print(i-1, [cols_x[i-1]], cEntropy(y,X[:,i-1]))

0 ['sepal length (cm)'] 0.9447200734403691
1 ['sepal width (cm)'] 1.1934869489363455
2 ['petal length (cm)'] 0.31870954563264053
3 ['petal width (cm)'] 0.26043148725915666
4 ['randint5'] 1.5680084062068063


> NOTA: Es una buena manera para seleccionar variables. 