# Decision trees

In [None]:
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None

In [2]:
df = pd.read_csv('titanic-homework.csv')
df.head(1)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Survived
0,1,3,"Braund, Mr. Owen Harris",male,22,1,0,0


In [28]:
X = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Survived']]

# Version for 4.0

### Categorical features

In [4]:
bins = [0, 20, 40, 100]
labels = ['young', 'medium', 'old']
X['Age'] = pd.cut(X['Age'], bins=bins, labels=labels)

for column in X.columns:
    X[column] = X[column].astype('category')

### Entropy
$H(S) = - \sum_{y \in S}p(y)log_2p(y)$ <br> <br>
$U=|S|$

In [26]:
def entropy(S):
    U, H = len(X), 0
    for Ui in X[S].value_counts():
        p = Ui / U
        if p > 0:
            H += (-p * np.log2(p))
    return H

### Conditional entropy

$H(S|A) = - \sum_{x \in A} \sum_{y \in S} p(x,y) log_2 \frac{p(x,y)}{p(x)}$

In [27]:
def conditional_entropy(S, A):
    U, H = len(X), 0
    for Ui in X[A].value_counts():
        px = Ui / U
        if px <= 0: continue
        
        for Uj in X[S, A].value_counts():
            pxy = Uj / U
            if pxy > 0:
                H += (-pxy * np.log2(pxy / px))
    return H

### Information gain
$IG(S,A) = H(S) - H(S|A)$

In [29]:
def information_gain(S, A):
    IG = entropy(S) - conditional_entropy(S, A)
    return IG

### Intrinsic information
$II(S,A)=- \sum_{x \in A} \sum_{y \in S} p(x,y) log_2 p(x,y)$

In [30]:
def intrinsic_info(S, A):
    info = 0
    for Ui in X[A].value_counts():
        pxy = Uj / U
        if pxy > 0:
            info += (-pxy * np.log2(pxy))
    return info

### Gain ratio
$IGR(S,A) = \frac{IG(S,A)}{II(S,A)}$

In [31]:
def gain_ratio(S,A):
    IGR = information_gain(S,A) / intrinsic_info(S,A)
    return IGR

# Version for 5.0