In [1]:
import numpy as np
import pandas as pd
from graphviz import Digraph
from sklearn.tree import DecisionTreeClassifier

In [2]:
# グラフの設定
G = Digraph(format='png')
G.attr('node', shape='square')

In [3]:
# タイタニックデータを読む
df = pd.read_csv("train.csv")

In [4]:
df.drop(["PassengerId", "Name", "Ticket"], axis = 1, inplace = True)
df.dropna(axis = 0, inplace = True)

In [5]:
df["Survived"] = df["Survived"].apply(str)
df["Pclass"] = df["Pclass"].apply(str)
df["SibSp"] = df["SibSp"].apply(str)
df["Parch"] = df["Parch"].apply(str)

In [6]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
1,1,1,female,38.0,1,0,71.2833,C85,C
3,1,1,female,35.0,1,0,53.1,C123,S
6,0,1,male,54.0,0,0,51.8625,E46,S
10,1,3,female,4.0,1,1,16.7,G6,S
11,1,1,female,58.0,0,0,26.55,C103,S


In [7]:
df.shape

(183, 9)

In [8]:
# 生存者数と死亡者数を数える
def count_survived(_df):
    nos = _df.query("Survived == '1'").shape[0]
    nod = _df.query("Survived == '0'").shape[0]
    return nos, nod

In [9]:
# ジニ係数の計算
def calc_gini(_df):
    nos, nod = count_survived(_df)
    pos = float(nos) / (nos + nod)
    return 2 * pos * (1 - pos)

In [10]:
# ジニ係数を求める
gini = calc_gini(df)
gini

0.44074173609244827

In [11]:
# 生存者数と死亡者数
nos, nod = count_survived(df)
nos, nod

(123, 60)

In [12]:
igs = {}
for f in df.columns[1:]:
    igs[f] = 0

In [13]:
# ジニ係数が最小になるような分割方法を見つける（再帰バージョン）
def get_best_division(_df, depth = 1):
    
    # df のポインタをとっておく
    name = str(id(_df))
    
    # パラメータの初期化
    max_ig = 0
    max_feature = "nothing"
    max_value = "nothing"
    
    # 情報利得の初期化
    _igs = {}
    for f in df.columns[1:]:
        _igs[f] = 0
    
    # ジニ係数と生存者数・死亡者数の計算
    gini = calc_gini(_df)
    nos, nod = count_survived(_df)
    
    # 最後だったら生存者数と死亡者数のみ返す
    if depth > 3:
        print("end of tree")
        info = "%d vs %d" % (nos, nod)
        G.node(name, info)
        return _igs
    
    for f in df.columns[1:]:
        vals = _df[f].drop_duplicates().values
        vals = sorted(vals)
        n = len(vals)
        
        # カテゴリカル・データの場合
        if _df[f].dtype == np.object:
            for val in vals:
                tmp0 = _df[_df[f] == val]
                n0 = len(tmp0)
                tmp1 = _df[_df[f] != val]
                n1 = len(tmp1)
                if n0 * n1 == 0:
                    continue
                gini0 = calc_gini(tmp0)
                gini1 = calc_gini(tmp1)
                ig = gini - (n0 * gini0 + n1 * gini1) / (n0 + n1)
                if ig > max_ig:
                    max_ig = ig
                    max_feature = f
                    max_value = val
                
        # 数値データの場合
        else:
            for val in vals[1:]:
                tmp0 = _df[_df[f] >= val]
                n0 = len(tmp0)
                tmp1 = _df[_df[f] < val]
                n1 = len(tmp1)
                if n0 * n1 == 0:
                    continue
                gini0 = calc_gini(tmp0)
                gini1 = calc_gini(tmp1)
                ig = gini - (n0 * gini0 + n1 * gini1) / (n0 + n1)
                if ig > max_ig:
                    max_ig = ig
                    max_feature = f
                    max_value = val
    
    # 結果表示
    print("IG: %f feature: %s value: %s" % ( max_ig, max_feature, str(max_value)))
    # 分割する
    
    info = ""
    # 分割されなかった場合
    if max_feature == "nothing":
        info = "%d vs %d" % (nos, nod)
        G.node(name, info)
        return  _igs
  
    _df0 = ""
    _df1 = ""
    
    # カテゴリカル・データの場合
    if _df[max_feature].dtype == np.object:
        _df0 = _df[_df[max_feature] == max_value]
        _df1 = _df[_df[max_feature] != max_value]
        info = "%s is %s\n\n%d vs %d" % (max_feature, max_value, nos, nod)
    # 数値データの場合   
    else: 
        _df0 = _df[_df[max_feature] >= max_value]
        _df1 = _df[_df[max_feature] < max_value]
        info = "%s >= %f\n\n%d vs %d" % (max_feature, max_value, nos, nod)
    
    # グラフの作成
    name0 = str(id(_df0))
    name1 = str(id(_df1))
    G.node(name, info)
    G.edge(name, name0)
    G.edge(name, name1)
    
    # 次の枝を作る
    _igs0 = get_best_division(_df0, depth + 1)
    _igs1 = get_best_division(_df1, depth + 1)
    
    # 情報利得の集計
    for f in df.columns[1:]:
        _igs[f] += _igs0[f] + _igs1[f]
    
    _igs[max_feature] += max_ig * (nos + nod)
    
    return _igs


In [14]:
igs = get_best_division(df, 1)

IG: 0.124937 feature: Sex value: female
IG: 0.040385 feature: Age value: 4.0
IG: 0.021392 feature: Fare value: 10.5
end of tree
end of tree
IG: 0.000000 feature: nothing value: nothing
IG: 0.059421 feature: Age value: 18.0
IG: 0.024545 feature: Age value: 44.0
end of tree
end of tree
IG: 0.000000 feature: nothing value: nothing


In [15]:
# 決定木の可視化
G.render('tree')

'tree.png'

In [16]:
# 情報利得の正規化
s = 0
for f in df.columns[1:]:
    s += igs[f]
for f in df.columns[1:]:
    igs[f] /= s

In [17]:
igs

{'Age': 0.3145154042314017,
 'Cabin': 0.0,
 'Embarked': 0.0,
 'Fare': 0.05104899352897298,
 'Parch': 0.0,
 'Pclass': 0.0,
 'Sex': 0.6344356022396254,
 'SibSp': 0.0}