### ゴルフのデータセットを作る
ref. https://ja.wikipedia.org/wiki/%E6%B1%BA%E5%AE%9A%E6%9C%A8

In [1]:
import pandas as pd
import numpy as np

columns = [['weather','temperature','humidity','wind','golf']]
df = pd.DataFrame(columns=columns)

def df_append(df, lists, columns):
    for p in lists:
        foo = pd.DataFrame(p)
        foo.columns = columns
        df = df.append(foo)
    return df

lists = [
    [['sunny',29,85,'weak','no']],
    [['sunny',27,90,'strong','no']],
    [['cloudy',28,78,'weak','yes']],
    [['rain',21,96,'weak','yes']],
    [['rain',20,80,'weak','yes']],
    [['rain',18,70,'strong','no']],
    [['cloudy',18,65,'strong','yes']],
    [['sunny',22,95,'weak','no']],
    [['sunny',21,70,'weak','yes']],
    [['rain',24,80,'weak','yes']],
    [['sunny',24,70,'strong','yes']],
    [['cloudy',22,90,'strong','yes']],
    [['cloudy',27,75,'weak','yes']],
    [['rain',22,80,'strong','no']],
]

df = df_append(df, lists, columns).reset_index(drop=True)

nb_train = 3
df_test = df[-nb_train:]
df = df[:-nb_train]
df

Unnamed: 0,weather,temperature,humidity,wind,golf
0,sunny,29,85,weak,no
1,sunny,27,90,strong,no
2,cloudy,28,78,weak,yes
3,rain,21,96,weak,yes
4,rain,20,80,weak,yes
5,rain,18,70,strong,no
6,cloudy,18,65,strong,yes
7,sunny,22,95,weak,no
8,sunny,21,70,weak,yes
9,rain,24,80,weak,yes


In [2]:
df_test

Unnamed: 0,weather,temperature,humidity,wind,golf
11,cloudy,22,90,strong,yes
12,cloudy,27,75,weak,yes
13,rain,22,80,strong,no


In [3]:
if False:
    df = df.replace('sunny', 0)
    df = df.replace('cloudy', 1)
    df = df.replace('rain', 2)
    df = df.replace('weak', 0)
    df = df.replace('strong', 1)
    df = df.replace('no', 0)
    df = df.replace('yes', 1)

## information gain基準の分割を行う

### 風の強さによってデータ分割した場合のinformation gainを計算する

In [4]:
wind = df['wind'].values.flatten()
golf = df['golf'].values.flatten()
EPS = np.finfo(float).eps

print('------------------------------------------------------------------')

# 風が弱い場合にゴルフに行く確率
p_wind_weak_yes = (golf[wind=='weak']=='yes').sum()/float((wind=='weak').sum())
print('p(golf=yes | wind=weak) = '+str(p_wind_weak_yes))

# 風が弱い場合にゴルフに行かない確率
p_wind_weak_no = 1. - p_wind_weak_yes
print('p(golf=no | wind=weak) = '+str(p_wind_weak_no))

# 風が弱い場合の条件付きエントロピー
entropy_wind_weak = - p_wind_weak_yes * np.log2(p_wind_weak_yes+EPS)
entropy_wind_weak += - p_wind_weak_no * np.log2(p_wind_weak_no+EPS)
print('h(golf | wind=weak) = '+str(entropy_wind_weak))

print('------------------------------------------------------------------')

# 風が強い場合にゴルフに行く確率
p_wind_strong_yes = (golf[wind=='strong']=='yes').sum()/float((wind=='strong').sum())
print('p(golf=yes | wind=strong) = '+str(p_wind_strong_yes))

# 風が強い場合にゴルフに行かない確率
p_wind_strong_no = 1. - p_wind_strong_yes
print('p(golf=no | wind=strong) = '+str(p_wind_strong_no))

# 風が強い場合の条件付きエントロピー
entropy_wind_strong = - p_wind_strong_yes * np.log2(p_wind_strong_yes+EPS)
entropy_wind_strong += - p_wind_strong_no * np.log2(p_wind_strong_no+EPS)
print('h(golf | wind=strong) = '+str(entropy_wind_strong))

print('------------------------------------------------------------------')

------------------------------------------------------------------
p(golf=yes | wind=weak) = 0.7142857142857143
p(golf=no | wind=weak) = 0.2857142857142857
h(golf | wind=weak) = 0.8631205685666303
------------------------------------------------------------------
p(golf=yes | wind=strong) = 0.5
p(golf=no | wind=strong) = 0.5
h(golf | wind=strong) = 0.9999999999999993
------------------------------------------------------------------


In [5]:
print('------------------------------------------------------------------')

# 風が弱い確率
p_wind_weak = (wind=='weak').sum()/float(len(wind))
print('p(wind=weak) = '+str(p_wind_weak))

# 風が強い確率
p_wind_strong = 1. - p_wind_weak
print('p(wind=strong) = '+str(p_wind_strong))

# 風の有無の条件付きエントロピーの期待値
expected_entropy_wind = p_wind_weak * entropy_wind_weak + p_wind_strong * entropy_wind_strong
print('E_wind[h(golf | wind)] = '+str(expected_entropy_wind))

print('------------------------------------------------------------------')
# ゴルフに行く人の割合
p_golf_yes = (golf=='yes').sum()/float(len(golf))
print('p(golf=yes) = '+str(p_golf_yes))

# ゴルフに行かない人の割合
p_golf_no = 1. - p_golf_yes
print('p(golf=no) = '+str(p_golf_no))

# 無条件のエントロピー
entropy = - p_golf_yes * np.log2(p_golf_yes+EPS) - p_golf_no * np.log2(p_golf_no+EPS)
print('h(golf) = '+str(entropy))

print('------------------------------------------------------------------')

# Information Gain
IG_wind = entropy - expected_entropy_wind
print('IG(wind) = '+str(IG_wind))

print('------------------------------------------------------------------')


------------------------------------------------------------------
p(wind=weak) = 0.6363636363636364
p(wind=strong) = 0.36363636363636365
E_wind[h(golf | wind)] = 0.9128949072696736
------------------------------------------------------------------
p(golf=yes) = 0.6363636363636364
p(golf=no) = 0.36363636363636365
h(golf) = 0.9456603046006395
------------------------------------------------------------------
IG(wind) = 0.03276539733096595
------------------------------------------------------------------


### 特徴量が0/1の２値の場合のinformation gainをモジュール化する

In [6]:
def calc_ig(x,y):

    EPS = np.finfo(float).eps
    
    p_y1_x0 = (y[x==0]==1).sum()/float((x==0).sum())
    p_y0_x0 = 1. - p_y1_x0
    entropy_y_x0 = - p_y1_x0 * np.log2(p_y1_x0+EPS) - p_y0_x0 * np.log2(p_y0_x0+EPS)
    
    p_y1_x1 = (y[x==1]==1).sum()/float((x==1).sum())
    p_y0_x1 = 1. - p_y1_x1
    entropy_y_x1 = - p_y1_x1 * np.log2(p_y1_x1+EPS) - p_y0_x1 * np.log2(p_y0_x1+EPS)

    p_x0 = (x==0).sum()/float(len(x))
    p_x1 = 1. - p_x0
    expected_entropy_y_x = p_x0 * entropy_y_x0 + p_x1 * entropy_y_x1

    p_y0 = (y==0).sum()/float(len(y))
    p_y1 = 1. - p_y0
    entropy = - p_y0 * np.log2(p_y0+EPS) - p_y1 * np.log2(p_y1+EPS)

    IG_y_x = entropy - expected_entropy_y_x
    
    return IG_y_x

In [7]:
import copy

wind_01 = copy.deepcopy(wind)
golf_01 = copy.deepcopy(golf)

np.place(wind_01, wind_01=='weak', 0)
np.place(wind_01, wind_01=='strong', 1)
np.place(golf_01, golf_01=='no', 0)
np.place(golf_01, golf_01=='yes', 1)

ig_wind = calc_ig(x=wind_01, y=golf_01)
ig_wind

0.03276539733096595

### 特徴量が連続量の場合（まずは温度）のinformation gainを計算する

特徴量の値によってデータに0/1のラベルをつける。  
例) 湿度50%以上に1、50%未満に0をつける。  
閾値は中央値を使う。せっかくなのでデータを特徴量の値でソートして閾値を徐々に変えながらIGを計算する。  

In [8]:
import copy

temp = df['temperature'].values.astype(np.float32).flatten()

ig_temp_max = -1e10
temp_sorted = list(set(temp))

def replace(lst, thresh, foo=99999):
    np.place(lst, lst>=thresh, foo)
    np.place(lst, lst!=foo, 0)
    np.place(lst, lst==foo, 1)
    return lst

for i in range(len(temp_sorted)-1):
    thresh = (temp_sorted[i] + temp_sorted[i+1]) * 0.5
    temp_01 = replace(copy.deepcopy(temp), thresh)

    ig_temp = calc_ig(x=temp_01, y=golf_01)
    if ig_temp>ig_temp_max:
        ig_temp_max = ig_temp
        thresh_temp = thresh

    print('threshold='+str(thresh)+'  IG='+str(ig_temp))


print('------------------------------------------------------------------')

print('max(IG)='+str(ig_temp_max)+'  threshold=' +  str(thresh))

threshold=19.0  IG=0.012509167646967079
threshold=20.5  IG=0.0010821659130775263
threshold=21.5  IG=0.07205662510638466
threshold=23.0  IG=0.003430488546069199
threshold=25.5  IG=0.1051955320700465
threshold=27.5  IG=0.012509167646966968
threshold=28.5  IG=0.144486759845465
------------------------------------------------------------------
max(IG)=0.144486759845465  threshold=28.5


28.5℃を閾値にするとIGが最も大きくなるが、そもそも29℃のデータが1点しかないので、おそらくオーバーフィットする。なので、やはり閾値には中央値（厳密に言うと、中央値とその隣の値の平均値）を使うことにする。

In [9]:
def median(x):
    s = list(set(sorted(x)))
    return 0.5 * (s[len(s)//2-1] + s[len(s)//2])

thresh_temp = median(temp)

temp_01 = replace(copy.deepcopy(temp), thresh_temp)

ig_temp = calc_ig(x=temp_01, y=golf_01)

print('threshold='+str(thresh_temp)+'  IG='+str(ig_temp))


threshold=23.0  IG=0.003430488546069199


### 湿度のIGも温度のIGと同じように計算する

In [10]:
hum = df['humidity'].values.astype(np.float32).flatten()

thresh_hum = median(hum)

hum_01 = replace(copy.deepcopy(hum), thresh_hum)

ig_hum = calc_ig(x=hum_01, y=golf_01)

print('threshold='+str(thresh_hum)+'  IG='+str(ig_hum))


threshold=79.0  IG=0.07205662510638466


### 天気は3通りあるので、３クラス分類になる。

In [11]:
def calc_ig_general(x, y, nb_classes=2):

    EPS = np.finfo(float).eps

    entropy_y_xk = []
    p_xk = []
    p_yk = []
    for k in range(nb_classes):
        p_y1_xk = (y[x==k]==1).sum()/float((x==k).sum())
        p_y0_xk = 1. - p_y1_xk
        entropy_y_xk.append(- p_y1_xk * np.log2(p_y1_xk+EPS) - p_y0_xk * np.log2(p_y0_xk+EPS))
        p_xk.append((x==k).sum()/float(len(x)))
        p_yk.append((y==k).sum()/float(len(y)))
    entropy_y_xk = np.array(entropy_y_xk)
    p_xk = np.array(p_xk)
    p_yk = np.array(p_yk)

    expected_entropy_y_x = (p_xk * entropy_y_xk).sum()
    entropy = (- p_yk * np.log2(p_yk+EPS)).sum()

    IG_y_x = entropy - expected_entropy_y_x
    
    return IG_y_x

In [12]:
weather = df['weather'].values.flatten()

def binalize(x):
    classes = list(set(x.tolist()))
    nb_classes = len(classes)
    corresp = {}
    for i,c in enumerate(classes):
        corresp.update({c:i})
        np.place(x, x==c, i)
    return nb_classes, corresp
        
nb_classes, corresp = binalize(weather)

ig_weather = calc_ig_general(x=weather, y=golf_01)

print('IG(weather)='+str(ig_weather))
print(corresp)

IG(weather)=0.5043191253030632
{'cloudy': 0, 'sunny': 1, 'rain': 2}


### information grainの一覧

In [13]:
print('IG(weather)='+str(ig_weather))
print('IG(temperature)='+str(ig_temp))
print('IG(humidity)='+str(ig_hum))
print('IG(wind)='+str(ig_wind))

# weatherが最もIGが大きいので、最初の分岐はweatherにする。

IG(weather)=0.5043191253030632
IG(temperature)=0.003430488546069199
IG(humidity)=0.07205662510638466
IG(wind)=0.03276539733096595


### weatherで分割したそれぞれの葉で再度IGを計算する

In [14]:
sunny = df[(df['weather']=='sunny').values]
cloudy = df[(df['weather']=='cloudy').values]
rain = df[(df['weather']=='rain').values]

### まずは晴れ(sunny)を分割する

In [15]:
sunny

Unnamed: 0,weather,temperature,humidity,wind,golf
0,sunny,29,85,weak,no
1,sunny,27,90,strong,no
7,sunny,22,95,weak,no
8,sunny,21,70,weak,yes
10,sunny,24,70,strong,yes


In [16]:
sunny_golf = sunny['golf'].values.flatten()
_, _ = binalize(sunny_golf)

# temperature
sunny_temp = sunny['temperature'].values.astype(np.float32).flatten()
thresh_sunny_temp = median(sunny_temp)
sunny_temp = replace(sunny_temp, thresh_sunny_temp)
ig_sunny_temp = calc_ig(x=sunny_temp, y=sunny_golf)

# humidity
sunny_hum = sunny['humidity'].values.astype(np.float32).flatten()
thresh_sunny_hum = median(sunny_hum)
sunny_hum = replace(sunny_hum, thresh_sunny_hum)
ig_sunny_hum = calc_ig(x=sunny_hum, y=sunny_golf)

# wind
sunny_wind = sunny['weather'].values.flatten()
nb_classes, _ = binalize(sunny_wind)
ig_sunny_wind = calc_ig_general(sunny_wind, sunny_golf, nb_classes)

print('IG(temperature | sunny)='+str(ig_sunny_temp)+'  thresh='+str(thresh_sunny_temp))
print('IG(humidity | sunny)='+str(ig_sunny_hum)+'  thresh='+str(thresh_sunny_hum))
print('IG(wind | sunny)='+str(ig_sunny_wind))

# sunnyの葉ではhumidityが最もIGが大きいので、次の分岐はhumidityにする。

IG(temperature | sunny)=0.17095059445466865  thresh=28.0
IG(humidity | sunny)=0.9709505944546684  thresh=77.5
IG(wind | sunny)=-0.5287712379549446


### 曇りを分割する

In [17]:
cloudy

Unnamed: 0,weather,temperature,humidity,wind,golf
2,cloudy,28,78,weak,yes
6,cloudy,18,65,strong,yes


In [18]:
cloudy_golf = cloudy['golf'].values.flatten()
_, _ = binalize(cloudy_golf)

# temperature
cloudy_temp = cloudy['temperature'].values.astype(np.float32).flatten()
thresh_cloudy_temp = median(cloudy_temp)
cloudy_temp = replace(cloudy_temp, thresh_cloudy_temp)
ig_cloudy_temp = calc_ig(x=cloudy_temp, y=cloudy_golf)

# humidity
cloudy_hum = cloudy['humidity'].values.astype(np.float32).flatten()
thresh_cloudy_hum = median(cloudy_hum)
cloudy_hum = replace(cloudy_hum, thresh_cloudy_hum)
ig_cloudy_hum = calc_ig(x=cloudy_hum, y=cloudy_golf)

# wind
cloudy_wind = cloudy['wind'].values.flatten()
nb_classes, _ = binalize(cloudy_wind)
ig_cloudy_wind = calc_ig_general(cloudy_wind, cloudy_golf, nb_classes)

print('IG(temperature | cloudy)='+str(ig_cloudy_temp)+'  thresh='+str(thresh_cloudy_temp))
print('IG(humidity | cloudy)='+str(ig_cloudy_hum)+'  thresh='+str(thresh_cloudy_hum))
print('IG(wind | cloudy)='+str(ig_cloudy_wind))

# cloudyの葉はどちらもゴルフに行くので、どう分割してもIGは0

IG(temperature | cloudy)=0.0  thresh=23.0
IG(humidity | cloudy)=0.0  thresh=71.5
IG(wind | cloudy)=0.0


### 雨を分割する

In [19]:
rain

Unnamed: 0,weather,temperature,humidity,wind,golf
3,rain,21,96,weak,yes
4,rain,20,80,weak,yes
5,rain,18,70,strong,no
9,rain,24,80,weak,yes


In [20]:
rain_golf = rain['golf'].values.flatten()
_, _ = binalize(rain_golf)

# temperature
rain_temp = rain['temperature'].values.astype(np.float32).flatten()
thresh_rain_temp = median(rain_temp)
rain_temp = replace(rain_temp, thresh_rain_temp)
ig_rain_temp = calc_ig(x=rain_temp, y=rain_golf)

# humidity
rain_hum = rain['humidity'].values.astype(np.float32).flatten()
thresh_rain_hum = median(rain_hum)
rain_hum = replace(rain_hum, thresh_rain_hum)
ig_rain_hum = calc_ig(x=rain_hum, y=rain_golf)

# wind
rain_wind = rain['wind'].values.flatten()
nb_classes, _ = binalize(rain_wind)
ig_rain_wind = calc_ig_general(rain_wind, rain_golf, nb_classes)

print('IG(temperature | rain)='+str(ig_rain_temp)+'  thresh='+str(thresh_rain_temp))
print('IG(humidity | rain)='+str(ig_rain_hum)+'  thresh='+str(thresh_rain_hum))
print('IG(wind | rain)='+str(ig_rain_wind))

# rainの葉ではtemperatureとwindが同率で最もIGが大きいので、
# 次の分岐はtemperatureまたはwindにする

IG(temperature | rain)=0.8112781244591325  thresh=19.0
IG(humidity | rain)=0.12255624891826566  thresh=88.0
IG(wind | rain)=0.8112781244591325


### 木の確認
- まずは天気で分類
  - 晴れの場合: 
    - 湿度が77.5%未満の場合: ゴルフに行く
    - 湿度が77.5%以上の場合: ゴルフに行かない
  - 曇りの場合:
    - ゴルフに行く
  - 雨の場合:
    1. 温度分割
      - 温度が19℃以上の場合: ゴルフに行く
      - 温度が19℃未満の場合: ゴルフに行かない
    2. 風分割
      - 風が弱い場合: ゴルフに行く
      - 風が強い場合: ゴルフに行かない

### テスト

In [21]:
df_test

Unnamed: 0,weather,temperature,humidity,wind,golf
11,cloudy,22,90,strong,yes
12,cloudy,27,75,weak,yes
13,rain,22,80,strong,no


In [22]:
mode = 'rain_temp'
#mode = 'rain_wind'

for _,d in df_test.iterrows():
    if d['weather'].values=='sunny':
        if d['humidity'].values<77.5:
            pred = 'yes'
        else:
            pred = 'no'
    elif d['weather'].values=='cloudy':
        pred = 'yes'
    elif d['weather'].values=='rain':
        if mode=='rain_temp':
            if d['temperature'].values>=19.:
                pred = 'yes'
            else:
                pred = 'no'
        elif mode=='rain_wind':
            if d['wind'].values=='weak':
                pred = 'yes'
            else:
                pred = 'no'
    print('pred: '+pred+'   true: '+d['golf'].values)
        

['pred: yes   true: yes']
['pred: yes   true: yes']
['pred: yes   true: no']
