## Decision Tree

### Data Load

In [1]:
import pandas as pd 
import numpy as np

In [2]:
pd_data = pd.read_csv('https://raw.githubusercontent.com/AugustLONG/ML01/master/01decisiontree/AllElectronics.csv')
pd_data.drop("RID",axis=1, inplace = True) #RID는 그냥 순서라서 삭제
pd_data

Unnamed: 0,age,income,student,credit_rating,class_buys_computer
0,youth,high,no,fair,no
1,youth,high,no,excellent,no
2,middle_aged,high,no,fair,yes
3,senior,medium,no,fair,yes
4,senior,low,yes,fair,yes
5,senior,low,yes,excellent,no
6,middle_aged,low,yes,excellent,yes
7,youth,medium,no,fair,no
8,youth,low,yes,fair,yes
9,senior,medium,yes,fair,yes


### Gini Index

In [32]:
def get_gini(df, label):
    gini = 1 - (len(df.loc[df[label] == 'yes']) / len(df)) ** 2 - (len(df.loc[df[label] == 'no']) / len(df)) ** 2
    return gini

In [4]:
get_gini(pd_data,'class_buys_computer')

0.4591836734693877

### Cart Algorithm

#### - Binary Split을 전제로함

In [7]:
import itertools
from itertools import combinations

def get_binary_split(df, attribute):
    
    uniques = list(df[attribute].unique()) 
    result = [
        list(temp)
        for i in range(1, len(uniques))
        for temp in itertools.combinations(uniques, i) # 1부터 len(uniques) - 1 까지 개수의 원소를 부분집합을 형성
    ]
    
    return result

In [8]:
get_binary_split(pd_data,'income')

[['high'],
 ['medium'],
 ['low'],
 ['high', 'medium'],
 ['high', 'low'],
 ['medium', 'low']]

### Gini Index - binray split

In [10]:
def get_attribute_gini_index(df, attribute, label):
    result = {}
    keys = get_binary_split(df, attribute)
    df_len = df.shape[0]
    for key in keys:
        g_index = df[attribute].map(lambda x : x in key)
        g_len = sum(g_index)
        gini = (g_len / df_len) * get_gini(df[g_index], label) + ((df_len - g_len) / df_len) * get_gini(df[~g_index], label)
        result[tuple(key)] = gini
    return result

In [11]:
get_attribute_gini_index(pd_data, 'income', 'class_buys_computer')

{('high',): 0.4428571428571429,
 ('medium',): 0.4583333333333333,
 ('low',): 0.45,
 ('high', 'medium'): 0.45,
 ('high', 'low'): 0.4583333333333333,
 ('medium', 'low'): 0.4428571428571429}

###  Minimum Gini Index

In [12]:
min(get_attribute_gini_index(pd_data, 'income', 'class_buys_computer').items())

(('high',), 0.4428571428571429)

### Most Important Variable by using Gini Index

In [13]:
def min_gini_index(df, label):
    cols = df.columns[df.columns != label]
    minimum = []
    for col in cols:
        g_dict = get_attribute_gini_index(df, col, label)
        if g_dict:
            g_min = min(g_dict.keys())
            print(f"{col}'s Minimum Gini Index : {g_dict[g_min]} {g_min}")
            minimum.append((g_dict[g_min], col, g_min))
    minimum.sort()
    return minimum[0]

print('Total Minimum Gini Index : ', min_gini_index(pd_data, 'class_buys_computer'))

age's Minimum Gini Index : 0.35714285714285715 ('middle_aged',)
income's Minimum Gini Index : 0.4428571428571429 ('high',)
student's Minimum Gini Index : 0.3673469387755103 ('no',)
credit_rating's Minimum Gini Index : 0.42857142857142855 ('excellent',)
Total Minimum Gini Index :  (0.35714285714285715, 'age', ('middle_aged',))


##### < age의 'middle_aged'가 가장 작은 Gini Index를 가지고 있음 >

### 'middle_aged'의 Gini Index

In [14]:
### middle_aged를 기준으로 dataframe을 형성

df_1 = pd_data[pd_data['age'] == 'middle_aged']
df_2 = pd_data[pd_data['age'] != 'middle_aged']

In [15]:
df_1.head()

Unnamed: 0,age,income,student,credit_rating,class_buys_computer
2,middle_aged,high,no,fair,yes
6,middle_aged,low,yes,excellent,yes
11,middle_aged,medium,no,excellent,yes
12,middle_aged,high,yes,fair,yes


In [16]:
df_2.head()

Unnamed: 0,age,income,student,credit_rating,class_buys_computer
0,youth,high,no,fair,no
1,youth,high,no,excellent,no
3,senior,medium,no,fair,yes
4,senior,low,yes,fair,yes
5,senior,low,yes,excellent,no


In [17]:
def res(df, label):
    cols = df.columns[df.columns != 'age'][:-1]
    for col in cols:
        g_dict = get_attribute_gini_index(df, col, label)
        if g_dict:
            g_min = min(g_dict.keys())
            print("{0}'s Minimum Gini Index : {1:.4f}% {2}".format(col, g_dict[g_min], g_min))

In [18]:
res(df_1, 'class_buys_computer')
print('#'*70)
res(df_2, 'class_buys_computer')

income's Minimum Gini Index : 0.0000% ('high',)
student's Minimum Gini Index : 0.0000% ('no',)
credit_rating's Minimum Gini Index : 0.0000% ('excellent',)
######################################################################
income's Minimum Gini Index : 0.3750% ('high',)
student's Minimum Gini Index : 0.3200% ('no',)
credit_rating's Minimum Gini Index : 0.4167% ('excellent',)


## Entropy 

<img src = https://miro.medium.com/max/1122/0*DkWdyGidNSfdT1Nu.png width = "350">

In [19]:
pd_data.head()

Unnamed: 0,age,income,student,credit_rating,class_buys_computer
0,youth,high,no,fair,no
1,youth,high,no,excellent,no
2,middle_aged,high,no,fair,yes
3,senior,medium,no,fair,yes
4,senior,low,yes,fair,yes


In [20]:
lisst = list(pd_data['income'].unique())

print(lisst)

['high', 'medium', 'low']


In [26]:
import math
from math import log2

def getEntropy(df, feature) :
    entropy = 0
    unique = list(df[feature].unique())
    for i in range(len(unique)):
        p = len(df[df[feature] == unique[i]]) / len(df[feature]) 
        entropy -= p * math.log(p,2)
    
    return(entropy)

In [31]:
getEntropy(pd_data, "class_buys_computer")

0.9402859586706309

### Imformation Gain은 상위 노드의 Enrtopy에서 하위 노드의 Entropy를 뺀 값
#### - Infomation이 클수록 Entropy가 많이 줄어든 것을 의미

In [28]:
def getGainA(df, feature) :
        
    result = {}

    info_D = getEntropy(df, feature)     ### 상위 노드의 Entropy
    columns = list(df.loc[:, df.columns != feature])
    
    gains = []

    for i in columns:
        outfo_D = 0                     ### 하위 노드의 Entropy => 각 P(확률) * 각 feature의 모든 entropy 의 합
        unique = list(df[i].unique())
        for j in range(len(unique)):
            j_entropy = len(df[df[i] == unique[j]]) / len(df[feature]) * getEntropy(df[df[i] == unique[j]], feature)
            outfo_D += j_entropy
        gain = info_D - outfo_D
        gains.append(gain)
    result = dict(zip(columns, gains))
    return(result)

In [29]:
getGainA(pd_data, "class_buys_computer")

{'age': 0.2467498197744391,
 'income': 0.029222565658954647,
 'student': 0.15183550136234136,
 'credit_rating': 0.04812703040826927}