In [1]:
# https://tobigs.gitbook.io/tobigs/data-analysis/decision-tree/python-decision-tree-1

import pandas as pd 
import numpy as np

pd_data = pd.read_csv('https://raw.githubusercontent.com/AugustLONG/ML01/master/01decisiontree/AllElectronics.csv')
pd_data.drop("RID",axis=1, inplace = True) #RID는 그냥 순서라서 삭제

In [2]:
pd_data

Unnamed: 0,age,income,student,credit_rating,class_buys_computer
0,youth,high,no,fair,no
1,youth,high,no,excellent,no
2,middle_aged,high,no,fair,yes
3,senior,medium,no,fair,yes
4,senior,low,yes,fair,yes
5,senior,low,yes,excellent,no
6,middle_aged,low,yes,excellent,yes
7,youth,medium,no,fair,no
8,youth,low,yes,fair,yes
9,senior,medium,yes,fair,yes


In [3]:
from functools import reduce

def get_gini(df, label):
    D_len = df[label].count() # 데이터 전체 길이
    # 각 클래스별 Count를 담은 Generator 생성
    count_arr = (value for key, value in df[label].value_counts().items())
    # reduce를 이용해 초기값 1에서 각 클래스 (count / D_len)^2 빼기
    return reduce(lambda x, y: x - (y/D_len)**2 ,count_arr,1)

In [4]:
pd_data['class_buys_computer']

0      no
1      no
2     yes
3     yes
4     yes
5      no
6     yes
7      no
8     yes
9     yes
10    yes
11    yes
12    yes
13     no
Name: class_buys_computer, dtype: object

In [5]:
get_gini(pd_data,'class_buys_computer')

0.4591836734693877

In [6]:
import itertools # 변수의 모든 클래시 조합을 얻기 위해 itertools 불러오기

def get_binary_split(df, attribute):
    attr_unique = df[attribute].unique()
    # 이중 For loop List Comprehension
    result = [
            list(item) 
            for i in range(1, len(attr_unique)) # 1부터 변수의 클래스 갯수-1 까지 Iteration
            for item in itertools.combinations(attr_unique, i) # i를 길이로 하는 조합 생성
        ]
    return result

In [7]:
# 검증을 위한 테스트데이터 제작
df = pd.DataFrame([1,2,3,4,2,1,3], columns=['d'])
print(df['d'].unique())
a = get_binary_split(df,'d')

[1 2 3 4]


In [8]:
# get_binary_split 검증, 짝을 찾아 전체 클래스가 나오는지 확인
for i in range(len(a) // 2):
    b = a[i] + a[len(a)-i-1]
    b.sort()
    print(a[i], a[len(a)-i-1], '=>', b)

[1] [2, 3, 4] => [1, 2, 3, 4]
[2] [1, 3, 4] => [1, 2, 3, 4]
[3] [1, 2, 4] => [1, 2, 3, 4]
[4] [1, 2, 3] => [1, 2, 3, 4]
[1, 2] [3, 4] => [1, 2, 3, 4]
[1, 3] [2, 4] => [1, 2, 3, 4]
[1, 4] [2, 3] => [1, 2, 3, 4]


In [9]:
get_binary_split(pd_data, "age")

[['youth'],
 ['middle_aged'],
 ['senior'],
 ['youth', 'middle_aged'],
 ['youth', 'senior'],
 ['middle_aged', 'senior']]