datasets used
* https://data.world/uci/arrhythmia/workspace/file?filename=arrhythmia.names.txt

In [2]:
from sklearn.datasets import load_diabetes
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('data/arrhythmia.data.csv', header=None)

In [4]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,270,271,272,273,274,275,276,277,278,279
0,75,0,190,80,91,193,371,174,121,-16,...,0.0,9.0,-0.9,0.0,0.0,0.9,2.9,23.3,49.4,8
1,56,1,165,64,81,174,401,149,39,25,...,0.0,8.5,0.0,0.0,0.0,0.2,2.1,20.4,38.8,6
2,54,0,172,95,138,163,386,185,102,96,...,0.0,9.5,-2.4,0.0,0.0,0.3,3.4,12.3,49.0,10
3,55,0,175,94,100,202,380,179,143,28,...,0.0,12.2,-2.2,0.0,0.0,0.4,2.6,34.6,61.6,1
4,75,0,190,80,88,181,360,177,103,-16,...,0.0,13.1,-3.6,0.0,0.0,-0.1,3.9,25.4,62.8,7


In [58]:
class Node:
    def __init__(self, full_data, indices=None):
        self.full_data = full_data
        self.indices = indices
        self.left = None
        self.right = None

class DecisionTree:
    def __init__(self, root, df):
        self.root = root
        self.df = df
        
    def build_using_dfs(self, df, feature, indices=None):
        """This is preorder traversal"""
        if indices and len(indices) == 1:
            return Node(df, indices)
        if indices is None:
            thisdf = df
        else:
            thisdf = df.loc[indices]
        distinct_mean = thisdf[feature].unique().mean()
        if len(thisdf[thisdf[feature] == distinct_mean]) == len(thisdf):
            return Node(df, indices)
        left_indices = thisdf[thisdf[feature] <= distinct_mean].index.tolist()
        right_indices = thisdf[thisdf[feature] > distinct_mean].index.tolist()
        root = Node(df, indices)
        root.left = self.build_using_dfs(self, df, feature, left_indices)
        root.right = self.build_using_dfs(self, df, feature, right_indices)
        return root
    
    def subnode_gini(self, node):
        df = self.df
        all_rows = node.indices
        if all_rows is None:
            # this means that this is the root node
            total_rows = len(df)
            subdf = df
        else:
            total_rows = len(all_rows)
            subdf = df.loc[all_rows]
            
        labels = subdf[279].unique()
        
        # 279 is the label node. so calculating the gini of the label
        gini = sum((len(subdf[subdf[279]==label]) / total_rows) ** 2 for label in labels)
        return gini * total_rows / len(df)
        
    @classmethod
    def build_from(cls, df, feature):
        root = cls.build_using_dfs(cls, df, feature)
        return cls(root, df)
    
    def calculate_gini(self, root):
        """This is done using inorder traversal"""
#         print(root)
        if not root:
            return 0
        return self.calculate_gini(root.left) + self.subnode_gini(root) + self.calculate_gini(root.right)
    
features = df.columns[:-1]

for feature in features:
    if df[feature].dtype == 'int64':
        decision_tree = DecisionTree.build_from(df, feature)
        print(feature, decision_tree.calculate_gini(decision_tree.root))

0 2.7497436316021755
1 0.6649200858745973
2 2.68691324895844
3 2.7371973829638776
4 2.8607889465604095
5 2.8805276318365327
6 3.2538304831740685
7 3.27144497973481
8 2.8314211363152353
9 3.4158702272675736
15 1.4224568816112908
16 2.1378903398172846
17 1.8612893418834
18 0.983212769921449
19 0.3246338789255227
20 1.917502564970028
21 0.6517731760626614
22 0.6512617421632333
23 0.6501491132309215
24 0.6520085101939592
25 0.6526968091123467
26 0.6546947196334872
27 1.4620474329921767
28 2.0509580692604663
29 1.851849212684258
30 1.3133066276682936
31 0.9793937591872374
32 1.8295904621210788
33 0.649889997876332
34 0.6548036285421246
35 0.6517731760626614
36 0.6504352555233596
37 0.6500911060641658
38 0.6524723425400626
39 1.8349936657035022
40 1.929872325489373
41 1.824713978686102
42 1.6703613492971396
43 0.9766024930227823
44 1.9039815507859648
45 0.6498011571469774
46 0.650098683666937
47 0.6498011571469774
48 0.6516638956679288
49 0.6505074510187302
50 0.6523723253365353
51 2.0486320

In [57]:
df[0].dtype == 'int64'

True

In [49]:
df.columns

Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
            ...
            270, 271, 272, 273, 274, 275, 276, 277, 278, 279],
           dtype='int64', length=280)

In [11]:
decision_tree

<__main__.DecisionTree at 0x7ff2cf04fed0>

In [78]:
df[0].unique().mean()

43.55844155844156

In [15]:
df[df[0]<43].index.tolist()

[5,
 6,
 13,
 20,
 22,
 24,
 26,
 27,
 37,
 41,
 42,
 43,
 44,
 47,
 48,
 49,
 51,
 52,
 53,
 56,
 57,
 59,
 60,
 61,
 62,
 63,
 67,
 70,
 71,
 72,
 74,
 75,
 77,
 80,
 86,
 87,
 94,
 96,
 97,
 99,
 101,
 102,
 103,
 104,
 108,
 112,
 113,
 114,
 115,
 117,
 118,
 124,
 125,
 132,
 134,
 135,
 138,
 140,
 141,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 151,
 152,
 153,
 154,
 160,
 161,
 163,
 164,
 165,
 167,
 169,
 172,
 177,
 181,
 188,
 193,
 195,
 196,
 198,
 203,
 204,
 208,
 210,
 211,
 215,
 216,
 221,
 224,
 226,
 228,
 232,
 237,
 239,
 250,
 260,
 261,
 262,
 263,
 264,
 265,
 276,
 279,
 280,
 283,
 286,
 287,
 292,
 295,
 297,
 298,
 304,
 305,
 306,
 309,
 310,
 316,
 319,
 320,
 321,
 324,
 332,
 333,
 334,
 336,
 337,
 338,
 339,
 344,
 345,
 346,
 348,
 353,
 358,
 361,
 364,
 369,
 374,
 379,
 382,
 384,
 391,
 393,
 397,
 401,
 403,
 405,
 407,
 412,
 415,
 416,
 419,
 422,
 424,
 425,
 426,
 429,
 430,
 432,
 434,
 435,
 436,
 437,
 438,
 440,
 441,
 443,
 444,
 446

In [17]:
df.loc[df[df[0]<43].index.tolist()].shape

(177, 280)

In [18]:
df.loc[[210, 298, 430, 432]].shape

(4, 280)

In [22]:
df.loc[[10, 38, 187, 241, 308, 349, 360, 404]]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,270,271,272,273,274,275,276,277,278,279
10,62,0,170,72,102,135,401,156,83,72,...,-0.5,9.0,-2.0,0.0,0.0,0.8,0.9,12.3,19.3,3
38,62,1,163,60,80,185,354,166,107,-2,...,0.0,11.2,-1.0,0.0,0.0,0.9,3.8,28.3,60.9,1
187,62,0,170,85,110,157,426,198,94,62,...,-1.1,5.9,-2.0,0.0,0.0,0.5,0.5,2.9,9.9,2
241,62,1,155,78,90,172,297,209,103,2,...,0.0,4.5,-1.1,0.0,0.0,0.6,-0.8,11.3,3.7,1
308,62,1,170,110,97,0,294,184,0,39,...,0.0,15.3,0.0,0.0,0.0,-0.5,0.2,36.7,38.9,2
349,62,0,178,89,95,181,368,156,104,2,...,0.0,5.2,-2.8,0.0,0.0,0.6,1.3,4.2,14.0,1
360,62,1,157,65,81,174,452,160,116,53,...,0.0,11.9,0.0,0.0,0.0,1.0,-0.8,35.7,28.5,2
404,62,1,165,70,73,177,381,170,93,35,...,-0.4,11.6,-0.7,0.0,0.0,0.5,3.0,24.1,51.7,1


In [24]:
df.loc[[10, 38, 187, 241, 308, 349, 360, 404]][279].unique()

array([3, 1, 2])

In [36]:
all_rows = [10, 38, 187, 241, 308, 349, 360, 404]
total_rows = len(all_rows)
print(total_rows)
subdf = df.loc[all_rows]
labels = subdf[279].unique()
print(labels)
print([len(subdf[subdf[279]==label]) for label in labels])
print([(len(subdf[subdf[279]==label]) / total_rows) ** 2 for label in labels])
gini = sum((len(subdf[subdf[279]==label]) / total_rows) ** 2 for label in labels)
print(gini)

8
[3 1 2]
[1, 4, 3]
[0.015625, 0.25, 0.140625]
0.40625


In [28]:
df.loc[all_rows].shape

(8, 280)

In [81]:
decision_tree = DecisionTree.build_from(df, 0)

AttributeError: 'int' object has no attribute 'loc'

In [14]:
df[279].describe()

count    452.000000
mean       3.880531
std        4.407097
min        1.000000
25%        1.000000
50%        1.000000
75%        6.000000
max       16.000000
Name: 279, dtype: float64