# Decision Tree

### Overall process of building decision tree

### Measuring Purity

### Information Gain

# Data Prep and Decision Tree Algo

In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder,LabelEncoder

In [69]:
dataset=pd.read_csv("decision_tree/dataset/drug200.csv")

In [70]:
dataset.head(10)

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY
5,22,F,NORMAL,HIGH,8.607,drugX
6,49,F,NORMAL,HIGH,16.275,drugY
7,41,M,LOW,HIGH,11.037,drugC
8,60,M,NORMAL,HIGH,15.171,drugY
9,43,M,LOW,NORMAL,19.368,drugY


In [71]:
cols=["Age","Na_to_K"]
dataset=dataset.drop(cols,axis=1)

In [72]:
dataset.count()

Sex            200
BP             200
Cholesterol    200
Drug           200
dtype: int64

In [73]:
dataset.sample(10)

Unnamed: 0,Sex,BP,Cholesterol,Drug
162,M,NORMAL,NORMAL,drugX
5,F,NORMAL,HIGH,drugX
91,M,HIGH,NORMAL,drugY
136,F,HIGH,HIGH,drugB
19,F,HIGH,NORMAL,drugY
121,M,HIGH,NORMAL,drugY
97,F,HIGH,HIGH,drugY
182,F,LOW,NORMAL,drugX
25,F,HIGH,NORMAL,drugY
17,M,HIGH,HIGH,drugA


In [74]:
dataset.isnull().sum()

Sex            0
BP             0
Cholesterol    0
Drug           0
dtype: int64

In [75]:
filtered_dataset = dataset[dataset['Drug'].isin(['drugX', 'drugY'])]


In [76]:
filtered_dataset.count()

Sex            145
BP             145
Cholesterol    145
Drug           145
dtype: int64

In [77]:
filtered_dataset.Drug.unique()

array(['drugY', 'drugX'], dtype=object)

**Lets use one hot encoding to change the categorical feature into numerical feature**

In [78]:
one_hot_encoding=OneHotEncoder()
label_encoder=LabelEncoder()

In [79]:
final_dataset=one_hot_encoding.fit_transform(filtered_dataset)

In [83]:

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(filtered_dataset['Drug'])

cols_for_ohe = ["Sex", "BP", "Cholesterol"]

one_hot_encoder = OneHotEncoder(sparse=False)
one_hot_encoded = one_hot_encoder.fit_transform(filtered_dataset[cols_for_ohe])





In [86]:
X_train=one_hot_encoded

In [103]:
def entropy(p):
    if p == 0 or p == 1:
        return 0
    else:
        return -p * np.log2(p) - (1- p)*np.log2(1 - p)
    


In [104]:
def split_indices(X,index_feature):
    left_indices = []
    right_indices = []
    for i,x in enumerate(X):
        if x[index_feature] == 1:
            left_indices.append(i)
        else:
            right_indices.append(i)
    return left_indices, right_indices
    

In [113]:
split_indices(X_train, 4)

([1,
  3,
  4,
  5,
  10,
  15,
  21,
  24,
  27,
  28,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  40,
  44,
  49,
  50,
  51,
  53,
  54,
  55,
  60,
  61,
  64,
  65,
  75,
  78,
  80,
  81,
  84,
  86,
  87,
  89,
  91,
  92,
  95,
  97,
  98,
  101,
  104,
  107,
  110,
  115,
  116,
  121,
  123,
  125,
  128,
  129,
  130,
  132,
  136,
  142,
  143],
 [0,
  2,
  6,
  7,
  8,
  9,
  11,
  12,
  13,
  14,
  16,
  17,
  18,
  19,
  20,
  22,
  23,
  25,
  26,
  29,
  38,
  39,
  41,
  42,
  43,
  45,
  46,
  47,
  48,
  52,
  56,
  57,
  58,
  59,
  62,
  63,
  66,
  67,
  68,
  69,
  70,
  71,
  72,
  73,
  74,
  76,
  77,
  79,
  82,
  83,
  85,
  88,
  90,
  93,
  94,
  96,
  99,
  100,
  102,
  103,
  105,
  106,
  108,
  109,
  111,
  112,
  113,
  114,
  117,
  118,
  119,
  120,
  122,
  124,
  126,
  127,
  131,
  133,
  134,
  135,
  137,
  138,
  139,
  140,
  141,
  144])

In [114]:
def weighted_entropy(X,y,left_indices,right_indices):
    """
    This function takes the splitted dataset, the indices we chose to split and returns the weighted entropy.
    """
    w_left = len(left_indices)/len(X)
    w_right = len(right_indices)/len(X)
    p_left = sum(y[left_indices])/len(left_indices)
    p_right = sum(y[right_indices])/len(right_indices)
    
    weighted_entropy = w_left * entropy(p_left) + w_right * entropy(p_right)
    return weighted_entropy

In [115]:
left_indices, right_indices = split_indices(X_train, 0)
weighted_entropy(X_train, y_train, left_indices, right_indices)

0.952324152935568

In [116]:
def information_gain(X, y, left_indices, right_indices):
    """
    Here, X has the elements in the node and y is theirs respectives classes
    """
    p_node = sum(y)/len(y)
    h_node = entropy(p_node)
    w_entropy = weighted_entropy(X,y,left_indices,right_indices)
    return h_node - w_entropy


In [117]:
information_gain(X_train, y_train, left_indices, right_indices)

0.00018330001103084026