In [1]:
# Import necessary modules
import pandas as pd
import numpy as np
from collections import Counter

In [2]:
# Read dataset
data = pd.read_csv("data/diabetes.csv")
print("Shape of data: ", data.shape)
print("\nData Info:")
print(data.info())
print("\nData Description:")
print(data.describe()) # Here only integer and float columns are only considered

Shape of data:  (768, 9)

Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
Pregnancies                 768 non-null int64
Glucose                     768 non-null int64
BloodPressure               768 non-null int64
SkinThickness               768 non-null int64
Insulin                     768 non-null int64
BMI                         768 non-null float64
DiabetesPedigreeFunction    768 non-null float64
Age                         768 non-null int64
Outcome                     768 non-null int64
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
None

Data Description:
       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.369578   31.972618      19.355807      15.952218  115.244002   
min       0.000000    0.000000       0.000000       0

In [3]:
def split_into_train_and_test():
    # Split dataset between train and test dataset
    train = data.sample(frac=0.8, random_state=1) # Get random 80% of data
    # print(train.index)
    test = data.drop(train.index) # Get remaining data, after dropping rows of indexes which are present in train
    X_train = train.iloc[:, :-1].values
    y_train = train.iloc[:, -1:].values.flatten()
    
    X_test = test.iloc[:, :-1].values
    y_test = test.iloc[:, -1:].values.flatten()
    
    return X_train, y_train, X_test, y_test

![Entropy formula](images/entropy_general.png)

Where p(X) is % of given sample wrt all samples. Given by formula: number of given sample/total samples

In [4]:
def entropy(y):
    # Get number of occurences of all class labels
    h = np.bincount(y)  # bincount gives an numpy array containing frequency of numbers in ascending order.
    # Then divide them by length of total samples, it will also give numpy array where each element is
    # division of ith element of `h` / length of y
    px = h/len(y)
    
    # Now calculate entropy using its formula
    e = -np.sum([p * np.log2(p) for p in px if p>0]) # p > 0 is checked as log is only defined for +ve numbers
    return e

# To store a info of a node
class Node:
    def __init__(self, feature=None, threshold=None, left=None,right=None, *, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

    def check_leaf_node(self):
        """
         Detects whether node is leaf node or not
        """
        if self.value:
            return True
        return False
        

In [5]:
class DecisionTree:
    def __init__(self, min_samples_split=2, max_depth=100, num_features=None):  # minimum samples require to split a tree 
        self.min_samples_split = min_samples_split 
        self.max_depth = max_depth
        self.num_features = num_features
        # Starting root
        self.root = None
        
    def fit(self, X, y):
        # If self.features was not specified then we will take maximum no. of features, 
        # else we will take specified features, 
        # or if features are less than specified values,then we just consider min value
        self.num_features = X.shape[1] if not self.num_features else min(self.num_features, X.shape[1])
        self.root = self.grow_tree(X, y)
        
    def predict(self, X):
        # Traverse tree
        return np.array([self.traverse_tree(x, self.root) for x in X])

    def traverse_tree(self, x, node):
        print(node.threshold)
        if node.check_leaf_node():
            return node.value

        if x[node.feature] <= node.threshold:
            return self.traverse_tree(x, node.left)
        return self.traverse_tree(x, node.right)

    def grow_tree(self, X, y, depth=0):  # Intially depth = 0(Start at root)
        n_samples, n_features = X.shape
        # Get all different labels
        n_labels = len(np.unique(y))
        
        """
        Stoppig criteria:
            - Reached max depth, or
            - minimum samples not found at node
            - No more class distribution            
        """
        # Then it is leaf node
        if depth >= self.max_depth or n_samples < self.min_samples_split or n_labels == 1:
            leaf_value = self.get_most_common_label(y)
            return Node(value=leaf_value)

        # Select some random features
        features_idxs = np.random.choice(n_features, self.num_features, replace=False) # array, size, replace=True(keep False as we don't want same index multiple times)
        
        best_feature, best_threshold = self.get_best_criteria(X, y, features_idxs)
        
        # Split tree wrt best feature and threshold
        left_idx, right_idx = self.split(X[:, best_feature], best_threshold)
        # Now we can continuing growing
        left = self.grow_tree(X[left_idx, :], y[left_idx], depth=1)
        right = self.grow_tree(X[right_idx, :], y[right_idx], depth=1)
        return Node(best_feature, best_threshold, left, right)
        
    def get_most_common_label(self, y):
        counter = Counter(y)  # It will give num of occurrences of all types of y
        # print(counter.most_common(1))
        most_common = counter.most_common(1)[0][0]
        return most_common
    
    def get_best_criteria(self, X, y, features_idxs):
        best_gain = -1
        split_idx, split_threshold = None, None
        for features_idx in features_idxs:
            # Now select only column vector at only selected index
            X_column = X[:, features_idx]
            thresholds = np.unique(X_column)
            for threshold in thresholds:
                gain = self.get_info_gain(X_column, y, threshold)
                if gain > best_gain:
                    best_gain = gain
                    split_idx = features_idx
                    split_threshold = threshold
                    
        return split_idx, split_threshold
       
    def get_info_gain(self, X_column, y, split_threshold):
        # Calculate parent entropy
        parent_entropy = entropy(y)
        # Generate split
        left_idx, right_idx = self.split(X_column, split_threshold)
        if len(left_idx) == 0 or len(right_idx) == 0:
            return 0
        # Calculate weighted average of child entropies
        n = len(y)
        n_l, n_r = len(left_idx), len(right_idx)
        e_l, e_r = entropy(y[left_idx]), entropy(y[right_idx])
        child_entropy = (n_l/n) * e_l + (n_r/n) * e_r
        
        # Calculate information gain
        ig = parent_entropy - child_entropy
        return ig
        
    def split(self, X_column, split_threshold):
        left_idx = np.argwhere(X_column <= split_threshold).flatten() # Return array where condition is true and flatten it to get 1D array
        right_idx = np.argwhere(X_column > split_threshold).flatten() # Return array where condition is true and flatten it to get 1D array

        return left_idx, right_idx

In [6]:
X_train, y_train, X_test, y_test = split_into_train_and_test()
X_train
# from sklearn.model_selection import train_test_split
# x = data.iloc[:, :-1].values
# y = data.iloc[:, -1:].values

# #spliting the dataset into training and test set
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1234)

array([[7.00e+00, 1.36e+02, 7.40e+01, ..., 2.60e+01, 6.47e-01, 5.10e+01],
       [1.00e+00, 1.51e+02, 6.00e+01, ..., 2.61e+01, 1.79e-01, 2.20e+01],
       [6.00e+00, 1.09e+02, 6.00e+01, ..., 2.50e+01, 2.06e-01, 2.70e+01],
       ...,
       [0.00e+00, 9.30e+01, 6.00e+01, ..., 2.87e+01, 5.32e-01, 2.20e+01],
       [0.00e+00, 1.05e+02, 9.00e+01, ..., 2.96e+01, 1.97e-01, 4.60e+01],
       [7.00e+00, 1.79e+02, 9.50e+01, ..., 3.42e+01, 1.64e-01, 6.00e+01]])

In [7]:
def get_accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

classifier = DecisionTree()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
acc = get_accuracy(y_test, y_pred)
print ("Accuracy:", acc)

127.0
26.9
28.0
107.0
0.845
1.0
82.0
29.0
34.0
29.7
None


TypeError: '<=' not supported between instances of 'float' and 'NoneType'