In [1]:
# Import necessary modules
import pandas as pd
import numpy as np

In [2]:
# Read dataset
data = pd.read_csv("data/diabetes.csv")
# Printing shape of data
print("Shape of data: ", data.shape)
# Displaying data information
print("\nData Info:")
print(data.info())
print("\nData Description:")
print(data.describe()) # Here only integer and float columns are only considered
# from sklearn import datasets
# data = datasets.load_breast_cancer()

Shape of data:  (768, 9)

Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
None

Data Description:
       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894

In [3]:
# Displaying the DataFrame
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
def split_into_train_and_test():
    """
    - Split dataset between train and test dataset
    """
    train = data.sample(frac=0.8, random_state=1) # Get random 80% of data
    # print(train.index)
    test = data.drop(train.index) # Get remaining data, after dropping rows of indexes which are present in train
    X_train = train.iloc[:, :-1].values
    y_train = train.iloc[:, -1:].values.flatten()
    
    X_test = test.iloc[:, :-1].values
    y_test = test.iloc[:, -1:].values.flatten()
    
    return X_train, y_train, X_test, y_test

![Entropy formula](images/entropy_general.png)

Where p(X) is % of given sample wrt all samples. Given by formula: number of given sample/total samples

In [33]:
from collections import Counter
def calculate_entropy(y):
    # Get number of occurences of all class labels
    h = np.bincount(y)  # bincount gives an numpy array containing frequency of numbers in ascending order.
    # Instead of above, we can also use:  _, counts = np.unique(y_train, return_counts=True); then `counts` will give same result
    # print(h): array([403, 211])

    # Then divide them by length of total samples to get the probability(or percentage), it will also give numpy array where each element is
    # division of ith element of `h` / length of y   
    px = h/len(y)
    # print(px): array([0.65635179, 0.34364821])
    
    # Now calculate entropy using its formula
    #entropy = -(sum(px * np.log2(px)))
    return -np.sum([p * np.log2(p) for p in px if p > 0])


# To store a info of a node
class Node:
    def __init__(self, feature=None, threshold=None, left=None,right=None, *, value=None):
        # We are using * as if we have to use value parameter, we have to use using keyword only parameter
        # So for a leaf node when we just have value, then we can use this
        # For every other node, other than leaf node, value will be None 
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

    def check_leaf_node(self):
        """
         Detects whether node is leaf node or not
        """
        return self.value is not None

In [41]:
class DecisionTree:
    def __init__(self, min_samples_split=2, max_depth=100, num_features=None):
        self.root = None  # Starting node
        self.min_samples_split = min_samples_split  # minimum samples require to split a tree 
        self.max_depth = max_depth
        self.num_features = num_features  # If we don't want to include all the given features in input dataset


    def fit(self, X, y):
        """
        - First method to be called when building the tree in training phase
        """
        # If self.features was not specified then we will take maximum no. of features, 
        # or if features are less than specified values,then we just consider min value
        # Here for now we are getting all 8 features
        self.num_features = X.shape[1] if not self.num_features else min(self.num_features, X.shape[1])
        self.root = self.grow_tree(X, y)


    def grow_tree(self, X, y, depth=0):  # Intially depth = 0(Starting at root)
        n_samples, n_features = X.shape
        # Get all different labels
        n_labels = len(np.unique(y))
        """
        Stoppig criteria:
            - Reached max depth, or
            - minimum samples not found at node(required sample count became less than min number of sample count)
            - No more class distribution            
        """
        # If it is leaf node or any stopping criteria is found
        if depth >= self.max_depth or n_labels == 1 or n_samples < self.min_samples_split:
            # If any of these conditions is true it means leaf node has been reached
            # Here we save the node with that y_label value whose count is more
            #print(depth, n_labels, n_samples)
            leaf_value = self.get_most_common_label(y)
            #print("##", leaf_value)
            return Node(value=leaf_value)  # This is the leaf node as we are just using value keyword

        # Select some random features every time for split,
        # np,random.choice generates a random sample from a given 1-D array
        # if n_features = self.num_features, then all features will be given in an array in random order
        features_idxs = np.random.choice(n_features, self.num_features, replace=False) # array, size, replace (keep False as we don't want same index multiple times)
        # Returns best feature and threshold value
        best_feature, best_threshold = self.get_best_split(X, y, features_idxs, n_samples)  # Greedy search
        #if n_samples == 17:
        #    print(features_idxs)
        #    import sys
        #    sys.exit(0)

        # Split tree wrt best feature and threshold
        left_idx, right_idx = self.split(X[:, best_feature], best_threshold)  # Return ids of X features having 
        # left_idx contains indices of indices of array(X) elements that were less than threshold and right_idx contain elements indices whose value were more than threshold

        # Now we can continuing growing
        left = self.grow_tree(X[left_idx, :], y[left_idx], depth+1)
        right = self.grow_tree(X[right_idx, :], y[right_idx], depth+1)
        
        return Node(best_feature, best_threshold, left, right)


    def get_most_common_label(self, y):
        #counts = np.bincount(y) # It will give frequency of all values of y
        #print(y)
        #print(counts)#:  array([403, 211]); 403 is count of 0 and 211 is of 1
        #most_common = np.argmax(counts)
        # print(most_common):  0
        #return most_common
        counter = Counter(y)
        most_common = counter.most_common(1)[0][0]
        return most_common


    def get_best_split(self, X, y, features_idxs, n_samples):
        """
        To get the best split among all given features, using information gain
        """
        best_gain = -1
        split_idx, split_threshold = None, None
        for features_idx in features_idxs:
            # Now select only column vector at only selected index
            X_column = X[:, features_idx]
            #if n_samples == 17:
            #    print(X_column)
            # Get only unique values in a numpy array for X_column
            thresholds = np.unique(X_column)
            for threshold in thresholds:
                gain = self.get_info_gain(X_column, y, threshold)
                if gain > best_gain:
                    best_gain = gain
                    split_idx = features_idx
                    split_threshold = threshold
                    
        return split_idx, split_threshold

    
    def get_info_gain(self, X_column, y, split_threshold):
        # Calculate parent entropy
        parent_entropy = calculate_entropy(y)
        # Generate split
        left_idx, right_idx = self.split(X_column, split_threshold)
        if len(left_idx) == 0 or len(right_idx) == 0:
            return 0
        # Calculate weighted average of child entropies
        n = len(y)
        n_l, n_r = len(left_idx), len(right_idx)
        e_l, e_r = calculate_entropy(y[left_idx]), calculate_entropy(y[right_idx])
        child_entropy = (n_l/n) * e_l + (n_r/n) * e_r
        
        # Calculate information gain
        ig = parent_entropy - child_entropy
        return ig


    def split(self, X_column, split_threshold):
        left_idx = np.argwhere(X_column <= split_threshold).flatten() # Return array where condition is true and flatten it to get 1D array
        right_idx = np.argwhere(X_column > split_threshold).flatten() # Return array where condition is true and flatten it to get 1D array

        return left_idx, right_idx
    
    
    def predict(self, X):
        # Traverse tree
        return np.array([self.traverse_tree(x, self.root) for x in X])

    def traverse_tree(self, x, node):
        if node.check_leaf_node():
            return node.value
        
        if x[node.feature] <= node.threshold:
            return self.traverse_tree(x, node.left)
        return self.traverse_tree(x, node.right)

In [42]:
X_train, y_train, X_test, y_test = split_into_train_and_test()
# # X_train
# from sklearn.model_selection import train_test_split
# X = data.data
# y = data.target
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

# # from sklearn.model_selection import train_test_split
# # x = data.iloc[:, :-1].values
# # y = data.iloc[:, -1:].values

# # #spliting the dataset into training and test set
# # x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1234)

In [43]:
def get_accuracy(y_true, y_pred):
    """
    Function to calculate accuracy
    """
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

In [44]:
classifier = DecisionTree(max_depth=10)
classifier.fit(X_train, y_train)

In [45]:
X_test

array([[1.00e+01, 1.15e+02, 0.00e+00, ..., 3.53e+01, 1.34e-01, 2.90e+01],
       [4.00e+00, 1.10e+02, 9.20e+01, ..., 3.76e+01, 1.91e-01, 3.00e+01],
       [7.00e+00, 1.00e+02, 0.00e+00, ..., 3.00e+01, 4.84e-01, 3.20e+01],
       ...,
       [6.00e+00, 1.62e+02, 6.20e+01, ..., 2.43e+01, 1.78e-01, 5.00e+01],
       [1.00e+00, 1.06e+02, 7.60e+01, ..., 3.75e+01, 1.97e-01, 2.60e+01],
       [2.00e+00, 1.22e+02, 7.00e+01, ..., 3.68e+01, 3.40e-01, 2.70e+01]])

In [46]:
y_pred = classifier.predict(X_test)

In [47]:
acc = get_accuracy(y_test, y_pred)
print ("Accuracy:", acc)

Accuracy: 0.7337662337662337
