In [2]:
import numpy as np
import pandas as pd

# Step 1: Load and Preprocess the dataset
# Let's assume "bill_authentication.csv" is available in the working directory
df = pd.read_csv("bill_authentication.csv")

# Print the first few rows of the dataset
print("First few rows of the dataset:")
print(df.head())

# Alternatively, you can print a random sample of the dataset
print("\nRandom sample of the dataset:")
print(df.sample(5))  # Change the number in the sample() function to print a different number of rows

# Assuming the last column contains the target variable (class labels) and the rest are features
X = df.drop(columns=["Class"]).values
y = df["Class"].values

# Split the data into training and testing sets
def train_test_split(X, y, test_size=0.2, random_state=None):
    if random_state is not None:
        np.random.seed(random_state)

    indices = np.arange(len(y))
    np.random.shuffle(indices)

    test_size = int(test_size * len(y))
    test_indices = indices[:test_size]
    train_indices = indices[test_size:]

    X_train, X_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]

    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Implement the Decision Tree from Scratch
class DecisionTree:
    def __init__(self, max_depth=None, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split

    def fit(self, X, y):
        self.tree = self._build_tree(X, y, depth=0)

    def _build_tree(self, X, y, depth):
        if depth == self.max_depth or len(np.unique(y)) == 1 or len(y) < self.min_samples_split:
            return DecisionTreeNode(None, None, None, None, np.bincount(y).argmax())

        best_feature, best_value = self._find_best_split(X, y)

        # Split the data
        left_indices = X[:, best_feature] < best_value
        right_indices = ~left_indices

        left_child = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_child = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        return DecisionTreeNode(best_feature, best_value, left_child, right_child, None)

    def _find_best_split(self, X, y):
        # Calculate information gain for all features and values to find the best split
        best_feature = None
        best_value = None
        best_info_gain = -1

        parent_entropy = self._calculate_entropy(y)
        print("Entropy:", parent_entropy)

        for feature in range(X.shape[1]):
            values = np.unique(X[:, feature])
            for value in values:
                left_indices = X[:, feature] < value
                right_indices = ~left_indices

                if np.any(left_indices) and np.any(right_indices):
                    info_gain = self._calculate_info_gain(y, y[left_indices], y[right_indices], parent_entropy)
                    if info_gain > best_info_gain:
                        best_info_gain = info_gain
                        best_feature = feature
                        best_value = value

        print("Best Information Gain:", best_info_gain)
        return best_feature, best_value

    def _calculate_entropy(self, y):
        unique_labels, label_counts = np.unique(y, return_counts=True)
        probabilities = label_counts / len(y)
        entropy = -np.sum(probabilities * np.log2(probabilities + 1e-10))  # Add a small value to avoid log(0)
        return entropy

    def _calculate_info_gain(self, parent, left_child, right_child, parent_entropy):
        total_samples = len(parent)
        left_weight = len(left_child) / total_samples
        right_weight = len(right_child) / total_samples

        left_entropy = self._calculate_entropy(left_child)
        right_entropy = self._calculate_entropy(right_child)

        info_gain = parent_entropy - (left_weight * left_entropy) - (right_weight * right_entropy)
        return info_gain

    def predict(self, X):
        return np.array([self._predict_sample(x, self.tree) for x in X])

    def _predict_sample(self, x, node):
        if node.prediction is not None:
            return node.prediction

        if x[node.feature] < node.value:
            return self._predict_sample(x, node.left_child)
        else:
            return self._predict_sample(x, node.right_child)

    def print_tree(self):
        self._print_tree_node(self.tree)

    def _print_tree_node(self, node, indent=""):
        if node is None:
            return

        if node.prediction is not None:
            print(indent + "Prediction:", node.prediction)
            return

        print(indent + "Feature:", node.feature)
        print(indent + "Value:", node.value)
        print(indent + "Left:")
        self._print_tree_node(node.left_child, indent + "  ")
        print(indent + "Right:")
        self._print_tree_node(node.right_child, indent + "  ")


class DecisionTreeNode:
    def __init__(self, feature, value, left_child, right_child, prediction):
        self.feature = feature
        self.value = value
        self.left_child = left_child
        self.right_child = right_child
        self.prediction = prediction

# Step 3: Train the Decision Tree
decision_tree = DecisionTree(max_depth=3)
decision_tree.fit(X_train, y_train)

# Step 4: Print the Decision Tree
print("Decision Tree:")
decision_tree.print_tree()

# Step 5: Evaluate the Decision Tree
predictions = decision_tree.predict(X_test)

# Evaluate the accuracy of the model
accuracy = np.mean(predictions == y_test)
print("Accuracy:", accuracy)


First few rows of the dataset:
   Variance  Skewness  Curtosis  Entropy  Class
0   3.62160    8.6661   -2.8073 -0.44699      0
1   4.54590    8.1674   -2.4586 -1.46210      0
2   3.86600   -2.6383    1.9242  0.10645      0
3   3.45660    9.5228   -4.0112 -3.59440      0
4   0.32924   -4.4552    4.5718 -0.98880      0

Random sample of the dataset:
      Variance  Skewness  Curtosis  Entropy  Class
1000   -2.8829    3.8964  -0.18880 -1.16720      1
1097   -2.5919   -1.0553   3.89490  0.77757      1
735     3.5358    6.7086  -0.81857  0.47886      0
942    -3.3793  -13.7731  17.92740 -2.03230      1
73      4.1654   -3.4495   3.64300  1.08790      0
Entropy: 0.9895494615608866
Best Information Gain: 0.4299412435830885
Entropy: 0.6881307949040949
Best Information Gain: 0.28694331834807263
Entropy: 0.2938465246028983
Best Information Gain: 0.06784579784409125
Entropy: 0.8440991715082329
Best Information Gain: 0.7831718800320864
Entropy: 0.44055572566712337
Best Information Gain: 0.12875197